[llvm] r283842 - Codegen: Tail-duplicate during placement.

Kyle Butt via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 10 18:20:34 PDT 2016


Author: iteratee
Date: Mon Oct 10 20:20:33 2016
New Revision: 283842

URL: http://llvm.org/viewvc/llvm-project?rev=283842&view=rev
Log:
Codegen: Tail-duplicate during placement.

The tail duplication pass uses an assumed layout when making duplication
decisions. This is fine, but passes up duplication opportunities that
may arise when blocks are outlined. Because we want the updated CFG to
affect subsequent placement decisions, this change must occur during
placement.

In order to achieve this goal, TailDuplicationPass is split into a
utility class, TailDuplicator, and the pass itself. The pass delegates
nearly everything to the TailDuplicator object, except for looping over
the blocks in a function. This allows the same code to be used for tail
duplication in both places.

This change, in concert with outlining optional branches, allows
triangle shaped code to perform much better, esepecially when the
taken/untaken branches are correlated, as it creates a second spine when
the tests are small enough.

Issue from previous rollback fixed, and a new test was added for that
case as well. Issue was worklist/scheduling/taildup issue in layout.

Issue from 2nd rollback fixed, with 2 additional tests. Issue was
tail merging/loop info/tail-duplication causing issue with loops that share
a header block.

Issue with early tail-duplication of blocks that branch to a fallthrough
predecessor fixed with test case: tail-dup-branch-to-fallthrough.ll

Differential revision: https://reviews.llvm.org/D18226

Added:
    llvm/trunk/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll
    llvm/trunk/test/CodeGen/PowerPC/tail-dup-branch-to-fallthrough.ll
    llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll
    llvm/trunk/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
    llvm/trunk/test/CodeGen/X86/tail-dup-repeat.ll
Modified:
    llvm/trunk/include/llvm/Analysis/LoopInfoImpl.h
    llvm/trunk/include/llvm/CodeGen/TailDuplicator.h
    llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp
    llvm/trunk/lib/CodeGen/TailDuplication.cpp
    llvm/trunk/lib/CodeGen/TailDuplicator.cpp
    llvm/trunk/test/CodeGen/AArch64/arm64-extload-knownzero.ll
    llvm/trunk/test/CodeGen/AArch64/machine_cse.ll
    llvm/trunk/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll
    llvm/trunk/test/CodeGen/PowerPC/branch-opt.ll
    llvm/trunk/test/CodeGen/PowerPC/sjlj.ll
    llvm/trunk/test/CodeGen/WebAssembly/cfg-stackify.ll
    llvm/trunk/test/CodeGen/WebAssembly/mem-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/block-placement.ll
    llvm/trunk/test/CodeGen/X86/cmov-into-branch.ll
    llvm/trunk/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
    llvm/trunk/test/CodeGen/X86/fp-une-cmp.ll
    llvm/trunk/test/CodeGen/X86/pr11202.ll
    llvm/trunk/test/CodeGen/X86/ragreedy-bug.ll
    llvm/trunk/test/CodeGen/X86/sse1.ll
    llvm/trunk/test/CodeGen/X86/update-terminator.mir

Modified: llvm/trunk/include/llvm/Analysis/LoopInfoImpl.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/LoopInfoImpl.h?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Analysis/LoopInfoImpl.h (original)
+++ llvm/trunk/include/llvm/Analysis/LoopInfoImpl.h Mon Oct 10 20:20:33 2016
@@ -186,8 +186,13 @@ BlockT *LoopBase<BlockT, LoopT>::getLoop
 template<class BlockT, class LoopT>
 void LoopBase<BlockT, LoopT>::
 addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase<BlockT, LoopT> &LIB) {
-  assert((Blocks.empty() || LIB[getHeader()] == this) &&
-         "Incorrect LI specified for this loop!");
+#ifndef NDEBUG
+  if (!Blocks.empty()) {
+    auto SameHeader = LIB[getHeader()];
+    assert(contains(SameHeader) && getHeader() == SameHeader->getHeader()
+           && "Incorrect LI specified for this loop!");
+  }
+#endif
   assert(NewBB && "Cannot add a null basic block to the loop!");
   assert(!LIB[NewBB] && "BasicBlock already in the loop!");
 

Modified: llvm/trunk/include/llvm/CodeGen/TailDuplicator.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/TailDuplicator.h?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/TailDuplicator.h (original)
+++ llvm/trunk/include/llvm/CodeGen/TailDuplicator.h Mon Oct 10 20:20:33 2016
@@ -15,6 +15,7 @@
 #ifndef LLVM_CODEGEN_TAILDUPLICATOR_H
 #define LLVM_CODEGEN_TAILDUPLICATOR_H
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -37,6 +38,7 @@ class TailDuplicator {
   MachineRegisterInfo *MRI;
   MachineFunction *MF;
   bool PreRegAlloc;
+  bool LayoutMode;
   unsigned TailDupSize;
 
   // A list of virtual registers for which to update SSA form.
@@ -50,10 +52,16 @@ class TailDuplicator {
 
 public:
   /// Prepare to run on a specific machine function.
-  /// @param TailDupSize - Maxmimum size of blocks to tail-duplicate.
+  /// @param MF - Function that will be processed
+  /// @param MBPI - Branch Probability Info. Used to propagate correct
+  ///     probabilities when modifying the CFG.
+  /// @param LayoutMode - When true, don't use the existing layout to make
+  ///     decisions.
+  /// @param TailDupSize - Maxmimum size of blocks to tail-duplicate. Zero
+  ///     default implies using the command line value TailDupSize.
   void initMF(MachineFunction &MF,
               const MachineBranchProbabilityInfo *MBPI,
-              unsigned TailDupSize = 0);
+              bool LayoutMode, unsigned TailDupSize = 0);
   bool tailDuplicateBlocks();
   static bool isSimpleBB(MachineBasicBlock *TailBB);
   bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB);
@@ -63,9 +71,13 @@ public:
   /// up.
   /// If \p DuplicatePreds is not null, it will be updated to contain the list
   /// of predecessors that received a copy of \p MBB.
+  /// If \p RemovalCallback is non-null. It will be called before MBB is
+  /// deleted.
   bool tailDuplicateAndUpdate(
       bool IsSimple, MachineBasicBlock *MBB,
-      SmallVectorImpl<MachineBasicBlock*> *DuplicatedPreds = nullptr);
+      MachineBasicBlock *ForcedLayoutPred,
+      SmallVectorImpl<MachineBasicBlock*> *DuplicatedPreds = nullptr,
+      llvm::function_ref<void(MachineBasicBlock *)> *RemovalCallback = nullptr);
 
 private:
   typedef TargetInstrInfo::RegSubRegPair RegSubRegPair;
@@ -89,14 +101,18 @@ private:
                          SmallVectorImpl<MachineBasicBlock *> &TDBBs,
                          const DenseSet<unsigned> &RegsUsedByPhi,
                          SmallVectorImpl<MachineInstr *> &Copies);
-  bool tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
+  bool tailDuplicate(bool IsSimple,
+                     MachineBasicBlock *TailBB,
+                     MachineBasicBlock *ForcedLayoutPred,
                      SmallVectorImpl<MachineBasicBlock *> &TDBBs,
                      SmallVectorImpl<MachineInstr *> &Copies);
   void appendCopies(MachineBasicBlock *MBB,
                  SmallVectorImpl<std::pair<unsigned,RegSubRegPair>> &CopyInfos,
                  SmallVectorImpl<MachineInstr *> &Copies);
 
-  void removeDeadBlock(MachineBasicBlock *MBB);
+  void removeDeadBlock(
+      MachineBasicBlock *MBB,
+      llvm::function_ref<void(MachineBasicBlock *)> *RemovalCallback = nullptr);
 };
 
 } // End llvm namespace

Modified: llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp (original)
+++ llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp Mon Oct 10 20:20:33 2016
@@ -40,6 +40,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TailDuplicator.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -121,6 +122,12 @@ static cl::opt<unsigned> MisfetchCost(
 static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
                                       cl::desc("Cost of jump instructions."),
                                       cl::init(1), cl::Hidden);
+static cl::opt<bool>
+TailDupPlacement("tail-dup-placement",
+              cl::desc("Perform tail duplication during placement. "
+                       "Creates more fallthrough opportunites in "
+                       "outline branches."),
+              cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
 BranchFoldPlacement("branch-fold-placement",
@@ -128,6 +135,14 @@ BranchFoldPlacement("branch-fold-placeme
                        "Reduces code size."),
               cl::init(true), cl::Hidden);
 
+// Heuristic for tail duplication.
+static cl::opt<unsigned> TailDuplicatePlacementThreshold(
+    "tail-dup-placement-threshold",
+    cl::desc("Instruction cutoff for tail duplication during layout. "
+             "Tail merging during layout is forced to have a threshold "
+             "that won't conflict."), cl::init(2),
+    cl::Hidden);
+
 extern cl::opt<unsigned> StaticLikelyProb;
 extern cl::opt<unsigned> ProfileLikelyProb;
 
@@ -185,6 +200,16 @@ public:
   /// \brief End of blocks within the chain.
   iterator end() { return Blocks.end(); }
 
+  bool remove(MachineBasicBlock* BB) {
+    for(iterator i = begin(); i != end(); ++i) {
+      if (*i == BB) {
+        Blocks.erase(i);
+        return true;
+      }
+    }
+    return false;
+  }
+
   /// \brief Merge a block chain into this one.
   ///
   /// This routine merges a block chain into this one. It takes care of forming
@@ -266,6 +291,13 @@ class MachineBlockPlacement : public Mac
   /// \brief A handle to the post dominator tree.
   MachineDominatorTree *MDT;
 
+  /// \brief Duplicator used to duplicate tails during placement.
+  ///
+  /// Placement decisions can open up new tail duplication opportunities, but
+  /// since tail duplication affects placement decisions of later blocks, it
+  /// must be done inline.
+  TailDuplicator TailDup;
+
   /// \brief A set of blocks that are unavoidably execute, i.e. they dominate
   /// all terminators of the MachineFunction.
   SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks;
@@ -287,8 +319,18 @@ class MachineBlockPlacement : public Mac
   /// between basic blocks.
   DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain;
 
+  /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
+  /// if the count goes to 0, add them to the appropriate work list.
   void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
                            const BlockFilterSet *BlockFilter = nullptr);
+
+  /// Decrease the UnscheduledPredecessors count for a single block, and
+  /// if the count goes to 0, add them to the appropriate work list.
+  void markBlockSuccessors(
+      BlockChain &Chain, MachineBasicBlock *BB, MachineBasicBlock *LoopHeaderBB,
+      const BlockFilterSet *BlockFilter = nullptr);
+
+
   BranchProbability
   collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain,
                           const BlockFilterSet *BlockFilter,
@@ -298,6 +340,16 @@ class MachineBlockPlacement : public Mac
                                  const BlockFilterSet *BlockFilter,
                                  BranchProbability SuccProb,
                                  BranchProbability HotProb);
+  bool repeatedlyTailDuplicateBlock(
+      MachineBasicBlock *BB, MachineBasicBlock *&LPred,
+      MachineBasicBlock *LoopHeaderBB,
+      BlockChain &Chain, BlockFilterSet *BlockFilter,
+      MachineFunction::iterator &PrevUnplacedBlockIt);
+  bool maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred,
+                               const BlockChain &Chain,
+                               BlockFilterSet *BlockFilter,
+                               MachineFunction::iterator &PrevUnplacedBlockIt,
+                               bool &DuplicatedToPred);
   bool
   hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ,
                              BlockChain &SuccChain, BranchProbability SuccProb,
@@ -323,7 +375,7 @@ class MachineBlockPlacement : public Mac
                      SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
                      const BlockFilterSet *BlockFilter);
   void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
-                  const BlockFilterSet *BlockFilter = nullptr);
+                  BlockFilterSet *BlockFilter = nullptr);
   MachineBasicBlock *findBestLoopTop(MachineLoop &L,
                                      const BlockFilterSet &LoopBlockSet);
   MachineBasicBlock *findBestLoopExit(MachineLoop &L,
@@ -388,37 +440,49 @@ static std::string getBlockName(MachineB
 /// When a chain is being merged into the "placed" chain, this routine will
 /// quickly walk the successors of each block in the chain and mark them as
 /// having one fewer active predecessor. It also adds any successors of this
-/// chain which reach the zero-predecessor state to the worklist passed in.
+/// chain which reach the zero-predecessor state to the appropriate worklist.
 void MachineBlockPlacement::markChainSuccessors(
     BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
     const BlockFilterSet *BlockFilter) {
   // Walk all the blocks in this chain, marking their successors as having
   // a predecessor placed.
   for (MachineBasicBlock *MBB : Chain) {
-    // Add any successors for which this is the only un-placed in-loop
-    // predecessor to the worklist as a viable candidate for CFG-neutral
-    // placement. No subsequent placement of this block will violate the CFG
-    // shape, so we get to use heuristics to choose a favorable placement.
-    for (MachineBasicBlock *Succ : MBB->successors()) {
-      if (BlockFilter && !BlockFilter->count(Succ))
-        continue;
-      BlockChain &SuccChain = *BlockToChain[Succ];
-      // Disregard edges within a fixed chain, or edges to the loop header.
-      if (&Chain == &SuccChain || Succ == LoopHeaderBB)
-        continue;
+    markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
+  }
+}
 
-      // This is a cross-chain edge that is within the loop, so decrement the
-      // loop predecessor count of the destination chain.
-      if (SuccChain.UnscheduledPredecessors == 0 ||
-          --SuccChain.UnscheduledPredecessors > 0)
-        continue;
+/// \brief Mark a single block's successors as having one fewer preds.
+///
+/// Under normal circumstances, this is only called by markChainSuccessors,
+/// but if a block that was to be placed is completely tail-duplicated away,
+/// and was duplicated into the chain end, we need to redo markBlockSuccessors
+/// for just that block.
+void MachineBlockPlacement::markBlockSuccessors(
+    BlockChain &Chain, MachineBasicBlock *MBB, MachineBasicBlock *LoopHeaderBB,
+    const BlockFilterSet *BlockFilter) {
+  // Add any successors for which this is the only un-placed in-loop
+  // predecessor to the worklist as a viable candidate for CFG-neutral
+  // placement. No subsequent placement of this block will violate the CFG
+  // shape, so we get to use heuristics to choose a favorable placement.
+  for (MachineBasicBlock *Succ : MBB->successors()) {
+    if (BlockFilter && !BlockFilter->count(Succ))
+      continue;
+    BlockChain &SuccChain = *BlockToChain[Succ];
+    // Disregard edges within a fixed chain, or edges to the loop header.
+    if (&Chain == &SuccChain || Succ == LoopHeaderBB)
+      continue;
 
-      auto *MBB = *SuccChain.begin();
-      if (MBB->isEHPad())
-        EHPadWorkList.push_back(MBB);
-      else
-        BlockWorkList.push_back(MBB);
-    }
+    // This is a cross-chain edge that is within the loop, so decrement the
+    // loop predecessor count of the destination chain.
+    if (SuccChain.UnscheduledPredecessors == 0 ||
+        --SuccChain.UnscheduledPredecessors > 0)
+      continue;
+
+    auto *NewBB = *SuccChain.begin();
+    if (NewBB->isEHPad())
+      EHPadWorkList.push_back(NewBB);
+    else
+      BlockWorkList.push_back(NewBB);
   }
 }
 
@@ -902,7 +966,7 @@ void MachineBlockPlacement::fillWorkList
 
 void MachineBlockPlacement::buildChain(
     MachineBasicBlock *BB, BlockChain &Chain,
-    const BlockFilterSet *BlockFilter) {
+    BlockFilterSet *BlockFilter) {
   assert(BB && "BB must not be null.\n");
   assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n");
   MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
@@ -937,6 +1001,17 @@ void MachineBlockPlacement::buildChain(
                       "layout successor until the CFG reduces\n");
     }
 
+    // Placement may have changed tail duplication opportunities.
+    // Check for that now.
+    if (TailDupPlacement && BestSucc) {
+      // If the chosen successor was duplicated into all its predecessors,
+      // don't bother laying it out, just go round the loop again with BB as
+      // the chain end.
+      if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
+                                       BlockFilter, PrevUnplacedBlockIt))
+        continue;
+    }
+
     // Place this block, updating the datastructures to reflect its placement.
     BlockChain &SuccChain = *BlockToChain[BestSucc];
     // Zero out UnscheduledPredecessors for the successor we're about to merge in case
@@ -1718,6 +1793,175 @@ void MachineBlockPlacement::alignBlocks(
   }
 }
 
+/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
+/// it was duplicated into its chain predecessor and removed.
+/// \p BB    - Basic block that may be duplicated.
+///
+/// \p LPred - Chosen layout predecessor of \p BB.
+///            Updated to be the chain end if LPred is removed.
+/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
+/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
+///                  Used to identify which blocks to update predecessor
+///                  counts.
+/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
+///                          chosen in the given order due to unnatural CFG
+///                          only needed if \p BB is removed and
+///                          \p PrevUnplacedBlockIt pointed to \p BB.
+/// @return true if \p BB was removed.
+bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
+    MachineBasicBlock *BB, MachineBasicBlock *&LPred,
+    MachineBasicBlock *LoopHeaderBB,
+    BlockChain &Chain, BlockFilterSet *BlockFilter,
+    MachineFunction::iterator &PrevUnplacedBlockIt) {
+  bool Removed, DuplicatedToLPred;
+  bool DuplicatedToOriginalLPred;
+  Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
+                                    PrevUnplacedBlockIt,
+                                    DuplicatedToLPred);
+  if (!Removed)
+    return false;
+  DuplicatedToOriginalLPred = DuplicatedToLPred;
+  // Iteratively try to duplicate again. It can happen that a block that is
+  // duplicated into is still small enough to be duplicated again.
+  // No need to call markBlockSuccessors in this case, as the blocks being
+  // duplicated from here on are already scheduled.
+  // Note that DuplicatedToLPred always implies Removed.
+  while (DuplicatedToLPred) {
+    assert (Removed && "Block must have been removed to be duplicated into its "
+            "layout predecessor.");
+    MachineBasicBlock *DupBB, *DupPred;
+    // The removal callback causes Chain.end() to be updated when a block is
+    // removed. On the first pass through the loop, the chain end should be the
+    // same as it was on function entry. On subsequent passes, because we are
+    // duplicating the block at the end of the chain, if it is removed the
+    // chain will have shrunk by one block.
+    BlockChain::iterator ChainEnd = Chain.end();
+    DupBB = *(--ChainEnd);
+    // Now try to duplicate again.
+    if (ChainEnd == Chain.begin())
+      break;
+    DupPred = *std::prev(ChainEnd);
+    Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
+                                      PrevUnplacedBlockIt,
+                                      DuplicatedToLPred);
+  }
+  // If BB was duplicated into LPred, it is now scheduled. But because it was
+  // removed, markChainSuccessors won't be called for its chain. Instead we
+  // call markBlockSuccessors for LPred to achieve the same effect. This must go
+  // at the end because repeating the tail duplication can increase the number
+  // of unscheduled predecessors.
+  if (DuplicatedToOriginalLPred)
+    markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
+
+  LPred = *std::prev(Chain.end());
+  return true;
+}
+
+/// Tail duplicate \p BB into (some) predecessors if profitable.
+/// \p BB    - Basic block that may be duplicated
+/// \p LPred - Chosen layout predecessor of \p BB
+/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
+/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
+///                  Used to identify which blocks to update predecessor
+///                  counts.
+/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
+///                          chosen in the given order due to unnatural CFG
+///                          only needed if \p BB is removed and
+///                          \p PrevUnplacedBlockIt pointed to \p BB.
+/// \p DuplicatedToLPred - True if the block was duplicated into LPred. Will
+///                        only be true if the block was removed.
+/// \return  - True if the block was duplicated into all preds and removed.
+bool MachineBlockPlacement::maybeTailDuplicateBlock(
+    MachineBasicBlock *BB, MachineBasicBlock *LPred,
+    const BlockChain &Chain, BlockFilterSet *BlockFilter,
+    MachineFunction::iterator &PrevUnplacedBlockIt,
+    bool &DuplicatedToLPred) {
+
+  DuplicatedToLPred = false;
+  DEBUG(dbgs() << "Redoing tail duplication for Succ#"
+        << BB->getNumber() << "\n");
+  bool IsSimple = TailDup.isSimpleBB(BB);
+  // Blocks with single successors don't create additional fallthrough
+  // opportunities. Don't duplicate them. TODO: When conditional exits are
+  // analyzable, allow them to be duplicated.
+  if (!IsSimple && BB->succ_size() == 1)
+    return false;
+  if (!TailDup.shouldTailDuplicate(IsSimple, *BB))
+    return false;
+  // This has to be a callback because none of it can be done after
+  // BB is deleted.
+  bool Removed = false;
+  auto RemovalCallback =
+      [&](MachineBasicBlock *RemBB) {
+        // Signal to outer function
+        Removed = true;
+
+        // Conservative default.
+        bool InWorkList = true;
+        // Remove from the Chain and Chain Map
+        if (BlockToChain.count(RemBB)) {
+          BlockChain *Chain = BlockToChain[RemBB];
+          InWorkList = Chain->UnscheduledPredecessors == 0;
+          Chain->remove(RemBB);
+          BlockToChain.erase(RemBB);
+        }
+
+        // Handle the unplaced block iterator
+        if (&(*PrevUnplacedBlockIt) == RemBB) {
+          PrevUnplacedBlockIt++;
+        }
+
+        // Handle the Work Lists
+        if (InWorkList) {
+          SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
+          if (RemBB->isEHPad())
+            RemoveList = EHPadWorkList;
+          RemoveList.erase(
+              remove_if(RemoveList,
+                        [RemBB](MachineBasicBlock *BB) {return BB == RemBB;}),
+              RemoveList.end());
+        }
+
+        // Handle the filter set
+        if (BlockFilter) {
+          BlockFilter->erase(RemBB);
+        }
+
+        // Remove the block from loop info.
+        MLI->removeBlock(RemBB);
+
+        // TailDuplicator handles removing it from loops.
+        DEBUG(dbgs() << "TailDuplicator deleted block: "
+              << getBlockName(RemBB) << "\n");
+      };
+  auto RemovalCallbackRef =
+      llvm::function_ref<void(MachineBasicBlock*)>(RemovalCallback);
+
+  SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
+  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
+                                 &DuplicatedPreds, &RemovalCallbackRef);
+
+  // Update UnscheduledPredecessors to reflect tail-duplication.
+  DuplicatedToLPred = false;
+  for (MachineBasicBlock *Pred : DuplicatedPreds) {
+    // We're only looking for unscheduled predecessors that match the filter.
+    BlockChain* PredChain = BlockToChain[Pred];
+    if (Pred == LPred)
+      DuplicatedToLPred = true;
+    if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
+        || PredChain == &Chain)
+      continue;
+    for (MachineBasicBlock *NewSucc : Pred->successors()) {
+      if (BlockFilter && !BlockFilter->count(NewSucc))
+        continue;
+      BlockChain *NewChain = BlockToChain[NewSucc];
+      if (NewChain != &Chain && NewChain != PredChain)
+        NewChain->UnscheduledPredecessors++;
+    }
+  }
+  return Removed;
+}
+
 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
     return false;
@@ -1734,6 +1978,13 @@ bool MachineBlockPlacement::runOnMachine
   TII = MF.getSubtarget().getInstrInfo();
   TLI = MF.getSubtarget().getTargetLowering();
   MDT = &getAnalysis<MachineDominatorTree>();
+  if (TailDupPlacement) {
+    unsigned TailDupSize = TailDuplicatePlacementThreshold;
+    if (MF.getFunction()->optForSize())
+      TailDupSize = 1;
+    TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);
+  }
+
   assert(BlockToChain.empty());
 
   buildCFGChains();
@@ -1747,8 +1998,7 @@ bool MachineBlockPlacement::runOnMachine
                          BranchFoldPlacement;
   // No tail merging opportunities if the block number is less than four.
   if (MF.size() > 3 && EnableTailMerge) {
-    // Default to the standard tail-merge-size option.
-    unsigned TailMergeSize = 0;
+    unsigned TailMergeSize = TailDuplicatePlacementThreshold + 1;
     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
                     *MBPI, TailMergeSize);
 
@@ -1757,6 +2007,8 @@ bool MachineBlockPlacement::runOnMachine
                             /*AfterBlockPlacement=*/true)) {
       // Redo the layout if tail merging creates/removes/moves blocks.
       BlockToChain.clear();
+      // Must redo the dominator tree if blocks were changed.
+      MDT->runOnMachineFunction(MF);
       ChainAllocator.DestroyAll();
       buildCFGChains();
     }

Modified: llvm/trunk/lib/CodeGen/TailDuplication.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/TailDuplication.cpp?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/TailDuplication.cpp (original)
+++ llvm/trunk/lib/CodeGen/TailDuplication.cpp Mon Oct 10 20:20:33 2016
@@ -49,7 +49,7 @@ bool TailDuplicatePass::runOnMachineFunc
 
   auto MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
 
-  Duplicator.initMF(MF, MBPI);
+  Duplicator.initMF(MF, MBPI, /* LayoutMode */ false);
 
   bool MadeChange = false;
   while (Duplicator.tailDuplicateBlocks())

Modified: llvm/trunk/lib/CodeGen/TailDuplicator.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/TailDuplicator.cpp?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/TailDuplicator.cpp (original)
+++ llvm/trunk/lib/CodeGen/TailDuplicator.cpp Mon Oct 10 20:20:33 2016
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
@@ -64,7 +65,7 @@ static cl::opt<unsigned> TailDupLimit("t
 
 void TailDuplicator::initMF(MachineFunction &MFin,
                             const MachineBranchProbabilityInfo *MBPIin,
-                            unsigned TailDupSizeIn) {
+                            bool LayoutModeIn, unsigned TailDupSizeIn) {
   MF = &MFin;
   TII = MF->getSubtarget().getInstrInfo();
   TRI = MF->getSubtarget().getRegisterInfo();
@@ -75,6 +76,7 @@ void TailDuplicator::initMF(MachineFunct
 
   assert(MBPI != nullptr && "Machine Branch Probability Info required");
 
+  LayoutMode = LayoutModeIn;
   PreRegAlloc = MRI->isSSA();
 }
 
@@ -127,18 +129,23 @@ static void VerifyPHIs(MachineFunction &
 /// Tail duplicate the block and cleanup.
 /// \p IsSimple - return value of isSimpleBB
 /// \p MBB - block to be duplicated
+/// \p ForcedLayoutPred - If non-null, treat this block as the layout
+///     predecessor, instead of using the ordering in MF
 /// \p DuplicatedPreds - if non-null, \p DuplicatedPreds will contain a list of
 ///     all Preds that received a copy of \p MBB.
+/// \p RemovalCallback - if non-null, called just before MBB is deleted.
 bool TailDuplicator::tailDuplicateAndUpdate(
     bool IsSimple, MachineBasicBlock *MBB,
-    SmallVectorImpl<MachineBasicBlock*> *DuplicatedPreds) {
+    MachineBasicBlock *ForcedLayoutPred,
+    SmallVectorImpl<MachineBasicBlock*> *DuplicatedPreds,
+    llvm::function_ref<void(MachineBasicBlock *)> *RemovalCallback) {
   // Save the successors list.
   SmallSetVector<MachineBasicBlock *, 8> Succs(MBB->succ_begin(),
                                                MBB->succ_end());
 
   SmallVector<MachineBasicBlock *, 8> TDBBs;
   SmallVector<MachineInstr *, 16> Copies;
-  if (!tailDuplicate(IsSimple, MBB, TDBBs, Copies))
+  if (!tailDuplicate(IsSimple, MBB, ForcedLayoutPred, TDBBs, Copies))
     return false;
 
   ++NumTails;
@@ -156,7 +163,7 @@ bool TailDuplicator::tailDuplicateAndUpd
   // If it is dead, remove it.
   if (isDead) {
     NumTailDupRemoved += MBB->size();
-    removeDeadBlock(MBB);
+    removeDeadBlock(MBB, RemovalCallback);
     ++NumDeadBlocks;
   }
 
@@ -255,7 +262,7 @@ bool TailDuplicator::tailDuplicateBlocks
     if (!shouldTailDuplicate(IsSimple, *MBB))
       continue;
 
-    MadeChange |= tailDuplicateAndUpdate(IsSimple, MBB);
+    MadeChange |= tailDuplicateAndUpdate(IsSimple, MBB, nullptr);
   }
 
   if (PreRegAlloc && TailDupVerify)
@@ -514,8 +521,10 @@ void TailDuplicator::updateSuccessorsPHI
 /// Determine if it is profitable to duplicate this block.
 bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
                                          MachineBasicBlock &TailBB) {
-  // Only duplicate blocks that end with unconditional branches.
-  if (TailBB.canFallThrough())
+  // When doing tail-duplication during layout, the block ordering is in flux,
+  // so canFallThrough returns a result based on incorrect information and
+  // should just be ignored.
+  if (!LayoutMode && TailBB.canFallThrough())
     return false;
 
   // Don't try to tail-duplicate single-block loops.
@@ -735,7 +744,7 @@ bool TailDuplicator::duplicateSimpleBB(
 
 bool TailDuplicator::canTailDuplicate(MachineBasicBlock *TailBB,
                                       MachineBasicBlock *PredBB) {
-  // EH edges are ignored by AnalyzeBranch.
+  // EH edges are ignored by analyzeBranch.
   if (PredBB->succ_size() > 1)
     return false;
 
@@ -750,7 +759,16 @@ bool TailDuplicator::canTailDuplicate(Ma
 
 /// If it is profitable, duplicate TailBB's contents in each
 /// of its predecessors.
+/// \p IsSimple result of isSimpleBB
+/// \p TailBB   Block to be duplicated.
+/// \p ForcedLayoutPred  When non-null, use this block as the layout predecessor
+///                      instead of the previous block in MF's order.
+/// \p TDBBs             A vector to keep track of all blocks tail-duplicated
+///                      into.
+/// \p Copies            A vector of copy instructions inserted. Used later to
+///                      walk all the inserted copies and remove redundant ones.
 bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
+                                   MachineBasicBlock *ForcedLayoutPred,
                                    SmallVectorImpl<MachineBasicBlock *> &TDBBs,
                                    SmallVectorImpl<MachineInstr *> &Copies) {
   DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n');
@@ -775,7 +793,12 @@ bool TailDuplicator::tailDuplicate(bool
       continue;
 
     // Don't duplicate into a fall-through predecessor (at least for now).
-    if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
+    bool IsLayoutSuccessor = false;
+    if (ForcedLayoutPred)
+      IsLayoutSuccessor = (ForcedLayoutPred == PredBB);
+    else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
+      IsLayoutSuccessor = true;
+    if (IsLayoutSuccessor)
       continue;
 
     DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
@@ -828,19 +851,27 @@ bool TailDuplicator::tailDuplicate(bool
   // If TailBB was duplicated into all its predecessors except for the prior
   // block, which falls through unconditionally, move the contents of this
   // block into the prior block.
-  MachineBasicBlock *PrevBB = &*std::prev(TailBB->getIterator());
+  MachineBasicBlock *PrevBB = ForcedLayoutPred;
+  if (!PrevBB)
+    PrevBB = &*std::prev(TailBB->getIterator());
   MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr;
   SmallVector<MachineOperand, 4> PriorCond;
   // This has to check PrevBB->succ_size() because EH edges are ignored by
-  // AnalyzeBranch.
+  // analyzeBranch.
   if (PrevBB->succ_size() == 1 &&
       // Layout preds are not always CFG preds. Check.
       *PrevBB->succ_begin() == TailBB &&
       !TII->analyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) &&
-      PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 &&
+      PriorCond.empty() &&
+      (!PriorTBB || PriorTBB == TailBB) &&
+      TailBB->pred_size() == 1 &&
       !TailBB->hasAddressTaken()) {
     DEBUG(dbgs() << "\nMerging into block: " << *PrevBB
                  << "From MBB: " << *TailBB);
+    // There may be a branch to the layout successor. This is unlikely but it
+    // happens. The correct thing to do is to remove the branch before
+    // duplicating the instructions in all cases.
+    TII->removeBranch(*PrevBB);
     if (PreRegAlloc) {
       DenseMap<unsigned, RegSubRegPair> LocalVRMap;
       SmallVector<std::pair<unsigned, RegSubRegPair>, 4> CopyInfos;
@@ -864,6 +895,7 @@ bool TailDuplicator::tailDuplicate(bool
       }
       appendCopies(PrevBB, CopyInfos, Copies);
     } else {
+      TII->removeBranch(*PrevBB);
       // No PHIs to worry about, just splice the instructions over.
       PrevBB->splice(PrevBB->end(), TailBB, TailBB->begin(), TailBB->end());
     }
@@ -936,10 +968,15 @@ void TailDuplicator::appendCopies(Machin
 
 /// Remove the specified dead machine basic block from the function, updating
 /// the CFG.
-void TailDuplicator::removeDeadBlock(MachineBasicBlock *MBB) {
+void TailDuplicator::removeDeadBlock(
+    MachineBasicBlock *MBB,
+    llvm::function_ref<void(MachineBasicBlock *)> *RemovalCallback) {
   assert(MBB->pred_empty() && "MBB must be dead!");
   DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
 
+  if (RemovalCallback)
+    (*RemovalCallback)(MBB);
+
   // Remove all successors.
   while (!MBB->succ_empty())
     MBB->removeSuccessor(MBB->succ_end() - 1);

Modified: llvm/trunk/test/CodeGen/AArch64/arm64-extload-knownzero.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-extload-knownzero.ll?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-extload-knownzero.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-extload-knownzero.ll Mon Oct 10 20:20:33 2016
@@ -12,7 +12,6 @@ bb1:
   %tmp2 = load i16, i16* %ptr, align 2
   br label %bb2
 bb2:
-; CHECK: %bb2
 ; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
 ; CHECK: cmp [[REG]], #23
   %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ]

Modified: llvm/trunk/test/CodeGen/AArch64/machine_cse.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/machine_cse.ll?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/machine_cse.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/machine_cse.ll Mon Oct 10 20:20:33 2016
@@ -1,4 +1,8 @@
-; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 -tail-dup-placement=0 | FileCheck %s
+; -tail-dup-placement causes tail duplication during layout. This breaks the
+; assumptions of the test case as written (specifically, it creates an
+; additional cmp instruction, creating a false positive), so we pass
+; -tail-dup-placement=0 to restore the original behavior
 
 ; marked as external to prevent possible optimizations
 @a = external global i32

Added: llvm/trunk/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll?rev=283842&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll Mon Oct 10 20:20:33 2016
@@ -0,0 +1,69 @@
+; RUN: llc -O3 -o - -verify-machineinstrs %s | FileCheck %s
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+%struct.s1 = type { %struct.s3*, %struct.s1* }
+%struct.s2 = type opaque
+%struct.s3 = type { i32 }
+
+; Function Attrs: nounwind
+define internal fastcc i32 @repeated_dup_worklist(%struct.s1** %pp1, %struct.s2* %p2, i32 %state, i1 %i1_1, i32 %i32_1) unnamed_addr #0 {
+entry:
+  br label %while.cond.outer
+
+; The loop gets laid out:
+; %while.cond.outer
+; %(null)
+; %(null)
+; %dup2
+; and then %dup1 gets chosen as the next block.
+; when dup2 is duplicated into dup1, %worklist could erroneously be placed on
+; the worklist, because all of its current predecessors are now scheduled.
+; However, after dup2 is tail-duplicated, %worklist can't be on the worklist
+; because it now has unscheduled predecessors.q
+; CHECK-LABEL: repeated_dup_worklist
+; CHECK: // %entry
+; CHECK: // %while.cond.outer
+; first %(null) block
+; CHECK: // in Loop:
+; CHECK: ldr
+; CHECK-NEXT: tbnz
+; second %(null) block
+; CHECK: // in Loop:
+; CHECK: // %dup2
+; CHECK: // %worklist
+; CHECK: // %if.then96.i
+while.cond.outer:                                 ; preds = %dup1, %entry
+  %progress.0.ph = phi i32 [ 0, %entry ], [ %progress.1, %dup1 ]
+  %inc77 = add nsw i32 %progress.0.ph, 1
+  %cmp = icmp slt i32 %progress.0.ph, %i32_1
+  br i1 %cmp, label %dup2, label %dup1
+
+dup2:                       ; preds = %if.then96.i, %worklist, %while.cond.outer
+  %progress.1.ph = phi i32 [ 0, %while.cond.outer ], [ %progress.1, %if.then96.i ], [ %progress.1, %worklist ]
+  %.pr = load %struct.s1*, %struct.s1** %pp1, align 8
+  br label %dup1
+
+dup1:                                       ; preds = %dup2, %while.cond.outer
+  %0 = phi %struct.s1* [ %.pr, %dup2 ], [ undef, %while.cond.outer ]
+  %progress.1 = phi i32 [ %progress.1.ph, %dup2 ], [ %inc77, %while.cond.outer ]
+  br i1 %i1_1, label %while.cond.outer, label %worklist
+
+worklist:                                       ; preds = %dup1
+  %snode94 = getelementptr inbounds %struct.s1, %struct.s1* %0, i64 0, i32 0
+  %1 = load %struct.s3*, %struct.s3** %snode94, align 8
+  %2 = getelementptr inbounds %struct.s3, %struct.s3* %1, i32 0, i32 0
+  %3 = load i32, i32* %2, align 4
+  %tobool95.i = icmp eq i32 %3, 0
+  br i1 %tobool95.i, label %if.then96.i, label %dup2
+
+if.then96.i:                                      ; preds = %worklist
+  call fastcc void @free_s3(%struct.s2* %p2, %struct.s3* %1) #1
+  br label %dup2
+}
+
+; Function Attrs: nounwind
+declare fastcc void @free_s3(%struct.s2*, %struct.s3*) unnamed_addr #0
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }

Modified: llvm/trunk/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll Mon Oct 10 20:20:33 2016
@@ -25,7 +25,6 @@ bb1:
   br label %bb2
 
 bb2:                                              ; preds = %bb1, %entry
-; CHECK: bb2
 ; CHECK: cmp [[REG]], #0
 ; CHECK: ble
   %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ]

Modified: llvm/trunk/test/CodeGen/PowerPC/branch-opt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/branch-opt.ll?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/branch-opt.ll (original)
+++ llvm/trunk/test/CodeGen/PowerPC/branch-opt.ll Mon Oct 10 20:20:33 2016
@@ -1,9 +1,21 @@
-; RUN: llc -verify-machineinstrs < %s -march=ppc32 | \
-; RUN:   grep "b LBB.*" | count 4
+; RUN: llc -verify-machineinstrs < %s -march=ppc32 | FileCheck %s
 
 target datalayout = "E-p:32:32"
 target triple = "powerpc-apple-darwin8.7.0"
 
+;CHECK-LABEL: foo:
+; There are 4 inner loops (%bb, %bb12, %bb25, %bb38) that all exit to %cond_next48
+; The last (whichever it is) should have a fallthrough exit, and the other three
+; need an unconditional branch. No other block should have an unconditional
+; branch to cond_next48
+; One of the blocks ends up with a loop exit block that gets a tail-duplicated copy
+; of %cond_next48, so there should only be two unconditional branches.
+
+;CHECK: b LBB0_13
+;CHECK: b LBB0_13
+;CHECK-NOT: b LBB0_13
+;CHECK: LBB0_13: ; %cond_next48
+
 define void @foo(i32 %W, i32 %X, i32 %Y, i32 %Z) {
 entry:
 	%tmp1 = and i32 %W, 1		; <i32> [#uses=1]

Modified: llvm/trunk/test/CodeGen/PowerPC/sjlj.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/sjlj.ll?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/sjlj.ll (original)
+++ llvm/trunk/test/CodeGen/PowerPC/sjlj.ll Mon Oct 10 20:20:33 2016
@@ -74,24 +74,24 @@ return:
 ; CHECK-DAG: std [[REGA]], [[OFF:[0-9]+]](31)                  # 8-byte Folded Spill
 ; CHECK-DAG: std 1, 16([[REGA]])
 ; CHECK-DAG: std 2, 24([[REGA]])
-; CHECK: bcl 20, 31, .LBB1_5
+; CHECK: bcl 20, 31, .LBB1_3
 ; CHECK: li 3, 1
-; CHECK: #EH_SjLj_Setup	.LBB1_5
+; CHECK: #EH_SjLj_Setup	.LBB1_3
 ; CHECK: b .LBB1_1
 
-; CHECK: .LBB1_4:
+; CHECK: .LBB1_3:
+; CHECK: mflr [[REGL:[0-9]+]]
+; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31)                   # 8-byte Folded Reload
+; CHECK: std [[REGL]], 8([[REG2]])
+; CHECK: li 3, 0
+
+; CHECK: .LBB1_5:
 
 ; CHECK: lfd
 ; CHECK: lxvd2x
 ; CHECK: ld
 ; CHECK: blr
 
-; CHECK: .LBB1_5:
-; CHECK: mflr [[REGL:[0-9]+]]
-; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31)                   # 8-byte Folded Reload
-; CHECK: std [[REGL]], 8([[REG2]])
-; CHECK: li 3, 0
-
 ; CHECK-NOAV: @main
 ; CHECK-NOAV-NOT: stxvd2x
 ; CHECK-NOAV: bcl

Added: llvm/trunk/test/CodeGen/PowerPC/tail-dup-branch-to-fallthrough.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/tail-dup-branch-to-fallthrough.ll?rev=283842&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/tail-dup-branch-to-fallthrough.ll (added)
+++ llvm/trunk/test/CodeGen/PowerPC/tail-dup-branch-to-fallthrough.ll Mon Oct 10 20:20:33 2016
@@ -0,0 +1,65 @@
+; RUN: llc -O2 %s -o - | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+declare void @f1()
+declare void @f2()
+declare void @f3()
+declare void @f4()
+
+; Function Attrs: nounwind
+; CHECK-LABEL: tail_dup_fallthrough_with_branch
+; CHECK: # %entry
+; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
+; CHECK: # %entry
+; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
+; CHECK: # %sw.0
+; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
+; CHECK: # %sw.1
+; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
+; CHECK: # %sw.default
+; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
+; CHECK: # %if.then
+; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
+; CHECK: # %if.else
+; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
+; CHECK: .Lfunc_end0
+define fastcc void @tail_dup_fallthrough_with_branch(i32 %a, i1 %b) unnamed_addr #0 {
+entry:
+  switch i32 %a, label %sw.default [
+    i32 0, label %sw.0
+    i32 1, label %sw.1
+  ]
+
+sw.0:                                         ; preds = %entry
+  call void @f1() #0
+  br label %dup1
+
+sw.1:                                         ; preds = %entry
+  call void @f2() #0
+  br label %dup1
+
+sw.default:                                   ; preds = %entry
+  br i1 %b, label %if.then, label %if.else
+
+if.then:                                      ; preds = %sw.default
+  call void @f3() #0
+  br label %dup2
+
+if.else:                                      ; preds = %sw.default
+  call void @f4() #0
+  br label %dup2
+
+dup1:                                         ; preds = %sw.0, %sw.1
+  call void @llvm.lifetime.end(i64 8, i8* nonnull undef) #0
+  unreachable
+
+dup2:                                         ; preds = %if.then, %if.else
+  call void @llvm.lifetime.end(i64 8, i8* nonnull undef) #0
+  unreachable
+}
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll?rev=283842&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll (added)
+++ llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll Mon Oct 10 20:20:33 2016
@@ -0,0 +1,100 @@
+; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-grtev4-linux-gnu"
+
+; Intended layout:
+; The outlining flag produces the layout
+; test1
+; test2
+; test3
+; test4
+; exit
+; optional1
+; optional2
+; optional3
+; optional4
+; Tail duplication puts test n+1 at the end of optional n
+; so optional1 includes a copy of test2 at the end, and branches
+; to test3 (at the top) or falls through to optional 2.
+; The CHECK statements check for the whole string of tests and exit block,
+; and then check that the correct test has been duplicated into the end of
+; the optional blocks and that the optional blocks are in the correct order.
+;CHECK-LABEL: f:
+; test1 may have been merged with entry
+;CHECK: mr [[TAGREG:[0-9]+]], 3
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK: blr
+;CHECK-NEXT: [[OPT1LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, [[TEST3LABEL]]
+;CHECK-NEXT: [[OPT2LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: beq 0, [[TEST4LABEL]]
+;CHECK-NEXT: [[OPT3LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, [[EXITLABEL]]
+;CHECK-NEXT: [[OPT4LABEL]]
+;CHECK: b [[EXITLABEL]]
+
+define void @f(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %optional1
+optional1:
+  call void @a()
+  call void @a()
+  call void @a()
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %test3, label %optional2
+optional2:
+  call void @b()
+  call void @b()
+  call void @b()
+  call void @b()
+  br label %test3
+test3:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %test4, label %optional3
+optional3:
+  call void @c()
+  call void @c()
+  call void @c()
+  call void @c()
+  br label %test4
+test4:
+  %tagbit4 = and i32 %tag, 8
+  %tagbit4eq0 = icmp eq i32 %tagbit4, 0
+  br i1 %tagbit4eq0, label %exit, label %optional4
+optional4:
+  call void @d()
+  call void @d()
+  call void @d()
+  call void @d()
+  br label %exit
+exit:
+  ret void
+}
+
+declare void @a()
+declare void @b()
+declare void @c()
+declare void @d()

Modified: llvm/trunk/test/CodeGen/WebAssembly/cfg-stackify.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/WebAssembly/cfg-stackify.ll?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/WebAssembly/cfg-stackify.ll (original)
+++ llvm/trunk/test/CodeGen/WebAssembly/cfg-stackify.ll Mon Oct 10 20:20:33 2016
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s
 
 ; Test the CFG stackifier pass.
 

Modified: llvm/trunk/test/CodeGen/WebAssembly/mem-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/WebAssembly/mem-intrinsics.ll?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/WebAssembly/mem-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/WebAssembly/mem-intrinsics.ll Mon Oct 10 20:20:33 2016
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s
 
 ; Test memcpy, memmove, and memset intrinsics.
 

Modified: llvm/trunk/test/CodeGen/X86/block-placement.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/block-placement.ll?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/block-placement.ll (original)
+++ llvm/trunk/test/CodeGen/X86/block-placement.ll Mon Oct 10 20:20:33 2016
@@ -177,6 +177,12 @@ exit:
   ret i32 %sum
 }
 
+; Tail duplication during layout can entirely remove body0 by duplicating it
+; into the entry block and into body1. This is a good thing but it isn't what
+; this test is looking for. So to make the blocks longer so they don't get
+; duplicated, we add some calls to dummy.
+declare void @dummy()
+
 define i32 @test_loop_rotate(i32 %i, i32* %a) {
 ; Check that we rotate conditional exits from the loop to the bottom of the
 ; loop, eliminating unconditional branches to the top.
@@ -194,6 +200,8 @@ body0:
   %base = phi i32 [ 0, %entry ], [ %sum, %body1 ]
   %next = add i32 %iv, 1
   %exitcond = icmp eq i32 %next, %i
+  call void @dummy()
+  call void @dummy()
   br i1 %exitcond, label %exit, label %body1
 
 body1:
@@ -945,7 +953,7 @@ define void @benchmark_heapsort(i32 %n,
 ; First rotated loop top.
 ; CHECK: .p2align
 ; CHECK: %while.end
-; CHECK: %for.cond
+; %for.cond gets completely tail-duplicated away.
 ; CHECK: %if.then
 ; CHECK: %if.else
 ; CHECK: %if.end10

Modified: llvm/trunk/test/CodeGen/X86/cmov-into-branch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/cmov-into-branch.ll?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/cmov-into-branch.ll (original)
+++ llvm/trunk/test/CodeGen/X86/cmov-into-branch.ll Mon Oct 10 20:20:33 2016
@@ -105,9 +105,11 @@ define i32 @weighted_select3(i32 %a, i32
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    je [[LABEL_BB6:.*]]
 ; CHECK:         movl %edi, %eax
+; CHECK-NEXT:    retq
 ; CHECK:         [[LABEL_BB6]]
 ; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:    jmp
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
 ;
   %cmp = icmp ne i32 %a, 0
   %sel = select i1 %cmp, i32 %a, i32 %b, !prof !2

Modified: llvm/trunk/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll Mon Oct 10 20:20:33 2016
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: fmaddsubpd_loop_128:
 ; CHECK:   vfmaddsub231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fmaddsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -28,7 +28,7 @@ for.end:
 
 ; CHECK-LABEL: fmsubaddpd_loop_128:
 ; CHECK:   vfmsubadd231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fmsubaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -54,7 +54,7 @@ for.end:
 
 ; CHECK-LABEL: fmaddpd_loop_128:
 ; CHECK:   vfmadd231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -80,7 +80,7 @@ for.end:
 
 ; CHECK-LABEL: fmsubpd_loop_128:
 ; CHECK:   vfmsub231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -106,7 +106,7 @@ for.end:
 
 ; CHECK-LABEL: fnmaddpd_loop_128:
 ; CHECK:   vfnmadd231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fnmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -132,7 +132,7 @@ for.end:
 
 ; CHECK-LABEL: fnmsubpd_loop_128:
 ; CHECK:   vfnmsub231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fnmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -329,7 +329,7 @@ declare <4 x float> @llvm.x86.fma.vfnmsu
 
 ; CHECK-LABEL: fmaddsubpd_loop_256:
 ; CHECK:   vfmaddsub231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fmaddsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -355,7 +355,7 @@ for.end:
 
 ; CHECK-LABEL: fmsubaddpd_loop_256:
 ; CHECK:   vfmsubadd231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fmsubaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -381,7 +381,7 @@ for.end:
 
 ; CHECK-LABEL: fmaddpd_loop_256:
 ; CHECK:   vfmadd231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -407,7 +407,7 @@ for.end:
 
 ; CHECK-LABEL: fmsubpd_loop_256:
 ; CHECK:   vfmsub231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -433,7 +433,7 @@ for.end:
 
 ; CHECK-LABEL: fnmaddpd_loop_256:
 ; CHECK:   vfnmadd231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fnmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -459,7 +459,7 @@ for.end:
 
 ; CHECK-LABEL: fnmsubpd_loop_256:
 ; CHECK:   vfnmsub231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fnmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:

Modified: llvm/trunk/test/CodeGen/X86/fp-une-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fp-une-cmp.ll?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fp-une-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fp-une-cmp.ll Mon Oct 10 20:20:33 2016
@@ -56,11 +56,11 @@ define double @profile_metadata(double %
 ; CHECK-NEXT:    ucomisd %xmm1, %xmm0
 ; CHECK-NEXT:    jne .LBB1_1
 ; CHECK-NEXT:    jp .LBB1_1
-; CHECK-NEXT:  .LBB1_2: # %bb2
+; CHECK-NEXT:  # %bb2
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB1_1: # %bb1
 ; CHECK-NEXT:    addsd {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    jmp .LBB1_2
+; CHECK-NEXT:    retq
 
 entry:
   %mul = fmul double %x, %y

Modified: llvm/trunk/test/CodeGen/X86/pr11202.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr11202.ll?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr11202.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr11202.ll Mon Oct 10 20:20:33 2016
@@ -15,5 +15,8 @@ l2:
   br label %l1
 }
 
-; CHECK: .Ltmp0:                                 # Address of block that was removed by CodeGen
+; It is correct for either l1 or l2 to be removed.
+; If l2 is removed, the message should be "Address of block that was removed by CodeGen"
+; If l1 is removed, it should be "Block address taken."
+; CHECK: .Ltmp0:                                 # {{Address of block that was removed by CodeGen|Block address taken}}
 ; CHECK: .quad	.Ltmp0

Modified: llvm/trunk/test/CodeGen/X86/ragreedy-bug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/ragreedy-bug.ll?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/ragreedy-bug.ll (original)
+++ llvm/trunk/test/CodeGen/X86/ragreedy-bug.ll Mon Oct 10 20:20:33 2016
@@ -3,16 +3,34 @@
 ; This testing case is reduced from 197.parser prune_match function.
 ; We make sure register copies are not generated on isupper.exit blocks.
 
-; CHECK: isupper.exit
+; isupper.exit and isupper.exit223 get tail-duplicated into all their
+; predecessors.
+; CHECK: cond.true.i.i
 ; CHECK-NEXT: in Loop
+; Mem-move
+; CHECK-NEXT: movl
+; CHECK-NEXT: andl
 ; CHECK-NEXT: testl
 ; CHECK-NEXT: jne
-; CHECK: isupper.exit
+; CHECK: cond.true.i.i217
 ; CHECK-NEXT: in Loop
+; Mem-move
+; CHECK-NEXT: movl
+; CHECK-NEXT: andl
 ; CHECK-NEXT: testl
 ; CHECK-NEXT: je
+; CHECK: cond.false.i.i
 ; CHECK: maskrune
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: testl
+; CHECK-NEXT: je
+; CHECK: cond.false.i.i219
 ; CHECK: maskrune
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: testl
+; CHECK-NEXT: jne
 
 %struct.List_o_links_struct = type { i32, i32, i32, %struct.List_o_links_struct* }
 %struct.Connector_struct = type { i16, i16, i8, i8, %struct.Connector_struct*, i8* }

Modified: llvm/trunk/test/CodeGen/X86/sse1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse1.ll?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse1.ll Mon Oct 10 20:20:33 2016
@@ -58,21 +58,23 @@ define <4 x float> @vselect(<4 x float>*
 ; X32-NEXT:    je .LBB1_1
 ; X32-NEXT:  # BB#2: # %entry
 ; X32-NEXT:    xorps %xmm1, %xmm1
-; X32-NEXT:    jmp .LBB1_3
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    jne .LBB1_5
+; X32-NEXT:    jmp .LBB1_4
 ; X32-NEXT:  .LBB1_1:
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:  .LBB1_3: # %entry
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    je .LBB1_4
-; X32-NEXT:  # BB#5: # %entry
+; X32-NEXT:  .LBB1_5: # %entry
 ; X32-NEXT:    xorps %xmm2, %xmm2
-; X32-NEXT:    jmp .LBB1_6
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    jne .LBB1_8
+; X32-NEXT:    jmp .LBB1_7
 ; X32-NEXT:  .LBB1_4:
 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:  .LBB1_6: # %entry
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    je .LBB1_7
-; X32-NEXT:  # BB#8: # %entry
+; X32-NEXT:  .LBB1_8: # %entry
 ; X32-NEXT:    xorps %xmm3, %xmm3
 ; X32-NEXT:    jmp .LBB1_9
 ; X32-NEXT:  .LBB1_7:
@@ -95,21 +97,23 @@ define <4 x float> @vselect(<4 x float>*
 ; X64-NEXT:    je .LBB1_1
 ; X64-NEXT:  # BB#2: # %entry
 ; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    jmp .LBB1_3
+; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    jne .LBB1_5
+; X64-NEXT:    jmp .LBB1_4
 ; X64-NEXT:  .LBB1_1:
 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT:  .LBB1_3: # %entry
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    je .LBB1_4
-; X64-NEXT:  # BB#5: # %entry
+; X64-NEXT:  .LBB1_5: # %entry
 ; X64-NEXT:    xorps %xmm2, %xmm2
-; X64-NEXT:    jmp .LBB1_6
+; X64-NEXT:    testl %r8d, %r8d
+; X64-NEXT:    jne .LBB1_8
+; X64-NEXT:    jmp .LBB1_7
 ; X64-NEXT:  .LBB1_4:
 ; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT:  .LBB1_6: # %entry
 ; X64-NEXT:    testl %r8d, %r8d
 ; X64-NEXT:    je .LBB1_7
-; X64-NEXT:  # BB#8: # %entry
+; X64-NEXT:  .LBB1_8: # %entry
 ; X64-NEXT:    xorps %xmm3, %xmm3
 ; X64-NEXT:    jmp .LBB1_9
 ; X64-NEXT:  .LBB1_7:

Added: llvm/trunk/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/tail-dup-merge-loop-headers.ll?rev=283842&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/tail-dup-merge-loop-headers.ll (added)
+++ llvm/trunk/test/CodeGen/X86/tail-dup-merge-loop-headers.ll Mon Oct 10 20:20:33 2016
@@ -0,0 +1,190 @@
+; RUN: llc -O2 -o - %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+; CHECK-LABEL: tail_dup_merge_loops
+; CHECK: # %entry
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %inner_loop_exit
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %inner_loop_latch
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %inner_loop_test
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %exit
+define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 {
+entry:
+  %notlhs674.i = icmp eq i32 %a, 0
+  br label %outer_loop_top
+
+outer_loop_top:                         ; preds = %inner_loop_exit, %entry
+  %dst.0.ph.i = phi i8* [ %b, %entry ], [ %scevgep679.i, %inner_loop_exit ]
+  br i1 %notlhs674.i, label %exit, label %inner_loop_preheader
+
+inner_loop_preheader:                           ; preds = %outer_loop_top
+  br label %inner_loop_top
+
+inner_loop_top:                                     ; preds = %inner_loop_latch, %inner_loop_preheader
+  %dst.0.i = phi i8* [ %inc, %inner_loop_latch ], [ %dst.0.ph.i, %inner_loop_preheader ]
+  %var = load i8, i8* %dst.0.i
+  %tobool1.i = icmp slt i8 %var, 0
+  br label %inner_loop_test
+
+inner_loop_test:                                       ; preds = %inner_loop_top
+  br i1 %tobool1.i, label %inner_loop_exit, label %inner_loop_latch
+
+inner_loop_exit:                       ; preds = %inner_loop_test
+  %scevgep.i = getelementptr i8, i8* %dst.0.i, i64 1
+  %scevgep679.i = getelementptr i8, i8* %scevgep.i, i64 0
+  br label %outer_loop_top
+
+inner_loop_latch:                                ; preds = %inner_loop_test
+  %cmp75.i = icmp ult i8* %dst.0.i, %c
+  %inc = getelementptr i8, i8* %dst.0.i, i64 2
+  br label %inner_loop_top
+
+exit:                              ; preds = %outer_loop_top
+  ret void
+}
+
+ at .str.6 = external unnamed_addr constant [23 x i8], align 1
+
+; There is an erroneus check in LoopBase::addBasicBlockToLoop(), where it
+; assumes that the header block for a loop is unique.
+; For most of compilation this assumption is true, but during layout we allow
+; this assumption to be violated. The following code will trigger the bug:
+
+; The loops in question is eventually headed by the block shared_loop_header
+;
+; During layout The block labeled outer_loop_header gets tail-duplicated into
+; outer_loop_latch, and into shared_preheader, and then removed. This leaves
+; shared_loop_header as the header of both loops. The end result
+; is that there are 2 valid loops, and that they share a header. If we re-ran
+; the loop analysis, it would classify this as a single loop.
+; So far this is fine as far as layout is concerned.
+; After layout we tail merge blocks merge_other and merge_predecessor_split.
+; We do this even though they share only a single instruction, because
+; merge_predecessor_split falls through to their shared successor:
+; outer_loop_latch.
+; The rest of the blocks in the function are noise unfortunately. Bugpoint
+; couldn't shrink the test any further.
+
+; CHECK-LABEL: loop_shared_header
+; CHECK: # %entry
+; CHECK: # %shared_preheader
+; CHECK: # %shared_loop_header
+; CHECK: # %inner_loop_body
+; CHECK: # %merge_predecessor_split
+; CHECK: # %outer_loop_latch
+; CHECK: # %outer_loop_latch
+; CHECK: # %cleanup
+define i32 @loop_shared_header(i8* %exe, i32 %exesz, i32 %headsize, i32 %min, i32 %wwprva, i32 %e_lfanew, i8* readonly %wwp, i32 %wwpsz, i16 zeroext %sects) local_unnamed_addr #0 {
+entry:
+  %0 = load i32, i32* undef, align 4
+  %mul = shl nsw i32 %0, 2
+  br i1 undef, label %if.end19, label %cleanup
+
+if.end19:                                         ; preds = %entry
+  %conv = zext i32 %mul to i64
+  %call = tail call i8* @cli_calloc(i64 %conv, i64 1)
+  %1 = icmp eq i32 %exesz, 0
+  %notrhs = icmp eq i32 %0, 0
+  %or.cond117.not = or i1 %1, %notrhs
+  %or.cond202 = or i1 %or.cond117.not, undef
+  %cmp35 = icmp ult i8* undef, %exe
+  %or.cond203 = or i1 %or.cond202, %cmp35
+  br i1 %or.cond203, label %cleanup, label %if.end50
+
+if.end50:                                         ; preds = %if.end19
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %call, i8* undef, i64 %conv, i32 1, i1 false)
+  %cmp1.i.i = icmp ugt i32 %mul, 3
+  br i1 %cmp1.i.i, label %shared_preheader, label %wunpsect.exit.thread.loopexit391
+
+shared_preheader:                                 ; preds = %if.end50
+  br label %outer_loop_header
+
+outer_loop_header:                                ; preds = %outer_loop_latch, %shared_preheader
+  %bits.1.i = phi i8 [ 32, %shared_preheader ], [ %bits.43.i, %outer_loop_latch ]
+  %dst.0.ph.i = phi i8* [ undef, %shared_preheader ], [ %scevgep679.i, %outer_loop_latch ]
+  %2 = icmp eq i32 undef, 0
+  br i1 %2, label %while.cond.us1412.i, label %shared_loop_header
+
+while.cond.us1412.i:                              ; preds = %outer_loop_header
+  %.pre.i = add i8 %bits.1.i, -1
+  %tobool2.us1420.i = icmp eq i8 %.pre.i, 0
+  %or.cond.us1421.i = or i1 undef, %tobool2.us1420.i
+  br i1 %or.cond.us1421.i, label %if.end41.us1436.i, label %cleanup
+
+if.end41.us1436.i:                                ; preds = %while.cond.us1412.i
+  unreachable
+
+shared_loop_header:                               ; preds = %dup_early2, %dup_early1
+  %dst.0.i = phi i8* [ undef, %inner_loop_body ], [ %dst.0.ph.i, %outer_loop_header ], [ undef, %dead_block ]
+  %cmp3.i1172.i = icmp ult i8* undef, %call
+  br i1 %cmp3.i1172.i, label %wunpsect.exit.thread.loopexit389, label %inner_loop_body
+
+inner_loop_body:                                  ; preds = %shared_loop_header
+  %3 = icmp slt i32 undef, 0
+  br i1 %3, label %if.end96.i, label %shared_loop_header
+
+dead_block:                                       ; preds = %inner_loop_body
+  %cmp75.i = icmp ult i8* %dst.0.i, undef
+  br label %shared_loop_header
+
+if.end96.i:                                       ; preds = %inner_loop_body
+  %cmp97.i = icmp ugt i32 undef, 2
+  br i1 %cmp97.i, label %if.then99.i, label %if.end287.i
+
+if.then99.i:                                      ; preds = %if.end96.i
+  tail call void (i8*, ...) @cli_dbgmsg(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.6, i64 0, i64 0), i32 undef)
+  br label %cleanup
+
+if.end287.i:                                      ; preds = %if.end96.i
+  %cmp291.i = icmp ne i32 undef, 1
+  %conv294.i = select i1 %cmp291.i, i16 4, i16 3
+  br i1 undef, label %if.end308.i, label %outer_loop_latch
+
+if.end308.i:                                      ; preds = %if.end287.i
+  br i1 undef, label %if.end335.i, label %merge_predecessor_split
+
+merge_predecessor_split:                          ; preds = %if.end308.i
+  %4 = bitcast i8* undef to i32*
+  br label %outer_loop_latch
+
+if.end335.i:                                      ; preds = %if.end308.i
+  br i1 undef, label %outer_loop_latch, label %merge_other
+
+merge_other:                                      ; preds = %if.end335.i
+  br label %outer_loop_latch
+
+outer_loop_latch:                                 ; preds = %merge_other, %if.end335.i, %merge_predecessor_split, %if.end287.i
+  %bits.43.i = phi i8 [ undef, %if.end287.i ], [ undef, %merge_other ], [ 32, %merge_predecessor_split ], [ 0, %if.end335.i ]
+  %backsize.0.i = phi i16 [ %conv294.i, %if.end287.i ], [ 0, %merge_other ], [ 0, %merge_predecessor_split ], [ 0, %if.end335.i ]
+  %5 = add i16 %backsize.0.i, -1
+  %6 = zext i16 %5 to i64
+  %scevgep.i = getelementptr i8, i8* %dst.0.ph.i, i64 1
+  %scevgep679.i = getelementptr i8, i8* %scevgep.i, i64 %6
+  br label %outer_loop_header
+
+wunpsect.exit.thread.loopexit389:                 ; preds = %shared_loop_header
+  unreachable
+
+wunpsect.exit.thread.loopexit391:                 ; preds = %if.end50
+  unreachable
+
+cleanup:                                          ; preds = %if.then99.i, %while.cond.us1412.i, %if.end19, %entry
+  %retval.0 = phi i32 [ 0, %if.then99.i ], [ 1, %entry ], [ 1, %if.end19 ], [ 1, %while.cond.us1412.i ]
+  ret i32 %retval.0
+}
+
+; Function Attrs: nounwind
+declare void @cli_dbgmsg(i8*, ...) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+declare i8* @cli_calloc(i64, i64) local_unnamed_addr #0
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }

Added: llvm/trunk/test/CodeGen/X86/tail-dup-repeat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/tail-dup-repeat.ll?rev=283842&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/tail-dup-repeat.ll (added)
+++ llvm/trunk/test/CodeGen/X86/tail-dup-repeat.ll Mon Oct 10 20:20:33 2016
@@ -0,0 +1,53 @@
+; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: uwtable
+; When tail-duplicating during placement, we work backward from blocks with
+; multiple successors. In this case, the block dup1 gets duplicated into dup2
+; and if.then64, and then the block dup2 gets duplicated into land.lhs.true
+; and if.end70
+; CHECK-LABEL: repeated_tail_dup:
+define void @repeated_tail_dup(i1 %a1, i1 %a2, i32* %a4, i32* %a5, i8* %a6) #0 align 2 {
+entry:
+  br label %for.cond
+
+; CHECK: {{^}}.[[HEADER:LBB0_[1-9]]]: # %for.cond
+for.cond:                                         ; preds = %dup1, %entry
+  br i1 %a1, label %land.lhs.true, label %if.end56
+
+land.lhs.true:                                    ; preds = %for.cond
+  store i32 10, i32* %a4, align 8
+  br label %dup2
+
+if.end56:                                         ; preds = %for.cond
+  br i1 %a2, label %if.then64, label %if.end70
+
+if.then64:                                        ; preds = %if.end56
+  store i8 1, i8* %a6, align 1
+  br label %dup1
+
+; CHECK:      # %if.end70
+; CHECK-NEXT: # in Loop:
+; CHECK-NEXT: movl $12, (%rdx)
+; CHECK-NEXT: movl $2, (%rcx)
+; CHECK-NEXT: testl %eax, %eax
+; CHECK-NEXT: je .[[HEADER]]
+if.end70:                                         ; preds = %if.end56
+  store i32 12, i32* %a4, align 8
+  br label %dup2
+
+dup2:                                             ; preds = %if.end70, %land.lhs.true
+  store i32 2, i32* %a5, align 4
+  br label %dup1
+
+dup1:                                             ; preds = %dup2, %if.then64
+  %val = load i32, i32* %a4, align 8
+  %switch = icmp ult i32 undef, 1
+  br i1 %switch, label %for.cond, label %for.end
+
+for.end:                                          ; preds = %dup1
+  ret void
+}
+
+attributes #0 = { uwtable }

Modified: llvm/trunk/test/CodeGen/X86/update-terminator.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/update-terminator.mir?rev=283842&r1=283841&r2=283842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/update-terminator.mir (original)
+++ llvm/trunk/test/CodeGen/X86/update-terminator.mir Mon Oct 10 20:20:33 2016
@@ -5,17 +5,30 @@
   @a = external global i16
   @b = external global i32
 
+  declare void @dummy1()
+  declare void @dummy2()
+  declare void @dummy3()
+
   ; Function Attrs: nounwind
   define void @f2() {
     br i1 undef, label %bb1, label %bb3
 
   bb1:
+    call void @dummy1()
+    call void @dummy1()
+    call void @dummy1()
     br i1 undef, label %bb2, label %bb2
 
   bb2:
+    call void @dummy2()
+    call void @dummy2()
+    call void @dummy2()
     br label %bb4
 
   bb3:
+    call void @dummy3()
+    call void @dummy3()
+    call void @dummy3()
     br label %bb2
 
   bb4:
@@ -40,15 +53,24 @@ body:             |
   bb.1:
     successors: %bb.2(100)
 
+    CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp
     JNE_1 %bb.2, implicit %eflags
 
   bb.2:
     successors: %bb.4(100)
 
+    CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp
     JMP_1 %bb.4
 
   bb.3:
     successors: %bb.2(100)
+    CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp
     JMP_1 %bb.2
 
   bb.4:




More information about the llvm-commits mailing list