[llvm] r280216 - [SimplifyCFG] Change the algorithm in SinkThenElseCodeToEnd

Wed Aug 31 03:46:23 PDT 2016

Author: jamesm
Date: Wed Aug 31 05:46:23 2016
New Revision: 280216

URL: http://llvm.org/viewvc/llvm-project?rev=280216&view=rev
Log:
[SimplifyCFG] Change the algorithm in SinkThenElseCodeToEnd

r279460 rewrote this function to be able to handle more than two incoming edges and took pains to ensure this didn't regress anything.

This time we change the logic for determining if an instruction should be sunk. Previously we used a single pass greedy algorithm - sink instructions until one requires more than one PHI node or we run out of instructions to sink.

This had the problem that sinking instructions that had non-identical but trivially the same operands needed extra logic so we sunk them aggressively. For example:

    %a = load i32* %b          %d = load i32* %b
    %c = gep i32* %a, i32 0    %e = gep i32* %d, i32 1

Sinking %c and %e would naively require two PHI merges as %a != %d. But the loads are obviously equivalent (and maybe can't be hoisted because there is no common predecessor).

This is why we implemented the fairly complex function areValuesTriviallySame(), to look through trivial differences like this. However it's just not clever enough.

Instead, throw areValuesTriviallySame away, use pointer equality to check equivalence of operands and switch to a two-stage algorithm.

In the "scan" stage, we look at every sinkable instruction in isolation from end of block to front. If it's sinkable, we keep track of all operands that required PHI merging.

In the "sink" stage, we iteratively sink the last non-terminator in the source blocks. But when calculating how many PHIs are actually required to be inserted (to work out if we should stop or not) we remove any values that have already been sunk from the set of PHI-merges required, which allows us to be more aggressive.

This turns an algorithm with potentially recursive lookahead (looking through GEPs, casts, loads and any other instruction potentially not CSE'd) to two linear scans.

Modified:
    llvm/trunk/lib/Transforms/Utils/SimplifyCFG.cpp
    llvm/trunk/test/Transforms/SimplifyCFG/sink-common-code.ll

Modified: llvm/trunk/lib/Transforms/Utils/SimplifyCFG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Utils/SimplifyCFG.cpp?rev=280216&r1=280215&r2=280216&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Utils/SimplifyCFG.cpp (original)
+++ llvm/trunk/lib/Transforms/Utils/SimplifyCFG.cpp Wed Aug 31 05:46:23 2016
@@ -1319,63 +1319,6 @@ HoistTerminator:
   return true;
 }
 
-// Return true if V0 and V1 are equivalent. This handles the obvious cases
-// where V0 == V1 and V0 and V1 are both identical instructions, but also
-// handles loads and stores with identical operands.
-//
-// Because determining if two memory instructions are equivalent
-// depends on control flow, the \c At0 and \c At1 parameters specify a
-// location for the query. This function is essentially answering the
-// query "If V0 were moved to At0, and V1 were moved to At1, are V0 and V1
-// equivalent?". In practice this means checking that moving V0 to At0
-// doesn't cross any other memory instructions.
-static bool areValuesTriviallySame(Value *V0, BasicBlock::const_iterator At0,
-                                   Value *V1, BasicBlock::const_iterator At1) {
-  if (V0 == V1)
-    return true;
-  
-  // Also check for instructions that are identical but not pointer-identical.
-  // This can include load instructions that haven't been CSE'd.
-  if (!isa<Instruction>(V0) || !isa<Instruction>(V1))
-    return false;
-  const auto *I0 = cast<Instruction>(V0);
-  const auto *I1 = cast<Instruction>(V1);
-  if (!I0->isIdenticalToWhenDefined(I1))
-    return false;
-
-  if (!I0->mayReadOrWriteMemory())
-    return true;
-
-  // Instructions that may read or write memory have extra restrictions. We
-  // must ensure we don't treat %a and %b as equivalent in code such as:
-  //
-  //  %a = load %x
-  //  store %x, 1
-  //  if (%c) {
-  //    %b = load %x
-  //    %d = add %b, 1
-  //  } else {
-  //    %d = add %a, 1
-  //  }
-
-  // Be conservative. We don't want to search the entire CFG between def
-  // and use; if the def isn't in the same block as the use just bail.
-  if (I0->getParent() != At0->getParent() ||
-      I1->getParent() != At1->getParent())
-    return false;
-
-  // Again, be super conservative. Ideally we'd be able to query AliasAnalysis
-  // but we currently don't have that available.
-  auto WritesMemory = [](const Instruction &I) {
-    return I.mayReadOrWriteMemory();
-  };
-  if (std::any_of(std::next(I0->getIterator()), At0, WritesMemory))
-    return false;
-  if (std::any_of(std::next(I1->getIterator()), At1, WritesMemory))
-    return false;
-  return true;
-}
-
 // Is it legal to place a variable in operand \c OpIdx of \c I?
 // FIXME: This should be promoted to Instruction.
 static bool canReplaceOperandWithVariable(const Instruction *I,
@@ -1410,21 +1353,14 @@ static bool canReplaceOperandWithVariabl
   }
 }
 
-// All blocks in Blocks unconditionally jump to a common successor. Analyze
-// the last non-terminator instruction in each block and return true if it would
-// be possible to sink them into their successor, creating one common
-// instruction instead. Set NumPHIsRequired to the number of PHI nodes that
-// would need to be created during sinking.
-static bool canSinkLastInstruction(ArrayRef<BasicBlock*> Blocks,
-                                   unsigned &NumPHIsRequired) {
-  SmallVector<Instruction*,4> Insts;
-  for (auto *BB : Blocks) {
-    if (BB->getTerminator() == &BB->front())
-      // Block was empty.
-      return false;
-    Insts.push_back(BB->getTerminator()->getPrevNode());
-  }
-
+// All instructions in Insts belong to different blocks that all unconditionally
+// branch to a common successor. Analyze each instruction and return true if it
+// would be possible to sink them into their successor, creating one common
+// instruction instead. For every value that would be required to be provided by
+// PHI node (because an operand varies in each input block), add to PHIOperands.
+static bool canSinkInstructions(
+    ArrayRef<Instruction *> Insts,
+    DenseMap<Instruction *, SmallVector<Value *, 4>> &PHIOperands) {
   // Prune out obviously bad instructions to move. Any non-store instruction
   // must have exactly one use, and we check later that use is by a single,
   // common PHI instruction in the successor.
@@ -1444,24 +1380,26 @@ static bool canSinkLastInstruction(Array
     if (!I->isSameOperationAs(I0))
       return false;
 
-  // If this isn't a store, check the only user is a single PHI.
+  // All instructions in Insts are known to be the same opcode. If they aren't
+  // stores, check the only user of each is a PHI or in the same block as the
+  // instruction, because if a user is in the same block as an instruction
+  // we're contemplating sinking, it must already be determined to be sinkable.
   if (!isa<StoreInst>(I0)) {
     auto *PNUse = dyn_cast<PHINode>(*I0->user_begin());
-    if (!PNUse ||
-        !all_of(Insts, [&PNUse](const Instruction *I) {
-          return *I->user_begin() == PNUse;
+    if (!all_of(Insts, [&PNUse](const Instruction *I) -> bool {
+          auto *U = cast<Instruction>(*I->user_begin());
+          return U == PNUse || U->getParent() == I->getParent();
         }))
       return false;
   }
 
-  NumPHIsRequired = 0;
   for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) {
     if (I0->getOperand(OI)->getType()->isTokenTy())
       // Don't touch any operand of token type.
       return false;
     auto SameAsI0 = [&I0, OI](const Instruction *I) {
-      return areValuesTriviallySame(I->getOperand(OI), I->getIterator(),
-                                    I0->getOperand(OI), I0->getIterator());
+      assert(I->getNumOperands() == I0->getNumOperands());
+      return I->getOperand(OI) == I0->getOperand(OI);
     };
     if (!all_of(Insts, SameAsI0)) {
       if (!canReplaceOperandWithVariable(I0, OI))
@@ -1472,7 +1410,8 @@ static bool canSinkLastInstruction(Array
         // FIXME: if the call was *already* indirect, we should do this.
         return false;
       }
-      ++NumPHIsRequired;
+      for (auto *I : Insts)
+        PHIOperands[I].push_back(I->getOperand(OI));
     }
   }
   return true;
@@ -1482,10 +1421,6 @@ static bool canSinkLastInstruction(Array
 // instruction of every block in Blocks to their common successor, commoning
 // into one instruction.
 static void sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
-  unsigned Dummy;
-  (void)Dummy;
-  assert(canSinkLastInstruction(Blocks, Dummy) &&
-         "Must analyze before transforming!");
   auto *BBEnd = Blocks[0]->getTerminator()->getSuccessor(0);
 
   // canSinkLastInstruction returning true guarantees that every block has at
@@ -1550,6 +1485,60 @@ static void sinkLastInstruction(ArrayRef
       I->eraseFromParent();
 }
 
+namespace {
+  // LockstepReverseIterator - Iterates through instructions
+  // in a set of blocks in reverse order from the first non-terminator.
+  // For example (assume all blocks have size n):
+  //   LockstepReverseIterator I([B1, B2, B3]);
+  //   *I-- = [B1[n], B2[n], B3[n]];
+  //   *I-- = [B1[n-1], B2[n-1], B3[n-1]];
+  //   *I-- = [B1[n-2], B2[n-2], B3[n-2]];
+  //   ...
+  class LockstepReverseIterator {
+    ArrayRef<BasicBlock*> Blocks;
+    SmallVector<Instruction*,4> Insts;
+    bool Fail;
+  public:
+    LockstepReverseIterator(ArrayRef<BasicBlock*> Blocks) :
+      Blocks(Blocks) {
+      reset();
+    }
+
+    void reset() {
+      Fail = false;
+      Insts.clear();
+      for (auto *BB : Blocks) {
+        if (BB->size() <= 1) {
+          // Block wasn't big enough
+          Fail = true;
+          return;
+        }
+        Insts.push_back(BB->getTerminator()->getPrevNode());
+      }
+    }
+
+    bool isValid() const {
+      return !Fail;
+    }
+    
+    void operator -- () {
+      if (Fail)
+        return;
+      for (auto *&Inst : Insts) {
+        if (Inst == &Inst->getParent()->front()) {
+          Fail = true;
+          return;
+        }
+        Inst = Inst->getPrevNode();
+      }
+    }
+
+    ArrayRef<Instruction*> operator * () const {
+      return Insts;
+    }
+  };
+}
+
 /// Given an unconditional branch that goes to BBEnd,
 /// check whether BBEnd has only two predecessors and the other predecessor
 /// ends with an unconditional branch. If it is true, sink any common code
@@ -1558,6 +1547,8 @@ static bool SinkThenElseCodeToEnd(Branch
   assert(BI1->isUnconditional());
   BasicBlock *BBEnd = BI1->getSuccessor(0);
 
+  // We currently only support branch targets with two predecessors.
+  // FIXME: this is an arbitrary restriction and should be lifted.
   SmallVector<BasicBlock*,4> Blocks;
   for (auto *BB : predecessors(BBEnd))
     Blocks.push_back(BB);
@@ -1569,12 +1560,62 @@ static bool SinkThenElseCodeToEnd(Branch
     return false;
 
   bool Changed = false;
-  unsigned NumPHIsToInsert;
-  while (canSinkLastInstruction(Blocks, NumPHIsToInsert) && NumPHIsToInsert <= 1) {
+
+  // We take a two-step approach to tail sinking. First we scan from the end of
+  // each block upwards in lockstep. If the n'th instruction from the end of each
+  // block can be sunk, those instructions are added to ValuesToSink and we
+  // carry on. If we can sink an instruction but need to PHI-merge some operands
+  // (because they're not identical in each instruction) we add these to
+  // PHIOperands.
+  unsigned ScanIdx = 0;
+  SmallPtrSet<Value*,4> InstructionsToSink;
+  DenseMap<Instruction*, SmallVector<Value*,4>> PHIOperands;
+  LockstepReverseIterator LRI(Blocks);
+  while (LRI.isValid() &&
+         canSinkInstructions(*LRI, PHIOperands)) {
+    DEBUG(dbgs() << "SINK: instruction can be sunk: " << (*LRI)[0] << "\n");
+    InstructionsToSink.insert((*LRI).begin(), (*LRI).end());
+    ++ScanIdx;
+    --LRI;
+  }
+
+  // Now that we've analyzed all potential sinking candidates, perform the
+  // actual sink. We iteratively sink the last non-terminator of the source
+  // blocks into their common successor unless doing so would require too
+  // many PHI instructions to be generated (currently only one PHI is allowed
+  // per sunk instruction).
+  //
+  // We can use InstructionsToSink to discount values needing PHI-merging that will
+  // actually be sunk in a later iteration. This allows us to be more
+  // aggressive in what we sink. This does allow a false positive where we
+  // sink presuming a later value will also be sunk, but stop half way through
+  // and never actually sink it which means we produce more PHIs than intended.
+  // This is unlikely in practice though.
+  for (unsigned SinkIdx = 0; SinkIdx != ScanIdx; ++SinkIdx) {
+    DEBUG(dbgs() << "SINK: Sink: "
+                 << *Blocks[0]->getTerminator()->getPrevNode()
+                 << "\n");
+    // Because we've sunk every instruction in turn, the current instruction to
+    // sink is always at index 0.
+    LRI.reset();
+    unsigned NumPHIdValues = 0;
+    for (auto *I : *LRI)
+      for (auto *V : PHIOperands[I])
+        if (InstructionsToSink.count(V) == 0)
+          ++NumPHIdValues;
+    DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
+    assert((NumPHIdValues % Blocks.size() == 0) &&
+           "Every operand must either be PHId or not PHId!");
+    
+    if (NumPHIdValues / Blocks.size() > 1)
+      // Too many PHIs would be created.
+      break;
+    
     sinkLastInstruction(Blocks);
     NumSinkCommons++;
     Changed = true;
   }
+  
   return Changed;
 }
 

Modified: llvm/trunk/test/Transforms/SimplifyCFG/sink-common-code.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SimplifyCFG/sink-common-code.ll?rev=280216&r1=280215&r2=280216&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SimplifyCFG/sink-common-code.ll (original)
+++ llvm/trunk/test/Transforms/SimplifyCFG/sink-common-code.ll Wed Aug 31 05:46:23 2016
@@ -367,6 +367,66 @@ declare i32 @bar(i32)
 ; CHECK: %[[x:.*]] = select i1 %flag
 ; CHECK: call i32 @bar(i32 %[[x]])
 
+; The load should be commoned.
+define i32 @test14(i1 zeroext %flag, i32 %w, i32 %x, i32 %y, %struct.anon* %s) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %x, 1
+  %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  %sv1 = load i32, i32* %gepa
+  %cmp1 = icmp eq i32 %sv1, 56
+  br label %if.end
+
+if.else:
+  %dummy2 = add i32 %x, 4
+  %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  %sv2 = load i32, i32* %gepb
+  %cmp2 = icmp eq i32 %sv2, 57
+  br label %if.end
+
+if.end:
+  %p = phi i1 [ %cmp1, %if.then ], [ %cmp2, %if.else ]
+  ret i32 1
+}
+
+; CHECK-LABEL: test14
+; CHECK: getelementptr
+; CHECK: load
+; CHECK-NOT: load
+
+; The load should be commoned.
+define i32 @test15(i1 zeroext %flag, i32 %w, i32 %x, i32 %y, %struct.anon* %s) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %x, 1
+  %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 0
+  %sv1 = load i32, i32* %gepa
+  %ext1 = zext i32 %sv1 to i64
+  %cmp1 = icmp eq i64 %ext1, 56
+  br label %if.end
+
+if.else:
+  %dummy2 = add i32 %x, 4
+  %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  %sv2 = load i32, i32* %gepb
+  %ext2 = zext i32 %sv2 to i64
+  %cmp2 = icmp eq i64 %ext2, 57
+  br label %if.end
+
+if.end:
+  %p = phi i1 [ %cmp1, %if.then ], [ %cmp2, %if.else ]
+  ret i32 1
+}
+
+; CHECK-LABEL: test15
+; CHECK: getelementptr
+; CHECK: load
+; CHECK-NOT: load
+
 ; CHECK: !0 = !{!1, !1, i64 0}
 ; CHECK: !1 = !{!"float", !2}
 ; CHECK: !2 = !{!"an example type tree"}