[llvm] r321377 - [SimplifyCFG] Don't do if-conversion if there is a long dependence chain

Fri Dec 22 10:54:04 PST 2017

Author: carrot
Date: Fri Dec 22 10:54:04 2017
New Revision: 321377

URL: http://llvm.org/viewvc/llvm-project?rev=321377&view=rev
Log:
[SimplifyCFG] Don't do if-conversion if there is a long dependence chain

If after if-conversion, most of the instructions in this new BB construct a long and slow dependence chain, it may be slower than cmp/branch, even if the branch has a high miss rate, because the control dependence is transformed into data dependence, and control dependence can be speculated, and thus, the second part can execute in parallel with the first part on modern OOO processor.

This patch checks for the long dependence chain, and give up if-conversion if find one.

Differential Revision: https://reviews.llvm.org/D39352


Added:
    llvm/trunk/test/Transforms/SimplifyCFG/X86/if-conversion.ll
Modified:
    llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
    llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
    llvm/trunk/lib/Transforms/Utils/SimplifyCFG.cpp

Modified: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h?rev=321377&r1=321376&r2=321377&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h Fri Dec 22 10:54:04 2017
@@ -646,6 +646,9 @@ public:
   /// \brief Additional properties of an operand's values.
   enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
 
+  /// \return True if target can execute instructions out of order.
+  bool isOutOfOrder() const;
+
   /// \return The number of scalar or vector registers that the target has.
   /// If 'Vectors' is true, it returns the number of vector registers. If it is
   /// set to false, it returns the number of scalar registers.
@@ -1018,6 +1021,7 @@ public:
                             Type *Ty) = 0;
   virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                             Type *Ty) = 0;
+  virtual bool isOutOfOrder() const = 0;
   virtual unsigned getNumberOfRegisters(bool Vector) = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
@@ -1295,6 +1299,9 @@ public:
                     Type *Ty) override {
     return Impl.getIntImmCost(IID, Idx, Imm, Ty);
   }
+  bool isOutOfOrder() const override {
+    return Impl.isOutOfOrder();
+  }
   unsigned getNumberOfRegisters(bool Vector) override {
     return Impl.getNumberOfRegisters(Vector);
   }

Modified: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h?rev=321377&r1=321376&r2=321377&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h Fri Dec 22 10:54:04 2017
@@ -337,6 +337,8 @@ public:
     return TTI::TCC_Free;
   }
 
+  bool isOutOfOrder() const { return false; }
+
   unsigned getNumberOfRegisters(bool Vector) { return 8; }
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }

Modified: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h?rev=321377&r1=321376&r2=321377&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h (original)
+++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h Fri Dec 22 10:54:04 2017
@@ -402,6 +402,10 @@ public:
     return BaseT::getInstructionLatency(I);
   }
 
+  bool isOutOfOrder() const {
+    return getST()->getSchedModel().isOutOfOrder();
+  }
+
   /// @}
 
   /// \name Vector TTI Implementations

Modified: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/TargetTransformInfo.cpp?rev=321377&r1=321376&r2=321377&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp Fri Dec 22 10:54:04 2017
@@ -314,6 +314,10 @@ int TargetTransformInfo::getIntImmCost(I
   return Cost;
 }
 
+bool TargetTransformInfo::isOutOfOrder() const {
+  return TTIImpl->isOutOfOrder();
+}
+
 unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {
   return TTIImpl->getNumberOfRegisters(Vector);
 }

Modified: llvm/trunk/lib/Transforms/Utils/SimplifyCFG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Utils/SimplifyCFG.cpp?rev=321377&r1=321376&r2=321377&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Utils/SimplifyCFG.cpp (original)
+++ llvm/trunk/lib/Transforms/Utils/SimplifyCFG.cpp Fri Dec 22 10:54:04 2017
@@ -127,6 +127,16 @@ static cl::opt<unsigned> MaxSpeculationD
     cl::desc("Limit maximum recursion depth when calculating costs of "
              "speculatively executed instructions"));
 
+static cl::opt<unsigned> DependenceChainLatency(
+    "dependence-chain-latency", cl::Hidden, cl::init(8),
+    cl::desc("Limit the maximum latency of dependence chain containing cmp "
+             "for if conversion"));
+
+static cl::opt<unsigned> SmallBBSize(
+    "small-bb-size", cl::Hidden, cl::init(40),
+    cl::desc("Check dependence chain latency only in basic block smaller than "
+             "this number"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
@@ -395,6 +405,166 @@ static bool DominatesMergePoint(Value *V
   return true;
 }
 
+/// Estimate the code size of the specified BB.
+static unsigned CountBBCodeSize(BasicBlock *BB,
+                                const TargetTransformInfo &TTI) {
+  unsigned Size = 0;
+  for (auto II = BB->begin(); !isa<TerminatorInst>(II); ++II)
+    Size += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_CodeSize);
+  return Size;
+}
+
+/// Find out the latency of the longest dependence chain in the BB if
+/// LongestChain is true, or the dependence chain containing the compare
+/// instruction feeding the block's conditional branch.
+static unsigned FindDependenceChainLatency(BasicBlock *BB,
+                            DenseMap<Instruction *, unsigned> &Instructions,
+                            const TargetTransformInfo &TTI,
+                            bool LongestChain) {
+  unsigned MaxLatency = 0;
+
+  BasicBlock::iterator II;
+  for (II = BB->begin(); !isa<TerminatorInst>(II); ++II) {
+    unsigned Latency = 0;
+    for (unsigned O = 0, E = II->getNumOperands(); O != E; ++O) {
+      Instruction *Op = dyn_cast<Instruction>(II->getOperand(O));
+      if (Op && Instructions.count(Op)) {
+        auto OpLatency = Instructions[Op];
+        if (OpLatency > Latency)
+          Latency = OpLatency;
+      }
+    }
+    Latency += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_Latency);
+    Instructions[&(*II)] = Latency;
+
+    if (Latency > MaxLatency)
+      MaxLatency = Latency;
+  }
+
+  if (LongestChain)
+    return MaxLatency;
+
+  // The length of the dependence chain containing the compare instruction is
+  // wanted, so the terminator must be a BranchInst.
+  assert(isa<BranchInst>(II));
+  BranchInst* Br = cast<BranchInst>(II);
+  Instruction *Cmp = dyn_cast<Instruction>(Br->getCondition());
+  if (Cmp && Instructions.count(Cmp))
+    return Instructions[Cmp];
+  else
+    return 0;
+}
+
+/// Instructions in BB2 may depend on instructions in BB1, and instructions
+/// in BB1 may have users in BB2. If the last (in terms of latency) such kind
+/// of instruction in BB1 is I, then the instructions after I can be executed
+/// in parallel with instructions in BB2.
+/// This function returns the latency of I.
+static unsigned LatencyAdjustment(BasicBlock *BB1, BasicBlock *BB2,
+                        BasicBlock *IfBlock1, BasicBlock *IfBlock2,
+                        DenseMap<Instruction *, unsigned> &BB1Instructions) {
+  unsigned LastLatency = 0;
+  SmallVector<Instruction *, 16> Worklist;
+  BasicBlock::iterator II;
+  for (II = BB2->begin(); !isa<TerminatorInst>(II); ++II) {
+    if (PHINode *PN = dyn_cast<PHINode>(II)) {
+      // Look for users in BB2.
+      bool InBBUser = false;
+      for (User *U : PN->users()) {
+        if (cast<Instruction>(U)->getParent() == BB2) {
+          InBBUser = true;
+          break;
+        }
+      }
+      // No such user, we don't care about this instruction and its operands.
+      if (!InBBUser)
+        break;
+    }
+    Worklist.push_back(&(*II));
+  }
+
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    for (unsigned O = 0, E = I->getNumOperands(); O != E; ++O) {
+      if (Instruction *Op = dyn_cast<Instruction>(I->getOperand(O))) {
+        if (Op->getParent() == IfBlock1 || Op->getParent() == IfBlock2)
+          Worklist.push_back(Op);
+        else if (Op->getParent() == BB1 && BB1Instructions.count(Op)) {
+          if (BB1Instructions[Op] > LastLatency)
+            LastLatency = BB1Instructions[Op];
+        }
+      }
+    }
+  }
+
+  return LastLatency;
+}
+
+/// If after if conversion, most of the instructions in this new BB construct a
+/// long and slow dependence chain, it may be slower than cmp/branch, even
+/// if the branch has a high miss rate, because the control dependence is
+/// transformed into data dependence, and control dependence can be speculated,
+/// and thus, the second part can execute in parallel with the first part on
+/// modern OOO processor.
+///
+/// To check this condition, this function finds the length of the dependence
+/// chain in BB1 (only the part that can be executed in parallel with code after
+/// branch in BB2) containing cmp, and if the length is longer than a threshold,
+/// don't perform if conversion.
+///
+/// BB1, BB2, IfBlock1 and IfBlock2 are candidate BBs for if conversion.
+/// SpeculationSize contains the code size of IfBlock1 and IfBlock2.
+static bool FindLongDependenceChain(BasicBlock *BB1, BasicBlock *BB2,
+                             BasicBlock *IfBlock1, BasicBlock *IfBlock2,
+                             unsigned SpeculationSize,
+                             const TargetTransformInfo &TTI) {
+  // Accumulated latency of each instruction in their BBs.
+  DenseMap<Instruction *, unsigned> BB1Instructions;
+  DenseMap<Instruction *, unsigned> BB2Instructions;
+
+  if (!TTI.isOutOfOrder())
+    return false;
+
+  unsigned NewBBSize = CountBBCodeSize(BB1, TTI) + CountBBCodeSize(BB2, TTI)
+                         + SpeculationSize;
+
+  // We check small BB only since it is more difficult to find unrelated
+  // instructions to fill functional units in a small BB.
+  if (NewBBSize > SmallBBSize)
+    return false;
+
+  auto BB1Chain =
+         FindDependenceChainLatency(BB1, BB1Instructions, TTI, false);
+  auto BB2Chain =
+         FindDependenceChainLatency(BB2, BB2Instructions, TTI, true);
+
+  // If there are many unrelated instructions in the new BB, there will be
+  // other instructions for the processor to issue regardless of the length
+  // of this new dependence chain.
+  // Modern processors can issue 3 or more instructions in each cycle. But in
+  // real world applications, an IPC of 2 is already very good for non-loop
+  // code with small basic blocks. Higher IPC is usually found in programs with
+  // small kernel. So IPC of 2 is more reasonable for most applications.
+  if ((BB1Chain + BB2Chain) * 2 <= NewBBSize)
+    return false;
+
+  // We only care about part of the dependence chain in BB1 that can be
+  // executed in parallel with BB2, so adjust the latency.
+  BB1Chain -=
+      LatencyAdjustment(BB1, BB2, IfBlock1, IfBlock2, BB1Instructions);
+
+  // Correctly predicted branch instruction can skip the dependence chain in
+  // BB1, but misprediction has a penalty, so only when the dependence chain is
+  // longer than DependenceChainLatency, then branch is better than select.
+  // Besides misprediction penalty, the threshold value DependenceChainLatency
+  // also depends on branch misprediction rate, taken branch latency and cmov
+  // latency.
+  if (BB1Chain >= DependenceChainLatency)
+    return true;
+
+  return false;
+}
+
 /// Extract ConstantInt from value, looking through IntToPtr
 /// and PointerNullValue. Return NULL if value is not a constant int.
 static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) {
@@ -2044,6 +2214,11 @@ static bool SpeculativelyExecuteBB(Branc
   if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue))
     return false;
 
+  // Don't do if conversion for long dependence chain.
+  if (FindLongDependenceChain(BB, EndBB, ThenBB, nullptr,
+                              CountBBCodeSize(ThenBB, TTI), TTI))
+    return false;
+
   // If we get here, we can hoist the instruction and if-convert.
   DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
 
@@ -2351,6 +2526,10 @@ static bool FoldTwoEntryPHINode(PHINode
       }
   }
 
+  if (FindLongDependenceChain(DomBlock, BB, IfBlock1, IfBlock2,
+                              AggressiveInsts.size(), TTI))
+    return false;
+
   DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond << "  T: "
                << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n");
 

Added: llvm/trunk/test/Transforms/SimplifyCFG/X86/if-conversion.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SimplifyCFG/X86/if-conversion.ll?rev=321377&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SimplifyCFG/X86/if-conversion.ll (added)
+++ llvm/trunk/test/Transforms/SimplifyCFG/X86/if-conversion.ll Fri Dec 22 10:54:04 2017
@@ -0,0 +1,231 @@
+; RUN: opt < %s -simplifycfg -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -S | FileCheck %s
+; Avoid if-conversion if there is a long dependence chain.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; The first several cases test FindLongDependenceChain returns true, so
+; if-conversion is blocked.
+
+define i64 @test1(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-NOT: select
+}
+
+define i64 @test2(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = add i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-LABEL: @test2
+; CHECK-NOT: select
+}
+
+; The following cases test FindLongDependenceChain returns false, so
+; if-conversion will proceed.
+
+; Non trivial LatencyAdjustment.
+define i64 @test3(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %p4 = add i64 %p3, %1
+  %ptr = inttoptr i64 %p4 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-LABEL: @test3
+; CHECK: select
+}
+
+; Short dependence chain.
+define i64 @test4(i64* %pp, i64* %p) {
+entry:
+  %0 = load i64, i64* %pp, align 8
+  %cmp = icmp slt i64 %0, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-LABEL: @test4
+; CHECK: select
+}
+
+; High IPC.
+define i64 @test5(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  %2 = add i64 %pint, 2
+  %3 = add i64 %pint, 3
+  %4 = or i64 %pint, 16
+  %5 = and i64 %pint, 255
+
+  %6 = or i64 %2, 9
+  %7 = and i64 %3, 255
+  %8 = add i64 %4, 4
+  %9 = add i64 %5, 5
+
+  %10 = add i64 %6, 2
+  %11 = add i64 %7, 3
+  %12 = add i64 %8, 4
+  %13 = add i64 %9, 5
+
+  %14 = add i64 %10, 6
+  %15 = add i64 %11, 7
+  %16 = add i64 %12, 8
+  %17 = add i64 %13, 9
+
+  %18 = add i64 %14, 10
+  %19 = add i64 %15, 11
+  %20 = add i64 %16, 12
+  %21 = add i64 %17, 13
+
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+
+  ret i64 %val
+
+; CHECK-LABEL: @test5
+; CHECK: select
+}
+
+; Large BB size.
+define i64 @test6(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  %2 = add i64 %pint, 2
+  %3 = add i64 %pint, 3
+  %4 = add i64 %2, 4
+  %5 = add i64 %3, 5
+  %6 = add i64 %4, 6
+  %7 = add i64 %5, 7
+  %8 = add i64 %6, 6
+  %9 = add i64 %7, 7
+  %10 = add i64 %8, 6
+  %11 = add i64 %9, 7
+  %12 = add i64 %10, 6
+  %13 = add i64 %11, 7
+  %14 = add i64 %12, 6
+  %15 = add i64 %13, 7
+  %16 = add i64 %14, 6
+  %17 = add i64 %15, 7
+  %18 = add i64 %16, 6
+  %19 = add i64 %17, 7
+  %20 = add i64 %18, 6
+  %21 = add i64 %19, 7
+  %22 = add i64 %20, 6
+  %23 = add i64 %21, 7
+  %24 = add i64 %22, 6
+  %25 = add i64 %23, 7
+  %26 = add i64 %24, 6
+  %27 = add i64 %25, 7
+  %28 = add i64 %26, 6
+  %29 = add i64 %27, 7
+  %30 = add i64 %28, 6
+  %31 = add i64 %29, 7
+  %32 = add i64 %30, 8
+  %33 = add i64 %31, 9
+  %34 = add i64 %32, %33
+  %35 = and i64 %34, 255
+  %res = add i64 %val, %35
+
+  ret i64 %res
+
+; CHECK-LABEL: @test6
+; CHECK: select
+}