[llvm] r208017 - Improve 'tail' call marking in TRE. A bootstrap of clang goes from 375k calls marked tail in the IR to 470k, however this improvement does not carry into an improvement of the call/jmp ratio on x86. The most common pattern is a tail call + br to a block with nothing but a 'ret'.

Rafael EspĂ­ndola rafael.espindola at gmail.com
Tue May 6 07:36:46 PDT 2014


This supersedes your exhaustive-tailcalls-1.patch?

On 5 May 2014 19:59, Nick Lewycky <nicholas at mxc.ca> wrote:
> Author: nicholas
> Date: Mon May  5 18:59:03 2014
> New Revision: 208017
>
> URL: http://llvm.org/viewvc/llvm-project?rev=208017&view=rev
> Log:
> Improve 'tail' call marking in TRE. A bootstrap of clang goes from 375k calls marked tail in the IR to 470k, however this improvement does not carry into an improvement of the call/jmp ratio on x86. The most common pattern is a tail call + br to a block with nothing but a 'ret'.
>
> The number of tail call to loop conversions remains the same (1618 by my count).
>
> The new algorithm does a local scan over the use-def chains to identify local "alloca-derived" values, as well as points where the alloca could escape. Then, a visit over the CFG marks blocks as being before or after the allocas have escaped, and annotates the calls accordingly.
>
> Modified:
>     llvm/trunk/include/llvm/IR/CallSite.h
>     llvm/trunk/lib/Transforms/Scalar/TailRecursionElimination.cpp
>     llvm/trunk/test/Transforms/TailCallElim/basic.ll
>
> Modified: llvm/trunk/include/llvm/IR/CallSite.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/CallSite.h?rev=208017&r1=208016&r2=208017&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/IR/CallSite.h (original)
> +++ llvm/trunk/include/llvm/IR/CallSite.h Mon May  5 18:59:03 2014
> @@ -166,6 +166,11 @@ public:
>      return isCall() && cast<CallInst>(getInstruction())->isMustTailCall();
>    }
>
> +  /// \brief Tests if this call site is marked as a tail call.
> +  bool isTailCall() const {
> +    return isCall() && cast<CallInst>(getInstruction())->isTailCall();
> +  }
> +
>  #define CALLSITE_DELEGATE_GETTER(METHOD) \
>    InstrTy *II = getInstruction();    \
>    return isCall()                        \
>
> Modified: llvm/trunk/lib/Transforms/Scalar/TailRecursionElimination.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/TailRecursionElimination.cpp?rev=208017&r1=208016&r2=208017&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Scalar/TailRecursionElimination.cpp (original)
> +++ llvm/trunk/lib/Transforms/Scalar/TailRecursionElimination.cpp Mon May  5 18:59:03 2014
> @@ -55,6 +55,7 @@
>  #include "llvm/ADT/SmallPtrSet.h"
>  #include "llvm/ADT/Statistic.h"
>  #include "llvm/Analysis/CaptureTracking.h"
> +#include "llvm/Analysis/CFG.h"
>  #include "llvm/Analysis/InlineCost.h"
>  #include "llvm/Analysis/InstructionSimplify.h"
>  #include "llvm/Analysis/Loads.h"
> @@ -95,6 +96,9 @@ namespace {
>      bool runOnFunction(Function &F) override;
>
>    private:
> +    bool runTRE(Function &F);
> +    bool markTails(Function &F, bool &AllCallsAreTailCalls);
> +
>      CallInst *FindTRECandidate(Instruction *I,
>                                 bool CannotTailCallElimCallsMarkedTail);
>      bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
> @@ -146,35 +150,232 @@ static bool CanTRE(AllocaInst *AI) {
>           isa<ConstantInt>(AI->getArraySize());
>  }
>
> +bool TailCallElim::runOnFunction(Function &F) {
> +  if (skipOptnoneFunction(F))
> +    return false;
> +
> +  bool AllCallsAreTailCalls = false;
> +  bool Modified = markTails(F, AllCallsAreTailCalls);
> +  if (AllCallsAreTailCalls)
> +    Modified |= runTRE(F);
> +  return Modified;
> +}
> +
>  namespace {
> -struct AllocaCaptureTracker : public CaptureTracker {
> -  AllocaCaptureTracker() : Captured(false) {}
> +struct AllocaDerivedValueTracker {
> +  // Start at a root value and walk its use-def chain to mark calls that use the
> +  // value or a derived value in AllocaUsers, and places where it may escape in
> +  // EscapePoints.
> +  void walk(Value *Root) {
> +    SmallVector<Use *, 32> Worklist;
> +    SmallPtrSet<Use *, 32> Visited;
> +
> +    auto AddUsesToWorklist = [&](Value *V) {
> +      for (auto &U : V->uses()) {
> +        if (!Visited.insert(&U))
> +          continue;
> +        Worklist.push_back(&U);
> +      }
> +    };
> +
> +    AddUsesToWorklist(Root);
>
> -  void tooManyUses() override { Captured = true; }
> +    while (!Worklist.empty()) {
> +      Use *U = Worklist.pop_back_val();
> +      Instruction *I = cast<Instruction>(U->getUser());
> +
> +      switch (I->getOpcode()) {
> +      case Instruction::Call:
> +      case Instruction::Invoke: {
> +        CallSite CS(I);
> +        bool IsNocapture = !CS.isCallee(U) &&
> +                           CS.doesNotCapture(CS.getArgumentNo(U));
> +        callUsesLocalStack(CS, IsNocapture);
> +        if (IsNocapture) {
> +          // If the alloca-derived argument is passed in as nocapture, then it
> +          // can't propagate to the call's return. That would be capturing.
> +          continue;
> +        }
> +        break;
> +      }
> +      case Instruction::Load: {
> +        // The result of a load is not alloca-derived (unless an alloca has
> +        // otherwise escaped, but this is a local analysis).
> +        continue;
> +      }
> +      case Instruction::Store: {
> +       if (U->getOperandNo() == 0)
> +          EscapePoints.insert(I);
> +        continue;  // Stores have no users to analyze.
> +      }
> +      case Instruction::BitCast:
> +      case Instruction::GetElementPtr:
> +      case Instruction::PHI:
> +      case Instruction::Select:
> +      case Instruction::AddrSpaceCast:
> +        break;
> +      default:
> +        EscapePoints.insert(I);
> +       break;
> +      }
>
> -  bool shouldExplore(const Use *U) override {
> -    Value *V = U->getUser();
> -    if (isa<CallInst>(V) || isa<InvokeInst>(V))
> -      UsesAlloca.insert(V);
> -    return true;
> +      AddUsesToWorklist(I);
> +    }
>    }
>
> -  bool captured(const Use *U) override {
> -    if (isa<ReturnInst>(U->getUser()))
> -      return false;
> -    Captured = true;
> -    return true;
> +  void callUsesLocalStack(CallSite CS, bool IsNocapture) {
> +    // Add it to the list of alloca users. If it's already there, skip further
> +    // processing.
> +    if (!AllocaUsers.insert(CS.getInstruction()))
> +      return;
> +
> +    // If it's nocapture then it can't capture the alloca.
> +    if (IsNocapture)
> +      return;
> +
> +    // If it can write to memory, it can leak the alloca value.
> +    if (!CS.onlyReadsMemory())
> +      EscapePoints.insert(CS.getInstruction());
>    }
>
> -  bool Captured;
> -  SmallPtrSet<const Value *, 16> UsesAlloca;
> +  SmallPtrSet<Instruction *, 32> AllocaUsers;
> +  SmallPtrSet<Instruction *, 32> EscapePoints;
>  };
> -} // end anonymous namespace
> +}
>
> -bool TailCallElim::runOnFunction(Function &F) {
> -  if (skipOptnoneFunction(F))
> +bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) {
> +  if (F.callsFunctionThatReturnsTwice())
>      return false;
> +  AllCallsAreTailCalls = true;
> +
> +  // The local stack holds all alloca instructions and all byval arguments.
> +  AllocaDerivedValueTracker Tracker;
> +  for (Argument &Arg : F.args()) {
> +    if (Arg.hasByValAttr())
> +      Tracker.walk(&Arg);
> +  }
> +  for (auto &BB : F) {
> +    for (auto &I : BB)
> +      if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
> +        Tracker.walk(AI);
> +  }
> +
> +  bool Modified = false;
> +
> +  // Track whether a block is reachable after an alloca has escaped. Blocks that
> +  // contain the escaping instruction will be marked as being visited without an
> +  // escaped alloca, since that is how the block began.
> +  enum VisitType {
> +    UNVISITED,
> +    UNESCAPED,
> +    ESCAPED
> +  };
> +  DenseMap<BasicBlock *, VisitType> Visited;
>
> +  // We propagate the fact that an alloca has escaped from block to successor.
> +  // Visit the blocks that are propagating the escapedness first. To do this, we
> +  // maintain two worklists.
> +  SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped;
> +
> +  // We may enter a block and visit it thinking that no alloca has escaped yet,
> +  // then see an escape point and go back around a loop edge and come back to
> +  // the same block twice. Because of this, we defer setting tail on calls when
> +  // we first encounter them in a block. Every entry in this list does not
> +  // statically use an alloca via use-def chain analysis, but may find an alloca
> +  // through other means if the block turns out to be reachable after an escape
> +  // point.
> +  SmallVector<CallInst *, 32> DeferredTails;
> +
> +  BasicBlock *BB = &F.getEntryBlock();
> +  VisitType Escaped = UNESCAPED;
> +  do {
> +    for (auto &I : *BB) {
> +      if (Tracker.EscapePoints.count(&I))
> +        Escaped = ESCAPED;
> +
> +      CallInst *CI = dyn_cast<CallInst>(&I);
> +      if (!CI || CI->isTailCall())
> +        continue;
> +
> +      if (CI->doesNotAccessMemory()) {
> +        // A call to a readnone function whose arguments are all things computed
> +        // outside this function can be marked tail. Even if you stored the
> +        // alloca address into a global, a readnone function can't load the
> +        // global anyhow.
> +        //
> +        // Note that this runs whether we know an alloca has escaped or not. If
> +        // it has, then we can't trust Tracker.AllocaUsers to be accurate.
> +        bool SafeToTail = true;
> +        for (auto &Arg : CI->arg_operands()) {
> +          if (isa<Constant>(Arg.getUser()))
> +            continue;
> +          if (Argument *A = dyn_cast<Argument>(Arg.getUser()))
> +            if (!A->hasByValAttr())
> +              continue;
> +          SafeToTail = false;
> +          break;
> +        }
> +        if (SafeToTail) {
> +          F.getContext().emitOptimizationRemark(
> +              "tailcallelim", F, CI->getDebugLoc(),
> +              "found readnone tail call candidate");
> +          CI->setTailCall();
> +          Modified = true;
> +          continue;
> +        }
> +      }
> +
> +      if (Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
> +        DeferredTails.push_back(CI);
> +      } else {
> +        AllCallsAreTailCalls = false;
> +      }
> +    }
> +
> +    for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) {
> +      auto &State = Visited[SuccBB];
> +      if (State < Escaped) {
> +        State = Escaped;
> +        if (State == ESCAPED)
> +          WorklistEscaped.push_back(SuccBB);
> +        else
> +          WorklistUnescaped.push_back(SuccBB);
> +      }
> +    }
> +
> +    if (!WorklistEscaped.empty()) {
> +      BB = WorklistEscaped.pop_back_val();
> +      Escaped = ESCAPED;
> +    } else {
> +      BB = nullptr;
> +      while (!WorklistUnescaped.empty()) {
> +        auto *NextBB = WorklistUnescaped.pop_back_val();
> +        if (Visited[NextBB] == UNESCAPED) {
> +          BB = NextBB;
> +          Escaped = UNESCAPED;
> +          break;
> +        }
> +      }
> +    }
> +  } while (BB);
> +
> +  for (CallInst *CI : DeferredTails) {
> +    if (Visited[CI->getParent()] != ESCAPED) {
> +      // If the escape point was part way through the block, calls after the
> +      // escape point wouldn't have been put into DeferredTails.
> +      F.getContext().emitOptimizationRemark(
> +          "tailcallelim", F, CI->getDebugLoc(), "found tail call candidate");
> +      CI->setTailCall();
> +      Modified = true;
> +    } else {
> +      AllCallsAreTailCalls = false;
> +    }
> +  }
> +
> +  return Modified;
> +}
> +
> +bool TailCallElim::runTRE(Function &F) {
>    // If this function is a varargs function, we won't be able to PHI the args
>    // right, so don't even try to convert it...
>    if (F.getFunctionType()->isVarArg()) return false;
> @@ -191,46 +392,30 @@ bool TailCallElim::runOnFunction(Functio
>    // doesn't).
>    bool CanTRETailMarkedCall = true;
>
> -  // Find calls that can be marked tail.
> -  AllocaCaptureTracker ACT;
> +  // Find dynamic allocas.
>    for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB) {
>      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
>        if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
>          CanTRETailMarkedCall &= CanTRE(AI);
> -        PointerMayBeCaptured(AI, &ACT);
> -        // If any allocas are captured, exit.
> -        if (ACT.Captured)
> -          return false;
>        }
>      }
>    }
>
> -  // If any byval or inalloca args are captured, exit. They are also allocated
> -  // in our stack frame.
> -  for (Argument &Arg : F.args()) {
> -    if (Arg.hasByValOrInAllocaAttr())
> -      PointerMayBeCaptured(&Arg, &ACT);
> -    if (ACT.Captured)
> -      return false;
> -  }
> -
> -  // Second pass, change any tail recursive calls to loops.
> +  // Change any tail recursive calls to loops.
>    //
>    // FIXME: The code generator produces really bad code when an 'escaping
>    // alloca' is changed from being a static alloca to being a dynamic alloca.
>    // Until this is resolved, disable this transformation if that would ever
>    // happen.  This bug is PR962.
> -  if (ACT.UsesAlloca.empty()) {
> -    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
> -      if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
> -        bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
> -                                            ArgumentPHIs, !CanTRETailMarkedCall);
> -        if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
> -          Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
> -                                            TailCallsAreMarkedTail, ArgumentPHIs,
> -                                            !CanTRETailMarkedCall);
> -        MadeChange |= Change;
> -      }
> +  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
> +    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
> +      bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
> +                                          ArgumentPHIs, !CanTRETailMarkedCall);
> +      if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
> +        Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
> +                                          TailCallsAreMarkedTail, ArgumentPHIs,
> +                                          !CanTRETailMarkedCall);
> +      MadeChange |= Change;
>      }
>    }
>
> @@ -239,34 +424,13 @@ bool TailCallElim::runOnFunction(Functio
>    // with themselves.  Check to see if we did and clean up our mess if so.  This
>    // occurs when a function passes an argument straight through to its tail
>    // call.
> -  if (!ArgumentPHIs.empty()) {
> -    for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) {
> -      PHINode *PN = ArgumentPHIs[i];
> -
> -      // If the PHI Node is a dynamic constant, replace it with the value it is.
> -      if (Value *PNV = SimplifyInstruction(PN)) {
> -        PN->replaceAllUsesWith(PNV);
> -        PN->eraseFromParent();
> -      }
> -    }
> -  }
> +  for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) {
> +    PHINode *PN = ArgumentPHIs[i];
>
> -  // At this point, we know that the function does not have any captured
> -  // allocas. If additionally the function does not call setjmp, mark all calls
> -  // in the function that do not access stack memory with the tail keyword. This
> -  // implies ensuring that there does not exist any path from a call that takes
> -  // in an alloca but does not capture it and the call which we wish to mark
> -  // with "tail".
> -  if (!F.callsFunctionThatReturnsTwice()) {
> -    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
> -      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
> -        if (CallInst *CI = dyn_cast<CallInst>(I)) {
> -          if (!ACT.UsesAlloca.count(CI)) {
> -            CI->setTailCall();
> -            MadeChange = true;
> -          }
> -        }
> -      }
> +    // If the PHI Node is a dynamic constant, replace it with the value it is.
> +    if (Value *PNV = SimplifyInstruction(PN)) {
> +      PN->replaceAllUsesWith(PNV);
> +      PN->eraseFromParent();
>      }
>    }
>
> @@ -520,6 +684,10 @@ bool TailCallElim::EliminateRecursiveTai
>    BasicBlock *BB = Ret->getParent();
>    Function *F = BB->getParent();
>
> +  F->getContext().emitOptimizationRemark(
> +      "tailcallelim", *F, CI->getDebugLoc(),
> +      "transforming tail recursion to loop");
> +
>    // OK! We can transform this tail call.  If this is the first one found,
>    // create the new entry block, allowing us to branch back to the old entry.
>    if (!OldEntry) {
>
> Modified: llvm/trunk/test/Transforms/TailCallElim/basic.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/TailCallElim/basic.ll?rev=208017&r1=208016&r2=208017&view=diff
> ==============================================================================
> --- llvm/trunk/test/Transforms/TailCallElim/basic.ll (original)
> +++ llvm/trunk/test/Transforms/TailCallElim/basic.ll Mon May  5 18:59:03 2014
> @@ -151,3 +151,26 @@ define void @test9(i32* byval %a) {
>    call void @use(i32* %a)
>    ret void
>  }
> +
> +%struct.X = type { i8* }
> +
> +declare void @ctor(%struct.X*)
> +define void @test10(%struct.X* noalias sret %agg.result, i1 zeroext %b) {
> +; CHECK-LABEL @test10
> +entry:
> +  %x = alloca %struct.X, align 8
> +  br i1 %b, label %if.then, label %if.end
> +
> +if.then:                                          ; preds = %entry
> +  call void @ctor(%struct.X* %agg.result)
> +; CHECK: tail call void @ctor
> +  br label %return
> +
> +if.end:
> +  call void @ctor(%struct.X* %x)
> +; CHECK: call void @ctor
> +  br label %return
> +
> +return:
> +  ret void
> +}
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits




More information about the llvm-commits mailing list