[llvm] 10c2e26 - [TRE] Reland: allow TRE for non-capturing calls.

Alexey Lapshin via llvm-commits llvm-commits at lists.llvm.org
Tue May 25 01:37:43 PDT 2021


Author: Alexey Lapshin
Date: 2021-05-25T11:35:48+03:00
New Revision: 10c2e261598a9c1b641b5adb10d87d937aba8b58

URL: https://github.com/llvm/llvm-project/commit/10c2e261598a9c1b641b5adb10d87d937aba8b58
DIFF: https://github.com/llvm/llvm-project/commit/10c2e261598a9c1b641b5adb10d87d937aba8b58.diff

LOG: [TRE] Reland: allow TRE for non-capturing calls.

The D82085 "allow TRE for non-capturing calls" caused failure during bootstrap.
This patch does the same as D82085 plus fixes bootstrap error.

The problem with D82085 is that it does not create copies for byval
operands, while replacing function call with a branch.

Consider following example:

```
    int zoo ( S p1 );

    int foo ( int count, S p1 ) {
      if ( count > 10 )
        return zoo(p1);

      // temporarily variable created for passing byvalue parameter
      // p1 could be used when zoo(p1) is called(after TRE is done).
      // lifetime.start p1.byvalue.temp
      return foo(count+1, p1);
      // lifetime.end p1.byvalue.temp
    }
```

After recursive call to foo is replaced with a jump into
start of the function, its parameters could be passed to
zoo function. i.e. temporarily variable created for byvalue
parameter "p1" could be passed to zoo. Finally zoo receives
broken operand:

```
    int foo ( int count, S p1 ) {
    :tailrecurse
      p1_tr = phi p1, p1.byvalue.temp
      if ( count > 10 )
        return zoo(p1_tr);

      // temporarily variable created for passing byvalue parameter
      // p1 could be used when zoo(p1) is called(after TRE is done).
      lifetime.start p1.byvalue.temp
      memcpy (p1.byvalue.temp, p1_tr)
      count = count + 1
      lifetime.end p1.byvalue.temp
      br tailrecurse
    }
```

To prevent using p1.byvalue.temp after its scope finished by
lifetime.end marker this patch copies value from p1.byvalue.temp
into another temporarily variable and then copies this variable
into the input parameter for next iteration.

This patch passes bootstrap build and bootstrap build with AddressSanitizer.

Differential Revision: https://reviews.llvm.org/D85614

Added: 
    llvm/test/Transforms/TailCallElim/tre-byval-parameter-2.ll
    llvm/test/Transforms/TailCallElim/tre-byval-parameter.ll
    llvm/test/Transforms/TailCallElim/tre-multiple-exits.ll
    llvm/test/Transforms/TailCallElim/tre-noncapturing-alloca-calls.ll

Modified: 
    llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
    llvm/test/Transforms/TailCallElim/basic.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 6f52dae8613dc..846a9321f53e2 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -63,6 +63,7 @@
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -70,6 +71,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -81,6 +83,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "tailcallelim"
@@ -92,10 +95,10 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced");
 /// Scan the specified function for alloca instructions.
 /// If it contains any dynamic allocas, returns false.
 static bool canTRE(Function &F) {
-  // FIXME: The code generator produces really bad code when an 'escaping
-  // alloca' is changed from being a static alloca to being a dynamic alloca.
-  // Until this is resolved, disable this transformation if that would ever
-  // happen.  This bug is PR962.
+  // TODO: We don't do TRE if dynamic allocas are used.
+  // Dynamic allocas allocate stack space which should be
+  // deallocated before new iteration started. That is
+  // currently not implemented.
   return llvm::all_of(instructions(F), [](Instruction &I) {
     auto *AI = dyn_cast<AllocaInst>(&I);
     return !AI || AI->isStaticAlloca();
@@ -188,11 +191,9 @@ struct AllocaDerivedValueTracker {
 };
 }
 
-static bool markTails(Function &F, bool &AllCallsAreTailCalls,
-                      OptimizationRemarkEmitter *ORE) {
+static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
   if (F.callsFunctionThatReturnsTwice())
     return false;
-  AllCallsAreTailCalls = true;
 
   // The local stack holds all alloca instructions and all byval arguments.
   AllocaDerivedValueTracker Tracker;
@@ -282,11 +283,8 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls,
         }
       }
 
-      if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
+      if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI))
         DeferredTails.push_back(CI);
-      } else {
-        AllCallsAreTailCalls = false;
-      }
     }
 
     for (auto *SuccBB : successors(BB)) {
@@ -323,8 +321,6 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls,
       LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
       CI->setTailCall();
       Modified = true;
-    } else {
-      AllCallsAreTailCalls = false;
     }
   }
 
@@ -336,6 +332,14 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls,
 /// instructions between the call and this instruction are movable.
 ///
 static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
+  if (isa<DbgInfoIntrinsic>(I))
+    return true;
+
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
+        llvm::findAllocaForValue(II->getArgOperand(1)))
+      return true;
+
   // FIXME: We can move load/store/call/free instructions above the call if the
   // call does not mod/ref the memory location being processed.
   if (I->mayHaveSideEffects())  // This also handles volatile loads.
@@ -402,7 +406,6 @@ class TailRecursionEliminator {
   // createTailRecurseLoopHeader the first time we find a call we can eliminate.
   BasicBlock *HeaderBB = nullptr;
   SmallVector<PHINode *, 8> ArgumentPHIs;
-  bool RemovableCallsMustBeMarkedTail = false;
 
   // PHI node to store our return value.
   PHINode *RetPN = nullptr;
@@ -429,8 +432,7 @@ class TailRecursionEliminator {
                           DomTreeUpdater &DTU)
       : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {}
 
-  CallInst *findTRECandidate(BasicBlock *BB,
-                             bool CannotTailCallElimCallsMarkedTail);
+  CallInst *findTRECandidate(BasicBlock *BB);
 
   void createTailRecurseLoopHeader(CallInst *CI);
 
@@ -440,7 +442,11 @@ class TailRecursionEliminator {
 
   void cleanupAndFinalize();
 
-  bool processBlock(BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail);
+  bool processBlock(BasicBlock &BB);
+
+  void copyByValueOperandIntoLocalTemp(CallInst *CI, int OpndIdx);
+
+  void copyLocalTempOfByValueOperandIntoArguments(CallInst *CI, int OpndIdx);
 
 public:
   static bool eliminate(Function &F, const TargetTransformInfo *TTI,
@@ -449,8 +455,7 @@ class TailRecursionEliminator {
 };
 } // namespace
 
-CallInst *TailRecursionEliminator::findTRECandidate(
-    BasicBlock *BB, bool CannotTailCallElimCallsMarkedTail) {
+CallInst *TailRecursionEliminator::findTRECandidate(BasicBlock *BB) {
   Instruction *TI = BB->getTerminator();
 
   if (&BB->front() == TI) // Make sure there is something before the terminator.
@@ -470,9 +475,9 @@ CallInst *TailRecursionEliminator::findTRECandidate(
     --BBI;
   }
 
-  // If this call is marked as a tail call, and if there are dynamic allocas in
-  // the function, we cannot perform this optimization.
-  if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
+  assert((!CI->isTailCall() || !CI->isNoTailCall()) &&
+         "Incompatible call site attributes(Tail,NoTail)");
+  if (!CI->isTailCall())
     return nullptr;
 
   // As a special case, detect code like this:
@@ -504,26 +509,13 @@ void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) {
   BranchInst *BI = BranchInst::Create(HeaderBB, NewEntry);
   BI->setDebugLoc(CI->getDebugLoc());
 
-  // If this function has self recursive calls in the tail position where some
-  // are marked tail and some are not, only transform one flavor or another.
-  // We have to choose whether we move allocas in the entry block to the new
-  // entry block or not, so we can't make a good choice for both. We make this
-  // decision here based on whether the first call we found to remove is
-  // marked tail.
-  // NOTE: We could do slightly better here in the case that the function has
-  // no entry block allocas.
-  RemovableCallsMustBeMarkedTail = CI->isTailCall();
-
-  // If this tail call is marked 'tail' and if there are any allocas in the
-  // entry block, move them up to the new entry block.
-  if (RemovableCallsMustBeMarkedTail)
-    // Move all fixed sized allocas from HeaderBB to NewEntry.
-    for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(),
-                              NEBI = NewEntry->begin();
-         OEBI != E;)
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
-        if (isa<ConstantInt>(AI->getArraySize()))
-          AI->moveBefore(&*NEBI);
+  // Move all fixed sized allocas from HeaderBB to NewEntry.
+  for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(),
+                            NEBI = NewEntry->begin();
+       OEBI != E;)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
+      if (isa<ConstantInt>(AI->getArraySize()))
+        AI->moveBefore(&*NEBI);
 
   // Now that we have created a new block, which jumps to the entry
   // block, insert a PHI node for each argument of the function.
@@ -588,6 +580,54 @@ void TailRecursionEliminator::insertAccumulator(Instruction *AccRecInstr) {
   ++NumAccumAdded;
 }
 
+// Creates a copy of contents of ByValue operand of the specified
+// call instruction into the newly created temporarily variable.
+void TailRecursionEliminator::copyByValueOperandIntoLocalTemp(CallInst *CI,
+                                                              int OpndIdx) {
+  PointerType *ArgTy = cast<PointerType>(CI->getArgOperand(OpndIdx)->getType());
+  Type *AggTy = ArgTy->getElementType();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  // Get alignment of byVal operand.
+  Align Alignment(CI->getParamAlign(OpndIdx).valueOrOne());
+
+  // Create alloca for temporarily byval operands.
+  // Put alloca into the entry block.
+  Value *NewAlloca = new AllocaInst(
+      AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment,
+      CI->getArgOperand(OpndIdx)->getName(), &*F.getEntryBlock().begin());
+
+  IRBuilder<> Builder(CI);
+  Value *Size = Builder.getInt64(DL.getTypeAllocSize(AggTy));
+
+  // Copy data from byvalue operand into the temporarily variable.
+  Builder.CreateMemCpy(NewAlloca, /*DstAlign*/ Alignment,
+                       CI->getArgOperand(OpndIdx),
+                       /*SrcAlign*/ Alignment, Size);
+  CI->setArgOperand(OpndIdx, NewAlloca);
+}
+
+// Creates a copy from temporarily variable(keeping value of ByVal argument)
+// into the corresponding function argument location.
+void TailRecursionEliminator::copyLocalTempOfByValueOperandIntoArguments(
+    CallInst *CI, int OpndIdx) {
+  PointerType *ArgTy = cast<PointerType>(CI->getArgOperand(OpndIdx)->getType());
+  Type *AggTy = ArgTy->getElementType();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  // Get alignment of byVal operand.
+  Align Alignment(CI->getParamAlign(OpndIdx).valueOrOne());
+
+  IRBuilder<> Builder(CI);
+  Value *Size = Builder.getInt64(DL.getTypeAllocSize(AggTy));
+
+  // Copy data from the temporarily variable into corresponding
+  // function argument location.
+  Builder.CreateMemCpy(F.getArg(OpndIdx), /*DstAlign*/ Alignment,
+                       CI->getArgOperand(OpndIdx),
+                       /*SrcAlign*/ Alignment, Size);
+}
+
 bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
   ReturnInst *Ret = cast<ReturnInst>(CI->getParent()->getTerminator());
 
@@ -626,14 +666,22 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
   if (!HeaderBB)
     createTailRecurseLoopHeader(CI);
 
-  if (RemovableCallsMustBeMarkedTail && !CI->isTailCall())
-    return false;
+  // Copy values of ByVal operands into local temporarily variables.
+  for (unsigned I = 0, E = CI->getNumArgOperands(); I != E; ++I) {
+    if (CI->isByValArgument(I))
+      copyByValueOperandIntoLocalTemp(CI, I);
+  }
 
   // Ok, now that we know we have a pseudo-entry block WITH all of the
   // required PHI nodes, add entries into the PHI node for the actual
   // parameters passed into the tail-recursive call.
-  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
-    ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB);
+  for (unsigned I = 0, E = CI->getNumArgOperands(); I != E; ++I) {
+    if (CI->isByValArgument(I)) {
+      copyLocalTempOfByValueOperandIntoArguments(CI, I);
+      ArgumentPHIs[I]->addIncoming(F.getArg(I), BB);
+    } else
+      ArgumentPHIs[I]->addIncoming(CI->getArgOperand(I), BB);
+  }
 
   if (AccRecInstr) {
     insertAccumulator(AccRecInstr);
@@ -750,8 +798,7 @@ void TailRecursionEliminator::cleanupAndFinalize() {
   }
 }
 
-bool TailRecursionEliminator::processBlock(
-    BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail) {
+bool TailRecursionEliminator::processBlock(BasicBlock &BB) {
   Instruction *TI = BB.getTerminator();
 
   if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
@@ -764,7 +811,7 @@ bool TailRecursionEliminator::processBlock(
     if (!Ret)
       return false;
 
-    CallInst *CI = findTRECandidate(&BB, CannotTailCallElimCallsMarkedTail);
+    CallInst *CI = findTRECandidate(&BB);
 
     if (!CI)
       return false;
@@ -785,7 +832,7 @@ bool TailRecursionEliminator::processBlock(
     eliminateCall(CI);
     return true;
   } else if (isa<ReturnInst>(TI)) {
-    CallInst *CI = findTRECandidate(&BB, CannotTailCallElimCallsMarkedTail);
+    CallInst *CI = findTRECandidate(&BB);
 
     if (CI)
       return eliminateCall(CI);
@@ -803,26 +850,21 @@ bool TailRecursionEliminator::eliminate(Function &F,
     return false;
 
   bool MadeChange = false;
-  bool AllCallsAreTailCalls = false;
-  MadeChange |= markTails(F, AllCallsAreTailCalls, ORE);
-  if (!AllCallsAreTailCalls)
-    return MadeChange;
+  MadeChange |= markTails(F, ORE);
 
   // If this function is a varargs function, we won't be able to PHI the args
   // right, so don't even try to convert it...
   if (F.getFunctionType()->isVarArg())
     return MadeChange;
 
-  // If false, we cannot perform TRE on tail calls marked with the 'tail'
-  // attribute, because doing so would cause the stack size to increase (real
-  // TRE would deallocate variable sized allocas, TRE doesn't).
-  bool CanTRETailMarkedCall = canTRE(F);
+  if (!canTRE(F))
+    return MadeChange;
 
   // Change any tail recursive calls to loops.
   TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU);
 
   for (BasicBlock &BB : F)
-    MadeChange |= TRE.processBlock(BB, !CanTRETailMarkedCall);
+    MadeChange |= TRE.processBlock(BB);
 
   TRE.cleanupAndFinalize();
 

diff  --git a/llvm/test/Transforms/TailCallElim/basic.ll b/llvm/test/Transforms/TailCallElim/basic.ll
index 284bd8916bc26..91350c59c7e99 100644
--- a/llvm/test/Transforms/TailCallElim/basic.ll
+++ b/llvm/test/Transforms/TailCallElim/basic.ll
@@ -12,15 +12,16 @@ define void @test0() {
 	ret void
 }
 
-; PR615. Make sure that we do not move the alloca so that it interferes with the tail call.
+; Make sure that we do not do TRE if pointer to local stack
+; escapes through function call.
 define i32 @test1() {
 ; CHECK: i32 @test1()
 ; CHECK-NEXT: alloca
 	%A = alloca i32		; <i32*> [#uses=2]
 	store i32 5, i32* %A
 	call void @use(i32* %A)
-; CHECK: tail call i32 @test1
-	%X = tail call i32 @test1()		; <i32> [#uses=1]
+; CHECK: call i32 @test1
+	%X = call i32 @test1()		; <i32> [#uses=1]
 	ret i32 %X
 }
 

diff  --git a/llvm/test/Transforms/TailCallElim/tre-byval-parameter-2.ll b/llvm/test/Transforms/TailCallElim/tre-byval-parameter-2.ll
new file mode 100644
index 0000000000000..6f45a23c1cdb4
--- /dev/null
+++ b/llvm/test/Transforms/TailCallElim/tre-byval-parameter-2.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
+
+; the test was generated from the following C++ source:
+;
+; #include <stdio.h>
+; typedef struct A { long long x[10] = {0}; } A;
+; A global;
+; void dostuff(A a, A b, int i) {
+;  if (i==10) return;
+;  a.x[5]++;
+;  printf("%lld %lld\n", a.x[5], b.x[5]); dostuff(b, a, i+1);
+; }
+; __attribute((optnone)) int main() { dostuff(global, global, 0); }
+;
+; This test checks that values for two ByValue operands are copied
+; into temporarily variables first and then the temporaily
+; variables are copied into original function arguments location.
+
+%struct.A = type { [10 x i64] }
+
+ at global = dso_local local_unnamed_addr global %struct.A zeroinitializer, align 8
+ at .str = private unnamed_addr constant [11 x i8] c"%lld %lld\0A\00", align 1
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @_Z7dostuff1AS_i(%struct.A* nocapture byval(%struct.A) align 8 %a, %struct.A* nocapture readonly byval(%struct.A) align 8 %b, i32 %i) local_unnamed_addr #0 {
+; CHECK-LABEL: @_Z7dostuff1AS_i(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AGG_TMP52:%.*]] = alloca [[STRUCT_A:%.*]], align 8
+; CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca [[STRUCT_A]], align 8
+; CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_A]], align 8
+; CHECK-NEXT:    [[AGG_TMP5:%.*]] = alloca [[STRUCT_A]], align 8
+; CHECK-NEXT:    br label [[TAILRECURSE:%.*]]
+; CHECK:       tailrecurse:
+; CHECK-NEXT:    [[I_TR:%.*]] = phi i32 [ [[I:%.*]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I_TR]], 10
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_A]], %struct.A* [[A:%.*]], i64 0, i32 0, i64 5
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    store i64 [[INC]], i64* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCT_A]], %struct.A* [[B:%.*]], i64 0, i32 0, i64 5
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0), i64 [[INC]], i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.A* [[AGG_TMP]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast %struct.A* [[B]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(80) [[TMP2]], i8* nonnull align 8 dereferenceable(80) [[TMP3]], i64 80, i1 false)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.A* [[AGG_TMP5]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.A* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(80) [[TMP4]], i8* nonnull align 8 dereferenceable(80) [[TMP5]], i64 80, i1 false)
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[I_TR]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.A* [[AGG_TMP1]] to i8*
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.A* [[AGG_TMP]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 80, i1 false)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.A* [[AGG_TMP52]] to i8*
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast %struct.A* [[AGG_TMP5]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 80, i1 false)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast %struct.A* [[A]] to i8*
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.A* [[AGG_TMP1]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP10]], i8* align 8 [[TMP11]], i64 80, i1 false)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.A* [[B]] to i8*
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast %struct.A* [[AGG_TMP52]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP12]], i8* align 8 [[TMP13]], i64 80, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull [[TMP2]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull [[TMP4]])
+; CHECK-NEXT:    br label [[TAILRECURSE]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %agg.tmp = alloca %struct.A, align 8
+  %agg.tmp5 = alloca %struct.A, align 8
+  %cmp = icmp eq i32 %i, 10
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %arrayidx = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0, i64 5
+  %0 = load i64, i64* %arrayidx, align 8
+  %inc = add nsw i64 %0, 1
+  store i64 %inc, i64* %arrayidx, align 8
+  %arrayidx4 = getelementptr inbounds %struct.A, %struct.A* %b, i64 0, i32 0, i64 5
+  %1 = load i64, i64* %arrayidx4, align 8
+  %call = call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([11 x i8], [11 x i8]* @.str
+, i64 0, i64 0), i64 %inc, i64 %1)
+  %2 = bitcast %struct.A* %agg.tmp to i8*
+  call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull %2)
+  %3 = bitcast %struct.A* %b to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(80) %2, i8* nonnull align 8 dereferenceable(80) %3, i64 80, i1 false)
+  %4 = bitcast %struct.A* %agg.tmp5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull %4)
+  %5 = bitcast %struct.A* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(80) %4, i8* nonnull align 8 dereferenceable(80) %5, i64 80, i1 false)
+  %add = add nsw i32 %i, 1
+  call void @_Z7dostuff1AS_i(%struct.A* nonnull byval(%struct.A) align 8 %agg.tmp, %struct.A* nonnull byval(%struct.A) align 8 %agg.tmp5, i32 %add)
+  call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull %2)
+  call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull %4)
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret void
+}
+
+; Function Attrs: nofree nounwind
+declare dso_local noundef i32 @printf(i8* nocapture noundef readonly, ...) local_unnamed_addr #1
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #2
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define dso_local i32 @main() local_unnamed_addr #3 {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_A:%.*]], align 8
+; CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca [[STRUCT_A]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.A* [[AGG_TMP]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 bitcast (%struct.A* @global to i8*), i64 80, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.A* [[AGG_TMP1]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.A* @global to i8*), i64 80, i1 false)
+; CHECK-NEXT:    tail call void @_Z7dostuff1AS_i(%struct.A* byval(%struct.A) align 8 [[AGG_TMP]], %struct.A* byval(%struct.A) align 8 [[AGG_TMP1]], i32 0)
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %agg.tmp = alloca %struct.A, align 8
+  %agg.tmp1 = alloca %struct.A, align 8
+  %0 = bitcast %struct.A* %agg.tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 bitcast (%struct.A* @global to i8*), i64 80, i1 false)
+  %1 = bitcast %struct.A* %agg.tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %1, i8* align 8 bitcast (%struct.A* @global to i8*), i64 80, i1 false)
+  call void @_Z7dostuff1AS_i(%struct.A* byval(%struct.A) align 8 %agg.tmp, %struct.A* byval(%struct.A) align 8 %agg.tmp1, i32 0)
+  ret i32 0
+}
+
+attributes #0 = { uwtable }
+attributes #1 = { uwtable }
+attributes #2 = { argmemonly nounwind willreturn }

diff  --git a/llvm/test/Transforms/TailCallElim/tre-byval-parameter.ll b/llvm/test/Transforms/TailCallElim/tre-byval-parameter.ll
new file mode 100644
index 0000000000000..1925ca91e1e06
--- /dev/null
+++ b/llvm/test/Transforms/TailCallElim/tre-byval-parameter.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
+
+; the test was generated from the following C++ source:
+;
+; int zoo ( S p1 );
+;
+; int foo ( int count, S p1 ) {
+;   if ( count > 10 )
+;     return zoo(p1);
+;
+;   // After TRE: temporarily variable created for passing byvalue parameter
+;   // p1 could be used when zoo(p1) is called.
+;   return foo(count+1, p1);
+; }
+
+; this test checks that value of ByValue operand AGG_TMP_I of call site foo()
+; is copied into temporarily variable AGG_TMP_I1(byVal value holder) and
+; later the value from AGG_TMP_I1 is copied into function argument P1 before
+; new iteration started.
+
+%struct.S = type { i32, i32, float, %struct.B }
+%struct.B = type { i32, float }
+
+; Function Attrs: uwtable
+define dso_local i32 @_Z3fooi1S(i32 %count, %struct.S* nocapture readonly byval(%struct.S) align 8 %p1) local_unnamed_addr #0 {
+; CHECK-LABEL: @_Z3fooi1S(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AGG_TMP_I1:%.*]] = alloca [[STRUCT_S:%.*]], align 8
+; CHECK-NEXT:    [[AGG_TMP_I:%.*]] = alloca [[STRUCT_S]], align 8
+; CHECK-NEXT:    [[AGG_TMP14:%.*]] = alloca [[STRUCT_S]], align 8
+; CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_S]], align 8
+; CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca [[STRUCT_S]], align 8
+; CHECK-NEXT:    br label [[TAILRECURSE:%.*]]
+; CHECK:       tailrecurse:
+; CHECK-NEXT:    [[COUNT_TR:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[COUNT_TR]], 10
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.S* [[AGG_TMP]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.S* [[P1:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) [[TMP0]], i8* nonnull align 8 dereferenceable(20) [[TMP1]], i64 20, i1 false)
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @_Z3zoo1S(%struct.S* nonnull byval(%struct.S) align 8 [[AGG_TMP]])
+; CHECK-NEXT:    br label [[RETURN:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[COUNT_TR]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.S* [[AGG_TMP1]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast %struct.S* [[P1]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) [[TMP2]], i8* nonnull align 8 dereferenceable(20) [[TMP3]], i64 20, i1 false)
+; CHECK-NEXT:    [[AGG_TMP14_0__SROA_CAST:%.*]] = bitcast %struct.S* [[AGG_TMP14]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull [[AGG_TMP14_0__SROA_CAST]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.S* [[AGG_TMP_I]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull [[TMP4]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) [[AGG_TMP14_0__SROA_CAST]], i8* nonnull align 8 dereferenceable(20) [[TMP2]], i64 20, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) [[TMP4]], i8* nonnull align 8 dereferenceable(20) [[AGG_TMP14_0__SROA_CAST]], i64 20, i1 false)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.S* [[AGG_TMP_I1]] to i8*
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.S* [[AGG_TMP_I]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 20, i1 false)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.S* [[P1]] to i8*
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.S* [[AGG_TMP_I1]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 20, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull [[AGG_TMP14_0__SROA_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull [[TMP4]])
+; CHECK-NEXT:    br label [[TAILRECURSE]]
+; CHECK:       return:
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
+entry:
+  %agg.tmp.i = alloca %struct.S, align 8
+  %agg.tmp14 = alloca %struct.S, align 8
+  %agg.tmp = alloca %struct.S, align 8
+  %agg.tmp1 = alloca %struct.S, align 8
+  %cmp = icmp sgt i32 %count, 10
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %0 = bitcast %struct.S* %agg.tmp to i8*
+  %1 = bitcast %struct.S* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) %0, i8* nonnull align 8 dereferenceable(20) %1, i64 20, i1 false)
+  %call = call i32 @_Z3zoo1S(%struct.S* nonnull byval(%struct.S) align 8 %agg.tmp)
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %add = add nsw i32 %count, 1
+  %2 = bitcast %struct.S* %agg.tmp1 to i8*
+  %3 = bitcast %struct.S* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) %2, i8* nonnull align 8 dereferenceable(20) %3, i64 20, i1 false)
+  %agg.tmp14.0..sroa_cast = bitcast %struct.S* %agg.tmp14 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %agg.tmp14.0..sroa_cast)
+  %4 = bitcast %struct.S* %agg.tmp.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %4)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) %agg.tmp14.0..sroa_cast, i8* nonnull align 8 dereferenceable(20) %2, i64 20, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) %4, i8* nonnull align 8 dereferenceable(20) %agg.tmp14.0..sroa_cast, i64 20, i1 false)
+  %call.i = call i32 @_Z3fooi1S(i32 %add, %struct.S* nonnull byval(%struct.S) align 8 %agg.tmp.i)
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %agg.tmp14.0..sroa_cast)
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %4)
+  br label %return
+
+return:                                           ; preds = %if.end, %if.then
+  %retval.0 = phi i32 [ %call, %if.then ], [ %call.i, %if.end ]
+  ret i32 %retval.0
+}
+
+declare dso_local i32 @_Z3zoo1S(%struct.S* byval(%struct.S) align 8) local_unnamed_addr #1
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #2
+
+attributes #0 = { uwtable }
+attributes #1 = { uwtable }
+attributes #2 = { argmemonly nounwind willreturn }

diff  --git a/llvm/test/Transforms/TailCallElim/tre-multiple-exits.ll b/llvm/test/Transforms/TailCallElim/tre-multiple-exits.ll
new file mode 100644
index 0000000000000..8f69087dd879d
--- /dev/null
+++ b/llvm/test/Transforms/TailCallElim/tre-multiple-exits.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
+
+; This test checks that TRE would be done for only one recursive call.
+; The test_multiple_exits function has three recursive calls.
+; First recursive call could not be eliminated because there is
+; escaped pointer to local variable. Second recursive call could
+; be eliminated. Thrid recursive call could not be eliminated since
+; this is not last call. Thus, test checks that TRE would be done
+; for only second recursive call.
+
+; IR for that test was generated from the following C++ source:
+;
+; void capture_arg (int*);
+; void test_multiple_exits (int param);
+;   if (param >= 0 && param < 10) {
+;     int temp;
+;     capture_arg(&temp);
+;     // TRE could not be done because pointer to local
+;     // variable "temp" is escaped.
+;     test_multiple_exits(param + 1);
+;   } else if (param >=10 && param < 20) {
+;     // TRE should be done.
+;     test_multiple_exits(param + 1);
+;   } else if (param >= 20 && param < 22) {
+;     // TRE could not be done since recursive
+;     // call is not last call.
+;     test_multiple_exits(param + 1);
+;     func();
+;   }
+;
+;   return;
+; }
+
+; Function Attrs: noinline optnone uwtable
+declare void @_Z11capture_argPi(i32* %param) #0
+
+; Function Attrs: noinline optnone uwtable
+declare void @_Z4funcv() #0
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @_Z19test_multiple_exitsi(i32 %param) local_unnamed_addr #2 {
+; CHECK-LABEL: @_Z19test_multiple_exitsi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TEMP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[TAILRECURSE:%.*]]
+; CHECK:       tailrecurse:
+; CHECK-NEXT:    [[PARAM_TR:%.*]] = phi i32 [ [[PARAM:%.*]], [[ENTRY:%.*]] ], [ [[ADD6:%.*]], [[IF_THEN5:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[PARAM_TR]], 10
+; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TEMP]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP1]]) #1
+; CHECK-NEXT:    call void @_Z11capture_argPi(i32* nonnull [[TEMP]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[PARAM_TR]], 1
+; CHECK-NEXT:    call void @_Z19test_multiple_exitsi(i32 [[ADD]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP1]]) #1
+; CHECK-NEXT:    br label [[IF_END14:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[PARAM_OFF:%.*]] = add i32 [[PARAM_TR]], -10
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[PARAM_OFF]], 10
+; CHECK-NEXT:    br i1 [[TMP2]], label [[IF_THEN5]], label [[IF_ELSE7:%.*]]
+; CHECK:       if.then5:
+; CHECK-NEXT:    [[ADD6]] = add nuw nsw i32 [[PARAM_TR]], 1
+; CHECK-NEXT:    br label [[TAILRECURSE]]
+; CHECK:       if.else7:
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[PARAM_TR]], -2
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 20
+; CHECK-NEXT:    br i1 [[TMP4]], label [[IF_THEN11:%.*]], label [[IF_END14]]
+; CHECK:       if.then11:
+; CHECK-NEXT:    [[ADD12:%.*]] = add nsw i32 [[PARAM_TR]], 1
+; CHECK-NEXT:    tail call void @_Z19test_multiple_exitsi(i32 [[ADD12]])
+; CHECK-NEXT:    tail call void @_Z4funcv()
+; CHECK-NEXT:    ret void
+; CHECK:       if.end14:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %temp = alloca i32, align 4
+  %0 = icmp ult i32 %param, 10
+  br i1 %0, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %1 = bitcast i32* %temp to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #2
+  call void @_Z11capture_argPi(i32* nonnull %temp)
+  %add = add nuw nsw i32 %param, 1
+  call void @_Z19test_multiple_exitsi(i32 %add)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #2
+  br label %if.end14
+
+if.else:                                          ; preds = %entry
+  %param.off = add i32 %param, -10
+  %2 = icmp ult i32 %param.off, 10
+  br i1 %2, label %if.then5, label %if.else7
+
+if.then5:                                         ; preds = %if.else
+  %add6 = add nuw nsw i32 %param, 1
+  call void @_Z19test_multiple_exitsi(i32 %add6)
+  br label %if.end14
+
+if.else7:                                         ; preds = %if.else
+  %3 = and i32 %param, -2
+  %4 = icmp eq i32 %3, 20
+  br i1 %4, label %if.then11, label %if.end14
+
+if.then11:                                        ; preds = %if.else7
+  %add12 = add nsw i32 %param, 1
+  call void @_Z19test_multiple_exitsi(i32 %add12)
+  call void @_Z4funcv()
+  br label %if.end14
+
+if.end14:                                         ; preds = %if.then5, %if.then11, %if.else7, %if.then
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2
+
+attributes #0 = { nofree noinline norecurse nounwind uwtable }
+attributes #1 = { nounwind uwtable }
+attributes #2 = { argmemonly nounwind willreturn }

diff  --git a/llvm/test/Transforms/TailCallElim/tre-noncapturing-alloca-calls.ll b/llvm/test/Transforms/TailCallElim/tre-noncapturing-alloca-calls.ll
new file mode 100644
index 0000000000000..2168437fc5706
--- /dev/null
+++ b/llvm/test/Transforms/TailCallElim/tre-noncapturing-alloca-calls.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
+
+; IR for that test was generated from the following C++ source:
+;
+;int count;
+;__attribute__((noinline)) void globalIncrement(const int* param) { count += *param; }
+;
+;void test(int recurseCount)
+;{
+;    if (recurseCount == 0) return;
+;    int temp = 10;
+;    globalIncrement(&temp);
+;    test(recurseCount - 1);
+;}
+;
+
+ at count = dso_local local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: nofree noinline norecurse nounwind uwtable
+declare void @_Z15globalIncrementPKi(i32* nocapture readonly %param) #0
+
+; Test that TRE could be done for recursive tail routine containing
+; call to function receiving a pointer to local stack.
+
+; Function Attrs: nounwind uwtable
+define dso_local void @_Z4testi(i32 %recurseCount) local_unnamed_addr #1 {
+; CHECK-LABEL: @_Z4testi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TEMP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[TAILRECURSE:%.*]]
+; CHECK:       tailrecurse:
+; CHECK-NEXT:    [[RECURSECOUNT_TR:%.*]] = phi i32 [ [[RECURSECOUNT:%.*]], [[ENTRY:%.*]] ], [ [[SUB:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RECURSECOUNT_TR]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[TEMP]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    store i32 10, i32* [[TEMP]], align 4
+; CHECK-NEXT:    call void @_Z15globalIncrementPKi(i32* nonnull [[TEMP]])
+; CHECK-NEXT:    [[SUB]] = add nsw i32 [[RECURSECOUNT_TR]], -1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    br label [[TAILRECURSE]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %temp = alloca i32, align 4
+  %cmp = icmp eq i32 %recurseCount, 0
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %0 = bitcast i32* %temp to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #6
+  store i32 10, i32* %temp, align 4
+  call void @_Z15globalIncrementPKi(i32* nonnull %temp)
+  %sub = add nsw i32 %recurseCount, -1
+  call void @_Z4testi(i32 %sub)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #6
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2
+
+attributes #0 = { nofree noinline norecurse nounwind uwtable }
+attributes #1 = { nounwind uwtable }
+attributes #2 = { argmemonly nounwind willreturn }


        


More information about the llvm-commits mailing list