[llvm] r187191 - Re-implement the analysis of uses in mem2reg to be significantly more

Fri Jul 26 01:20:40 PDT 2013

Author: chandlerc
Date: Fri Jul 26 03:20:39 2013
New Revision: 187191

URL: http://llvm.org/viewvc/llvm-project?rev=187191&view=rev
Log:
Re-implement the analysis of uses in mem2reg to be significantly more
robust. It now uses an InstVisitor and worklist to actually walk the
uses of the Alloca transitively and detect the pattern which we can
directly promote: loads & stores of the whole alloca and instructions we
can completely ignore.

Also, with this new implementation teach both the predicate for testing
whether we can promote and the promotion engine itself to use the same
code so we no longer have strange divergence between the two code paths.

I've added some silly test cases to demonstrate that we can handle
slightly more degenerate code patterns now. See the below for why this
is even interesting.

Performance impact: roughly 1% regression in the performance of SROA or
ScalarRepl on a large C++-ish test case where most of the allocas are
basically ready for promotion. The reason is because of silly redundant
work that I've left FIXMEs for and which I'll address in the next
commit. I wanted to separate this commit as it changes the behavior.
Once the redundant work in removing the dead uses of the alloca is
fixed, this code appears to be faster than the old version. =]

So why is this useful? Because the previous requirement for promotion
required a *specific* visit pattern of the uses of the alloca to verify:
we *had* to look for no more than 1 intervening use. The end goal is to
have SROA automatically detect when an alloca is already promotable and
directly hand it to the mem2reg machinery rather than trying to
partition and rewrite it. This is a 25% or more performance improvement
for SROA, and a significant chunk of the delta between it and
ScalarRepl. To get there, we need to make mem2reg actually capable of
promoting allocas which *look* promotable to SROA without have SROA do
tons of work to massage the code into just the right form.

This is actually the tip of the iceberg. There are tremendous potential
savings we can realize here by de-duplicating work between mem2reg and
SROA.

Added:
    llvm/trunk/test/Transforms/Mem2Reg/use-analysis.ll
Removed:
    llvm/trunk/test/Transforms/Mem2Reg/ignore-lifetime.ll
Modified:
    llvm/trunk/lib/Transforms/Utils/PromoteMemoryToRegister.cpp

Modified: llvm/trunk/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Utils/PromoteMemoryToRegister.cpp?rev=187191&r1=187190&r2=187191&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Utils/PromoteMemoryToRegister.cpp (original)
+++ llvm/trunk/lib/Transforms/Utils/PromoteMemoryToRegister.cpp Fri Jul 26 03:20:39 2013
@@ -30,6 +30,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -45,6 +46,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/InstVisitor.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
@@ -56,56 +58,13 @@ STATISTIC(NumSingleStore,   "Number of a
 STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
 STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
 
-bool llvm::isAllocaPromotable(const AllocaInst *AI) {
-  // FIXME: If the memory unit is of pointer or integer type, we can permit
-  // assignments to subsections of the memory unit.
-
-  // Only allow direct and non-volatile loads and stores...
-  for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end();
-       UI != UE; ++UI) { // Loop over all of the uses of the alloca
-    const User *U = *UI;
-    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      // Note that atomic loads can be transformed; atomic semantics do
-      // not have any meaning for a local alloca.
-      if (LI->isVolatile())
-        return false;
-    } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
-      if (SI->getOperand(0) == AI)
-        return false; // Don't allow a store OF the AI, only INTO the AI.
-      // Note that atomic stores can be transformed; atomic semantics do
-      // not have any meaning for a local alloca.
-      if (SI->isVolatile())
-        return false;
-    } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
-      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
-          II->getIntrinsicID() != Intrinsic::lifetime_end)
-        return false;
-    } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
-      if (BCI->getType() != Type::getInt8PtrTy(U->getContext()))
-        return false;
-      if (!onlyUsedByLifetimeMarkers(BCI))
-        return false;
-    } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
-      if (GEPI->getType() != Type::getInt8PtrTy(U->getContext()))
-        return false;
-      if (!GEPI->hasAllZeroIndices())
-        return false;
-      if (!onlyUsedByLifetimeMarkers(GEPI))
-        return false;
-    } else {
-      return false;
-    }
-  }
-
-  return true;
-}
-
 namespace {
 
-struct AllocaInfo {
+struct AllocaInfo : private InstVisitor<AllocaInfo, bool> {
   SmallVector<BasicBlock *, 32> DefiningBlocks;
   SmallVector<BasicBlock *, 32> UsingBlocks;
 
+  Type *AllocaTy;
   StoreInst *OnlyStore;
   BasicBlock *OnlyBlock;
   bool OnlyUsedInOneBlock;
@@ -116,6 +75,7 @@ struct AllocaInfo {
   void clear() {
     DefiningBlocks.clear();
     UsingBlocks.clear();
+    AllocaTy = 0;
     OnlyStore = 0;
     OnlyBlock = 0;
     OnlyUsedInOneBlock = true;
@@ -125,39 +85,107 @@ struct AllocaInfo {
 
   /// Scan the uses of the specified alloca, filling in the AllocaInfo used
   /// by the rest of the pass to reason about the uses of this alloca.
-  void AnalyzeAlloca(AllocaInst *AI) {
+  bool analyzeAlloca(AllocaInst &AI) {
     clear();
 
-    // As we scan the uses of the alloca instruction, keep track of stores,
-    // and decide whether all of the loads and stores to the alloca are within
-    // the same basic block.
-    for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
-         UI != E;) {
-      Instruction *User = cast<Instruction>(*UI++);
-
-      if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
-        // Remember the basic blocks which define new values for the alloca
-        DefiningBlocks.push_back(SI->getParent());
-        AllocaPointerVal = SI->getOperand(0);
-        OnlyStore = SI;
-      } else {
-        LoadInst *LI = cast<LoadInst>(User);
-        // Otherwise it must be a load instruction, keep track of variable
-        // reads.
-        UsingBlocks.push_back(LI->getParent());
-        AllocaPointerVal = LI;
-      }
+    AllocaTy = AI.getAllocatedType();
+    enqueueUsers(AI);
+
+    // Walk queued up uses in the worklist to handle nested uses.
+    while (!UseWorklist.empty()) {
+      U = UseWorklist.pop_back_val();
+      Instruction &I = *cast<Instruction>(U->getUser());
+      if (!visit(I))
+        return false; // Propagate failure to promote up.
 
       if (OnlyUsedInOneBlock) {
         if (OnlyBlock == 0)
-          OnlyBlock = User->getParent();
-        else if (OnlyBlock != User->getParent())
+          OnlyBlock = I.getParent();
+        else if (OnlyBlock != I.getParent())
           OnlyUsedInOneBlock = false;
       }
     }
 
-    DbgDeclare = FindAllocaDbgDeclare(AI);
+    DbgDeclare = FindAllocaDbgDeclare(&AI);
+    return true;
+  }
+
+private:
+  // Befriend the base class so it can call through private visitor methods.
+  friend class InstVisitor<AllocaInfo, bool>;
+
+  /// \brief A use pointer that is non-null when visiting uses.
+  Use *U;
+
+  /// \brief A worklist for recursively visiting all uses of an alloca.
+  SmallVector<Use *, 8> UseWorklist;
+
+  /// \brief A set for preventing cyclic visitation.
+  SmallPtrSet<Use *, 8> VisitedUses;
+
+  void enqueueUsers(Instruction &I) {
+    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
+         ++UI)
+      if (VisitedUses.insert(&UI.getUse()))
+        UseWorklist.push_back(&UI.getUse());
+  }
+
+  bool visitLoadInst(LoadInst &LI) {
+    if (LI.isVolatile() || LI.getType() != AllocaTy)
+      return false;
+
+    // Keep track of variable reads.
+    UsingBlocks.push_back(LI.getParent());
+    AllocaPointerVal = &LI;
+    return true;
+  }
+
+  bool visitStoreInst(StoreInst &SI) {
+    if (SI.isVolatile() || SI.getValueOperand() == U->get() ||
+        SI.getValueOperand()->getType() != AllocaTy)
+      return false;
+
+    // Remember the basic blocks which define new values for the alloca
+    DefiningBlocks.push_back(SI.getParent());
+    AllocaPointerVal = SI.getOperand(0);
+    OnlyStore = &SI;
+    return true;
+  }
+
+  bool visitBitCastInst(BitCastInst &BC) {
+    enqueueUsers(BC);
+    return true;
+  }
+
+  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    if (GEPI.use_empty())
+      return true;
+
+    enqueueUsers(GEPI);
+
+    return GEPI.hasAllZeroIndices();
+  }
+
+  // We can promote through debug info intrinsics as they don't alter the
+  // value stored in memory.
+  bool visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) { return true; }
+
+  bool visitIntrinsicInst(IntrinsicInst &II) {
+    switch (II.getIntrinsicID()) {
+    default:
+      return false;
+
+      // Lifetime intrinsics don't preclude promoting the memory to a register.
+      // FIXME: We should use these to promote to undef when outside of a valid
+      // lifetime.
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      return true;
+    }
   }
+
+  // The fallback is that the alloca cannot be promoted.
+  bool visitInstruction(Instruction &I) { return false; }
 };
 
 // Data package used by RenamePass()
@@ -316,24 +344,56 @@ private:
 static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
   // Knowing that this alloca is promotable, we know that it's safe to kill all
   // instructions except for load and store.
+  // FIXME: This function isn't actually specific to lifetime instrinsics. It
+  // also is redundant with the actual analysis of the loads and stores and
+  // should be folded into that.
+
+  SmallVector<Instruction *, 8> Worklist;
+  SmallPtrSet<Instruction *, 8> Visited;
+  SmallVector<Instruction *, 8> Dead;
+  Worklist.push_back(AI);
+
+  do {
+    Instruction *I = Worklist.pop_back_val();
 
-  for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
-       UI != UE;) {
-    Instruction *I = cast<Instruction>(*UI);
-    ++UI;
-    if (isa<LoadInst>(I) || isa<StoreInst>(I))
+    if (I->use_empty()) {
+      Dead.push_back(I);
       continue;
+    }
 
-    if (!I->getType()->isVoidTy()) {
-      // The only users of this bitcast/GEP instruction are lifetime intrinsics.
-      // Follow the use/def chain to erase them now instead of leaving it for
-      // dead code elimination later.
-      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
-           UI != UE;) {
-        Instruction *Inst = cast<Instruction>(*UI);
-        ++UI;
-        Inst->eraseFromParent();
-      }
+    for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+         UI != UE; ++UI)
+      if (!isa<LoadInst>(*UI) && !isa<StoreInst>(*UI) &&
+          Visited.insert(cast<Instruction>(*UI)))
+        Worklist.push_back(cast<Instruction>(*UI));
+  } while (!Worklist.empty());
+
+  while (!Dead.empty()) {
+    Instruction *I = Dead.pop_back_val();
+
+    // Don't delete the alloca itself.
+    if (I == AI)
+      continue;
+
+    // Note that we open code the deletion algorithm here because we know
+    // apriori that all of the instructions using an alloca that reaches here
+    // are trivially dead when their use list becomes empty (The only risk are
+    // lifetime markers which we specifically want to nuke). By coding it here
+    // we can skip the triviality test and be more efficient.
+    //
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE;
+         ++OI) {
+      Instruction *Op = dyn_cast<Instruction>(*OI);
+      if (!Op)
+        continue;
+
+      OI->set(0);
+      if (!Op->use_empty())
+        continue;
+
+      Dead.push_back(Op);
     }
     I->eraseFromParent();
   }
@@ -540,7 +600,7 @@ void PromoteMem2Reg::run() {
   for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
     AllocaInst *AI = Allocas[AllocaNum];
 
-    assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!");
+    //assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!");
     assert(AI->getParent()->getParent() == &F &&
            "All allocas should be in the same function, which is same as DF!");
 
@@ -560,7 +620,9 @@ void PromoteMem2Reg::run() {
 
     // Calculate the set of read and write-locations for each alloca.  This is
     // analogous to finding the 'uses' and 'definitions' of each variable.
-    Info.AnalyzeAlloca(AI);
+    bool Good = Info.analyzeAlloca(*AI);
+    (void)Good;
+    assert(Good && "Cannot promote non-promotable alloca!");
 
     // If there is only a single store to this value, replace any loads of
     // it that are directly dominated by the definition with the value stored.
@@ -1087,8 +1149,16 @@ NextIteration:
   goto NextIteration;
 }
 
-void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                           AliasSetTracker *AST) {
+bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+  // We cast away constness because we re-use the non-const analysis that the
+  // actual promotion routine uses. While it is non-const, it doesn't actually
+  // mutate anything at this phase, and we discard the non-const results that
+  // promotion uses to mutate the alloca.
+  return AllocaInfo().analyzeAlloca(*const_cast<AllocaInst *>(AI));
+}
+
+void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas,
+                           DominatorTree &DT, AliasSetTracker *AST) {
   // If there is nothing to do, bail out...
   if (Allocas.empty())
     return;

Removed: llvm/trunk/test/Transforms/Mem2Reg/ignore-lifetime.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Mem2Reg/ignore-lifetime.ll?rev=187190&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Mem2Reg/ignore-lifetime.ll (original)
+++ llvm/trunk/test/Transforms/Mem2Reg/ignore-lifetime.ll (removed)
@@ -1,26 +0,0 @@
-; RUN: opt -mem2reg -S -o - < %s | FileCheck %s
-
-declare void @llvm.lifetime.start(i64 %size, i8* nocapture %ptr)
-declare void @llvm.lifetime.end(i64 %size, i8* nocapture %ptr)
-
-define void @test1() {
-; CHECK: test1
-; CHECK-NOT: alloca
-  %A = alloca i32
-  %B = bitcast i32* %A to i8*
-  call void @llvm.lifetime.start(i64 2, i8* %B)
-  store i32 1, i32* %A
-  call void @llvm.lifetime.end(i64 2, i8* %B)
-  ret void
-}
-
-define void @test2() {
-; CHECK: test2
-; CHECK-NOT: alloca
-  %A = alloca {i8, i16}
-  %B = getelementptr {i8, i16}* %A, i32 0, i32 0
-  call void @llvm.lifetime.start(i64 2, i8* %B)
-  store {i8, i16} zeroinitializer, {i8, i16}* %A
-  call void @llvm.lifetime.end(i64 2, i8* %B)
-  ret void
-}

Added: llvm/trunk/test/Transforms/Mem2Reg/use-analysis.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Mem2Reg/use-analysis.ll?rev=187191&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Mem2Reg/use-analysis.ll (added)
+++ llvm/trunk/test/Transforms/Mem2Reg/use-analysis.ll Fri Jul 26 03:20:39 2013
@@ -0,0 +1,70 @@
+; RUN: opt -mem2reg -S -o - < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+declare void @llvm.lifetime.start(i64 %size, i8* nocapture %ptr)
+declare void @llvm.lifetime.end(i64 %size, i8* nocapture %ptr)
+
+define void @test1() {
+; Ensure we can look through a bitcast to i8* and the addition of lifetime
+; markers.
+;
+; CHECK-LABEL: @test1(
+; CHECK-NOT: alloca
+; CHECK: ret void
+
+  %A = alloca i32
+  %B = bitcast i32* %A to i8*
+  call void @llvm.lifetime.start(i64 2, i8* %B)
+  store i32 1, i32* %A
+  call void @llvm.lifetime.end(i64 2, i8* %B)
+  ret void
+}
+
+define void @test2() {
+; Ensure we can look through a GEP to i8* and the addition of lifetime
+; markers.
+;
+; CHECK-LABEL: @test2(
+; CHECK-NOT: alloca
+; CHECK: ret void
+
+  %A = alloca {i8, i16}
+  %B = getelementptr {i8, i16}* %A, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 2, i8* %B)
+  store {i8, i16} zeroinitializer, {i8, i16}* %A
+  call void @llvm.lifetime.end(i64 2, i8* %B)
+  ret void
+}
+
+define i32 @test3(i32 %x) {
+; CHECK-LABEL: @test3(
+;
+; Check that we recursively walk the uses of the alloca and thus can see
+; through round trip bitcasts, dead bitcasts, GEPs, multiple GEPs, and lifetime
+; markers.
+entry:
+  %a = alloca i32
+; CHECK-NOT: alloca
+
+  %b = bitcast i32* %a to i8*
+  %b2 = getelementptr inbounds i8* %b, i32 0
+  %b3 = getelementptr inbounds i8* %b2, i32 0
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+; CHECK-NOT: call void @llvm.lifetime.start
+
+  store i32 %x, i32* %a
+; CHECK-NOT: store
+
+  %dead = bitcast i32* %a to i4096*
+  %dead1 = bitcast i4096* %dead to i42*
+  %dead2 = getelementptr inbounds i32* %a, i32 %x
+; CHECK-NOT: bitcast
+; CHECK-NOT: getelementptr
+
+  %ret = load i32* %a
+; CHECK-NOT: load
+
+  ret i32 %ret
+; CHECK: ret i32 %x
+}