[Mlir-commits] [mlir] 4be504a - [mlir] Add support for detecting single use callables in the Inliner.

Wed Mar 18 13:17:05 PDT 2020

Author: River Riddle
Date: 2020-03-18T13:10:41-07:00
New Revision: 4be504a97f9176c9b1f9db24724e35122b1edc5f

URL: https://github.com/llvm/llvm-project/commit/4be504a97f9176c9b1f9db24724e35122b1edc5f
DIFF: https://github.com/llvm/llvm-project/commit/4be504a97f9176c9b1f9db24724e35122b1edc5f.diff

LOG: [mlir] Add support for detecting single use callables in the Inliner.

Summary: This is somewhat complex(annoying) as it involves directly tracking the uses within each of the callgraph nodes, and updating them as needed during inlining. The benefit of this is that we can have a more exact cost model, enable inlining some otherwise non-inlinable cases, and also ensure that newly dead callables are properly disposed of.

Differential Revision: https://reviews.llvm.org/D75476

Added: 
    mlir/test/Transforms/inlining-dce.mlir

Modified: 
    mlir/examples/toy/Ch4/toyc.cpp
    mlir/examples/toy/Ch5/toyc.cpp
    mlir/examples/toy/Ch6/toyc.cpp
    mlir/examples/toy/Ch7/toyc.cpp
    mlir/include/mlir/Analysis/CallGraph.h
    mlir/lib/Analysis/CallGraph.cpp
    mlir/lib/Transforms/Inliner.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/examples/toy/Ch4/toyc.cpp b/mlir/examples/toy/Ch4/toyc.cpp
index 51ff330cce67..6a89d0006239 100644

--- a/mlir/examples/toy/Ch4/toyc.cpp
+++ b/mlir/examples/toy/Ch4/toyc.cpp
@@ -119,7 +119,6 @@ int dumpMLIR() {
 
     // Inline all functions into main and then delete them.
     pm.addPass(mlir::createInlinerPass());
-    pm.addPass(mlir::createSymbolDCEPass());
 
     // Now that there is only one function, we can infer the shapes of each of
     // the operations.

diff  --git a/mlir/examples/toy/Ch5/toyc.cpp b/mlir/examples/toy/Ch5/toyc.cpp
index c9a52c606b21..4bc2af2ff899 100644
--- a/mlir/examples/toy/Ch5/toyc.cpp
+++ b/mlir/examples/toy/Ch5/toyc.cpp
@@ -125,7 +125,6 @@ int dumpMLIR() {
   if (enableOpt || isLoweringToAffine) {
     // Inline all functions into main and then delete them.
     pm.addPass(mlir::createInlinerPass());
-    pm.addPass(mlir::createSymbolDCEPass());
 
     // Now that there is only one function, we can infer the shapes of each of
     // the operations.

diff  --git a/mlir/examples/toy/Ch6/toyc.cpp b/mlir/examples/toy/Ch6/toyc.cpp
index 3c54f731ff42..558141c2ca89 100644
--- a/mlir/examples/toy/Ch6/toyc.cpp
+++ b/mlir/examples/toy/Ch6/toyc.cpp
@@ -139,7 +139,6 @@ int loadAndProcessMLIR(mlir::MLIRContext &context,
   if (enableOpt || isLoweringToAffine) {
     // Inline all functions into main and then delete them.
     pm.addPass(mlir::createInlinerPass());
-    pm.addPass(mlir::createSymbolDCEPass());
 
     // Now that there is only one function, we can infer the shapes of each of
     // the operations.

diff  --git a/mlir/examples/toy/Ch7/toyc.cpp b/mlir/examples/toy/Ch7/toyc.cpp
index 1f5f988caca3..f3e12fb986cf 100644
--- a/mlir/examples/toy/Ch7/toyc.cpp
+++ b/mlir/examples/toy/Ch7/toyc.cpp
@@ -139,7 +139,6 @@ int loadAndProcessMLIR(mlir::MLIRContext &context,
   if (enableOpt || isLoweringToAffine) {
     // Inline all functions into main and then delete them.
     pm.addPass(mlir::createInlinerPass());
-    pm.addPass(mlir::createSymbolDCEPass());
 
     // Now that there is only one function, we can infer the shapes of each of
     // the operations.

diff  --git a/mlir/include/mlir/Analysis/CallGraph.h b/mlir/include/mlir/Analysis/CallGraph.h
index cd25151da4c0..b4ef04969b5a 100644
--- a/mlir/include/mlir/Analysis/CallGraph.h
+++ b/mlir/include/mlir/Analysis/CallGraph.h
@@ -192,6 +192,9 @@ class CallGraph {
   /// external node if a valid node was not resolved.
   CallGraphNode *resolveCallable(CallOpInterface call) const;
 
+  /// Erase the given node from the callgraph.
+  void eraseNode(CallGraphNode *node);
+
   /// An iterator over the nodes of the graph.
   using iterator = NodeIterator;
   iterator begin() const { return nodes.begin(); }

diff  --git a/mlir/lib/Analysis/CallGraph.cpp b/mlir/lib/Analysis/CallGraph.cpp
index 1a31f1347135..e31641a87e05 100644
--- a/mlir/lib/Analysis/CallGraph.cpp
+++ b/mlir/lib/Analysis/CallGraph.cpp
@@ -143,6 +143,23 @@ CallGraphNode *CallGraph::resolveCallable(CallOpInterface call) const {
   return getExternalNode();
 }
 
+/// Erase the given node from the callgraph.
+void CallGraph::eraseNode(CallGraphNode *node) {
+  // Erase any children of this node first.
+  if (node->hasChildren()) {
+    for (const CallGraphNode::Edge &edge : llvm::make_early_inc_range(*node))
+      if (edge.isChild())
+        eraseNode(edge.getTarget());
+  }
+  // Erase any edges to this node from any other nodes.
+  for (auto &it : nodes) {
+    it.second->edges.remove_if([node](const CallGraphNode::Edge &edge) {
+      return edge.getTarget() == node;
+    });
+  }
+  nodes.erase(node->getCallableRegion());
+}
+
 //===----------------------------------------------------------------------===//
 // Printing
 

diff  --git a/mlir/lib/Transforms/Inliner.cpp b/mlir/lib/Transforms/Inliner.cpp
index b6fcf8bc3941..ea48582dc52a 100644
--- a/mlir/lib/Transforms/Inliner.cpp
+++ b/mlir/lib/Transforms/Inliner.cpp
@@ -14,8 +14,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/CallGraph.h"
-#include "mlir/IR/Builders.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/SideEffects.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "mlir/Transforms/Passes.h"
@@ -37,6 +37,259 @@ static llvm::cl::opt<unsigned> maxInliningIterations(
     llvm::cl::desc("Maximum number of iterations when inlining within an SCC"),
     llvm::cl::ReallyHidden, llvm::cl::init(4));
 
+//===----------------------------------------------------------------------===//
+// Symbol Use Tracking
+//===----------------------------------------------------------------------===//
+
+/// Returns true if this operation can be discarded if it is a symbol and has no
+/// uses. 'allUsesVisible' corresponds to if the parent symbol table is hidden
+/// from above.
+static bool canDiscardSymbolOnUseEmpty(Operation *op, bool allUsesVisible) {
+  if (!SymbolTable::isSymbol(op))
+    return false;
+
+  // TODO: This is essentially the same logic from SymbolDCE. Remove this when
+  // we have a 'Symbol' interface.
+  // Private symbols are always initially considered dead.
+  SymbolTable::Visibility visibility = SymbolTable::getSymbolVisibility(op);
+  if (visibility == mlir::SymbolTable::Visibility::Private)
+    return true;
+  // We only include nested visibility here if all uses are visible.
+  if (allUsesVisible && visibility == SymbolTable::Visibility::Nested)
+    return true;
+  // Otherwise, public symbols are never removable.
+  return false;
+}
+
+/// Walk all of the symbol table operations nested with 'op' along with a
+/// boolean signifying if the symbols within can be treated as if all uses are
+/// visible. The provided callback is invoked with the symbol table operation,
+/// and a boolean signaling if all of the uses within the symbol table are
+/// visible.
+static void walkSymbolTables(Operation *op, bool allSymUsesVisible,
+                             function_ref<void(Operation *, bool)> callback) {
+  if (op->hasTrait<OpTrait::SymbolTable>()) {
+    allSymUsesVisible = allSymUsesVisible || !SymbolTable::isSymbol(op) ||
+                        SymbolTable::getSymbolVisibility(op) ==
+                            SymbolTable::Visibility::Private;
+    callback(op, allSymUsesVisible);
+  } else {
+    // Otherwise if 'op' is not a symbol table, any nested symbols are
+    // guaranteed to be hidden.
+    allSymUsesVisible = true;
+  }
+
+  for (Region &region : op->getRegions())
+    for (Block &block : region)
+      for (Operation &nested : block)
+        walkSymbolTables(&nested, allSymUsesVisible, callback);
+}
+
+/// Walk all of the used symbol callgraph nodes referenced with the given op.
+static void walkReferencedSymbolNodes(
+    Operation *op, CallGraph &cg,
+    DenseMap<Attribute, CallGraphNode *> &resolvedRefs,
+    function_ref<void(CallGraphNode *, Operation *)> callback) {
+  auto symbolUses = SymbolTable::getSymbolUses(op);
+  assert(symbolUses && "expected uses to be valid");
+
+  Operation *symbolTableOp = op->getParentOp();
+  for (const SymbolTable::SymbolUse &use : *symbolUses) {
+    auto refIt = resolvedRefs.insert({use.getSymbolRef(), nullptr});
+    CallGraphNode *&node = refIt.first->second;
+
+    // If this is the first instance of this reference, try to resolve a
+    // callgraph node for it.
+    if (refIt.second) {
+      auto *symbolOp = SymbolTable::lookupNearestSymbolFrom(symbolTableOp,
+                                                            use.getSymbolRef());
+      auto callableOp = dyn_cast_or_null<CallableOpInterface>(symbolOp);
+      if (!callableOp)
+        continue;
+      node = cg.lookupNode(callableOp.getCallableRegion());
+    }
+    if (node)
+      callback(node, use.getUser());
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// CGUseList
+
+namespace {
+/// This struct tracks the uses of callgraph nodes that can be dropped when
+/// use_empty. It directly tracks and manages a use-list for all of the
+/// call-graph nodes. This is necessary because many callgraph nodes are
+/// referenced by SymbolRefAttr, which has no mechanism akin to the SSA `Use`
+/// class.
+struct CGUseList {
+  /// This struct tracks the uses of callgraph nodes within a specific
+  /// operation.
+  struct CGUser {
+    /// Any nodes referenced in the top-level attribute list of this user. We
+    /// use a set here because the number of references does not matter.
+    DenseSet<CallGraphNode *> topLevelUses;
+
+    /// Uses of nodes referenced by nested operations.
+    DenseMap<CallGraphNode *, int> innerUses;
+  };
+
+  CGUseList(Operation *op, CallGraph &cg);
+
+  /// Drop uses of nodes referred to by the given call operation that resides
+  /// within 'userNode'.
+  void dropCallUses(CallGraphNode *userNode, Operation *callOp, CallGraph &cg);
+
+  /// Remove the given node from the use list.
+  void eraseNode(CallGraphNode *node);
+
+  /// Returns true if the given callgraph node has no uses and can be pruned.
+  bool isDead(CallGraphNode *node) const;
+
+  /// Returns true if the given callgraph node has a single use and can be
+  /// discarded.
+  bool hasOneUseAndDiscardable(CallGraphNode *node) const;
+
+  /// Recompute the uses held by the given callgraph node.
+  void recomputeUses(CallGraphNode *node, CallGraph &cg);
+
+  /// Merge the uses of 'lhs' with the uses of the 'rhs' after inlining a copy
+  /// of 'lhs' into 'rhs'.
+  void mergeUsesAfterInlining(CallGraphNode *lhs, CallGraphNode *rhs);
+
+private:
+  /// Decrement the uses of discardable nodes referenced by the given user.
+  void decrementDiscardableUses(CGUser &uses);
+
+  /// A mapping between a discardable callgraph node (that is a symbol) and the
+  /// number of uses for this node.
+  DenseMap<CallGraphNode *, int> discardableSymNodeUses;
+  /// A mapping between a callgraph node and the symbol callgraph nodes that it
+  /// uses.
+  DenseMap<CallGraphNode *, CGUser> nodeUses;
+};
+} // end anonymous namespace
+
+CGUseList::CGUseList(Operation *op, CallGraph &cg) {
+  /// A set of callgraph nodes that are always known to be live during inlining.
+  DenseMap<Attribute, CallGraphNode *> alwaysLiveNodes;
+
+  // Walk each of the symbol tables looking for discardable callgraph nodes.
+  auto walkFn = [&](Operation *symbolTableOp, bool allUsesVisible) {
+    for (Block &block : symbolTableOp->getRegion(0)) {
+      for (Operation &op : block) {
+        // If this is a callgraph operation, check to see if it is discardable.
+        if (auto callable = dyn_cast<CallableOpInterface>(&op)) {
+          if (auto *node = cg.lookupNode(callable.getCallableRegion())) {
+            if (canDiscardSymbolOnUseEmpty(&op, allUsesVisible))
+              discardableSymNodeUses.try_emplace(node, 0);
+            continue;
+          }
+        }
+        // Otherwise, check for any referenced nodes. These will be always-live.
+        walkReferencedSymbolNodes(&op, cg, alwaysLiveNodes,
+                                  [](CallGraphNode *, Operation *) {});
+      }
+    }
+  };
+  walkSymbolTables(op, /*allSymUsesVisible=*/!op->getBlock(), walkFn);
+
+  // Drop the use information for any discardable nodes that are always live.
+  for (auto &it : alwaysLiveNodes)
+    discardableSymNodeUses.erase(it.second);
+
+  // Compute the uses for each of the callable nodes in the graph.
+  for (CallGraphNode *node : cg)
+    recomputeUses(node, cg);
+}
+
+void CGUseList::dropCallUses(CallGraphNode *userNode, Operation *callOp,
+                             CallGraph &cg) {
+  auto &userRefs = nodeUses[userNode].innerUses;
+  auto walkFn = [&](CallGraphNode *node, Operation *user) {
+    auto parentIt = userRefs.find(node);
+    if (parentIt == userRefs.end())
+      return;
+    --parentIt->second;
+    --discardableSymNodeUses[node];
+  };
+  DenseMap<Attribute, CallGraphNode *> resolvedRefs;
+  walkReferencedSymbolNodes(callOp, cg, resolvedRefs, walkFn);
+}
+
+void CGUseList::eraseNode(CallGraphNode *node) {
+  // Drop all child nodes.
+  for (auto &edge : *node)
+    if (edge.isChild())
+      eraseNode(edge.getTarget());
+
+  // Drop the uses held by this node and erase it.
+  auto useIt = nodeUses.find(node);
+  assert(useIt != nodeUses.end() && "expected node to be valid");
+  decrementDiscardableUses(useIt->getSecond());
+  nodeUses.erase(useIt);
+  discardableSymNodeUses.erase(node);
+}
+
+bool CGUseList::isDead(CallGraphNode *node) const {
+  // If the parent operation isn't a symbol, simply check normal SSA deadness.
+  Operation *nodeOp = node->getCallableRegion()->getParentOp();
+  if (!SymbolTable::isSymbol(nodeOp))
+    return MemoryEffectOpInterface::hasNoEffect(nodeOp) && nodeOp->use_empty();
+
+  // Otherwise, check the number of symbol uses.
+  auto symbolIt = discardableSymNodeUses.find(node);
+  return symbolIt != discardableSymNodeUses.end() && symbolIt->second == 0;
+}
+
+bool CGUseList::hasOneUseAndDiscardable(CallGraphNode *node) const {
+  // If this isn't a symbol node, check for side-effects and SSA use count.
+  Operation *nodeOp = node->getCallableRegion()->getParentOp();
+  if (!SymbolTable::isSymbol(nodeOp))
+    return MemoryEffectOpInterface::hasNoEffect(nodeOp) && nodeOp->hasOneUse();
+
+  // Otherwise, check the number of symbol uses.
+  auto symbolIt = discardableSymNodeUses.find(node);
+  return symbolIt != discardableSymNodeUses.end() && symbolIt->second == 1;
+}
+
+void CGUseList::recomputeUses(CallGraphNode *node, CallGraph &cg) {
+  Operation *parentOp = node->getCallableRegion()->getParentOp();
+  CGUser &uses = nodeUses[node];
+  decrementDiscardableUses(uses);
+
+  // Collect the new discardable uses within this node.
+  uses = CGUser();
+  DenseMap<Attribute, CallGraphNode *> resolvedRefs;
+  auto walkFn = [&](CallGraphNode *refNode, Operation *user) {
+    auto discardSymIt = discardableSymNodeUses.find(refNode);
+    if (discardSymIt == discardableSymNodeUses.end())
+      return;
+
+    if (user != parentOp)
+      ++uses.innerUses[refNode];
+    else if (!uses.topLevelUses.insert(refNode).second)
+      return;
+    ++discardSymIt->second;
+  };
+  walkReferencedSymbolNodes(parentOp, cg, resolvedRefs, walkFn);
+}
+
+void CGUseList::mergeUsesAfterInlining(CallGraphNode *lhs, CallGraphNode *rhs) {
+  auto &lhsUses = nodeUses[lhs], &rhsUses = nodeUses[rhs];
+  for (auto &useIt : lhsUses.innerUses) {
+    rhsUses.innerUses[useIt.first] += useIt.second;
+    discardableSymNodeUses[useIt.first] += useIt.second;
+  }
+}
+
+void CGUseList::decrementDiscardableUses(CGUser &uses) {
+  for (CallGraphNode *node : uses.topLevelUses)
+    --discardableSymNodeUses[node];
+  for (auto &it : uses.innerUses)
+    discardableSymNodeUses[it.first] -= it.second;
+}
+
 //===----------------------------------------------------------------------===//
 // CallGraph traversal
 //===----------------------------------------------------------------------===//
@@ -45,7 +298,7 @@ static llvm::cl::opt<unsigned> maxInliningIterations(
 /// traversal.
 static void runTransformOnCGSCCs(
     const CallGraph &cg,
-    function_ref<void(ArrayRef<CallGraphNode *>)> sccTransformer) {
+    function_ref<void(MutableArrayRef<CallGraphNode *>)> sccTransformer) {
   std::vector<CallGraphNode *> currentSCCVec;
   auto cgi = llvm::scc_begin(&cg);
   while (!cgi.isAtEnd()) {
@@ -63,10 +316,11 @@ namespace {
 /// Region(CallGraphNode) that it is dispatching to, we need to resolve them
 /// explicitly.
 struct ResolvedCall {
-  ResolvedCall(CallOpInterface call, CallGraphNode *targetNode)
-      : call(call), targetNode(targetNode) {}
+  ResolvedCall(CallOpInterface call, CallGraphNode *sourceNode,
+               CallGraphNode *targetNode)
+      : call(call), sourceNode(sourceNode), targetNode(targetNode) {}
   CallOpInterface call;
-  CallGraphNode *targetNode;
+  CallGraphNode *sourceNode, *targetNode;
 };
 } // end anonymous namespace
 
@@ -74,17 +328,22 @@ struct ResolvedCall {
 /// `traverseNestedCGNodes` is true, this will also collect call operations
 /// inside of nested callgraph nodes.
 static void collectCallOps(iterator_range<Region::iterator> blocks,
-                           CallGraph &cg, SmallVectorImpl<ResolvedCall> &calls,
+                           CallGraphNode *sourceNode, CallGraph &cg,
+                           SmallVectorImpl<ResolvedCall> &calls,
                            bool traverseNestedCGNodes) {
-  SmallVector<Block *, 8> worklist;
-  auto addToWorklist = [&](iterator_range<Region::iterator> blocks) {
+  SmallVector<std::pair<Block *, CallGraphNode *>, 8> worklist;
+  auto addToWorklist = [&](CallGraphNode *node,
+                           iterator_range<Region::iterator> blocks) {
     for (Block &block : blocks)
-      worklist.push_back(&block);
+      worklist.emplace_back(&block, node);
   };
 
-  addToWorklist(blocks);
+  addToWorklist(sourceNode, blocks);
   while (!worklist.empty()) {
-    for (Operation &op : *worklist.pop_back_val()) {
+    Block *block;
+    std::tie(block, sourceNode) = worklist.pop_back_val();
+
+    for (Operation &op : *block) {
       if (auto call = dyn_cast<CallOpInterface>(op)) {
         // TODO(riverriddle) Support inlining nested call references.
         CallInterfaceCallable callable = call.getCallableForCallee();
@@ -93,18 +352,20 @@ static void collectCallOps(iterator_range<Region::iterator> blocks,
             continue;
         }
 
-        CallGraphNode *node = cg.resolveCallable(call);
-        if (!node->isExternal())
-          calls.emplace_back(call, node);
+        CallGraphNode *targetNode = cg.resolveCallable(call);
+        if (!targetNode->isExternal())
+          calls.emplace_back(call, sourceNode, targetNode);
         continue;
       }
 
       // If this is not a call, traverse the nested regions. If
       // `traverseNestedCGNodes` is false, then don't traverse nested call graph
       // regions.
-      for (auto &nestedRegion : op.getRegions())
-        if (traverseNestedCGNodes || !cg.lookupNode(&nestedRegion))
-          addToWorklist(nestedRegion);
+      for (auto &nestedRegion : op.getRegions()) {
+        CallGraphNode *nestedNode = cg.lookupNode(&nestedRegion);
+        if (traverseNestedCGNodes || !nestedNode)
+          addToWorklist(nestedNode ? nestedNode : sourceNode, nestedRegion);
+      }
     }
   }
 }
@@ -122,7 +383,16 @@ struct Inliner : public InlinerInterface {
   /// *before* inlined terminator operations have been processed.
   void
   processInlinedBlocks(iterator_range<Region::iterator> inlinedBlocks) final {
-    collectCallOps(inlinedBlocks, cg, calls, /*traverseNestedCGNodes=*/true);
+    // Find the closest callgraph node from the first block.
+    CallGraphNode *node;
+    Region *region = inlinedBlocks.begin()->getParent();
+    while (!(node = cg.lookupNode(region))) {
+      region = region->getParentRegion();
+      assert(region && "expected valid parent node");
+    }
+
+    collectCallOps(inlinedBlocks, node, cg, calls,
+                   /*traverseNestedCGNodes=*/true);
   }
 
   /// The current set of call instructions to consider for inlining.
@@ -150,24 +420,47 @@ static bool shouldInline(ResolvedCall &resolvedCall) {
   return true;
 }
 
+/// Delete the given node and remove it from the current scc and the callgraph.
+static void deleteNode(CallGraphNode *node, CGUseList &useList, CallGraph &cg,
+                       MutableArrayRef<CallGraphNode *> currentSCC) {
+  // Erase the parent operation and remove it from the various lists.
+  node->getCallableRegion()->getParentOp()->erase();
+  cg.eraseNode(node);
+
+  // Replace this node in the currentSCC with the external node.
+  auto it = llvm::find(currentSCC, node);
+  if (it != currentSCC.end())
+    *it = cg.getExternalNode();
+}
+
 /// Attempt to inline calls within the given scc. This function returns
 /// success if any calls were inlined, failure otherwise.
-static LogicalResult inlineCallsInSCC(Inliner &inliner,
-                                      ArrayRef<CallGraphNode *> currentSCC) {
+static LogicalResult
+inlineCallsInSCC(Inliner &inliner, CGUseList &useList,
+                 MutableArrayRef<CallGraphNode *> currentSCC) {
   CallGraph &cg = inliner.cg;
   auto &calls = inliner.calls;
 
   // Collect all of the direct calls within the nodes of the current SCC. We
   // don't traverse nested callgraph nodes, because they are handled separately
   // likely within a 
diff erent SCC.
-  for (auto *node : currentSCC) {
-    if (!node->isExternal())
-      collectCallOps(*node->getCallableRegion(), cg, calls,
+  for (CallGraphNode *node : currentSCC) {
+    if (node->isExternal())
+      continue;
+
+    // If this node is dead, just delete it now.
+    if (useList.isDead(node))
+      deleteNode(node, useList, cg, currentSCC);
+    else
+      collectCallOps(*node->getCallableRegion(), node, cg, calls,
                      /*traverseNestedCGNodes=*/false);
   }
   if (calls.empty())
     return failure();
 
+  // A set of dead nodes to remove after inlining.
+  SmallVector<CallGraphNode *, 1> deadNodes;
+
   // Try to inline each of the call operations. Don't cache the end iterator
   // here as more calls may be added during inlining.
   bool inlinedAnyCalls = false;
@@ -179,26 +472,44 @@ static LogicalResult inlineCallsInSCC(Inliner &inliner,
     });
     if (!shouldInline(it))
       continue;
-
     CallOpInterface call = it.call;
     Region *targetRegion = it.targetNode->getCallableRegion();
+
+    // If this is the last call to the target node and the node is discardable,
+    // then inline it in-place and delete the node if successful.
+    bool inlineInPlace = useList.hasOneUseAndDiscardable(it.targetNode);
+
     LogicalResult inlineResult = inlineCall(
         inliner, call, cast<CallableOpInterface>(targetRegion->getParentOp()),
-        targetRegion);
+        targetRegion, /*shouldCloneInlinedRegion=*/!inlineInPlace);
     if (failed(inlineResult))
       continue;
+    inlinedAnyCalls = true;
+
+    // If the inlining was successful, Merge the new uses into the source node.
+    useList.dropCallUses(it.sourceNode, call.getOperation(), cg);
+    useList.mergeUsesAfterInlining(it.targetNode, it.sourceNode);
 
-    // If the inlining was successful, then erase the call.
+    // then erase the call.
     call.erase();
-    inlinedAnyCalls = true;
+
+    // If we inlined in place, mark the node for deletion.
+    if (inlineInPlace) {
+      useList.eraseNode(it.targetNode);
+      deadNodes.push_back(it.targetNode);
+    }
   }
+
+  for (CallGraphNode *node : deadNodes)
+    deleteNode(node, useList, cg, currentSCC);
   calls.clear();
   return success(inlinedAnyCalls);
 }
 
 /// Canonicalize the nodes within the given SCC with the given set of
 /// canonicalization patterns.
-static void canonicalizeSCC(CallGraph &cg, ArrayRef<CallGraphNode *> currentSCC,
+static void canonicalizeSCC(CallGraph &cg, CGUseList &useList,
+                            MutableArrayRef<CallGraphNode *> currentSCC,
                             MLIRContext *context,
                             const OwningRewritePatternList &canonPatterns) {
   // Collect the sets of nodes to canonicalize.
@@ -246,12 +557,17 @@ static void canonicalizeSCC(CallGraph &cg, ArrayRef<CallGraphNode *> currentSCC,
         // thread may be used in a 
diff erent context.
         canonicalizationHandler.eraseOrderIDForThread();
       });
+
+  // Recompute the uses held by each of the nodes.
+  for (CallGraphNode *node : nodesToCanonicalize)
+    useList.recomputeUses(node, cg);
 }
 
 /// Attempt to inline calls within the given scc, and run canonicalizations with
 /// the given patterns, until a fixed point is reached. This allows for the
 /// inlining of newly devirtualized calls.
-static void inlineSCC(Inliner &inliner, ArrayRef<CallGraphNode *> currentSCC,
+static void inlineSCC(Inliner &inliner, CGUseList &useList,
+                      MutableArrayRef<CallGraphNode *> currentSCC,
                       MLIRContext *context,
                       const OwningRewritePatternList &canonPatterns) {
   // If we successfully inlined any calls, run some simplifications on the
@@ -259,12 +575,12 @@ static void inlineSCC(Inliner &inliner, ArrayRef<CallGraphNode *> currentSCC,
   // point, or a maximum iteration count. We canonicalize here as it may
   // devirtualize new calls, as well as give us a better cost model.
   unsigned iterationCount = 0;
-  while (succeeded(inlineCallsInSCC(inliner, currentSCC))) {
+  while (succeeded(inlineCallsInSCC(inliner, useList, currentSCC))) {
     // If we aren't allowing simplifications or the max iteration count was
     // reached, then bail out early.
     if (disableCanonicalization || ++iterationCount >= maxInliningIterations)
       break;
-    canonicalizeSCC(inliner.cg, currentSCC, context, canonPatterns);
+    canonicalizeSCC(inliner.cg, useList, currentSCC, context, canonPatterns);
   }
 }
 
@@ -272,8 +588,6 @@ static void inlineSCC(Inliner &inliner, ArrayRef<CallGraphNode *> currentSCC,
 // InlinerPass
 //===----------------------------------------------------------------------===//
 
-// TODO(riverriddle) This pass should currently only be used for basic testing
-// of inlining functionality.
 namespace {
 struct InlinerPass : public OperationPass<InlinerPass> {
   void runOnOperation() override {
@@ -297,8 +611,9 @@ struct InlinerPass : public OperationPass<InlinerPass> {
 
     // Run the inline transform in post-order over the SCCs in the callgraph.
     Inliner inliner(context, cg);
-    runTransformOnCGSCCs(cg, [&](ArrayRef<CallGraphNode *> scc) {
-      inlineSCC(inliner, scc, context, canonPatterns);
+    CGUseList useList(getOperation(), cg);
+    runTransformOnCGSCCs(cg, [&](MutableArrayRef<CallGraphNode *> scc) {
+      inlineSCC(inliner, useList, scc, context, canonPatterns);
     });
   }
 };

diff  --git a/mlir/test/Transforms/inlining-dce.mlir b/mlir/test/Transforms/inlining-dce.mlir
new file mode 100644
index 000000000000..d9c8bf983c19
--- /dev/null
+++ b/mlir/test/Transforms/inlining-dce.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-opt %s -inline | FileCheck %s
+
+// This file tests the callgraph dead code elimination performed by the inliner.
+
+// Function is already dead.
+// CHECK-NOT: func @dead_function
+func @dead_function() attributes {sym_visibility = "private"} {
+  return
+}
+
+// Function becomes dead after inlining.
+// CHECK-NOT: func @dead_function_b
+func @dead_function_b() attributes {sym_visibility = "private"} {
+  return
+}
+
+// CHECK: func @live_function()
+func @live_function() {
+  call @dead_function_b() : () -> ()
+  return
+}
+
+// Same as above, but a transitive example.
+
+// CHECK: func @live_function_b
+func @live_function_b() {
+  return
+}
+// CHECK-NOT: func @dead_function_c
+func @dead_function_c() attributes {sym_visibility = "private"} {
+  call @live_function_b() : () -> ()
+  return
+}
+// CHECK-NOT: func @dead_function_d
+func @dead_function_d() attributes {sym_visibility = "private"} {
+  call @dead_function_c() : () -> ()
+  call @dead_function_c() : () -> ()
+  return
+}
+// CHECK: func @live_function_c
+func @live_function_c() {
+  call @dead_function_c() : () -> ()
+  call @dead_function_d() : () -> ()
+  return
+}
+
+// Function is referenced by non-callable top-level user.
+// CHECK: func @live_function_d
+func @live_function_d() attributes {sym_visibility = "private"} {
+  return
+}
+
+"live.user"() {use = @live_function_d} : () -> ()