[llvm] [LCSSA] Cache the loop exit blocks across recursive analysis (NFC) (PR #101087)

Tue Jul 30 13:22:34 PDT 2024

https://github.com/teresajohnson updated https://github.com/llvm/llvm-project/pull/101087

>From 45deb5610d5060a9e7c6ce14ac19ae2e04566826 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Mon, 29 Jul 2024 14:21:20 -0700
Subject: [PATCH 1/2] [LCSSA] Cache the loop exit blocks across recursive
 analysis (NFC)

The computation of loop exit blocks recently showed up as a huge compile
time cost for a large file. This computation was already being cached
during an invocation of formLCSSAForInstructions, but can also be cached
across callers formLCSSA and formLCSSARecursively (the latter was what
was being invoked in the examined case).

Since each of these functions has an external entry point invoked from
other passes, doing so required refactoring each into a worker mechanism
that takes a LoopExitBlocks map, and the externally callable version
that declares the map. That way we can pass it down from the outermost
formLCSSARecursively.

This reduced the time spent in the LCSSA pass from ~110s to ~1s.
---
 llvm/lib/Transforms/Utils/LCSSA.cpp | 73 ++++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index ab1edf47d8db0..3f5d00f9d8e1f 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -74,21 +74,17 @@ static bool isExitBlock(BasicBlock *BB,
 /// For every instruction from the worklist, check to see if it has any uses
 /// that are outside the current loop.  If so, insert LCSSA PHI nodes and
 /// rewrite the uses.
-bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
-                                    const DominatorTree &DT, const LoopInfo &LI,
-                                    ScalarEvolution *SE,
-                                    SmallVectorImpl<PHINode *> *PHIsToRemove,
-                                    SmallVectorImpl<PHINode *> *InsertedPHIs) {
+static bool formLCSSAForInstructionsWorker(
+    SmallVectorImpl<Instruction *> &Worklist, const DominatorTree &DT,
+    const LoopInfo &LI, ScalarEvolution *SE,
+    SmallVectorImpl<PHINode *> *PHIsToRemove,
+    SmallVectorImpl<PHINode *> *InsertedPHIs,
+    SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>> &LoopExitBlocks) {
   SmallVector<Use *, 16> UsesToRewrite;
   SmallSetVector<PHINode *, 16> LocalPHIsToRemove;
   PredIteratorCache PredCache;
   bool Changed = false;
 
-  // Cache the Loop ExitBlocks across this loop.  We expect to get a lot of
-  // instructions within the same loops, computing the exit blocks is
-  // expensive, and we're not mutating the loop structure.
-  SmallDenseMap<Loop*, SmallVector<BasicBlock *,1>> LoopExitBlocks;
-
   while (!Worklist.empty()) {
     UsesToRewrite.clear();
 
@@ -317,6 +313,23 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
   return Changed;
 }
 
+/// For every instruction from the worklist, check to see if it has any uses
+/// that are outside the current loop.  If so, insert LCSSA PHI nodes and
+/// rewrite the uses.
+bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
+                                    const DominatorTree &DT, const LoopInfo &LI,
+                                    ScalarEvolution *SE,
+                                    SmallVectorImpl<PHINode *> *PHIsToRemove,
+                                    SmallVectorImpl<PHINode *> *InsertedPHIs) {
+  // Cache the Loop ExitBlocks computed during the analysis.  We expect to get a
+  // lot of instructions within the same loops, computing the exit blocks is
+  // expensive, and we're not mutating the loop structure.
+  SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>> LoopExitBlocks;
+
+  return formLCSSAForInstructionsWorker(Worklist, DT, LI, SE, PHIsToRemove,
+                                        InsertedPHIs, LoopExitBlocks);
+}
+
 // Compute the set of BasicBlocks in the loop `L` dominating at least one exit.
 static void computeBlocksDominatingExits(
     Loop &L, const DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks,
@@ -360,8 +373,9 @@ static void computeBlocksDominatingExits(
   }
 }
 
-bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
-                     ScalarEvolution *SE) {
+static bool formLCSSAWorker(
+    Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE,
+    SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>> &LoopExitBlocks) {
   bool Changed = false;
 
 #ifdef EXPENSIVE_CHECKS
@@ -374,6 +388,8 @@ bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
 
   SmallVector<BasicBlock *, 8> ExitBlocks;
   L.getExitBlocks(ExitBlocks);
+  if (!LoopExitBlocks.count(&L))
+    L.getExitBlocks(LoopExitBlocks[&L]);
   if (ExitBlocks.empty())
     return false;
 
@@ -414,26 +430,49 @@ bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
     }
   }
 
-  Changed = formLCSSAForInstructions(Worklist, DT, *LI, SE);
+  Changed = formLCSSAForInstructionsWorker(Worklist, DT, *LI, SE, nullptr,
+                                           nullptr, LoopExitBlocks);
 
   assert(L.isLCSSAForm(DT));
 
   return Changed;
 }
 
+bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
+                     ScalarEvolution *SE) {
+  // Cache the Loop ExitBlocks computed during the analysis.  We expect to get a
+  // lot of instructions within the same loops, computing the exit blocks is
+  // expensive, and we're not mutating the loop structure.
+  SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>> LoopExitBlocks;
+
+  return formLCSSAWorker(L, DT, LI, SE, LoopExitBlocks);
+}
+
 /// Process a loop nest depth first.
-bool llvm::formLCSSARecursively(Loop &L, const DominatorTree &DT,
-                                const LoopInfo *LI, ScalarEvolution *SE) {
+static bool formLCSSARecursivelyWorker(
+    Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE,
+    SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>> &LoopExitBlocks) {
   bool Changed = false;
 
   // Recurse depth-first through inner loops.
   for (Loop *SubLoop : L.getSubLoops())
-    Changed |= formLCSSARecursively(*SubLoop, DT, LI, SE);
+    Changed |= formLCSSARecursivelyWorker(*SubLoop, DT, LI, SE, LoopExitBlocks);
 
-  Changed |= formLCSSA(L, DT, LI, SE);
+  Changed |= formLCSSAWorker(L, DT, LI, SE, LoopExitBlocks);
   return Changed;
 }
 
+/// Process a loop nest depth first.
+bool llvm::formLCSSARecursively(Loop &L, const DominatorTree &DT,
+                                const LoopInfo *LI, ScalarEvolution *SE) {
+  // Cache the Loop ExitBlocks computed during the analysis.  We expect to get a
+  // lot of instructions within the same loops, computing the exit blocks is
+  // expensive, and we're not mutating the loop structure.
+  SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>> LoopExitBlocks;
+
+  return formLCSSARecursivelyWorker(L, DT, LI, SE, LoopExitBlocks);
+}
+
 /// Process all loops in the function, inner-most out.
 static bool formLCSSAOnAllLoops(const LoopInfo *LI, const DominatorTree &DT,
                                 ScalarEvolution *SE) {

>From d4f3f8e7b3e68597206cfe6984ab018d82c7a120 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Tue, 30 Jul 2024 13:22:10 -0700
Subject: [PATCH 2/2] Address comments

---
 llvm/lib/Transforms/Utils/LCSSA.cpp | 49 +++++++++++++++--------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index 3f5d00f9d8e1f..34581cce0fe80 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -71,15 +71,18 @@ static bool isExitBlock(BasicBlock *BB,
   return is_contained(ExitBlocks, BB);
 }
 
+using LoopExitBlocksTy = SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>>;
+
 /// For every instruction from the worklist, check to see if it has any uses
 /// that are outside the current loop.  If so, insert LCSSA PHI nodes and
 /// rewrite the uses.
-static bool formLCSSAForInstructionsWorker(
-    SmallVectorImpl<Instruction *> &Worklist, const DominatorTree &DT,
-    const LoopInfo &LI, ScalarEvolution *SE,
-    SmallVectorImpl<PHINode *> *PHIsToRemove,
-    SmallVectorImpl<PHINode *> *InsertedPHIs,
-    SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>> &LoopExitBlocks) {
+static bool
+formLCSSAForInstructionsImpl(SmallVectorImpl<Instruction *> &Worklist,
+                             const DominatorTree &DT, const LoopInfo &LI,
+                             ScalarEvolution *SE,
+                             SmallVectorImpl<PHINode *> *PHIsToRemove,
+                             SmallVectorImpl<PHINode *> *InsertedPHIs,
+                             LoopExitBlocksTy &LoopExitBlocks) {
   SmallVector<Use *, 16> UsesToRewrite;
   SmallSetVector<PHINode *, 16> LocalPHIsToRemove;
   PredIteratorCache PredCache;
@@ -324,10 +327,10 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
   // Cache the Loop ExitBlocks computed during the analysis.  We expect to get a
   // lot of instructions within the same loops, computing the exit blocks is
   // expensive, and we're not mutating the loop structure.
-  SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>> LoopExitBlocks;
+  LoopExitBlocksTy LoopExitBlocks;
 
-  return formLCSSAForInstructionsWorker(Worklist, DT, LI, SE, PHIsToRemove,
-                                        InsertedPHIs, LoopExitBlocks);
+  return formLCSSAForInstructionsImpl(Worklist, DT, LI, SE, PHIsToRemove,
+                                      InsertedPHIs, LoopExitBlocks);
 }
 
 // Compute the set of BasicBlocks in the loop `L` dominating at least one exit.
@@ -373,9 +376,9 @@ static void computeBlocksDominatingExits(
   }
 }
 
-static bool formLCSSAWorker(
-    Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE,
-    SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>> &LoopExitBlocks) {
+static bool formLCSSAImpl(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
+                          ScalarEvolution *SE,
+                          LoopExitBlocksTy &LoopExitBlocks) {
   bool Changed = false;
 
 #ifdef EXPENSIVE_CHECKS
@@ -430,8 +433,8 @@ static bool formLCSSAWorker(
     }
   }
 
-  Changed = formLCSSAForInstructionsWorker(Worklist, DT, *LI, SE, nullptr,
-                                           nullptr, LoopExitBlocks);
+  Changed = formLCSSAForInstructionsImpl(Worklist, DT, *LI, SE, nullptr,
+                                         nullptr, LoopExitBlocks);
 
   assert(L.isLCSSAForm(DT));
 
@@ -443,22 +446,22 @@ bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
   // Cache the Loop ExitBlocks computed during the analysis.  We expect to get a
   // lot of instructions within the same loops, computing the exit blocks is
   // expensive, and we're not mutating the loop structure.
-  SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>> LoopExitBlocks;
+  LoopExitBlocksTy LoopExitBlocks;
 
-  return formLCSSAWorker(L, DT, LI, SE, LoopExitBlocks);
+  return formLCSSAImpl(L, DT, LI, SE, LoopExitBlocks);
 }
 
 /// Process a loop nest depth first.
-static bool formLCSSARecursivelyWorker(
-    Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE,
-    SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>> &LoopExitBlocks) {
+static bool formLCSSARecursivelyImpl(Loop &L, const DominatorTree &DT,
+                                     const LoopInfo *LI, ScalarEvolution *SE,
+                                     LoopExitBlocksTy &LoopExitBlocks) {
   bool Changed = false;
 
   // Recurse depth-first through inner loops.
   for (Loop *SubLoop : L.getSubLoops())
-    Changed |= formLCSSARecursivelyWorker(*SubLoop, DT, LI, SE, LoopExitBlocks);
+    Changed |= formLCSSARecursivelyImpl(*SubLoop, DT, LI, SE, LoopExitBlocks);
 
-  Changed |= formLCSSAWorker(L, DT, LI, SE, LoopExitBlocks);
+  Changed |= formLCSSAImpl(L, DT, LI, SE, LoopExitBlocks);
   return Changed;
 }
 
@@ -468,9 +471,9 @@ bool llvm::formLCSSARecursively(Loop &L, const DominatorTree &DT,
   // Cache the Loop ExitBlocks computed during the analysis.  We expect to get a
   // lot of instructions within the same loops, computing the exit blocks is
   // expensive, and we're not mutating the loop structure.
-  SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>> LoopExitBlocks;
+  LoopExitBlocksTy LoopExitBlocks;
 
-  return formLCSSARecursivelyWorker(L, DT, LI, SE, LoopExitBlocks);
+  return formLCSSARecursivelyImpl(L, DT, LI, SE, LoopExitBlocks);
 }
 
 /// Process all loops in the function, inner-most out.