[llvm] r253168 - [GlobalOpt] Demote globals to locals more aggressively

James Molloy via llvm-commits llvm-commits at lists.llvm.org
Sun Nov 15 06:21:37 PST 2015


Author: jamesm
Date: Sun Nov 15 08:21:37 2015
New Revision: 253168

URL: http://llvm.org/viewvc/llvm-project?rev=253168&view=rev
Log:
[GlobalOpt] Demote globals to locals more aggressively

Global to local demotion can speed up programs that use globals a lot. It is particularly useful with LTO, when the entire call graph is known and most functions have been internalized.

For a global to be demoted, it must only be accessed by one function and that function:
  1. Must never recurse directly or indirectly, else the GV would be clobbered.
  2. Must never rely on the value in GV at the start of the function (apart from the initializer).

GlobalOpt can already do this, but it is hamstrung and only ever tries to demote globals inside "main", because C++ gives extra guarantees about how main is called - once and only once.

In LTO mode, we can often prove the first property (if the function is internal by this point, we know enough about the callgraph to determine if it could possibly recurse). FunctionAttrs now infers the "norecurse" attribute for this reason.

The second property can be proven for a subset of functions by proving that all loads from GV are dominated by a store to GV. This is conservative in the name of compile time - this only requires a DominatorTree which is fairly cheap in the grand scheme of things. We could do more fancy stuff with MemoryDependenceAnalysis too to catch more cases but this appears to catch most of the useful ones in my testing.

Added:
    llvm/trunk/test/Transforms/GlobalOpt/global-demotion.ll
Modified:
    llvm/trunk/lib/Transforms/IPO/GlobalOpt.cpp
    llvm/trunk/test/Transforms/GlobalOpt/metadata.ll
    llvm/trunk/test/Transforms/MergeFunc/crash2.ll

Modified: llvm/trunk/lib/Transforms/IPO/GlobalOpt.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/IPO/GlobalOpt.cpp?rev=253168&r1=253167&r2=253168&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/IPO/GlobalOpt.cpp (original)
+++ llvm/trunk/lib/Transforms/IPO/GlobalOpt.cpp Sun Nov 15 08:21:37 2015
@@ -28,6 +28,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -69,6 +70,7 @@ namespace {
   struct GlobalOpt : public ModulePass {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
     }
     static char ID; // Pass identification, replacement for typeid
     GlobalOpt() : ModulePass(ID) {
@@ -85,7 +87,9 @@ namespace {
     bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI,
                                const GlobalStatus &GS);
     bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn);
-
+    
+    bool isPointerValueDeadOnEntryToFunction(const Function *F, GlobalValue *GV);
+    
     TargetLibraryInfo *TLI;
     SmallSet<const Comdat *, 8> NotDiscardableComdats;
   };
@@ -95,6 +99,7 @@ char GlobalOpt::ID = 0;
 INITIALIZE_PASS_BEGIN(GlobalOpt, "globalopt",
                 "Global Variable Optimizer", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(GlobalOpt, "globalopt",
                 "Global Variable Optimizer", false, false)
 
@@ -1718,15 +1723,78 @@ bool GlobalOpt::ProcessGlobal(GlobalVari
   return ProcessInternalGlobal(GV, GVI, GS);
 }
 
+bool GlobalOpt::isPointerValueDeadOnEntryToFunction(const Function *F, GlobalValue *GV) {
+  // Find all uses of GV. We expect them all to be in F, and if we can't
+  // identify any of the uses we bail out.
+  //
+  // On each of these uses, identify if the memory that GV points to is
+  // used/required/live at the start of the function. If it is not, for example
+  // if the first thing the function does is store to the GV, the GV can
+  // possibly be demoted.
+  //
+  // We don't do an exhaustive search for memory operations - simply look
+  // through bitcasts as they're quite common and benign.
+  const DataLayout &DL = GV->getParent()->getDataLayout();
+  SmallVector<LoadInst *, 4> Loads;
+  SmallVector<StoreInst *, 4> Stores;
+  for (auto *U : GV->users()) {
+    if (Operator::getOpcode(U) == Instruction::BitCast) {
+      for (auto *UU : U->users()) {
+        if (auto *LI = dyn_cast<LoadInst>(UU))
+          Loads.push_back(LI);
+        else if (auto *SI = dyn_cast<StoreInst>(UU))
+          Stores.push_back(SI);
+        else
+          return false;
+      }
+      continue;
+    }
+
+    Instruction *I = dyn_cast<Instruction>(U);
+    if (!I)
+      return false;
+    assert(I->getParent()->getParent() == F);
+
+    if (auto *LI = dyn_cast<LoadInst>(I))
+      Loads.push_back(LI);
+    else if (auto *SI = dyn_cast<StoreInst>(I))
+      Stores.push_back(SI);
+    else
+      return false;
+  }
+
+  // We have identified all uses of GV into loads and stores. Now check if all
+  // of them are known not to depend on the value of the global at the function
+  // entry point. We do this by ensuring that every load is dominated by at
+  // least one store.
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>(*const_cast<Function *>(F))
+                 .getDomTree();
+
+  for (auto *L : Loads) {
+    auto *LTy = L->getType();
+    if (!std::any_of(Stores.begin(), Stores.end(), [&](StoreInst *S) {
+          auto *STy = S->getValueOperand()->getType();
+          // The load is only dominated by the store if DomTree says so
+          // and the number of bits loaded in L is less than or equal to
+          // the number of bits stored in S.
+          return DT.dominates(S, L) &&
+                 DL.getTypeStoreSize(LTy) <= DL.getTypeStoreSize(STy);
+        }))
+      return false;
+  }
+  // All loads have known dependences inside F, so the global can be localized.
+  return true;
+}
+
 /// Analyze the specified global variable and optimize
 /// it if possible.  If we make a change, return true.
 bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
                                       Module::global_iterator &GVI,
                                       const GlobalStatus &GS) {
   auto &DL = GV->getParent()->getDataLayout();
-  // If this is a first class global and has only one accessing function
-  // and this function is main (which we know is not recursive), we replace
-  // the global with a local alloca in this function.
+  // If this is a first class global and has only one accessing function and
+  // this function is non-recursive, we replace the global with a local alloca
+  // in this function.
   //
   // NOTE: It doesn't make sense to promote non-single-value types since we
   // are just replacing static memory to stack memory.
@@ -1735,9 +1803,10 @@ bool GlobalOpt::ProcessInternalGlobal(Gl
   if (!GS.HasMultipleAccessingFunctions &&
       GS.AccessingFunction && !GS.HasNonInstructionUser &&
       GV->getType()->getElementType()->isSingleValueType() &&
-      GS.AccessingFunction->getName() == "main" &&
-      GS.AccessingFunction->hasExternalLinkage() &&
-      GV->getType()->getAddressSpace() == 0) {
+      GV->getType()->getAddressSpace() == 0 &&
+      !GV->isExternallyInitialized() &&
+      GS.AccessingFunction->doesNotRecurse() &&
+      isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV) ) {
     DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
     Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
                                                    ->getEntryBlock().begin());

Added: llvm/trunk/test/Transforms/GlobalOpt/global-demotion.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/GlobalOpt/global-demotion.ll?rev=253168&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/GlobalOpt/global-demotion.ll (added)
+++ llvm/trunk/test/Transforms/GlobalOpt/global-demotion.ll Sun Nov 15 08:21:37 2015
@@ -0,0 +1,80 @@
+; RUN: opt -globalopt -S < %s | FileCheck %s
+
+ at G1 = internal global i32 5
+ at G2 = internal global i32 5
+ at G3 = internal global i32 5
+ at G4 = internal global i32 5
+ at G5 = internal global i32 5
+
+; CHECK-LABEL: @test1
+define internal i32 @test1() norecurse {
+; CHECK-NOT: @G1
+  store i32 4, i32* @G1
+  %a = load i32, i32* @G1
+; CHECK: ret
+  ret i32 %a
+}
+
+; The load comes before the store which makes @G2 live before the call.
+; CHECK-LABEL: @test2
+define internal i32 @test2() norecurse {
+; CHECK-NOT: %G2
+  %a = load i32, i32* @G2
+  store i32 4, i32* @G2
+; CHECK: ret
+  ret i32 %a
+}
+
+; This global is indexed by a GEP - this makes it partial alias and we bail out.
+; FIXME: We don't actually have to bail out in this case.
+
+; CHECK-LABEL: @test3
+define internal i32 @test3() norecurse {
+; CHECK-NOT: %G3
+  %x = getelementptr i32,i32* @G3, i32 0
+  %a = load i32, i32* %x
+  store i32 4, i32* @G3
+; CHECK: ret
+  ret i32 %a
+}
+
+; The global is casted away to a larger type then loaded. The store only partially
+; covers the load, so we must not demote.
+
+; CHECK-LABEL: @test4
+define internal i32 @test4() norecurse {
+; CHECK-NOT: %G4
+  store i32 4, i32* @G4
+  %x = bitcast i32* @G4 to i64*
+  %a = load i64, i64* %x
+  %b = trunc i64 %a to i32
+; CHECK: ret
+  ret i32 %b
+}
+
+; The global is casted away to a smaller type then loaded. This one is fine.
+
+; CHECK-LABEL: @test5
+define internal i32 @test5() norecurse {
+; CHECK-NOT: @G5
+  store i32 4, i32* @G5
+  %x = bitcast i32* @G5 to i16*
+  %a = load i16, i16* %x
+  %b = zext i16 %a to i32
+; CHECK: ret
+  ret i32 %b
+}
+
+define i32 @main() norecurse {
+  %a = call i32 @test1()
+  %b = call i32 @test2()
+  %c = call i32 @test3()
+  %d = call i32 @test4()
+  %e = call i32 @test5()
+
+  %x = or i32 %a, %b
+  %y = or i32 %x, %c
+  %z = or i32 %y, %d
+  %w = or i32 %z, %e
+  ret i32 %w
+}

Modified: llvm/trunk/test/Transforms/GlobalOpt/metadata.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/GlobalOpt/metadata.ll?rev=253168&r1=253167&r2=253168&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/GlobalOpt/metadata.ll (original)
+++ llvm/trunk/test/Transforms/GlobalOpt/metadata.ll Sun Nov 15 08:21:37 2015
@@ -5,7 +5,7 @@
 ; to that containing %G should likewise drop to null.
 @G = internal global i8** null
 
-define i32 @main(i32 %argc, i8** %argv) {
+define i32 @main(i32 %argc, i8** %argv) norecurse {
 ; CHECK-LABEL: @main(
 ; CHECK: %G = alloca
   store i8** %argv, i8*** @G

Modified: llvm/trunk/test/Transforms/MergeFunc/crash2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/MergeFunc/crash2.ll?rev=253168&r1=253167&r2=253168&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/MergeFunc/crash2.ll (original)
+++ llvm/trunk/test/Transforms/MergeFunc/crash2.ll Sun Nov 15 08:21:37 2015
@@ -11,7 +11,7 @@
 @G = internal global i8** null
 @G2 = internal global i8** null
 
-define i32 @main(i32 %argc, i8** %argv) {
+define i32 @main(i32 %argc, i8** %argv) norecurse {
 ; CHECK: alloca
   store i8** %argv, i8*** @G
   ret i32 0




More information about the llvm-commits mailing list