[llvm-branch-commits] [llvm] 1ab4db0 - [HotColdSplit] Reflect full cost of parameters in split penalty

Fri Dec 18 17:11:08 PST 2020

Author: Aditya Kumar
Date: 2020-12-18T17:06:17-08:00
New Revision: 1ab4db0f847fa1ddd394dbf54a5051b626eab160

URL: https://github.com/llvm/llvm-project/commit/1ab4db0f847fa1ddd394dbf54a5051b626eab160
DIFF: https://github.com/llvm/llvm-project/commit/1ab4db0f847fa1ddd394dbf54a5051b626eab160.diff

LOG: [HotColdSplit] Reflect full cost of parameters in split penalty

Make the penalty for splitting a region more accurately reflect the cost
of materializing all of the inputs/outputs to/from the region.

This almost entirely eliminates code growth within functions which
undergo splitting in key internal frameworks, and reduces the size of
those frameworks between 2.6% to 3%.

rdar://49167240

Patch by: Vedant Kumar(@vsk)
Reviewers: hiraditya,rjf,t.p.northover
Reviewed By: hiraditya,rjf

Differential Revision: https://reviews.llvm.org/D59715

Added: 
    

Modified: 
    llvm/lib/Transforms/IPO/HotColdSplitting.cpp
    llvm/test/Transforms/CodeExtractor/extract-assume.ll
    llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll
    llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll
    llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll
    llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index b25b789d1dae..aa708ee520b1 100644

--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -67,6 +67,7 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
+#include <limits>
 #include <cassert>
 #include <string>
 
@@ -96,6 +97,10 @@ static cl::opt<std::string>
                     cl::desc("Name for the section containing cold functions "
                              "extracted by hot-cold splitting."));
 
+static cl::opt<int> MaxParametersForSplit(
+    "hotcoldsplit-max-params", cl::init(4), cl::Hidden,
+    cl::desc("Maximum number of parameters for a split function"));
+
 namespace {
 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
 // this function unless you modify the MBB version as well.
@@ -257,18 +262,6 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
   if (SplittingThreshold <= 0)
     return Penalty;
 
-  // The typical code size cost for materializing an argument for the outlined
-  // call.
-  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumInputs << " inputs\n");
-  const int CostForArgMaterialization = TargetTransformInfo::TCC_Basic;
-  Penalty += CostForArgMaterialization * NumInputs;
-
-  // The typical code size cost for an output alloca, its associated store, and
-  // its associated reload.
-  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputs << " outputs\n");
-  const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
-  Penalty += CostForRegionOutput * NumOutputs;
-
   // Find the number of distinct exit blocks for the region. Use a conservative
   // check to determine whether control returns from the region.
   bool NoBlocksReturn = true;
@@ -289,6 +282,48 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
     }
   }
 
+  // Count the number of phis in exit blocks with >= 2 incoming values from the
+  // outlining region. These phis are split (\ref severSplitPHINodesOfExits),
+  // and new outputs are created to supply the split phis. CodeExtractor can't
+  // report these new outputs until extraction begins, but it's important to
+  // factor the cost of the outputs into the cost calculation.
+  unsigned NumSplitExitPhis = 0;
+  for (BasicBlock *ExitBB : SuccsOutsideRegion) {
+    for (PHINode &PN : ExitBB->phis()) {
+      // Find all incoming values from the outlining region.
+      int NumIncomingVals = 0;
+      for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
+        if (find(Region, PN.getIncomingBlock(i)) != Region.end()) {
+          ++NumIncomingVals;
+          if (NumIncomingVals > 1) {
+            ++NumSplitExitPhis;
+            break;
+          }
+        }
+    }
+  }
+
+  // Apply a penalty for calling the split function. Factor in the cost of
+  // materializing all of the parameters.
+  int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis;
+  int NumParams = NumInputs + NumOutputsAndSplitPhis;
+  if (NumParams > MaxParametersForSplit) {
+    LLVM_DEBUG(dbgs() << NumInputs << " inputs and " << NumOutputsAndSplitPhis
+                      << " outputs exceeds parameter limit ("
+                      << MaxParametersForSplit << ")\n");
+    return std::numeric_limits<int>::max();
+  }
+  const int CostForArgMaterialization = 2 * TargetTransformInfo::TCC_Basic;
+  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumParams << " params\n");
+  Penalty += CostForArgMaterialization * NumParams;
+
+  // Apply the typical code size cost for an output alloca and its associated
+  // reload in the caller. Also penalize the associated store in the callee.
+  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputsAndSplitPhis
+                    << " outputs/split phis\n");
+  const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
+  Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
+
   // Apply a `noreturn` bonus.
   if (NoBlocksReturn) {
     LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
@@ -298,7 +333,7 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
 
   // Apply a penalty for having more than one successor outside of the region.
   // This penalty accounts for the switch needed in the caller.
-  if (!SuccsOutsideRegion.empty()) {
+  if (SuccsOutsideRegion.size() > 1) {
     LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
                       << " non-region successors\n");
     Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;

diff  --git a/llvm/test/Transforms/CodeExtractor/extract-assume.ll b/llvm/test/Transforms/CodeExtractor/extract-assume.ll
index bf0d2ecb2d6b..ffba771856fb 100644
--- a/llvm/test/Transforms/CodeExtractor/extract-assume.ll
+++ b/llvm/test/Transforms/CodeExtractor/extract-assume.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes="function(slp-vectorizer),module(hotcoldsplit),function(slp-vectorizer,print<assumptions>)" -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -passes="function(slp-vectorizer),module(hotcoldsplit),function(slp-vectorizer,print<assumptions>)" -hotcoldsplit-threshold=-1 -disable-output %s 2>&1 | FileCheck %s
 ;
 ; Make sure this compiles. Check that function assumption cache is refreshed
 ; after extracting blocks with assume calls from the function.

diff  --git a/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll
index fffd6f9f5dcf..4906316816e9 100644
--- a/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll
+++ b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -hotcoldsplit-max-params=2 -S < %s -o /dev/null 2>&1 | FileCheck %s
 
 declare void @sink(i32*, i32, i32) cold
 
@@ -10,10 +10,27 @@ define void @foo(i32 %arg) {
   br i1 undef, label %cold, label %exit
 
 cold:
-  ; CHECK: Applying penalty for: 2 inputs
+  ; CHECK: Applying penalty for splitting: 2
+  ; CHECK-NEXT: Applying penalty for: 2 params
+  ; CHECK-NEXT: Applying penalty for: 0 outputs/split phis
+  ; CHECK-NEXT: penalty = 6
   call void @sink(i32* @g, i32 %arg, i32 %local)
   ret void
 
 exit:
   ret void
 }
+
+define void @bar(i32* %p1, i32 %p2, i32 %p3) {
+  br i1 undef, label %cold, label %exit
+
+cold:
+  ; CHECK: Applying penalty for splitting: 2
+  ; CHECK-NEXT: 3 inputs and 0 outputs exceeds parameter limit (2)
+  ; CHECK-NEXT: penalty = 2147483647
+  call void @sink(i32* %p1, i32 %p2, i32 %p3)
+  ret void
+
+exit:
+  ret void
+}

diff  --git a/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll
index a7d9f97ab030..b7bf760b90c4 100644
--- a/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll
+++ b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -S < %s -o /dev/null 2>&1 | FileCheck %s
 
 declare void @sink() cold
 
@@ -10,8 +10,10 @@ entry:
   br i1 undef, label %cold, label %exit
 
 cold:
-  ; CHECK: Applying penalty for: 1 output
-  ; CHECK: Applying penalty for: 1 non-region successors
+  ; CHECK: Applying penalty for splitting: 2
+  ; CHECK-NEXT: Applying penalty for: 1 params
+  ; CHECK-NEXT: Applying penalty for: 1 outputs/split phis
+  ; CHECK-NEXT: penalty = 7
   %local = load i32, i32* @g
   call void @sink()
   br label %exit

diff  --git a/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll b/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll
index 3886d76da016..a9e3fc62e37c 100644
--- a/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll
+++ b/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -S < %s -o /dev/null 2>&1 | FileCheck %s
 
 declare void @sink() cold
 
@@ -9,7 +9,10 @@ entry:
   br i1 undef, label %cold1, label %exit
 
 cold1:
-  ; CHECK: Applying penalty for: 1 non-region successor
+  ; CHECK: Applying penalty for splitting: 2
+  ; CHECK-NEXT: Applying penalty for: 0 params
+  ; CHECK-NEXT: Applying penalty for: 0 outputs/split phis
+  ; CHECK-NEXT: penalty = 2
   call void @sink()
   br i1 undef, label %cold2, label %cold3
 
@@ -32,7 +35,11 @@ entry:
   br i1 undef, label %cold1, label %exit1
 
 cold1:
-  ; CHECK: Applying penalty for: 2 non-region successors
+  ; CHECK: Applying penalty for splitting: 2
+  ; CHECK-NEXT: Applying penalty for: 0 params
+  ; CHECK-NEXT: Applying penalty for: 0 outputs/split phis
+  ; CHECK-NEXT: Applying penalty for: 2 non-region successors
+  ; CHECK-NEXT: penalty = 3
   call void @sink()
   br i1 undef, label %cold2, label %cold3
 

diff  --git a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll
index bdb46d584dcb..465d0e6add77 100644
--- a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll
+++ b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -S -instsimplify -hotcoldsplit -debug < %s 2>&1 | FileCheck %s
+; RUN: opt -S -instsimplify -hotcoldsplit -hotcoldsplit-threshold=-1 -debug < %s 2>&1 | FileCheck %s
 ; RUN: opt -instcombine -hotcoldsplit -instsimplify %s -o /dev/null
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -13,7 +13,10 @@ target triple = "aarch64"
 ; CHECK-NOT: @llvm.assume
 ; CHECK: }
 ; CHECK: declare {{.*}}@llvm.assume
-; CHECK: define {{.*}}@f.cold.1(i64 %0)
+; CHECK: define {{.*}}@f.cold.1()
+; CHECK-LABEL: newFuncRoot:
+; CHECK: }
+; CHECK: define {{.*}}@f.cold.2(i64 %0)
 ; CHECK-LABEL: newFuncRoot:
 ; CHECK: %1 = icmp eq i64 %0, 0
 ; CHECK-NOT: call void @llvm.assume