[llvm] 6327d26 - [CHR] Add a threshold for the code duplication

Rong Xu via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 22 11:38:07 PST 2022


Author: Rong Xu
Date: 2022-11-22T11:36:40-08:00
New Revision: 6327d263f5e2a18645979fac2525b574866adbe5

URL: https://github.com/llvm/llvm-project/commit/6327d263f5e2a18645979fac2525b574866adbe5
DIFF: https://github.com/llvm/llvm-project/commit/6327d263f5e2a18645979fac2525b574866adbe5.diff

LOG: [CHR] Add a threshold for the code duplication

ControlHeightReduction (CHR) clones the code region to reduce the
branches in the hot code path. The number of clones is linear to the
depth of the region.

Currently it does not have control over the code size increase. We are
seeing one ~9000 BB functions get expanded to ~250000 BBs, an 25x
increase. This creates a big compile time issue for the downstream
optimizations.

This patch adds a cap for number of clones for one region.

Differential Revision: https://reviews.llvm.org/D138333

Added: 
    llvm/test/Transforms/PGOProfile/chr-dup-threshold.ll

Modified: 
    llvm/lib/Passes/PassBuilderPipelines.cpp
    llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 285645b8ba47f..022861318ce95 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -675,8 +675,12 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(FPM, Level);
 
+  // Don't add CHR pass for CSIRInstr build in PostLink as the profile
+  // is still the same as the PreLink compilation.
   if (EnableCHR && Level == OptimizationLevel::O3 && PGOOpt &&
-      (PGOOpt->Action == PGOOptions::IRUse ||
+      ((PGOOpt->Action == PGOOptions::IRUse &&
+        (Phase != ThinOrFullLTOPhase::ThinLTOPostLink ||
+         PGOOpt->CSAction != PGOOptions::CSIRInstr)) ||
        PGOOpt->Action == PGOOptions::SampleUse))
     FPM.addPass(ControlHeightReductionPass());
 

diff  --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 8ecf2cfdcbe13..aee9718ff0c78 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -47,6 +47,9 @@ using namespace llvm;
 
 #define CHR_DEBUG(X) LLVM_DEBUG(X)
 
+static cl::opt<bool> DisableCHR("disable-chr", cl::init(false), cl::Hidden,
+                                cl::desc("Disable CHR for all functions"));
+
 static cl::opt<bool> ForceCHR("force-chr", cl::init(false), cl::Hidden,
                               cl::desc("Apply CHR for all functions"));
 
@@ -66,6 +69,10 @@ static cl::opt<std::string> CHRFunctionList(
     "chr-function-list", cl::init(""), cl::Hidden,
     cl::desc("Specify file to retrieve the list of functions to apply CHR to"));
 
+static cl::opt<unsigned> CHRDupThreshsold(
+    "chr-dup-threshold", cl::init(3), cl::Hidden,
+    cl::desc("Max number of duplications by CHR for a region"));
+
 static StringSet<> CHRModules;
 static StringSet<> CHRFunctions;
 
@@ -339,23 +346,27 @@ class CHR {
                                  BasicBlock *EntryBlock,
                                  BasicBlock *NewEntryBlock,
                                  ValueToValueMapTy &VMap);
-  void fixupBranchesAndSelects(CHRScope *Scope,
-                               BasicBlock *PreEntryBlock,
-                               BranchInst *MergedBR,
-                               uint64_t ProfileCount);
-  void fixupBranch(Region *R,
-                   CHRScope *Scope,
-                   IRBuilder<> &IRB,
+  void fixupBranchesAndSelects(CHRScope *Scope, BasicBlock *PreEntryBlock,
+                               BranchInst *MergedBR, uint64_t ProfileCount);
+  void fixupBranch(Region *R, CHRScope *Scope, IRBuilder<> &IRB,
                    Value *&MergedCondition, BranchProbability &CHRBranchBias);
-  void fixupSelect(SelectInst* SI,
-                   CHRScope *Scope,
-                   IRBuilder<> &IRB,
+  void fixupSelect(SelectInst *SI, CHRScope *Scope, IRBuilder<> &IRB,
                    Value *&MergedCondition, BranchProbability &CHRBranchBias);
   void addToMergedCondition(bool IsTrueBiased, Value *Cond,
-                            Instruction *BranchOrSelect,
-                            CHRScope *Scope,
-                            IRBuilder<> &IRB,
-                            Value *&MergedCondition);
+                            Instruction *BranchOrSelect, CHRScope *Scope,
+                            IRBuilder<> &IRB, Value *&MergedCondition);
+  unsigned getRegionDuplicationCount(const Region *R) {
+    unsigned Count = 0;
+    // Find out how many times region R is cloned. Note that if the parent
+    // of R is cloned, R is also cloned, but R's clone count is not updated
+    // from the clone of the parent. We need to accumlate all the counts
+    // from the ancestors to get the clone count.
+    while (R) {
+      Count += DuplicationCount[R];
+      R = R->getParent();
+    }
+    return Count;
+  }
 
   Function &F;
   BlockFrequencyInfo &BFI;
@@ -379,6 +390,8 @@ class CHR {
   DenseMap<SelectInst *, BranchProbability> SelectBiasMap;
   // All the scopes.
   DenseSet<CHRScope *> Scopes;
+  // This maps records how many times this region is cloned.
+  DenseMap<const Region *, unsigned> DuplicationCount;
 };
 
 } // end anonymous namespace
@@ -396,7 +409,10 @@ raw_ostream &operator<<(raw_ostream &OS, const CHRScope &Scope) {
   return OS;
 }
 
-static bool shouldApply(Function &F, ProfileSummaryInfo& PSI) {
+static bool shouldApply(Function &F, ProfileSummaryInfo &PSI) {
+  if (DisableCHR)
+    return false;
+
   if (ForceCHR)
     return true;
 
@@ -1666,6 +1682,26 @@ void CHR::transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs) {
   CHR_DEBUG(dbgs() << "transformScopes " << *Scope << "\n");
 
   assert(Scope->RegInfos.size() >= 1 && "Should have at least one Region");
+
+  for (RegInfo &RI : Scope->RegInfos) {
+    const Region *R = RI.R;
+    unsigned Duplication = getRegionDuplicationCount(R);
+    dbgs() << "Dup count for R=" << R << "  is " << Duplication << "\n";
+    if (Duplication >= CHRDupThreshsold) {
+      CHR_DEBUG(dbgs() << "Reached the dup threshold of " << Duplication
+                       << " for this region");
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "DupThresholdReached",
+                                        R->getEntry()->getTerminator())
+               << "Reached the duplication threshold for the region";
+      });
+      return;
+    }
+  }
+  for (RegInfo &RI : Scope->RegInfos) {
+    DuplicationCount[RI.R]++;
+  }
+
   Region *FirstRegion = Scope->RegInfos[0].R;
   BasicBlock *EntryBlock = FirstRegion->getEntry();
   Region *LastRegion = Scope->RegInfos[Scope->RegInfos.size() - 1].R;

diff  --git a/llvm/test/Transforms/PGOProfile/chr-dup-threshold.ll b/llvm/test/Transforms/PGOProfile/chr-dup-threshold.ll
new file mode 100644
index 0000000000000..d3c3cb2e7ca70
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/chr-dup-threshold.ll
@@ -0,0 +1,195 @@
+; Test case for capping the cloning in CHR.
+; RUN: opt < %s -passes='require<profile-summary>,function(chr)' -chr-dup-threshold=2 -S | FileCheck %s
+
+; c sources for the test case.
+; extern void foo(int);
+; __attribute__((noinline)) void goo(int r, int s, int t) {
+;   if ((r & 2) != 0) {
+;     if ((s & 2) != 0) {
+;       if ((t & 2) != 0) {
+;         foo(111);
+;       }
+;       if ((t & 4) != 0) {
+;         foo(112);
+;       }
+;     }
+;     if ((s & 4) != 0) {
+;       if ((t & 2) != 0) {
+;         foo(121);
+;       }
+;       if ((t & 4) != 0) {
+;         foo(122);
+;       }
+;     }
+;   }
+;   if ((r & 4) != 0) {
+;     if ((s & 2) != 0) {
+;       if ((t & 2) != 0) {
+;         foo(211);
+;       }
+;       if ((t & 4) != 0) {
+;         foo(212);
+;       }
+;     }
+;     if ((s & 4) != 0) {
+;       if ((t & 2) != 0) {
+;         foo(221);
+;       }
+;       if ((t & 4) != 0) {
+;         foo(222);
+;       }
+;     }
+;   }
+; }
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @goo(i32 noundef %r, i32 noundef %s, i32 noundef %t) !prof !34 {
+entry:
+  %and = and i32 %r, 2
+  %cmp.not = icmp eq i32 %and, 0
+  br i1 %cmp.not, label %if.end24, label %if.then, !prof !35
+
+if.then:
+  %and1 = and i32 %s, 2
+  %cmp2.not = icmp eq i32 %and1, 0
+  br i1 %cmp2.not, label %if.end11, label %if.then3, !prof !35
+
+if.then3:
+  %and4 = and i32 %t, 2
+  %cmp5.not = icmp eq i32 %and4, 0
+  br i1 %cmp5.not, label %if.end, label %if.then6, !prof !35
+
+if.then6:
+  tail call void @foo(i32 noundef 111)
+  br label %if.end
+
+if.end:
+  %and7 = and i32 %t, 4
+  %cmp8.not = icmp eq i32 %and7, 0
+  br i1 %cmp8.not, label %if.end11, label %if.then9, !prof !35
+
+if.then9:
+  tail call void @foo(i32 noundef 112)
+  br label %if.end11
+
+if.end11:
+  %and12 = and i32 %s, 4
+  %cmp13.not = icmp eq i32 %and12, 0
+  br i1 %cmp13.not, label %if.end24, label %if.then14, !prof !35
+
+if.then14:
+  %and15 = and i32 %t, 2
+  %cmp16.not = icmp eq i32 %and15, 0
+  br i1 %cmp16.not, label %if.end18, label %if.then17, !prof !35
+
+if.then17:
+  tail call void @foo(i32 noundef 121)
+  br label %if.end18
+
+if.end18:
+  %and19 = and i32 %t, 4
+  %cmp20.not = icmp eq i32 %and19, 0
+  br i1 %cmp20.not, label %if.end24, label %if.then21, !prof !35
+
+if.then21:
+  tail call void @foo(i32 noundef 122)
+  br label %if.end24
+
+if.end24:
+  %and25 = and i32 %r, 4
+  %cmp26.not = icmp eq i32 %and25, 0
+  br i1 %cmp26.not, label %if.end52, label %if.then27, !prof !35
+
+if.then27:
+  %and28 = and i32 %s, 2
+  %cmp29.not = icmp eq i32 %and28, 0
+  br i1 %cmp29.not, label %if.end39, label %if.then30, !prof !35
+
+if.then30:
+  %and31 = and i32 %t, 2
+  %cmp32.not = icmp eq i32 %and31, 0
+  br i1 %cmp32.not, label %if.end34, label %if.then33, !prof !35
+
+if.then33:
+  tail call void @foo(i32 noundef 211)
+  br label %if.end34
+
+if.end34:
+  %and35 = and i32 %t, 4
+  %cmp36.not = icmp eq i32 %and35, 0
+  br i1 %cmp36.not, label %if.end39, label %if.then37, !prof !35
+
+if.then37:
+  tail call void @foo(i32 noundef 212)
+  br label %if.end39
+
+if.end39:
+  %and40 = and i32 %s, 4
+  %cmp41.not = icmp eq i32 %and40, 0
+  br i1 %cmp41.not, label %if.end52, label %if.then42, !prof !35
+
+if.then42:
+  %and43 = and i32 %t, 2
+  %cmp44.not = icmp eq i32 %and43, 0
+  br i1 %cmp44.not, label %if.end46, label %if.then45, !prof !35
+
+if.then45:
+  tail call void @foo(i32 noundef 221)
+  br label %if.end46
+
+if.end46:
+  %and47 = and i32 %t, 4
+  %cmp48.not = icmp eq i32 %and47, 0
+  br i1 %cmp48.not, label %if.end52, label %if.then49, !prof !35
+
+if.then49:
+  tail call void @foo(i32 noundef 222)
+  br label %if.end52
+
+if.end52:
+  ret void
+}
+
+; CHECK-LABEL: goo
+; CHECK-COUNT-3: {{.*}}.split:
+; CHECK-NOT: {{.*}}.split:
+
+declare void @foo(i32 noundef)
+
+!llvm.module.flags = !{!4}
+
+!4 = !{i32 1, !"ProfileSummary", !5}
+!5 = !{!6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
+!6 = !{!"ProfileFormat", !"InstrProf"}
+!7 = !{!"TotalCount", i64 2400001}
+!8 = !{!"MaxCount", i64 800000}
+!9 = !{!"MaxInternalCount", i64 100000}
+!10 = !{!"MaxFunctionCount", i64 800000}
+!11 = !{!"NumCounts", i64 19}
+!12 = !{!"NumFunctions", i64 4}
+!13 = !{!"IsPartialProfile", i64 0}
+!14 = !{!"PartialProfileRatio", double 0.000000e+00}
+!15 = !{!"DetailedSummary", !16}
+!16 = !{!17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32}
+!17 = !{i32 10000, i64 800000, i32 1}
+!18 = !{i32 100000, i64 800000, i32 1}
+!19 = !{i32 200000, i64 800000, i32 1}
+!20 = !{i32 300000, i64 800000, i32 1}
+!21 = !{i32 400000, i64 100000, i32 17}
+!22 = !{i32 500000, i64 100000, i32 17}
+!23 = !{i32 600000, i64 100000, i32 17}
+!24 = !{i32 700000, i64 100000, i32 17}
+!25 = !{i32 800000, i64 100000, i32 17}
+!26 = !{i32 900000, i64 100000, i32 17}
+!27 = !{i32 950000, i64 100000, i32 17}
+!28 = !{i32 990000, i64 100000, i32 17}
+!29 = !{i32 999000, i64 100000, i32 17}
+!30 = !{i32 999900, i64 100000, i32 17}
+!31 = !{i32 999990, i64 100000, i32 17}
+!32 = !{i32 999999, i64 100000, i32 17}
+!34 = !{!"function_entry_count", i64 100000}
+!35 = !{!"branch_weights", i32 0, i32 100000}
+!36 = !{!"function_entry_count", i64 1}
+!37 = !{!"branch_weights", i32 100000, i32 1}


        


More information about the llvm-commits mailing list