[llvm] b9d40a7 - [llvm-profgen] Improve sample profile density (#92144)

Fri May 24 11:37:29 PDT 2024

Author: Lei Wang
Date: 2024-05-24T14:37:24-04:00
New Revision: b9d40a7ae4b3e5b9829eca8a497637c9fab6dd3e

URL: https://github.com/llvm/llvm-project/commit/b9d40a7ae4b3e5b9829eca8a497637c9fab6dd3e
DIFF: https://github.com/llvm/llvm-project/commit/b9d40a7ae4b3e5b9829eca8a497637c9fab6dd3e.diff

LOG: [llvm-profgen] Improve sample profile density (#92144)

The profile density feature(the amount of samples in the profile
relative to the program size) is used to identify insufficient sample
issue and provide hints for user to increase sample count. A low-density
profile can be inaccurate due to statistical noise, which can hurt FDO
performance.

This change introduces two improvements to the current density work. 
1. The density calculation/definition is changed. Previously, the
density of a profile was calculated as the minimum density for all warm
functions (a function was considered warm if its total samples were
within the top N percent of the profile). However, there is a problem
that a high total sample profile can have a very low density, which
makes the density value unstable.
- Instead, we want to find a density number such that if a function's
density is below this value, it is considered low-density function. We
consider the whole profile is bad if a group of low-density functions
have the sum of samples that exceeds N percent cut-off of the total
samples.

- In implementation, we sort the function profiles by density, iterate
them in descending order and keep accumulating the body samples until
the sum exceeds the (100% - N) percentage of the total_samples, the
profile-density is the last(minimum) function-density of processed
functions. We introduce the a flag(`--profile-density-threshold`) for
this percentage threshold.

2. The density is now calculated based on final(compiler used) profiles
instead of merged context-less profiles.

Added: 
    

Modified: 
    llvm/test/tools/llvm-profgen/profile-density.test
    llvm/tools/llvm-profgen/ProfileGenerator.cpp
    llvm/tools/llvm-profgen/ProfileGenerator.h

Removed: 
    


################################################################################
diff  --git a/llvm/test/tools/llvm-profgen/profile-density.test b/llvm/test/tools/llvm-profgen/profile-density.test
index 0eb83838d16e7..086697e8da0a5 100644

--- a/llvm/test/tools/llvm-profgen/profile-density.test
+++ b/llvm/test/tools/llvm-profgen/profile-density.test
@@ -1,13 +1,17 @@
-; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -hot-function-density-threshold=10  --trim-cold-profile=0 &> %t2
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -profile-density-threshold=10  --trim-cold-profile=0 &> %t2
 ; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-DENSITY
-
-; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -hot-function-density-threshold=1 &> %t4
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -profile-density-threshold=1 -profile-density-threshold=10000 &> %t4
 ; RUN: FileCheck %s --input-file %t4 --check-prefix=CHECK-DENSITY-CS
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t5 --show-density -profile-density-threshold=1 -profile-density-cutoff-hot=800000  &> %t6
+; RUN: FileCheck %s --input-file %t6 --check-prefix=CHECK-DENSITY-CS-80
+
+;CHECK-DENSITY: Sample PGO is estimated to optimize better with 2.9x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
+;CHECK-DENSITY: Functions with density >= 3.5 account for 99.00% total sample counts.
 
-;CHECK-DENSITY: Sample PGO is estimated to optimize better with 3.1x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
-;CHECK-DENSITY: Minimum profile density for hot functions with top 99.00% total samples: 3.2
+;CHECK-DENSITY-CS: Sample PGO is estimated to optimize better with 12.5x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
+;CHECK-DENSITY-CS: Functions with density >= 800.1 account for 99.00% total sample counts.
 
-;CHECK-DENSITY-CS: Minimum profile density for hot functions with top 99.00% total samples: 128.3
+;CHECK-DENSITY-CS-80: Functions with density >= 1886.2 account for 80.00% total sample counts.
 
 ; original code:
 ; clang -O3 -g -fno-optimize-sibling-calls -fdebug-info-for-profiling qsort.c -o a.out

diff  --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 5aa44108f9660..2118e954fe543 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -75,14 +75,18 @@ static cl::opt<int, true> CSProfMaxContextDepth(
              "depth limit."),
     cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
 
-static cl::opt<double> HotFunctionDensityThreshold(
-    "hot-function-density-threshold", llvm::cl::init(1000),
-    llvm::cl::desc(
-        "specify density threshold for hot functions (default: 1000)"),
+static cl::opt<double> ProfileDensityThreshold(
+    "profile-density-threshold", llvm::cl::init(50),
+    llvm::cl::desc("If the profile density is below the given threshold, it "
+                   "will be suggested to increase the sampling rate."),
     llvm::cl::Optional);
 static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
                                  llvm::cl::desc("show profile density details"),
                                  llvm::cl::Optional);
+static cl::opt<int> ProfileDensityCutOffHot(
+    "profile-density-cutoff-hot", llvm::cl::init(990000),
+    llvm::cl::desc("Total samples cutoff for functions used to calculate "
+                   "profile density."));
 
 static cl::opt<bool> UpdateTotalSamples(
     "update-total-samples", llvm::cl::init(false),
@@ -179,21 +183,22 @@ void ProfileGeneratorBase::write() {
 
 void ProfileGeneratorBase::showDensitySuggestion(double Density) {
   if (Density == 0.0)
-    WithColor::warning() << "The --profile-summary-cutoff-hot option may be "
+    WithColor::warning() << "The output profile is empty or the "
+                            "--profile-density-cutoff-hot option is "
                             "set too low. Please check your command.\n";
-  else if (Density < HotFunctionDensityThreshold)
+  else if (Density < ProfileDensityThreshold)
     WithColor::warning()
         << "Sample PGO is estimated to optimize better with "
-        << format("%.1f", HotFunctionDensityThreshold / Density)
+        << format("%.1f", ProfileDensityThreshold / Density)
         << "x more samples. Please consider increasing sampling rate or "
            "profiling for longer duration to get more samples.\n";
 
   if (ShowDensity)
-    outs() << "Minimum profile density for hot functions with top "
+    outs() << "Functions with density >= " << format("%.1f", Density)
+           << " account for "
            << format("%.2f",
-                     static_cast<double>(ProfileSummaryCutoffHot.getValue()) /
-                         10000)
-           << "% total samples: " << format("%.1f", Density) << "\n";
+                     static_cast<double>(ProfileDensityCutOffHot) / 10000)
+           << "% total sample counts.\n";
 }
 
 bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) {
@@ -238,32 +243,6 @@ void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) {
   }
 }
 
-double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles,
-                                              uint64_t HotCntThreshold) {
-  double Density = DBL_MAX;
-  std::vector<const FunctionSamples *> HotFuncs;
-  for (auto &I : Profiles) {
-    auto &FuncSamples = I.second;
-    if (FuncSamples.getTotalSamples() < HotCntThreshold)
-      continue;
-    HotFuncs.emplace_back(&FuncSamples);
-  }
-
-  for (auto *FuncSamples : HotFuncs) {
-    auto *Func = Binary->getBinaryFunction(FuncSamples->getFunction());
-    if (!Func)
-      continue;
-    uint64_t FuncSize = Func->getFuncSize();
-    if (FuncSize == 0)
-      continue;
-    Density =
-        std::min(Density, static_cast<double>(FuncSamples->getTotalSamples()) /
-                              FuncSize);
-  }
-
-  return Density == DBL_MAX ? 0.0 : Density;
-}
-
 void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
                                               const RangeSample &Ranges) {
 
@@ -768,9 +747,95 @@ void ProfileGenerator::populateBoundarySamplesForAllFunctions(
   }
 }
 
+void ProfileGeneratorBase::calculateBodySamplesAndSize(
+    const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
+    uint64_t &FuncBodySize) {
+  // Note that ideally the size should be the number of function instruction.
+  // However, for probe-based profile, we don't have the accurate instruction
+  // count for each probe, instead, the probe sample is the samples count for
+  // the block, which is equivelant to
+  // total_instruction_samples/num_of_instruction in one block. Hence, we use
+  // the number of probe as a proxy for the function's size.
+  FuncBodySize += FSamples.getBodySamples().size();
+
+  // The accumulated body samples re-calculated here could be 
diff erent from the
+  // TotalSamples(getTotalSamples) field of FunctionSamples for line-number
+  // based profile. The reason is that TotalSamples is the sum of all the
+  // samples of the machine instruction in one source-code line, however, the
+  // entry of Bodysamples is the only max number of them, so the TotalSamples is
+  // usually much bigger than the accumulated body samples as one souce-code
+  // line can emit many machine instructions. We observed a regression when we
+  // switched to use the accumulated body samples(by using
+  // -update-total-samples). Hence, it's safer to re-calculate here to avoid
+  // such discrepancy. There is no problem for probe-based profile, as the
+  // TotalSamples is exactly the same as the accumulated body samples.
+  for (const auto &I : FSamples.getBodySamples())
+    TotalBodySamples += I.second.getSamples();
+
+  for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
+    for (const auto &Callee : CallsiteSamples.second) {
+      // For binary-level density, the inlinees' samples and size should be
+      // included in the calculation.
+      calculateBodySamplesAndSize(Callee.second, TotalBodySamples,
+                                  FuncBodySize);
+    }
+}
+
+// Calculate Profile-density:
+// Calculate the density for each function and sort them in descending order,
+// keep accumulating their total samples unitl it exceeds the
+// percentage_threshold(cut-off) of total profile samples, the profile-density
+// is the last(minimum) function-density of the processed functions, which means
+// all the functions hot to perf are on good density if the profile-density is
+// good. The percentage_threshold(--profile-density-cutoff-hot) is configurable
+// depending on how much regression the system want to tolerate.
+double
+ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
+  double ProfileDensity = 0.0;
+
+  uint64_t TotalProfileSamples = 0;
+  // A list of the function profile density and its total samples.
+  std::vector<std::pair<double, uint64_t>> FuncDensityList;
+  for (const auto &I : Profiles) {
+    uint64_t TotalBodySamples = 0;
+    uint64_t FuncBodySize = 0;
+    calculateBodySamplesAndSize(I.second, TotalBodySamples, FuncBodySize);
+
+    if (FuncBodySize == 0)
+      continue;
+
+    double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
+    TotalProfileSamples += TotalBodySamples;
+    FuncDensityList.emplace_back(FuncDensity, TotalBodySamples);
+  }
+
+  // Sorted by the density in descending order.
+  llvm::stable_sort(FuncDensityList, [&](const std::pair<double, uint64_t> &A,
+                                         const std::pair<double, uint64_t> &B) {
+    if (A.first != B.first)
+      return A.first > B.first;
+    return A.second < B.second;
+  });
+
+  uint64_t AccumulatedSamples = 0;
+  uint32_t I = 0;
+  assert(ProfileDensityCutOffHot <= 1000000 &&
+         "The cutoff value is greater than 1000000(100%)");
+  while (AccumulatedSamples < TotalProfileSamples *
+                                  static_cast<float>(ProfileDensityCutOffHot) /
+                                  1000000 &&
+         I < FuncDensityList.size()) {
+    AccumulatedSamples += FuncDensityList[I].second;
+    ProfileDensity = FuncDensityList[I].first;
+    I++;
+  }
+
+  return ProfileDensity;
+}
+
 void ProfileGeneratorBase::calculateAndShowDensity(
     const SampleProfileMap &Profiles) {
-  double Density = calculateDensity(Profiles, HotCountThreshold);
+  double Density = calculateDensity(Profiles);
   showDensitySuggestion(Density);
 }
 
@@ -1057,17 +1122,13 @@ void CSProfileGenerator::postProcessProfiles() {
             CSProfMaxColdContextDepth, EnableCSPreInliner);
   }
 
-  // Merge function samples of CS profile to calculate profile density.
-  sampleprof::SampleProfileMap ContextLessProfiles;
-  ProfileConverter::flattenProfile(ProfileMap, ContextLessProfiles, true);
-
-  calculateAndShowDensity(ContextLessProfiles);
   if (GenCSNestedProfile) {
     ProfileConverter CSConverter(ProfileMap);
     CSConverter.convertCSProfiles();
     FunctionSamples::ProfileIsCS = false;
   }
   filterAmbiguousProfile(ProfileMap);
+  ProfileGeneratorBase::calculateAndShowDensity(ProfileMap);
 }
 
 void ProfileGeneratorBase::computeSummaryAndThreshold(

diff  --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
index d258fb78bfb11..5e36128530cd9 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.h
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -116,10 +116,13 @@ class ProfileGeneratorBase {
 
   void computeSummaryAndThreshold(SampleProfileMap &ProfileMap);
 
-  void calculateAndShowDensity(const SampleProfileMap &Profiles);
+  void calculateBodySamplesAndSize(const FunctionSamples &FSamples,
+                                   uint64_t &TotalBodySamples,
+                                   uint64_t &FuncBodySize);
+
+  double calculateDensity(const SampleProfileMap &Profiles);
 
-  double calculateDensity(const SampleProfileMap &Profiles,
-                          uint64_t HotCntThreshold);
+  void calculateAndShowDensity(const SampleProfileMap &Profiles);
 
   void showDensitySuggestion(double Density);