[llvm] [llvm-profgen] Improve sample profile density (PR #92144)
Lei Wang via llvm-commits
llvm-commits at lists.llvm.org
Thu May 23 12:21:39 PDT 2024
https://github.com/wlei-llvm updated https://github.com/llvm/llvm-project/pull/92144
>From 441a16d95c2deb4b50641241e283891d7765c50b Mon Sep 17 00:00:00 2001
From: wlei <wlei at fb.com>
Date: Mon, 13 May 2024 13:57:02 -0700
Subject: [PATCH 1/8] improve profile density
---
.../tools/llvm-profgen/profile-density.test | 2 +-
llvm/tools/llvm-profgen/ProfileGenerator.cpp | 90 +++++++++++++++++--
llvm/tools/llvm-profgen/ProfileGenerator.h | 5 +-
3 files changed, 89 insertions(+), 8 deletions(-)
diff --git a/llvm/test/tools/llvm-profgen/profile-density.test b/llvm/test/tools/llvm-profgen/profile-density.test
index 0eb83838d16e7..f22c6f04914aa 100644
--- a/llvm/test/tools/llvm-profgen/profile-density.test
+++ b/llvm/test/tools/llvm-profgen/profile-density.test
@@ -7,7 +7,7 @@
;CHECK-DENSITY: Sample PGO is estimated to optimize better with 3.1x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
;CHECK-DENSITY: Minimum profile density for hot functions with top 99.00% total samples: 3.2
-;CHECK-DENSITY-CS: Minimum profile density for hot functions with top 99.00% total samples: 128.3
+;CHECK-DENSITY-CS: Minimum profile density for hot functions with top 99.00% total samples: 619.0
; original code:
; clang -O3 -g -fno-optimize-sibling-calls -fdebug-info-for-profiling qsort.c -o a.out
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 5aa44108f9660..ecbc6763e56f1 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -83,6 +83,10 @@ static cl::opt<double> HotFunctionDensityThreshold(
static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
llvm::cl::desc("show profile density details"),
llvm::cl::Optional);
+static cl::opt<int> ProfileDensityHotFuncCutOff(
+ "profile-density-hot-func-cutoff", llvm::cl::init(990000),
+ llvm::cl::desc("Total sample cutoff for hot functions used to calculate "
+ "the profile density."));
static cl::opt<bool> UpdateTotalSamples(
"update-total-samples", llvm::cl::init(false),
@@ -177,7 +181,8 @@ void ProfileGeneratorBase::write() {
write(std::move(WriterOrErr.get()), ProfileMap);
}
-void ProfileGeneratorBase::showDensitySuggestion(double Density) {
+void ProfileGeneratorBase::showDensitySuggestion(double Density,
+ int DensityCutoffHot) {
if (Density == 0.0)
WithColor::warning() << "The --profile-summary-cutoff-hot option may be "
"set too low. Please check your command.\n";
@@ -190,9 +195,7 @@ void ProfileGeneratorBase::showDensitySuggestion(double Density) {
if (ShowDensity)
outs() << "Minimum profile density for hot functions with top "
- << format("%.2f",
- static_cast<double>(ProfileSummaryCutoffHot.getValue()) /
- 10000)
+ << format("%.2f", static_cast<double>(DensityCutoffHot) / 10000)
<< "% total samples: " << format("%.1f", Density) << "\n";
}
@@ -771,7 +774,7 @@ void ProfileGenerator::populateBoundarySamplesForAllFunctions(
void ProfileGeneratorBase::calculateAndShowDensity(
const SampleProfileMap &Profiles) {
double Density = calculateDensity(Profiles, HotCountThreshold);
- showDensitySuggestion(Density);
+ showDensitySuggestion(Density, ProfileSummaryCutoffHot);
}
FunctionSamples *
@@ -1032,6 +1035,78 @@ void CSProfileGenerator::convertToProfileMap() {
IsProfileValidOnTrie = false;
}
+void CSProfileGenerator::calculateAndShowDensity(
+ SampleContextTracker &CTracker) {
+ double Density = calculateDensity(CTracker);
+ showDensitySuggestion(Density, ProfileDensityHotFuncCutOff);
+}
+
+// Calculate Profile-density:
+// Sort the list of function-density in descending order and iterate them once
+// their accumulated total samples exceeds the percentage_threshold of total
+// profile samples, the profile-density is the last(minimum) function-density of
+// the processed functions, which means all the functions significant to perf
+// are on good density if the profile-density is good, or in other words, if the
+// profile-density is bad, the accumulated samples for all the bad density
+// profile exceeds the (100% - percentage_threshold).
+// The percentage_threshold(--profile-density-hot-func-cutoff) is configurable
+// depending on how much regression the system want to tolerate.
+double CSProfileGenerator::calculateDensity(SampleContextTracker &CTracker) {
+ double ProfileDensity = 0.0;
+
+ uint64_t TotalProfileSamples = 0;
+ // A list of the function profile density and total samples.
+ std::vector<std::pair<double, uint64_t>> DensityList;
+ for (const auto *Node : CTracker) {
+ const auto *FSamples = Node->getFunctionSamples();
+ if (!FSamples)
+ continue;
+
+ uint64_t TotalBodySamples = 0;
+ uint64_t FuncBodySize = 0;
+ for (const auto &I : FSamples->getBodySamples()) {
+ TotalBodySamples += I.second.getSamples();
+ FuncBodySize++;
+ }
+ // The whole function could be inlined and optimized out, use the callsite
+ // head samples instead to estimate the body count.
+ if (FuncBodySize == 0) {
+ for (const auto &CallsiteSamples : FSamples->getCallsiteSamples()) {
+ FuncBodySize++;
+ for (const auto &Callee : CallsiteSamples.second)
+ TotalBodySamples += Callee.second.getHeadSamplesEstimate();
+ }
+ }
+
+ if (FuncBodySize == 0)
+ continue;
+
+ double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
+ TotalProfileSamples += TotalBodySamples;
+ DensityList.emplace_back(FuncDensity, TotalBodySamples);
+ }
+
+ // Sorted by the density in descending order.
+ llvm::stable_sort(DensityList, [&](const std::pair<double, uint64_t> &A,
+ const std::pair<double, uint64_t> &B) {
+ if (A.first != B.first)
+ return A.first > B.first;
+ return A.second < B.second;
+ });
+
+ uint64_t AccumulatedSamples = 0;
+ for (const auto &P : DensityList) {
+ AccumulatedSamples += P.second;
+ ProfileDensity = P.first;
+ if (AccumulatedSamples >=
+ TotalProfileSamples * static_cast<float>(ProfileDensityHotFuncCutOff) /
+ 1000000)
+ break;
+ }
+
+ return ProfileDensity;
+}
+
void CSProfileGenerator::postProcessProfiles() {
// Compute hot/cold threshold based on profile. This will be used for cold
// context profile merging/trimming.
@@ -1041,6 +1116,7 @@ void CSProfileGenerator::postProcessProfiles() {
// inline decisions.
if (EnableCSPreInliner) {
ContextTracker.populateFuncToCtxtMap();
+ calculateAndShowDensity(ContextTracker);
CSPreInliner(ContextTracker, *Binary, Summary.get()).run();
// Turn off the profile merger by default unless it is explicitly enabled.
if (!CSProfMergeColdContext.getNumOccurrences())
@@ -1061,7 +1137,9 @@ void CSProfileGenerator::postProcessProfiles() {
sampleprof::SampleProfileMap ContextLessProfiles;
ProfileConverter::flattenProfile(ProfileMap, ContextLessProfiles, true);
- calculateAndShowDensity(ContextLessProfiles);
+ if (!EnableCSPreInliner)
+ ProfileGeneratorBase::calculateAndShowDensity(ContextLessProfiles);
+
if (GenCSNestedProfile) {
ProfileConverter CSConverter(ProfileMap);
CSConverter.convertCSProfiles();
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
index d258fb78bfb11..cf451f9d1a1a4 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.h
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -121,7 +121,7 @@ class ProfileGeneratorBase {
double calculateDensity(const SampleProfileMap &Profiles,
uint64_t HotCntThreshold);
- void showDensitySuggestion(double Density);
+ void showDensitySuggestion(double Density, int DensityCutoffHot);
void collectProfiledFunctions();
@@ -363,6 +363,9 @@ class CSProfileGenerator : public ProfileGeneratorBase {
void computeSummaryAndThreshold();
+ void calculateAndShowDensity(SampleContextTracker &CTracker);
+ double calculateDensity(SampleContextTracker &CTracker);
+
bool collectFunctionsFromLLVMProfile(
std::unordered_set<const BinaryFunction *> &ProfiledFunctions) override;
>From 9082e49bad782088c8f7da0057027c6367f8d927 Mon Sep 17 00:00:00 2001
From: wlei <wlei at fb.com>
Date: Fri, 17 May 2024 11:50:25 -0700
Subject: [PATCH 2/8] change to base on finial profile and addressing other
comments
---
.../tools/llvm-profgen/profile-density.test | 6 +-
llvm/tools/llvm-profgen/ProfileGenerator.cpp | 205 +++++++-----------
llvm/tools/llvm-profgen/ProfileGenerator.h | 14 +-
3 files changed, 94 insertions(+), 131 deletions(-)
diff --git a/llvm/test/tools/llvm-profgen/profile-density.test b/llvm/test/tools/llvm-profgen/profile-density.test
index f22c6f04914aa..e8bcc9a3a5028 100644
--- a/llvm/test/tools/llvm-profgen/profile-density.test
+++ b/llvm/test/tools/llvm-profgen/profile-density.test
@@ -4,10 +4,10 @@
; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -hot-function-density-threshold=1 &> %t4
; RUN: FileCheck %s --input-file %t4 --check-prefix=CHECK-DENSITY-CS
-;CHECK-DENSITY: Sample PGO is estimated to optimize better with 3.1x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
-;CHECK-DENSITY: Minimum profile density for hot functions with top 99.00% total samples: 3.2
+;CHECK-DENSITY: Sample PGO is estimated to optimize better with 2.9x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
+;CHECK-DENSITY: Functions with density >= 3.5 account for 99.00% total sample counts.
-;CHECK-DENSITY-CS: Minimum profile density for hot functions with top 99.00% total samples: 619.0
+;CHECK-DENSITY-CS: Functions with density >= 800.1 account for 99.00% total sample counts.
; original code:
; clang -O3 -g -fno-optimize-sibling-calls -fdebug-info-for-profiling qsort.c -o a.out
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index ecbc6763e56f1..e3e856ead918c 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -76,17 +76,16 @@ static cl::opt<int, true> CSProfMaxContextDepth(
cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
static cl::opt<double> HotFunctionDensityThreshold(
- "hot-function-density-threshold", llvm::cl::init(1000),
- llvm::cl::desc(
- "specify density threshold for hot functions (default: 1000)"),
+ "hot-function-density-threshold", llvm::cl::init(20),
+ llvm::cl::desc("specify density threshold for hot functions (default: 20)"),
llvm::cl::Optional);
static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
llvm::cl::desc("show profile density details"),
llvm::cl::Optional);
-static cl::opt<int> ProfileDensityHotFuncCutOff(
- "profile-density-hot-func-cutoff", llvm::cl::init(990000),
- llvm::cl::desc("Total sample cutoff for hot functions used to calculate "
- "the profile density."));
+static cl::opt<int> ProfileDensityCutOffHot(
+ "profile-density-cutoff-hot", llvm::cl::init(990000),
+ llvm::cl::desc("Total samples cutoff for functions used to calculate "
+ "profile density."));
static cl::opt<bool> UpdateTotalSamples(
"update-total-samples", llvm::cl::init(false),
@@ -181,10 +180,9 @@ void ProfileGeneratorBase::write() {
write(std::move(WriterOrErr.get()), ProfileMap);
}
-void ProfileGeneratorBase::showDensitySuggestion(double Density,
- int DensityCutoffHot) {
+void ProfileGeneratorBase::showDensitySuggestion(double Density) {
if (Density == 0.0)
- WithColor::warning() << "The --profile-summary-cutoff-hot option may be "
+ WithColor::warning() << "The --profile-density-cutoff-hot option may be "
"set too low. Please check your command.\n";
else if (Density < HotFunctionDensityThreshold)
WithColor::warning()
@@ -194,9 +192,11 @@ void ProfileGeneratorBase::showDensitySuggestion(double Density,
"profiling for longer duration to get more samples.\n";
if (ShowDensity)
- outs() << "Minimum profile density for hot functions with top "
- << format("%.2f", static_cast<double>(DensityCutoffHot) / 10000)
- << "% total samples: " << format("%.1f", Density) << "\n";
+ outs() << "Functions with density >= " << format("%.1f", Density)
+ << " account for "
+ << format("%.2f",
+ static_cast<double>(ProfileDensityCutOffHot) / 10000)
+ << "% total sample counts.\n";
}
bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) {
@@ -241,32 +241,6 @@ void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) {
}
}
-double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles,
- uint64_t HotCntThreshold) {
- double Density = DBL_MAX;
- std::vector<const FunctionSamples *> HotFuncs;
- for (auto &I : Profiles) {
- auto &FuncSamples = I.second;
- if (FuncSamples.getTotalSamples() < HotCntThreshold)
- continue;
- HotFuncs.emplace_back(&FuncSamples);
- }
-
- for (auto *FuncSamples : HotFuncs) {
- auto *Func = Binary->getBinaryFunction(FuncSamples->getFunction());
- if (!Func)
- continue;
- uint64_t FuncSize = Func->getFuncSize();
- if (FuncSize == 0)
- continue;
- Density =
- std::min(Density, static_cast<double>(FuncSamples->getTotalSamples()) /
- FuncSize);
- }
-
- return Density == DBL_MAX ? 0.0 : Density;
-}
-
void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
const RangeSample &Ranges) {
@@ -771,10 +745,78 @@ void ProfileGenerator::populateBoundarySamplesForAllFunctions(
}
}
+void ProfileGeneratorBase::calculateDensity(
+ const FunctionSamples &FSamples,
+ std::vector<std::pair<double, uint64_t>> &DensityList,
+ uint64_t &TotalProfileSamples) {
+ uint64_t TotalBodySamples = 0;
+ uint64_t FuncBodySize = 0;
+ for (const auto &I : FSamples.getBodySamples()) {
+ TotalBodySamples += I.second.getSamples();
+ FuncBodySize++;
+ }
+
+ // The whole function could be inlined and optimized out, use the callsite
+ // head samples instead to estimate the body count.
+ if (FuncBodySize == 0) {
+ for (const auto &CallsiteSamples : FSamples.getCallsiteSamples()) {
+ FuncBodySize++;
+ for (const auto &Callee : CallsiteSamples.second)
+ TotalBodySamples += Callee.second.getHeadSamplesEstimate();
+ }
+ }
+
+ if (FuncBodySize == 0)
+ return;
+
+ double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
+ TotalProfileSamples += TotalBodySamples;
+ DensityList.emplace_back(FuncDensity, TotalBodySamples);
+}
+
+// Calculate Profile-density:
+// Calculate the density for each function and sort them in descending order,
+// iterate them once their accumulated total samples exceeds the
+// percentage_threshold(cut-off) of total profile samples, the profile-density
+// is the last(minimum) function-density of the processed functions, which means
+// all the functions hot to perf are on good density if the profile-density is
+// good. The percentage_threshold(--profile-density-cutoff-hot) is configurable
+// depending on how much regression the system want to tolerate.
+double
+ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
+ double ProfileDensity = 0.0;
+
+ uint64_t TotalProfileSamples = 0;
+ // A list of the function profile density and its total samples.
+ std::vector<std::pair<double, uint64_t>> FuncDensityList;
+ for (const auto &I : Profiles)
+ calculateDensity(I.second, FuncDensityList, TotalProfileSamples);
+
+ // Sorted by the density in descending order.
+ llvm::stable_sort(FuncDensityList, [&](const std::pair<double, uint64_t> &A,
+ const std::pair<double, uint64_t> &B) {
+ if (A.first != B.first)
+ return A.first > B.first;
+ return A.second < B.second;
+ });
+
+ uint64_t AccumulatedSamples = 0;
+ for (const auto &P : FuncDensityList) {
+ AccumulatedSamples += P.second;
+ ProfileDensity = P.first;
+ if (AccumulatedSamples >= TotalProfileSamples *
+ static_cast<float>(ProfileDensityCutOffHot) /
+ 1000000)
+ break;
+ }
+
+ return ProfileDensity;
+}
+
void ProfileGeneratorBase::calculateAndShowDensity(
const SampleProfileMap &Profiles) {
- double Density = calculateDensity(Profiles, HotCountThreshold);
- showDensitySuggestion(Density, ProfileSummaryCutoffHot);
+ double Density = calculateDensity(Profiles);
+ showDensitySuggestion(Density);
}
FunctionSamples *
@@ -1035,78 +1077,6 @@ void CSProfileGenerator::convertToProfileMap() {
IsProfileValidOnTrie = false;
}
-void CSProfileGenerator::calculateAndShowDensity(
- SampleContextTracker &CTracker) {
- double Density = calculateDensity(CTracker);
- showDensitySuggestion(Density, ProfileDensityHotFuncCutOff);
-}
-
-// Calculate Profile-density:
-// Sort the list of function-density in descending order and iterate them once
-// their accumulated total samples exceeds the percentage_threshold of total
-// profile samples, the profile-density is the last(minimum) function-density of
-// the processed functions, which means all the functions significant to perf
-// are on good density if the profile-density is good, or in other words, if the
-// profile-density is bad, the accumulated samples for all the bad density
-// profile exceeds the (100% - percentage_threshold).
-// The percentage_threshold(--profile-density-hot-func-cutoff) is configurable
-// depending on how much regression the system want to tolerate.
-double CSProfileGenerator::calculateDensity(SampleContextTracker &CTracker) {
- double ProfileDensity = 0.0;
-
- uint64_t TotalProfileSamples = 0;
- // A list of the function profile density and total samples.
- std::vector<std::pair<double, uint64_t>> DensityList;
- for (const auto *Node : CTracker) {
- const auto *FSamples = Node->getFunctionSamples();
- if (!FSamples)
- continue;
-
- uint64_t TotalBodySamples = 0;
- uint64_t FuncBodySize = 0;
- for (const auto &I : FSamples->getBodySamples()) {
- TotalBodySamples += I.second.getSamples();
- FuncBodySize++;
- }
- // The whole function could be inlined and optimized out, use the callsite
- // head samples instead to estimate the body count.
- if (FuncBodySize == 0) {
- for (const auto &CallsiteSamples : FSamples->getCallsiteSamples()) {
- FuncBodySize++;
- for (const auto &Callee : CallsiteSamples.second)
- TotalBodySamples += Callee.second.getHeadSamplesEstimate();
- }
- }
-
- if (FuncBodySize == 0)
- continue;
-
- double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
- TotalProfileSamples += TotalBodySamples;
- DensityList.emplace_back(FuncDensity, TotalBodySamples);
- }
-
- // Sorted by the density in descending order.
- llvm::stable_sort(DensityList, [&](const std::pair<double, uint64_t> &A,
- const std::pair<double, uint64_t> &B) {
- if (A.first != B.first)
- return A.first > B.first;
- return A.second < B.second;
- });
-
- uint64_t AccumulatedSamples = 0;
- for (const auto &P : DensityList) {
- AccumulatedSamples += P.second;
- ProfileDensity = P.first;
- if (AccumulatedSamples >=
- TotalProfileSamples * static_cast<float>(ProfileDensityHotFuncCutOff) /
- 1000000)
- break;
- }
-
- return ProfileDensity;
-}
-
void CSProfileGenerator::postProcessProfiles() {
// Compute hot/cold threshold based on profile. This will be used for cold
// context profile merging/trimming.
@@ -1116,7 +1086,6 @@ void CSProfileGenerator::postProcessProfiles() {
// inline decisions.
if (EnableCSPreInliner) {
ContextTracker.populateFuncToCtxtMap();
- calculateAndShowDensity(ContextTracker);
CSPreInliner(ContextTracker, *Binary, Summary.get()).run();
// Turn off the profile merger by default unless it is explicitly enabled.
if (!CSProfMergeColdContext.getNumOccurrences())
@@ -1133,19 +1102,13 @@ void CSProfileGenerator::postProcessProfiles() {
CSProfMaxColdContextDepth, EnableCSPreInliner);
}
- // Merge function samples of CS profile to calculate profile density.
- sampleprof::SampleProfileMap ContextLessProfiles;
- ProfileConverter::flattenProfile(ProfileMap, ContextLessProfiles, true);
-
- if (!EnableCSPreInliner)
- ProfileGeneratorBase::calculateAndShowDensity(ContextLessProfiles);
-
if (GenCSNestedProfile) {
ProfileConverter CSConverter(ProfileMap);
CSConverter.convertCSProfiles();
FunctionSamples::ProfileIsCS = false;
}
filterAmbiguousProfile(ProfileMap);
+ ProfileGeneratorBase::calculateAndShowDensity(ProfileMap);
}
void ProfileGeneratorBase::computeSummaryAndThreshold(
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
index cf451f9d1a1a4..d40a37d658829 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.h
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -116,12 +116,15 @@ class ProfileGeneratorBase {
void computeSummaryAndThreshold(SampleProfileMap &ProfileMap);
- void calculateAndShowDensity(const SampleProfileMap &Profiles);
+ void calculateDensity(const FunctionSamples &FSamples,
+ std::vector<std::pair<double, uint64_t>> &DensityList,
+ uint64_t &TotalProfileSamples);
+
+ double calculateDensity(const SampleProfileMap &Profiles);
- double calculateDensity(const SampleProfileMap &Profiles,
- uint64_t HotCntThreshold);
+ void calculateAndShowDensity(const SampleProfileMap &Profiles);
- void showDensitySuggestion(double Density, int DensityCutoffHot);
+ void showDensitySuggestion(double Density);
void collectProfiledFunctions();
@@ -363,9 +366,6 @@ class CSProfileGenerator : public ProfileGeneratorBase {
void computeSummaryAndThreshold();
- void calculateAndShowDensity(SampleContextTracker &CTracker);
- double calculateDensity(SampleContextTracker &CTracker);
-
bool collectFunctionsFromLLVMProfile(
std::unordered_set<const BinaryFunction *> &ProfiledFunctions) override;
>From 1a4679a9128a2f60bcd2158634326da6fe223821 Mon Sep 17 00:00:00 2001
From: wlei <wlei at fb.com>
Date: Fri, 17 May 2024 13:48:47 -0700
Subject: [PATCH 3/8] fix missing callee sample
---
llvm/tools/llvm-profgen/ProfileGenerator.cpp | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index e3e856ead918c..0bdf543d2f39a 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -761,8 +761,10 @@ void ProfileGeneratorBase::calculateDensity(
if (FuncBodySize == 0) {
for (const auto &CallsiteSamples : FSamples.getCallsiteSamples()) {
FuncBodySize++;
- for (const auto &Callee : CallsiteSamples.second)
+ for (const auto &Callee : CallsiteSamples.second) {
+ calculateDensity(Callee.second, DensityList, TotalProfileSamples);
TotalBodySamples += Callee.second.getHeadSamplesEstimate();
+ }
}
}
>From 9cef0d9ab287a2b3561a5656a4f7ac3872e492f8 Mon Sep 17 00:00:00 2001
From: wlei <wlei at fb.com>
Date: Wed, 22 May 2024 10:21:30 -0700
Subject: [PATCH 4/8] change to binary-level density
---
.../tools/llvm-profgen/profile-density.test | 9 +-
llvm/tools/llvm-profgen/ProfileGenerator.cpp | 83 +++++++++++--------
llvm/tools/llvm-profgen/ProfileGenerator.h | 6 +-
3 files changed, 57 insertions(+), 41 deletions(-)
diff --git a/llvm/test/tools/llvm-profgen/profile-density.test b/llvm/test/tools/llvm-profgen/profile-density.test
index e8bcc9a3a5028..05190c949597a 100644
--- a/llvm/test/tools/llvm-profgen/profile-density.test
+++ b/llvm/test/tools/llvm-profgen/profile-density.test
@@ -1,14 +1,17 @@
-; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -hot-function-density-threshold=10 --trim-cold-profile=0 &> %t2
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -profile-density-threshold=10 --trim-cold-profile=0 &> %t2
; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-DENSITY
-
-; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -hot-function-density-threshold=1 &> %t4
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -profile-density-threshold=1 &> %t4
; RUN: FileCheck %s --input-file %t4 --check-prefix=CHECK-DENSITY-CS
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t5 --show-density -profile-density-threshold=1 -profile-density-cutoff-hot=800000 &> %t6
+; RUN: FileCheck %s --input-file %t6 --check-prefix=CHECK-DENSITY-CS-80
;CHECK-DENSITY: Sample PGO is estimated to optimize better with 2.9x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
;CHECK-DENSITY: Functions with density >= 3.5 account for 99.00% total sample counts.
;CHECK-DENSITY-CS: Functions with density >= 800.1 account for 99.00% total sample counts.
+;CHECK-DENSITY-CS-80: Functions with density >= 1860.5 account for 80.00% total sample counts.
+
; original code:
; clang -O3 -g -fno-optimize-sibling-calls -fdebug-info-for-profiling qsort.c -o a.out
#include <stdio.h>
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 0bdf543d2f39a..c7b6f859d881c 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -75,9 +75,11 @@ static cl::opt<int, true> CSProfMaxContextDepth(
"depth limit."),
cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
-static cl::opt<double> HotFunctionDensityThreshold(
- "hot-function-density-threshold", llvm::cl::init(20),
- llvm::cl::desc("specify density threshold for hot functions (default: 20)"),
+static cl::opt<double> ProfileDensityThreshold(
+ "profile-density-threshold", llvm::cl::init(20),
+ llvm::cl::desc(
+ "Set the profile density threshold(default: 20), which is used to "
+ "provide suggestions for user to increase the sampling rate.\n"),
llvm::cl::Optional);
static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
llvm::cl::desc("show profile density details"),
@@ -182,12 +184,13 @@ void ProfileGeneratorBase::write() {
void ProfileGeneratorBase::showDensitySuggestion(double Density) {
if (Density == 0.0)
- WithColor::warning() << "The --profile-density-cutoff-hot option may be "
+ WithColor::warning() << "The output profile is empty or the "
+ "--profile-density-cutoff-hot option is "
"set too low. Please check your command.\n";
- else if (Density < HotFunctionDensityThreshold)
+ else if (Density < ProfileDensityThreshold)
WithColor::warning()
<< "Sample PGO is estimated to optimize better with "
- << format("%.1f", HotFunctionDensityThreshold / Density)
+ << format("%.1f", ProfileDensityThreshold / Density)
<< "x more samples. Please consider increasing sampling rate or "
"profiling for longer duration to get more samples.\n";
@@ -745,12 +748,15 @@ void ProfileGenerator::populateBoundarySamplesForAllFunctions(
}
}
-void ProfileGeneratorBase::calculateDensity(
- const FunctionSamples &FSamples,
- std::vector<std::pair<double, uint64_t>> &DensityList,
- uint64_t &TotalProfileSamples) {
- uint64_t TotalBodySamples = 0;
- uint64_t FuncBodySize = 0;
+// Note taht ideally the size should be the number of function's instruction.
+// However, for probe-based profile, we don't have the accurate instruction
+// count for each probe, Instead, the probe sample is the samples count for the
+// block, which is equivelant to total_instruction_samples/num_instruction in
+// one block. Hence, we use the number of probe as a proxy for the function's
+// size.
+void ProfileGeneratorBase::calculateBodySamplesAndSize(
+ const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
+ uint64_t &FuncBodySize) {
for (const auto &I : FSamples.getBodySamples()) {
TotalBodySamples += I.second.getSamples();
FuncBodySize++;
@@ -758,27 +764,21 @@ void ProfileGeneratorBase::calculateDensity(
// The whole function could be inlined and optimized out, use the callsite
// head samples instead to estimate the body count.
- if (FuncBodySize == 0) {
- for (const auto &CallsiteSamples : FSamples.getCallsiteSamples()) {
- FuncBodySize++;
- for (const auto &Callee : CallsiteSamples.second) {
- calculateDensity(Callee.second, DensityList, TotalProfileSamples);
- TotalBodySamples += Callee.second.getHeadSamplesEstimate();
- }
+ for (const auto &CallsiteSamples : FSamples.getCallsiteSamples()) {
+ FuncBodySize++;
+ for (const auto &Callee : CallsiteSamples.second) {
+ // This is used for caluculating the binary-level density, so the
+ // inlinees' samples and size should be included in the calculation.
+ calculateBodySamplesAndSize(Callee.second, TotalBodySamples,
+ FuncBodySize);
+ TotalBodySamples += Callee.second.getHeadSamplesEstimate();
}
}
-
- if (FuncBodySize == 0)
- return;
-
- double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
- TotalProfileSamples += TotalBodySamples;
- DensityList.emplace_back(FuncDensity, TotalBodySamples);
}
// Calculate Profile-density:
// Calculate the density for each function and sort them in descending order,
-// iterate them once their accumulated total samples exceeds the
+// keep accumulating their total samples unitl it exceeds the
// percentage_threshold(cut-off) of total profile samples, the profile-density
// is the last(minimum) function-density of the processed functions, which means
// all the functions hot to perf are on good density if the profile-density is
@@ -791,8 +791,18 @@ ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
uint64_t TotalProfileSamples = 0;
// A list of the function profile density and its total samples.
std::vector<std::pair<double, uint64_t>> FuncDensityList;
- for (const auto &I : Profiles)
- calculateDensity(I.second, FuncDensityList, TotalProfileSamples);
+ for (const auto &I : Profiles) {
+ uint64_t TotalBodySamples = 0;
+ uint64_t FuncBodySize = 0;
+ calculateBodySamplesAndSize(I.second, TotalBodySamples, FuncBodySize);
+
+ if (FuncBodySize == 0)
+ continue;
+
+ double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
+ TotalProfileSamples += TotalBodySamples;
+ FuncDensityList.emplace_back(FuncDensity, TotalBodySamples);
+ }
// Sorted by the density in descending order.
llvm::stable_sort(FuncDensityList, [&](const std::pair<double, uint64_t> &A,
@@ -803,13 +813,16 @@ ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
});
uint64_t AccumulatedSamples = 0;
- for (const auto &P : FuncDensityList) {
- AccumulatedSamples += P.second;
- ProfileDensity = P.first;
- if (AccumulatedSamples >= TotalProfileSamples *
+ uint32_t I = 0;
+ assert(ProfileDensityCutOffHot <= 1000000 &&
+ "The cutoff value is greater than 1000000(100%)");
+ while (AccumulatedSamples < TotalProfileSamples *
static_cast<float>(ProfileDensityCutOffHot) /
- 1000000)
- break;
+ 1000000 &&
+ I < FuncDensityList.size()) {
+ AccumulatedSamples += FuncDensityList[I].second;
+ ProfileDensity = FuncDensityList[I].first;
+ I++;
}
return ProfileDensity;
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
index d40a37d658829..5e36128530cd9 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.h
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -116,9 +116,9 @@ class ProfileGeneratorBase {
void computeSummaryAndThreshold(SampleProfileMap &ProfileMap);
- void calculateDensity(const FunctionSamples &FSamples,
- std::vector<std::pair<double, uint64_t>> &DensityList,
- uint64_t &TotalProfileSamples);
+ void calculateBodySamplesAndSize(const FunctionSamples &FSamples,
+ uint64_t &TotalBodySamples,
+ uint64_t &FuncBodySize);
double calculateDensity(const SampleProfileMap &Profiles);
>From a42f48051a9c9edabbfee8aa599564fb36bc7c0e Mon Sep 17 00:00:00 2001
From: wlei <wlei at fb.com>
Date: Wed, 22 May 2024 13:56:21 -0700
Subject: [PATCH 5/8] set profile density threshold to 50 and address other
comments
---
.../tools/llvm-profgen/profile-density.test | 3 ++-
llvm/tools/llvm-profgen/ProfileGenerator.cpp | 24 +++++++++----------
2 files changed, 13 insertions(+), 14 deletions(-)
diff --git a/llvm/test/tools/llvm-profgen/profile-density.test b/llvm/test/tools/llvm-profgen/profile-density.test
index 05190c949597a..14a6a77a5983b 100644
--- a/llvm/test/tools/llvm-profgen/profile-density.test
+++ b/llvm/test/tools/llvm-profgen/profile-density.test
@@ -1,6 +1,6 @@
; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -profile-density-threshold=10 --trim-cold-profile=0 &> %t2
; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-DENSITY
-; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -profile-density-threshold=1 &> %t4
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -profile-density-threshold=1 -profile-density-threshold=10000 &> %t4
; RUN: FileCheck %s --input-file %t4 --check-prefix=CHECK-DENSITY-CS
; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t5 --show-density -profile-density-threshold=1 -profile-density-cutoff-hot=800000 &> %t6
; RUN: FileCheck %s --input-file %t6 --check-prefix=CHECK-DENSITY-CS-80
@@ -8,6 +8,7 @@
;CHECK-DENSITY: Sample PGO is estimated to optimize better with 2.9x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
;CHECK-DENSITY: Functions with density >= 3.5 account for 99.00% total sample counts.
+;CHECK-DENSITY-CS: Sample PGO is estimated to optimize better with 12.5x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
;CHECK-DENSITY-CS: Functions with density >= 800.1 account for 99.00% total sample counts.
;CHECK-DENSITY-CS-80: Functions with density >= 1860.5 account for 80.00% total sample counts.
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index c7b6f859d881c..8ee0ccff22a5f 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -76,10 +76,9 @@ static cl::opt<int, true> CSProfMaxContextDepth(
cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
static cl::opt<double> ProfileDensityThreshold(
- "profile-density-threshold", llvm::cl::init(20),
- llvm::cl::desc(
- "Set the profile density threshold(default: 20), which is used to "
- "provide suggestions for user to increase the sampling rate.\n"),
+ "profile-density-threshold", llvm::cl::init(50),
+ llvm::cl::desc("If the profile density is below the given threshold, it "
+ "will be suggested to increase the sampling rate."),
llvm::cl::Optional);
static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
llvm::cl::desc("show profile density details"),
@@ -748,24 +747,24 @@ void ProfileGenerator::populateBoundarySamplesForAllFunctions(
}
}
-// Note taht ideally the size should be the number of function's instruction.
+// Note that ideally the size should be the number of function instruction.
// However, for probe-based profile, we don't have the accurate instruction
-// count for each probe, Instead, the probe sample is the samples count for the
-// block, which is equivelant to total_instruction_samples/num_instruction in
+// count for each probe, instead, the probe sample is the samples count for the
+// block, which is equivelant to total_instruction_samples/num_of_instruction in
// one block. Hence, we use the number of probe as a proxy for the function's
// size.
void ProfileGeneratorBase::calculateBodySamplesAndSize(
const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
uint64_t &FuncBodySize) {
- for (const auto &I : FSamples.getBodySamples()) {
+ FuncBodySize +=
+ FSamples.getBodySamples().size() + FSamples.getCallsiteSamples().size();
+
+ for (const auto &I : FSamples.getBodySamples())
TotalBodySamples += I.second.getSamples();
- FuncBodySize++;
- }
// The whole function could be inlined and optimized out, use the callsite
// head samples instead to estimate the body count.
- for (const auto &CallsiteSamples : FSamples.getCallsiteSamples()) {
- FuncBodySize++;
+ for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
for (const auto &Callee : CallsiteSamples.second) {
// This is used for caluculating the binary-level density, so the
// inlinees' samples and size should be included in the calculation.
@@ -773,7 +772,6 @@ void ProfileGeneratorBase::calculateBodySamplesAndSize(
FuncBodySize);
TotalBodySamples += Callee.second.getHeadSamplesEstimate();
}
- }
}
// Calculate Profile-density:
>From 5974082a7060ebfe910fc6b958ef01603e755527 Mon Sep 17 00:00:00 2001
From: wlei <wlei at fb.com>
Date: Wed, 22 May 2024 16:20:38 -0700
Subject: [PATCH 6/8] fix comment
---
llvm/tools/llvm-profgen/ProfileGenerator.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 8ee0ccff22a5f..98b12de1fd13a 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -766,8 +766,8 @@ void ProfileGeneratorBase::calculateBodySamplesAndSize(
// head samples instead to estimate the body count.
for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
for (const auto &Callee : CallsiteSamples.second) {
- // This is used for caluculating the binary-level density, so the
- // inlinees' samples and size should be included in the calculation.
+ // For binary-level density, the inlinees' samples and size should be
+ // included in the calculation.
calculateBodySamplesAndSize(Callee.second, TotalBodySamples,
FuncBodySize);
TotalBodySamples += Callee.second.getHeadSamplesEstimate();
>From 7769d0fa8b0cb4724d7362e9667fea1dd4f366ea Mon Sep 17 00:00:00 2001
From: wlei <wlei at fb.com>
Date: Thu, 23 May 2024 10:34:24 -0700
Subject: [PATCH 7/8] fix body samples and update comments
---
.../tools/llvm-profgen/profile-density.test | 2 +-
llvm/tools/llvm-profgen/ProfileGenerator.cpp | 30 +++++++++++--------
2 files changed, 19 insertions(+), 13 deletions(-)
diff --git a/llvm/test/tools/llvm-profgen/profile-density.test b/llvm/test/tools/llvm-profgen/profile-density.test
index 14a6a77a5983b..086697e8da0a5 100644
--- a/llvm/test/tools/llvm-profgen/profile-density.test
+++ b/llvm/test/tools/llvm-profgen/profile-density.test
@@ -11,7 +11,7 @@
;CHECK-DENSITY-CS: Sample PGO is estimated to optimize better with 12.5x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
;CHECK-DENSITY-CS: Functions with density >= 800.1 account for 99.00% total sample counts.
-;CHECK-DENSITY-CS-80: Functions with density >= 1860.5 account for 80.00% total sample counts.
+;CHECK-DENSITY-CS-80: Functions with density >= 1886.2 account for 80.00% total sample counts.
; original code:
; clang -O3 -g -fno-optimize-sibling-calls -fdebug-info-for-profiling qsort.c -o a.out
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 98b12de1fd13a..fdc1f1e9e79aa 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -747,30 +747,36 @@ void ProfileGenerator::populateBoundarySamplesForAllFunctions(
}
}
-// Note that ideally the size should be the number of function instruction.
-// However, for probe-based profile, we don't have the accurate instruction
-// count for each probe, instead, the probe sample is the samples count for the
-// block, which is equivelant to total_instruction_samples/num_of_instruction in
-// one block. Hence, we use the number of probe as a proxy for the function's
-// size.
void ProfileGeneratorBase::calculateBodySamplesAndSize(
const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
uint64_t &FuncBodySize) {
- FuncBodySize +=
- FSamples.getBodySamples().size() + FSamples.getCallsiteSamples().size();
-
+ // Note that ideally the size should be the number of function instruction.
+ // However, for probe-based profile, we don't have the accurate instruction
+ // count for each probe, instead, the probe sample is the samples count for
+ // the block, which is equivelant to
+ // total_instruction_samples/num_of_instruction in one block. Hence, we use
+ // the number of probe as a proxy for the function's size.
+ FuncBodySize += FSamples.getBodySamples().size();
+
+ // The accumulated body samples re-calculated here could be different from the
+ // TotalSamples(getTotalSamples) field of FunctionSamples for line-number
+ // based profile. The reason is that TotalSamples is the sum of all the
+ // samples of the machine instruction in one source-code line, however, the
+ // entry of Bodysamples is the only max number of them, so the TotalSamples is
+ // usually much bigger than the accumulated body samples as one souce-code
+ // line can emit many machine instructions. We observed a regression when we
+ // switched to use the accumulated body samples(by using
+ // -update-total-samples). Hence, it's safer to re-calculate here to avoid
+ // such discrepancy.
for (const auto &I : FSamples.getBodySamples())
TotalBodySamples += I.second.getSamples();
- // The whole function could be inlined and optimized out, use the callsite
- // head samples instead to estimate the body count.
for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
for (const auto &Callee : CallsiteSamples.second) {
// For binary-level density, the inlinees' samples and size should be
// included in the calculation.
calculateBodySamplesAndSize(Callee.second, TotalBodySamples,
FuncBodySize);
- TotalBodySamples += Callee.second.getHeadSamplesEstimate();
}
}
>From 4539a39060549a370f3e1cb31e78a01877de2975 Mon Sep 17 00:00:00 2001
From: wlei <wlei at fb.com>
Date: Thu, 23 May 2024 12:20:09 -0700
Subject: [PATCH 8/8] update comments
---
llvm/tools/llvm-profgen/ProfileGenerator.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index fdc1f1e9e79aa..2118e954fe543 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -767,7 +767,8 @@ void ProfileGeneratorBase::calculateBodySamplesAndSize(
// line can emit many machine instructions. We observed a regression when we
// switched to use the accumulated body samples(by using
// -update-total-samples). Hence, it's safer to re-calculate here to avoid
- // such discrepancy.
+ // such discrepancy. There is no problem for probe-based profile, as the
+ // TotalSamples is exactly the same as the accumulated body samples.
for (const auto &I : FSamples.getBodySamples())
TotalBodySamples += I.second.getSamples();
More information about the llvm-commits
mailing list