[llvm] f4711e0 - [CSSPGO] Sort function offset table to speed up profile loading.
Hongtao Yu via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 1 12:18:05 PDT 2021
Author: Hongtao Yu
Date: 2021-09-01T12:17:48-07:00
New Revision: f4711e0d009b7e62fc1fa3c070878ad366710487
URL: https://github.com/llvm/llvm-project/commit/f4711e0d009b7e62fc1fa3c070878ad366710487
DIFF: https://github.com/llvm/llvm-project/commit/f4711e0d009b7e62fc1fa3c070878ad366710487.diff
LOG: [CSSPGO] Sort function offset table to speed up profile loading.
With the context split work, the context-based (an array of strings) sorting performed at profile load time is way more expansive than single-string-based sorting. This is likely due to auxiliary operations done on each array element, such as indirect references, std::min operations, also likely cache misses. In this change I'm presorting profiles during profile generation time to avoid sorting at compile time.
Compared to the previous context-split work, this effectively cuts down compile time by 20% for one of our large services and brings us closer to non-CS build, with still a small gap in build time.
Reviewed By: wenlei, wmi
Differential Revision: https://reviews.llvm.org/D109036
Added:
Modified:
llvm/include/llvm/ProfileData/SampleProf.h
llvm/include/llvm/ProfileData/SampleProfReader.h
llvm/lib/ProfileData/SampleProfReader.cpp
llvm/lib/ProfileData/SampleProfWriter.cpp
llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index e67ea80362432..043e402603075 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -208,6 +208,13 @@ enum class SecFuncMetadataFlags : uint32_t {
SecFlagHasAttribute = (1 << 1)
};
+enum class SecFuncOffsetFlags : uint32_t {
+ SecFlagInvalid = 0,
+ // Store function offsets in an order of contexts. The order ensures that
+ // callee contexts of a given context laid out next to it.
+ SecFlagOrdered = (1 << 0),
+};
+
// Verify section specific flag is used for the correct section.
template <class SecFlagType>
static inline void verifySecFlag(SecType Type, SecFlagType Flag) {
@@ -228,6 +235,8 @@ static inline void verifySecFlag(SecType Type, SecFlagType Flag) {
IsFlagLegal = std::is_same<SecFuncMetadataFlags, SecFlagType>();
break;
default:
+ case SecFuncOffsetTable:
+ IsFlagLegal = std::is_same<SecFuncOffsetFlags, SecFlagType>();
break;
}
if (!IsFlagLegal)
diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
index a27a6f2a05f2c..f9a7830335bb7 100644
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -720,6 +720,11 @@ class SampleProfileReaderExtBinaryBase : public SampleProfileReaderBinary {
/// The table mapping from function context to the offset of its
/// FunctionSample towards file start.
DenseMap<SampleContext, uint64_t> FuncOffsetTable;
+
+ /// Function offset mapping ordered by contexts.
+ std::unique_ptr<std::vector<std::pair<SampleContext, uint64_t>>>
+ OrderedFuncOffsets;
+
/// The set containing the functions to use when compiling a module.
DenseSet<StringRef> FuncsToUse;
@@ -746,6 +751,8 @@ class SampleProfileReaderExtBinaryBase : public SampleProfileReaderBinary {
/// SecFlagFlat flag.
bool SkipFlatProf = false;
+ bool FuncOffsetsOrdered = false;
+
public:
SampleProfileReaderExtBinaryBase(std::unique_ptr<MemoryBuffer> B,
LLVMContext &C, SampleProfileFormat Format)
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index 838b4038438eb..71f203502c975 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -675,6 +675,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection(
return EC;
break;
case SecFuncOffsetTable:
+ FuncOffsetsOrdered = hasSecFlag(Entry, SecFuncOffsetFlags::SecFlagOrdered);
if (std::error_code EC = readFuncOffsetTable())
return EC;
break;
@@ -720,17 +721,27 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncOffsetTable() {
return EC;
FuncOffsetTable.reserve(*Size);
+
+ if (FuncOffsetsOrdered) {
+ OrderedFuncOffsets =
+ std::make_unique<std::vector<std::pair<SampleContext, uint64_t>>>();
+ OrderedFuncOffsets->reserve(*Size);
+ }
+
for (uint32_t I = 0; I < *Size; ++I) {
- auto FName(readSampleContextFromTable());
- if (std::error_code EC = FName.getError())
+ auto FContext(readSampleContextFromTable());
+ if (std::error_code EC = FContext.getError())
return EC;
auto Offset = readNumber<uint64_t>();
if (std::error_code EC = Offset.getError())
return EC;
- FuncOffsetTable[*FName] = *Offset;
+ FuncOffsetTable[*FContext] = *Offset;
+ if (FuncOffsetsOrdered)
+ OrderedFuncOffsets->emplace_back(*FContext, *Offset);
}
+
return sampleprof_error::success;
}
@@ -760,42 +771,43 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncProfiles() {
}
if (ProfileIsCS) {
- // Compute the ordered set of names, so we can
- // get all context profiles under a subtree by
- // iterating through the ordered names.
- std::set<SampleContext> OrderedContexts;
- for (auto Name : FuncOffsetTable) {
- OrderedContexts.insert(Name.first);
- }
-
DenseSet<uint64_t> FuncGuidsToUse;
if (useMD5()) {
for (auto Name : FuncsToUse)
FuncGuidsToUse.insert(Function::getGUID(Name));
}
- // For each function in current module, load all
- // context profiles for the function.
- for (auto NameOffset : FuncOffsetTable) {
- SampleContext FContext = NameOffset.first;
- auto FuncName = FContext.getName();
- if ((useMD5() && !FuncGuidsToUse.count(std::stoull(FuncName.data()))) ||
- (!useMD5() && !FuncsToUse.count(FuncName) &&
- (!Remapper || !Remapper->exist(FuncName))))
- continue;
-
- // For each context profile we need, try to load
- // all context profile in the subtree. This can
- // help profile guided importing for ThinLTO.
- auto It = OrderedContexts.find(FContext);
- while (It != OrderedContexts.end() && FContext.IsPrefixOf(*It)) {
- const uint8_t *FuncProfileAddr = Start + FuncOffsetTable[*It];
+ // For each function in current module, load all context profiles for
+ // the function as well as their callee contexts which can help profile
+ // guided importing for ThinLTO. This can be achieved by walking
+ // through an ordered context container, where contexts are laid out
+ // as if they were walked in preorder of a context trie. While
+ // traversing the trie, a link to the highest common ancestor node is
+ // kept so that all of its decendants will be loaded.
+ assert(OrderedFuncOffsets.get() &&
+ "func offset table should always be sorted in CS profile");
+ const SampleContext *CommonContext = nullptr;
+ for (const auto &NameOffset : *OrderedFuncOffsets) {
+ const auto &FContext = NameOffset.first;
+ auto FName = FContext.getName();
+ // For function in the current module, keep its farthest ancestor
+ // context. This can be used to load itself and its child and
+ // sibling contexts.
+ if ((useMD5() && FuncGuidsToUse.count(std::stoull(FName.data()))) ||
+ (!useMD5() && (FuncsToUse.count(FName) ||
+ (Remapper && Remapper->exist(FName))))) {
+ if (!CommonContext || !CommonContext->IsPrefixOf(FContext))
+ CommonContext = &FContext;
+ }
+
+ if (CommonContext == &FContext ||
+ (CommonContext && CommonContext->IsPrefixOf(FContext))) {
+ // Load profile for the current context which originated from
+ // the common ancestor.
+ const uint8_t *FuncProfileAddr = Start + NameOffset.second;
assert(FuncProfileAddr < End && "out of LBRProfile section");
if (std::error_code EC = readFuncProfile(FuncProfileAddr))
return EC;
- // Remove loaded context profile so we won't
- // load it repeatedly.
- It = OrderedContexts.erase(It);
}
}
} else {
@@ -1212,6 +1224,10 @@ static std::string getSecFlagsStr(const SecHdrTableEntry &Entry) {
if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator))
Flags.append("fs-discriminator,");
break;
+ case SecFuncOffsetTable:
+ if (hasSecFlag(Entry, SecFuncOffsetFlags::SecFlagOrdered))
+ Flags.append("ordered,");
+ break;
default:
break;
}
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index 415a7e70ac532..7fecd0dc7388f 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -165,11 +165,31 @@ std::error_code SampleProfileWriterExtBinaryBase::writeFuncOffsetTable() {
encodeULEB128(FuncOffsetTable.size(), OS);
// Write out FuncOffsetTable.
- for (auto Entry : FuncOffsetTable) {
- if (std::error_code EC = writeContextIdx(Entry.first))
+ auto WriteItem = [&](const SampleContext &Context, uint64_t Offset) {
+ if (std::error_code EC = writeContextIdx(Context))
return EC;
- encodeULEB128(Entry.second, OS);
+ encodeULEB128(Offset, OS);
+ return (std::error_code)sampleprof_error::success;
+ };
+
+ if (FunctionSamples::ProfileIsCS) {
+ // Sort the contexts before writing them out. This is to help fast load all
+ // context profiles for a function as well as their callee contexts which
+ // can help profile-guided importing for ThinLTO.
+ std::map<SampleContext, uint64_t> OrderedFuncOffsetTable(
+ FuncOffsetTable.begin(), FuncOffsetTable.end());
+ for (const auto &Entry : OrderedFuncOffsetTable) {
+ if (std::error_code EC = WriteItem(Entry.first, Entry.second))
+ return EC;
+ }
+ addSectionFlag(SecFuncOffsetTable, SecFuncOffsetFlags::SecFlagOrdered);
+ } else {
+ for (const auto &Entry : FuncOffsetTable) {
+ if (std::error_code EC = WriteItem(Entry.first, Entry.second))
+ return EC;
+ }
}
+
FuncOffsetTable.clear();
return sampleprof_error::success;
}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
index 5665d7759efe2..cd8ac4188ac45 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
@@ -2,8 +2,11 @@
; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list.prof -S | FileCheck %s
; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/csspgo-import-list.prof -o %t.prof
; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%t.prof -S | FileCheck %s
+; RUN: llvm-profdata show --sample -show-sec-info-only %t.prof | FileCheck %s --check-prefix=CHECK-ORDERED
; RUN: llvm-profdata merge --sample --extbinary --use-md5 %S/Inputs/csspgo-import-list.prof -o %t.md5
; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%t.md5 -S | FileCheck %s
+; RUN: llvm-profdata show --sample -show-sec-info-only %t.md5 | FileCheck %s --check-prefix=CHECK-ORDERED
+
declare i32 @_Z5funcBi(i32 %x)
declare i32 @_Z5funcAi(i32 %x)
@@ -32,6 +35,7 @@ for.body: ; preds = %for.body, %entry
; CHECK: distinct !DISubprogram(name: "main"
; CHECK: !{!"function_entry_count", i64 3, i64 446061515086924981, i64 3815895320998406042, i64 7102633082150537521, i64 -2862076748587597320}
+; CHECK-ORDERED: FuncOffsetTableSection {{.*}} {ordered}
attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
More information about the llvm-commits
mailing list