[llvm] f4711e0 - [CSSPGO] Sort function offset table to speed up profile loading.

Hongtao Yu via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 1 12:18:05 PDT 2021


Author: Hongtao Yu
Date: 2021-09-01T12:17:48-07:00
New Revision: f4711e0d009b7e62fc1fa3c070878ad366710487

URL: https://github.com/llvm/llvm-project/commit/f4711e0d009b7e62fc1fa3c070878ad366710487
DIFF: https://github.com/llvm/llvm-project/commit/f4711e0d009b7e62fc1fa3c070878ad366710487.diff

LOG: [CSSPGO] Sort function offset table to speed up profile loading.

With the context split work, the context-based (an array of strings) sorting performed at profile load time is way more expansive than single-string-based sorting. This is likely due to auxiliary operations done on each array element, such as indirect references, std::min operations, also likely cache misses. In this change I'm presorting profiles during profile generation time to avoid sorting at compile time.

Compared to the previous context-split work, this effectively cuts down compile time by 20% for one of our large services and brings us closer to non-CS build, with still a small gap in build time.

Reviewed By: wenlei, wmi

Differential Revision: https://reviews.llvm.org/D109036

Added: 
    

Modified: 
    llvm/include/llvm/ProfileData/SampleProf.h
    llvm/include/llvm/ProfileData/SampleProfReader.h
    llvm/lib/ProfileData/SampleProfReader.cpp
    llvm/lib/ProfileData/SampleProfWriter.cpp
    llvm/test/Transforms/SampleProfile/csspgo-import-list.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index e67ea80362432..043e402603075 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -208,6 +208,13 @@ enum class SecFuncMetadataFlags : uint32_t {
   SecFlagHasAttribute = (1 << 1)
 };
 
+enum class SecFuncOffsetFlags : uint32_t {
+  SecFlagInvalid = 0,
+  // Store function offsets in an order of contexts. The order ensures that
+  // callee contexts of a given context laid out next to it.
+  SecFlagOrdered = (1 << 0),
+};
+
 // Verify section specific flag is used for the correct section.
 template <class SecFlagType>
 static inline void verifySecFlag(SecType Type, SecFlagType Flag) {
@@ -228,6 +235,8 @@ static inline void verifySecFlag(SecType Type, SecFlagType Flag) {
     IsFlagLegal = std::is_same<SecFuncMetadataFlags, SecFlagType>();
     break;
   default:
+  case SecFuncOffsetTable:
+    IsFlagLegal = std::is_same<SecFuncOffsetFlags, SecFlagType>();
     break;
   }
   if (!IsFlagLegal)

diff  --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
index a27a6f2a05f2c..f9a7830335bb7 100644
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -720,6 +720,11 @@ class SampleProfileReaderExtBinaryBase : public SampleProfileReaderBinary {
   /// The table mapping from function context to the offset of its
   /// FunctionSample towards file start.
   DenseMap<SampleContext, uint64_t> FuncOffsetTable;
+
+  /// Function offset mapping ordered by contexts.
+  std::unique_ptr<std::vector<std::pair<SampleContext, uint64_t>>>
+      OrderedFuncOffsets;
+
   /// The set containing the functions to use when compiling a module.
   DenseSet<StringRef> FuncsToUse;
 
@@ -746,6 +751,8 @@ class SampleProfileReaderExtBinaryBase : public SampleProfileReaderBinary {
   /// SecFlagFlat flag.
   bool SkipFlatProf = false;
 
+  bool FuncOffsetsOrdered = false;
+
 public:
   SampleProfileReaderExtBinaryBase(std::unique_ptr<MemoryBuffer> B,
                                    LLVMContext &C, SampleProfileFormat Format)

diff  --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index 838b4038438eb..71f203502c975 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -675,6 +675,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection(
       return EC;
     break;
   case SecFuncOffsetTable:
+    FuncOffsetsOrdered = hasSecFlag(Entry, SecFuncOffsetFlags::SecFlagOrdered);
     if (std::error_code EC = readFuncOffsetTable())
       return EC;
     break;
@@ -720,17 +721,27 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncOffsetTable() {
     return EC;
 
   FuncOffsetTable.reserve(*Size);
+
+  if (FuncOffsetsOrdered) {
+    OrderedFuncOffsets =
+        std::make_unique<std::vector<std::pair<SampleContext, uint64_t>>>();
+    OrderedFuncOffsets->reserve(*Size);
+  }
+
   for (uint32_t I = 0; I < *Size; ++I) {
-    auto FName(readSampleContextFromTable());
-    if (std::error_code EC = FName.getError())
+    auto FContext(readSampleContextFromTable());
+    if (std::error_code EC = FContext.getError())
       return EC;
 
     auto Offset = readNumber<uint64_t>();
     if (std::error_code EC = Offset.getError())
       return EC;
 
-    FuncOffsetTable[*FName] = *Offset;
+    FuncOffsetTable[*FContext] = *Offset;
+    if (FuncOffsetsOrdered)
+      OrderedFuncOffsets->emplace_back(*FContext, *Offset);
   }
+
   return sampleprof_error::success;
 }
 
@@ -760,42 +771,43 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncProfiles() {
     }
 
     if (ProfileIsCS) {
-      // Compute the ordered set of names, so we can
-      // get all context profiles under a subtree by
-      // iterating through the ordered names.
-      std::set<SampleContext> OrderedContexts;
-      for (auto Name : FuncOffsetTable) {
-        OrderedContexts.insert(Name.first);
-      }
-
       DenseSet<uint64_t> FuncGuidsToUse;
       if (useMD5()) {
         for (auto Name : FuncsToUse)
           FuncGuidsToUse.insert(Function::getGUID(Name));
       }
 
-      // For each function in current module, load all
-      // context profiles for the function.
-      for (auto NameOffset : FuncOffsetTable) {
-        SampleContext FContext = NameOffset.first;
-        auto FuncName = FContext.getName();
-        if ((useMD5() && !FuncGuidsToUse.count(std::stoull(FuncName.data()))) ||
-            (!useMD5() && !FuncsToUse.count(FuncName) &&
-             (!Remapper || !Remapper->exist(FuncName))))
-          continue;
-
-        // For each context profile we need, try to load
-        // all context profile in the subtree. This can
-        // help profile guided importing for ThinLTO.
-        auto It = OrderedContexts.find(FContext);
-        while (It != OrderedContexts.end() && FContext.IsPrefixOf(*It)) {
-          const uint8_t *FuncProfileAddr = Start + FuncOffsetTable[*It];
+      // For each function in current module, load all context profiles for
+      // the function as well as their callee contexts which can help profile
+      // guided importing for ThinLTO. This can be achieved by walking
+      // through an ordered context container, where contexts are laid out
+      // as if they were walked in preorder of a context trie. While
+      // traversing the trie, a link to the highest common ancestor node is
+      // kept so that all of its decendants will be loaded.
+      assert(OrderedFuncOffsets.get() &&
+             "func offset table should always be sorted in CS profile");
+      const SampleContext *CommonContext = nullptr;
+      for (const auto &NameOffset : *OrderedFuncOffsets) {
+        const auto &FContext = NameOffset.first;
+        auto FName = FContext.getName();
+        // For function in the current module, keep its farthest ancestor
+        // context. This can be used to load itself and its child and
+        // sibling contexts.
+        if ((useMD5() && FuncGuidsToUse.count(std::stoull(FName.data()))) ||
+            (!useMD5() && (FuncsToUse.count(FName) ||
+                           (Remapper && Remapper->exist(FName))))) {
+          if (!CommonContext || !CommonContext->IsPrefixOf(FContext))
+            CommonContext = &FContext;
+        }
+
+        if (CommonContext == &FContext ||
+            (CommonContext && CommonContext->IsPrefixOf(FContext))) {
+          // Load profile for the current context which originated from
+          // the common ancestor.
+          const uint8_t *FuncProfileAddr = Start + NameOffset.second;
           assert(FuncProfileAddr < End && "out of LBRProfile section");
           if (std::error_code EC = readFuncProfile(FuncProfileAddr))
             return EC;
-          // Remove loaded context profile so we won't
-          // load it repeatedly.
-          It = OrderedContexts.erase(It);
         }
       }
     } else {
@@ -1212,6 +1224,10 @@ static std::string getSecFlagsStr(const SecHdrTableEntry &Entry) {
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator))
       Flags.append("fs-discriminator,");
     break;
+  case SecFuncOffsetTable:
+    if (hasSecFlag(Entry, SecFuncOffsetFlags::SecFlagOrdered))
+      Flags.append("ordered,");
+    break;
   default:
     break;
   }

diff  --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index 415a7e70ac532..7fecd0dc7388f 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -165,11 +165,31 @@ std::error_code SampleProfileWriterExtBinaryBase::writeFuncOffsetTable() {
   encodeULEB128(FuncOffsetTable.size(), OS);
 
   // Write out FuncOffsetTable.
-  for (auto Entry : FuncOffsetTable) {
-    if (std::error_code EC = writeContextIdx(Entry.first))
+  auto WriteItem = [&](const SampleContext &Context, uint64_t Offset) {
+    if (std::error_code EC = writeContextIdx(Context))
       return EC;
-    encodeULEB128(Entry.second, OS);
+    encodeULEB128(Offset, OS);
+    return (std::error_code)sampleprof_error::success;
+  };
+
+  if (FunctionSamples::ProfileIsCS) {
+    // Sort the contexts before writing them out. This is to help fast load all
+    // context profiles for a function as well as their callee contexts which
+    // can help profile-guided importing for ThinLTO.
+    std::map<SampleContext, uint64_t> OrderedFuncOffsetTable(
+        FuncOffsetTable.begin(), FuncOffsetTable.end());
+    for (const auto &Entry : OrderedFuncOffsetTable) {
+      if (std::error_code EC = WriteItem(Entry.first, Entry.second))
+        return EC;
+    }
+    addSectionFlag(SecFuncOffsetTable, SecFuncOffsetFlags::SecFlagOrdered);
+  } else {
+    for (const auto &Entry : FuncOffsetTable) {
+      if (std::error_code EC = WriteItem(Entry.first, Entry.second))
+        return EC;
+    }
   }
+
   FuncOffsetTable.clear();
   return sampleprof_error::success;
 }

diff  --git a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
index 5665d7759efe2..cd8ac4188ac45 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
@@ -2,8 +2,11 @@
 ; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list.prof -S | FileCheck %s
 ; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/csspgo-import-list.prof -o %t.prof
 ; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%t.prof -S | FileCheck %s
+; RUN: llvm-profdata show --sample -show-sec-info-only %t.prof | FileCheck %s --check-prefix=CHECK-ORDERED
 ; RUN: llvm-profdata merge --sample --extbinary --use-md5 %S/Inputs/csspgo-import-list.prof -o %t.md5
 ; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%t.md5 -S | FileCheck %s
+; RUN: llvm-profdata show --sample -show-sec-info-only %t.md5 | FileCheck %s --check-prefix=CHECK-ORDERED
+
 
 declare i32 @_Z5funcBi(i32 %x)
 declare i32 @_Z5funcAi(i32 %x)
@@ -32,6 +35,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK: distinct !DISubprogram(name: "main"
 ; CHECK: !{!"function_entry_count", i64 3, i64 446061515086924981, i64 3815895320998406042, i64 7102633082150537521, i64 -2862076748587597320}
 
+; CHECK-ORDERED: FuncOffsetTableSection {{.*}} {ordered}
 
 attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
 


        


More information about the llvm-commits mailing list