[llvm] [BOLT] Use symbol table info in registerFragment (PR #89648)

Amir Ayupov via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 22 11:42:18 PDT 2024


https://github.com/aaupov updated https://github.com/llvm/llvm-project/pull/89648

>From 4245b120999b4a1c23a1bfcdb21eb72ec842a897 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Sun, 21 Apr 2024 01:15:30 -0700
Subject: [PATCH 1/2] [BOLT] Use symbol table info in registerFragment

Fragment matching relies on symbol names to identify and register split
function fragments. However, as split fragments are local symbols,
name aliasing is possible. For such cases, use symbol table to resolve
ambiguities:
- find FILE symbol the fragment belongs to,
- iterate over local symbols belonging to the same file and check if
  parent symbol is present,
- if a local parent symbol is found, use it, otherwise use global
  parent symbol.

This requires the presence of FILE symbols in the input binary. As BOLT
requires non-stripped binary, this is a reasonable assumption. Note
however that `strip -g` removes FILE symbols by default, but
`--keep-file-symbols` can be used to preserve these.

We opted to resolve ambiguous cases one by one because there are
typically very few such cases and symbol table parsing is relatively
expensive:

python3.8.6:
Total symbols: 17522
Function symbols: 7877 (45% total)
Fragments: 2551 (15% total)
Fragments with ambiguous parent: 31 (0.1% total)

Large binary:
Total symbols: 3032180
Function symbols: 1637209 (54% total)
Fragments: 43077 (0.1% total)
Fragments with ambiguous parent: 180 (0.006% total)

Test Plan:
Updated X86/fragment-lite.s
---
 bolt/include/bolt/Rewrite/RewriteInstance.h |   3 +
 bolt/include/bolt/Utils/NameResolver.h      |   7 ++
 bolt/lib/Rewrite/RewriteInstance.cpp        | 105 +++++++++++++++-----
 bolt/test/X86/fragment-lite.s               |  54 ++++++++--
 4 files changed, 135 insertions(+), 34 deletions(-)

diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index af832b4c7c84cf..d35c85567c33e3 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -490,6 +490,9 @@ class RewriteInstance {
   /// Store all non-zero symbols in this map for a quick address lookup.
   std::map<uint64_t, llvm::object::SymbolRef> FileSymRefs;
 
+  /// FILE symbols.
+  std::vector<object::DataRefImpl> FileSymbols;
+
   std::unique_ptr<DWARFRewriter> DebugInfoRewriter;
 
   std::unique_ptr<BoltAddressTranslation> BAT;
diff --git a/bolt/include/bolt/Utils/NameResolver.h b/bolt/include/bolt/Utils/NameResolver.h
index 2e3ac20a532d76..b6312642e72c8c 100644
--- a/bolt/include/bolt/Utils/NameResolver.h
+++ b/bolt/include/bolt/Utils/NameResolver.h
@@ -28,6 +28,13 @@ class NameResolver {
   static constexpr char Sep = '/';
 
 public:
+  /// Return the number of duplicates registered for a given \p Name.
+  uint64_t getNumDuplicates(StringRef Name) {
+    if (Counters.contains(Name))
+      return Counters.at(Name);
+    return 0;
+  }
+
   /// Return unique version of the \p Name in the form "Name<Sep><Number>".
   std::string uniquify(StringRef Name) {
     const uint64_t ID = ++Counters[Name];
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 4e0096cf988aed..dc2105d090b6a9 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -840,6 +840,7 @@ void RewriteInstance::discoverFileObjects() {
       continue;
 
     if (cantFail(Symbol.getType()) == SymbolRef::ST_File) {
+      FileSymbols.emplace_back(Symbol.getRawDataRefImpl());
       StringRef Name =
           cantFail(std::move(NameOrError), "cannot get symbol name for file");
       // Ignore Clang LTO artificial FILE symbol as it is not always generated,
@@ -1417,50 +1418,106 @@ void RewriteInstance::registerFragments() {
   if (!BC->HasSplitFunctions)
     return;
 
+  std::vector<std::pair<StringRef, BinaryFunction *>> AmbiguousFragments;
   for (auto &BFI : BC->getBinaryFunctions()) {
     BinaryFunction &Function = BFI.second;
     if (!Function.isFragment())
       continue;
-    unsigned ParentsFound = 0;
-    for (StringRef Name : Function.getNames()) {
+    for (MCSymbol *Symbol : Function.Symbols) {
+      StringRef Name = Symbol->getName();
       StringRef BaseName, Suffix;
       std::tie(BaseName, Suffix) = Name.split('/');
       const size_t ColdSuffixPos = BaseName.find(".cold");
       if (ColdSuffixPos == StringRef::npos)
         continue;
-      // For cold function with local (foo.cold/1) symbol, prefer a parent with
-      // local symbol as well (foo/1) over global symbol (foo).
-      std::string ParentName = BaseName.substr(0, ColdSuffixPos).str();
+      StringRef ParentName = BaseName.substr(0, ColdSuffixPos);
       const BinaryData *BD = BC->getBinaryDataByName(ParentName);
-      if (Suffix != "") {
-        ParentName.append(Twine("/", Suffix).str());
-        const BinaryData *BDLocal = BC->getBinaryDataByName(ParentName);
-        if (BDLocal || !BD)
-          BD = BDLocal;
-      }
-      if (!BD) {
-        if (opts::Verbosity >= 1)
-          BC->outs() << "BOLT-INFO: parent function not found for " << Name
-                     << "\n";
+      uint64_t NumPossibleLocalParents = NR.getNumDuplicates(ParentName);
+      // The most common case: single local parent fragment.
+      if (!BD && NumPossibleLocalParents == 1) {
+        BD = BC->getBinaryDataByName((Twine(ParentName) + "/1").str());
+      } else if (BD && (!NumPossibleLocalParents || Suffix.empty())) {
+        // Global parent and either no local candidates (second most common), or
+        // the fragment is global as well (uncommon).
+      } else {
+        // Any other case: need to disambiguate using FILE symbols.
+        AmbiguousFragments.emplace_back(ParentName, &Function);
         continue;
       }
+      assert(BD);
       const uint64_t Address = BD->getAddress();
       BinaryFunction *BF = BC->getBinaryFunctionAtAddress(Address);
       if (!BF) {
-        if (opts::Verbosity >= 1)
-          BC->outs() << formatv(
-              "BOLT-INFO: parent function not found at {0:x}\n", Address);
-        continue;
+        BC->errs() << "BOLT-ERROR: parent function not found for " << Function
+                   << '\n';
+        exit(1);
       }
       BC->registerFragment(Function, *BF);
-      ++ParentsFound;
     }
-    if (!ParentsFound) {
-      BC->errs() << "BOLT-ERROR: parent function not found for " << Function
-                 << '\n';
-      exit(1);
+  }
+
+  if (AmbiguousFragments.empty())
+    return;
+
+  if (!BC->hasSymbolsWithFileName()) {
+    BC->errs() << "BOLT-ERROR: input file has split functions but does not "
+                  "have FILE symbols. If the binary was stripped, preserve "
+                  "FILE symbols with --keep-file-symbols strip option.";
+    exit(1);
+  }
+
+  // The first global symbol is identified by the symbol table sh_info value.
+  // Used as local symbol search stopping point.
+  uint32_t FirstGlobal{0};
+  auto *ELF64LEFile = cast<ELF64LEObjectFile>(InputFile);
+  const ELFFile<ELF64LE> &Obj = ELF64LEFile->getELFFile();
+  for (const auto &Sec : cantFail(Obj.sections())) {
+    if (Sec.sh_type == ELF::SHT_SYMTAB) {
+      FirstGlobal = Sec.sh_info;
+      break;
     }
   }
+
+  for (auto &[ParentName, BF]: AmbiguousFragments) {
+    const uint64_t Address = BF->getAddress();
+
+    // Get fragment's own symbol
+    const auto SymIt = FileSymRefs.find(Address);
+    assert(SymIt != FileSymRefs.end());
+
+    // Find containing FILE symbol
+    auto FSI =
+        llvm::upper_bound(FileSymbols, SymIt->second.getRawDataRefImpl(),
+                          [](const DataRefImpl &A, const DataRefImpl &B) {
+                            return A.d.b < B.d.b;
+                          });
+    assert(FSI != FileSymbols.begin());
+
+    uint32_t StopSymbolNum = FirstGlobal;
+    if (FSI != FileSymbols.end())
+      StopSymbolNum = FSI->d.b;
+
+    uint64_t ParentAddress{0};
+    // Iterate over local file symbols and check symbol names to match parent.
+    for (SymbolRef Symbol(FSI[-1], InputFile);
+         Symbol.getRawDataRefImpl().d.b != StopSymbolNum; Symbol.moveNext()) {
+      if (cantFail(Symbol.getName()) == ParentName) {
+        ParentAddress = cantFail(Symbol.getAddress());
+        break;
+      }
+    }
+
+    // No local parent is found, use global parent function.
+    if (!ParentAddress) {
+      BinaryData *ParentBD = BC->getBinaryDataByName(ParentName);
+      assert(ParentBD);
+      ParentAddress = ParentBD->getAddress();
+    }
+
+    BinaryFunction *ParentBF = BC->getBinaryFunctionAtAddress(ParentAddress);
+    assert(ParentBF);
+    BC->registerFragment(*BF, *ParentBF);
+  }
 }
 
 void RewriteInstance::createPLTBinaryFunction(uint64_t TargetAddress,
diff --git a/bolt/test/X86/fragment-lite.s b/bolt/test/X86/fragment-lite.s
index 97069bf8096e1c..32d1f5a98b64a3 100644
--- a/bolt/test/X86/fragment-lite.s
+++ b/bolt/test/X86/fragment-lite.s
@@ -3,35 +3,42 @@
 # RUN: split-file %s %t
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/baz.s -o %t.baz.o
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/baz2.s -o %t.baz2.o
 # RUN: link_fdata %s %t.o %t.main.fdata
 # RUN: link_fdata %s %t.baz.o %t.baz.fdata
-# RUN: merge-fdata %t.main.fdata %t.baz.fdata > %t.fdata
-# RUN: %clang %cflags %t.o %t.baz.o -o %t.exe -Wl,-q
+# RUN: link_fdata %s %t.baz2.o %t.baz2.fdata
+# RUN: merge-fdata %t.main.fdata %t.baz.fdata %t.baz2.fdata > %t.fdata
+# RUN: %clang %cflags %t.o %t.baz.o %t.baz2.o -o %t.exe -Wl,-q
 # RUN: llvm-bolt %t.exe -o %t.out --lite=1 --data %t.fdata -v=1 -print-cfg \
 # RUN:   2>&1 | FileCheck %s
 
 # CHECK: BOLT-INFO: processing main.cold.1 as a sibling of non-ignored function
-# CHECK: BOLT-INFO: processing foo.cold.1/1 as a sibling of non-ignored function
-# CHECK: BOLT-INFO: processing bar.cold.1/1 as a sibling of non-ignored function
+# CHECK: BOLT-INFO: processing foo.cold.1/1(*2) as a sibling of non-ignored function
+# CHECK: BOLT-INFO: processing bar.cold.1/1(*2) as a sibling of non-ignored function
 # CHECK: BOLT-INFO: processing baz.cold.1 as a sibling of non-ignored function
-# CHECK: BOLT-INFO: processing baz.cold.1/1 as a sibling of non-ignored function
+# CHECK: BOLT-INFO: processing baz.cold.1/1(*2) as a sibling of non-ignored function
+# CHECK: BOLT-INFO: processing baz.cold.1/2(*2) as a sibling of non-ignored function
 
 # CHECK: Binary Function "main.cold.1" after building cfg
 # CHECK: Parent : main
 
-# CHECK: Binary Function "foo.cold.1/1" after building cfg
+# CHECK: Binary Function "foo.cold.1/1(*2)" after building cfg
 # CHECK: Parent : foo
 
-# CHECK: Binary Function "bar.cold.1/1" after building cfg
-# CHECK: Parent : bar/1
+# CHECK: Binary Function "bar.cold.1/1(*2)" after building cfg
+# CHECK: Parent : bar/1(*2)
 
 # CHECK: Binary Function "baz.cold.1" after building cfg
 # CHECK: Parent : baz{{$}}
 
-# CHECK: Binary Function "baz.cold.1/1" after building cfg
-# CHECK: Parent : baz/1
+# CHECK: Binary Function "baz.cold.1/1(*2)" after building cfg
+# CHECK: Parent : baz/1(*2)
+
+# CHECK: Binary Function "baz.cold.1/2(*2)" after building cfg
+# CHECK: Parent : baz/2(*2)
 
 #--- main.s
+.file "main.s"
   .globl main
   .type main, %function
 main:
@@ -126,6 +133,7 @@ baz.cold.1:
 .size baz.cold.1, .-baz.cold.1
 
 #--- baz.s
+.file "baz.s"
   .local baz
   .type baz, %function
 baz:
@@ -149,3 +157,29 @@ baz.cold.1:
   retq
   .cfi_endproc
 .size baz.cold.1, .-baz.cold.1
+
+#--- baz2.s
+.file "baz2.s"
+  .local baz
+  .type baz, %function
+baz:
+  .cfi_startproc
+# FDATA: 0 [unknown] 0 1 baz/2 0 1 0
+  cmpl	$0x0, %eax
+  je	baz.cold.1
+  retq
+  .cfi_endproc
+.size baz, .-baz
+
+  .section .text.cold
+  .local baz.cold.1
+  .type baz.cold.1, %function
+baz.cold.1:
+  .cfi_startproc
+  pushq	%rbp
+  movq	%rsp, %rbp
+  movl	$0x0, %eax
+  popq	%rbp
+  retq
+  .cfi_endproc
+.size baz.cold.1, .-baz.cold.1

>From 325ae69c00c5e171d963387672942f0e76f475cc Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Mon, 22 Apr 2024 11:41:55 -0700
Subject: [PATCH 2/2] fixup! [BOLT] Use symbol table info in registerFragment

---
 bolt/lib/Rewrite/RewriteInstance.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index dc2105d090b6a9..f7b11684b48a69 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -1448,7 +1448,7 @@ void RewriteInstance::registerFragments() {
       const uint64_t Address = BD->getAddress();
       BinaryFunction *BF = BC->getBinaryFunctionAtAddress(Address);
       if (!BF) {
-        BC->errs() << "BOLT-ERROR: parent function not found for " << Function
+        BC->errs() << "BOLT-ERROR: parent function not found for " << Name
                    << '\n';
         exit(1);
       }
@@ -1478,7 +1478,7 @@ void RewriteInstance::registerFragments() {
     }
   }
 
-  for (auto &[ParentName, BF]: AmbiguousFragments) {
+  for (auto &[ParentName, BF] : AmbiguousFragments) {
     const uint64_t Address = BF->getAddress();
 
     // Get fragment's own symbol



More information about the llvm-commits mailing list