[lld] ceec610 - [lld-macho] Fix & refactor symbol size calculations
Jez Ng via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 6 12:10:24 PDT 2021
Author: Jez Ng
Date: 2021-04-06T15:10:01-04:00
New Revision: ceec610754b045cdac4ae0d705d8d9651e323fc4
URL: https://github.com/llvm/llvm-project/commit/ceec610754b045cdac4ae0d705d8d9651e323fc4
DIFF: https://github.com/llvm/llvm-project/commit/ceec610754b045cdac4ae0d705d8d9651e323fc4.diff
LOG: [lld-macho] Fix & refactor symbol size calculations
I noticed two problems with the previous implementation:
* N_ALT_ENTRY symbols weren't being handled correctly -- they should
determine the size of the previous symbol, even though they don't
cause a new section to be created
* The last symbol in a section had its size calculated wrongly;
the first subsection's size was used instead of the last one
I decided to take the opportunity to refactor things as well, mainly to
realize my observation
[here](https://reviews.llvm.org/D98837#inline-931511) that we could
avoid doing a binary search to match symbols with subsections. I think
the resulting code is a bit simpler too.
N Min Max Median Avg Stddev
x 20 4.31 4.43 4.37 4.3775 0.034162922
+ 20 4.32 4.43 4.38 4.3755 0.02799906
No difference proven at 95.0% confidence
Reviewed By: #lld-macho, alexshap
Differential Revision: https://reviews.llvm.org/D99972
Added:
Modified:
lld/MachO/InputFiles.cpp
lld/MachO/SymbolTable.cpp
lld/MachO/Symbols.h
lld/test/MachO/stabs.s
lld/test/MachO/symtab.s
Removed:
################################################################################
diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp
index 0445bfffc264..66dcca085cb5 100644
--- a/lld/MachO/InputFiles.cpp
+++ b/lld/MachO/InputFiles.cpp
@@ -198,8 +198,8 @@ void ObjFile::parseSections(ArrayRef<Section> sections) {
static InputSection *findContainingSubsection(SubsectionMap &map,
uint64_t *offset) {
auto it = std::prev(llvm::upper_bound(
- map, *offset, [](uint64_t value, SubsectionEntry subsectionEntry) {
- return value < subsectionEntry.offset;
+ map, *offset, [](uint64_t value, SubsectionEntry subsecEntry) {
+ return value < subsecEntry.offset;
}));
*offset -= it->offset;
return it->isec;
@@ -423,96 +423,79 @@ template <class LP>
void ObjFile::parseSymbols(ArrayRef<typename LP::section> sectionHeaders,
ArrayRef<typename LP::nlist> nList,
const char *strtab, bool subsectionsViaSymbols) {
- using Section = typename LP::section;
using NList = typename LP::nlist;
- // Precompute the boundaries of symbols within a section.
- // If subsectionsViaSymbols is True then the corresponding subsections will be
- // created, otherwise these boundaries are used for the calculation of symbols
- // sizes only.
- for (const NList &sym : nList) {
- if ((sym.n_type & N_TYPE) == N_SECT && !(sym.n_desc & N_ALT_ENTRY) &&
- !subsections[sym.n_sect - 1].empty()) {
- SubsectionMap &subsecMapping = subsections[sym.n_sect - 1];
- subsecMapping.push_back(
- {sym.n_value - sectionHeaders[sym.n_sect - 1].addr,
- subsecMapping.front().isec});
- }
- }
-
- for (SubsectionMap &subsecMap : subsections) {
- if (subsecMap.empty())
- continue;
- llvm::sort(subsecMap,
- [](const SubsectionEntry &lhs, const SubsectionEntry &rhs) {
- return lhs.offset < rhs.offset;
- });
- subsecMap.erase(
- std::unique(subsecMap.begin(), subsecMap.end(),
- [](const SubsectionEntry &lhs, const SubsectionEntry &rhs) {
- return lhs.offset == rhs.offset;
- }),
- subsecMap.end());
- if (!subsectionsViaSymbols)
- continue;
- for (size_t i = 0; i < subsecMap.size(); ++i) {
- uint32_t offset = subsecMap[i].offset;
- InputSection *&isec = subsecMap[i].isec;
- uint32_t end = i + 1 < subsecMap.size() ? subsecMap[i + 1].offset
- : isec->data.size();
- isec = make<InputSection>(*isec);
- isec->data = isec->data.slice(offset, end - offset);
- // TODO: ld64 appears to preserve the original alignment as well as each
- // subsection's offset from the last aligned address. We should consider
- // emulating that behavior.
- isec->align = MinAlign(isec->align, offset);
- }
- }
-
+ // Groups indices of the symbols by the sections that contain them.
+ std::vector<std::vector<uint32_t>> symbolsBySection(subsections.size());
symbols.resize(nList.size());
- for (size_t i = 0, n = nList.size(); i < n; ++i) {
+ for (uint32_t i = 0; i < nList.size(); ++i) {
const NList &sym = nList[i];
StringRef name = strtab + sym.n_strx;
-
- if ((sym.n_type & N_TYPE) != N_SECT) {
+ if ((sym.n_type & N_TYPE) == N_SECT) {
+ SubsectionMap &subsecMap = subsections[sym.n_sect - 1];
+ // parseSections() may have chosen not to parse this section.
+ if (subsecMap.empty())
+ continue;
+ symbolsBySection[sym.n_sect - 1].push_back(i);
+ } else {
symbols[i] = parseNonSectionSymbol(sym, name);
- continue;
}
+ }
- const Section &sec = sectionHeaders[sym.n_sect - 1];
- SubsectionMap &subsecMap = subsections[sym.n_sect - 1];
-
- // parseSections() may have chosen not to parse this section.
+ // Calculate symbol sizes and create subsections by splitting the sections
+ // along symbol boundaries.
+ for (size_t i = 0; i < subsections.size(); ++i) {
+ SubsectionMap &subsecMap = subsections[i];
if (subsecMap.empty())
continue;
- uint64_t offset = sym.n_value - sec.addr;
-
- auto it = llvm::upper_bound(
- subsecMap, offset, [](uint64_t value, SubsectionEntry subsectionEntry) {
- return value < subsectionEntry.offset;
- });
- uint32_t size = it != subsecMap.end()
- ? it->offset - offset
- : subsecMap.front().isec->getSize() - offset;
-
- // If the input file does not use subsections-via-symbols, all symbols can
- // use the same subsection. Otherwise, we must split the sections along
- // symbol boundaries.
- if (!subsectionsViaSymbols) {
- symbols[i] =
- createDefined(sym, name, subsecMap.front().isec, offset, size);
- continue;
- }
+ std::vector<uint32_t> &symbolIndices = symbolsBySection[i];
+ llvm::sort(symbolIndices, [&](uint32_t lhs, uint32_t rhs) {
+ return nList[lhs].n_value < nList[rhs].n_value;
+ });
+ uint64_t sectionAddr = sectionHeaders[i].addr;
+
+ // We populate subsecMap by repeatedly splitting the last (highest address)
+ // subsection.
+ SubsectionEntry subsecEntry = subsecMap.back();
+ for (size_t j = 0; j < symbolIndices.size(); ++j) {
+ uint32_t symIndex = symbolIndices[j];
+ const NList &sym = nList[symIndex];
+ StringRef name = strtab + sym.n_strx;
+ InputSection *isec = subsecEntry.isec;
+
+ uint64_t subsecAddr = sectionAddr + subsecEntry.offset;
+ uint64_t symbolOffset = sym.n_value - subsecAddr;
+ uint64_t symbolSize =
+ j + 1 < symbolIndices.size()
+ ? nList[symbolIndices[j + 1]].n_value - sym.n_value
+ : isec->data.size() - symbolOffset;
+ // There are 3 cases where we do not need to create a new subsection:
+ // 1. If the input file does not use subsections-via-symbols.
+ // 2. Multiple symbols at the same address only induce one subsection.
+ // 3. Alternative entry points do not induce new subsections.
+ if (!subsectionsViaSymbols || symbolOffset == 0 ||
+ sym.n_desc & N_ALT_ENTRY) {
+ symbols[symIndex] =
+ createDefined(sym, name, isec, symbolOffset, symbolSize);
+ continue;
+ }
- InputSection *subsec = (--it)->isec;
- symbols[i] = createDefined(sym, name, subsec, offset - it->offset, size);
- }
+ auto *nextIsec = make<InputSection>(*isec);
+ nextIsec->data = isec->data.slice(symbolOffset);
+ isec->data = isec->data.slice(0, symbolOffset);
- if (!subsectionsViaSymbols)
- for (SubsectionMap &subsecMap : subsections)
- if (!subsecMap.empty())
- subsecMap = {subsecMap.front()};
+ // By construction, the symbol will be at offset zero in the new section.
+ symbols[symIndex] =
+ createDefined(sym, name, nextIsec, /*value=*/0, symbolSize);
+ // TODO: ld64 appears to preserve the original alignment as well as each
+ // subsection's offset from the last aligned address. We should consider
+ // emulating that behavior.
+ nextIsec->align = MinAlign(isec->align, sym.n_value);
+ subsecMap.push_back({sym.n_value - sectionAddr, nextIsec});
+ subsecEntry = subsecMap.back();
+ }
+ }
}
OpaqueFile::OpaqueFile(MemoryBufferRef mb, StringRef segName,
diff --git a/lld/MachO/SymbolTable.cpp b/lld/MachO/SymbolTable.cpp
index 0a565b3c275c..127d513d1dda 100644
--- a/lld/MachO/SymbolTable.cpp
+++ b/lld/MachO/SymbolTable.cpp
@@ -161,7 +161,7 @@ Symbol *SymbolTable::addLazy(StringRef name, ArchiveFile *file,
Defined *SymbolTable::addSynthetic(StringRef name, InputSection *isec,
uint32_t value, bool isPrivateExtern,
bool includeInSymtab) {
- Defined *s = addDefined(name, nullptr, isec, value, /*size*/ 0,
+ Defined *s = addDefined(name, nullptr, isec, value, /*size=*/0,
/*isWeakDef=*/false, isPrivateExtern);
s->includeInSymtab = includeInSymtab;
return s;
diff --git a/lld/MachO/Symbols.h b/lld/MachO/Symbols.h
index 631d84954831..054834206d6f 100644
--- a/lld/MachO/Symbols.h
+++ b/lld/MachO/Symbols.h
@@ -118,7 +118,10 @@ class Defined : public Symbol {
static bool classof(const Symbol *s) { return s->kind() == DefinedKind; }
InputSection *isec;
+ // Contains the offset from the containing subsection. Note that this is
+ //
diff erent from nlist::n_value, which is the absolute address of the symbol.
uint64_t value;
+ // size is only calculated for regular (non-bitcode) symbols.
uint64_t size;
bool overridesWeakDef : 1;
diff --git a/lld/test/MachO/stabs.s b/lld/test/MachO/stabs.s
index 6a72ff246e1d..cf6f3ac17e18 100644
--- a/lld/test/MachO/stabs.s
+++ b/lld/test/MachO/stabs.s
@@ -42,25 +42,41 @@
# CHECK-NEXT: [[#%x, STATIC:]] - 0[[#MORE_DATA_ID + 1]] 0000 STSYM _static_var
# CHECK-NEXT: [[#%x, MAIN:]] - 0[[#TEXT_ID + 1]] 0000 FUN _main
# CHECK-NEXT: 0000000000000006 - 00 0000 FUN
-# CHECK-NEXT: [[#%x, FUN:]] - 0[[#MORE_TEXT_ID + 1]] 0000 FUN _fun
+# CHECK-NEXT: [[#%x, BAR:]] - 0[[#TEXT_ID + 1]] 0000 FUN _bar
+# CHECK-NEXT: 0000000000000000 - 00 0000 FUN
+# CHECK-NEXT: [[#%x, BAR2:]] - 0[[#TEXT_ID + 1]] 0000 FUN _bar2
# CHECK-NEXT: 0000000000000001 - 00 0000 FUN
+# CHECK-NEXT: [[#%x, BAZ:]] - 0[[#TEXT_ID + 1]] 0000 FUN _baz
+# CHECK-NEXT: 0000000000000000 - 00 0000 FUN
+# CHECK-NEXT: [[#%x, BAZ2:]] - 0[[#TEXT_ID + 1]] 0000 FUN _baz
+# CHECK-NEXT: 0000000000000002 - 00 0000 FUN
+# CHECK-NEXT: [[#%x, QUX:]] - 0[[#TEXT_ID + 1]] 0000 FUN _qux
+# CHECK-NEXT: 0000000000000003 - 00 0000 FUN
+# CHECK-NEXT: [[#%x, QUUX:]] - 0[[#TEXT_ID + 1]] 0000 FUN _quux
+# CHECK-NEXT: 0000000000000004 - 00 0000 FUN
# CHECK-NEXT: [[#%x, GLOB:]] - 0[[#DATA_ID + 1]] 0000 GSYM _global_var
# CHECK-NEXT: [[#%x, ZERO:]] - 0[[#COMM_ID + 1]] 0000 GSYM _zero
+# CHECK-NEXT: [[#%x, FUN:]] - 0[[#MORE_TEXT_ID + 1]] 0000 FUN _fun
+# CHECK-NEXT: 0000000000000001 - 00 0000 FUN
# CHECK-NEXT: 0000000000000000 - 01 0000 SO
# CHECK-NEXT: 0000000000000000 - 00 0000 SO /foo.cpp
# CHECK-NEXT: 0000000000000020 - 03 0001 OSO [[FOO_PATH]]
# CHECK-NEXT: [[#%x, FOO:]] - 0[[#TEXT_ID + 1]] 0000 FUN _foo
# CHECK-NEXT: 0000000000000001 - 00 0000 FUN
# CHECK-NEXT: 0000000000000000 - 01 0000 SO
-# CHECK-NEXT: [[#STATIC]] s _static_var
-# CHECK-NEXT: [[#MAIN]] T _main
-# CHECK-NEXT: {{[0-9af]+}} A _abs
-# CHECK-NEXT: [[#FUN]] S _fun
-# CHECK-NEXT: [[#GLOB]] D _global_var
-# CHECK-NEXT: [[#ZERO]] S _zero
-# CHECK-NEXT: [[#FOO]] T _foo
-# CHECK-NEXT: {{[0-9af]+}} T _no_debug
-# CHECK-NEXT: {{0+}} A __mh_execute_header
+# CHECK-DAG: [[#STATIC]] s _static_var
+# CHECK-DAG: [[#MAIN]] T _main
+# CHECK-DAG: {{[0-9af]+}} A _abs
+# CHECK-DAG: [[#BAR]] T _bar
+# CHECK-DAG: [[#BAR2]] T _bar2
+# CHECK-DAG: [[#BAZ]] T _baz
+# CHECK-DAG: [[#BAZ2]] T _baz2
+# CHECK-DAG: [[#FUN]] S _fun
+# CHECK-DAG: [[#GLOB]] D _global_var
+# CHECK-DAG: [[#ZERO]] S _zero
+# CHECK-DAG: [[#FOO]] T _foo
+# CHECK-DAG: {{[0-9af]+}} T _no_debug
+# CHECK-DAG: {{0+}} A __mh_execute_header
# CHECK-EMPTY:
## Check that we don't attempt to emit rebase opcodes for the debug sections
@@ -90,13 +106,30 @@ _static_var:
.zerofill __DATA,__common,_zero,4,2
.text
-.globl _main
+.globl _main, _bar, _bar2, _baz, _baz2, _qux, _quux
+.alt_entry _baz
+.alt_entry _qux
+
+_bar:
+_bar2:
+ .space 1
+
+_baz:
+_baz2:
+ .space 2
+
_main:
Lfunc_begin0:
callq _foo
retq
Lfunc_end0:
+_qux:
+ .space 3
+
+_quux:
+ .space 4
+
.section __DWARF,__debug_str,regular,debug
.asciz "test.cpp" ## string offset=0
.asciz "/tmp" ## string offset=9
diff --git a/lld/test/MachO/symtab.s b/lld/test/MachO/symtab.s
index 35fc961d1b35..708334173632 100644
--- a/lld/test/MachO/symtab.s
+++ b/lld/test/MachO/symtab.s
@@ -36,23 +36,23 @@
# CHECK-NEXT: Value: 0x1{{[0-9a-f]*}}
# CHECK-NEXT: }
# CHECK-NEXT: Symbol {
-# CHECK-NEXT: Name: _external
+# CHECK-NEXT: Name: _external_weak
# CHECK-NEXT: Extern
# CHECK-NEXT: Type: Section (0xE)
-# CHECK-NEXT: Section: __data
+# CHECK-NEXT: Section: __text (0x1)
# CHECK-NEXT: RefType: UndefinedNonLazy (0x0)
-# CHECK-NEXT: Flags [ (0x0)
+# CHECK-NEXT: Flags [ (0x80)
+# CHECK-NEXT: WeakDef (0x80)
# CHECK-NEXT: ]
# CHECK-NEXT: Value: 0x1{{[0-9a-f]*}}
# CHECK-NEXT: }
# CHECK-NEXT: Symbol {
-# CHECK-NEXT: Name: _external_weak
+# CHECK-NEXT: Name: _external
# CHECK-NEXT: Extern
# CHECK-NEXT: Type: Section (0xE)
-# CHECK-NEXT: Section: __text (0x1)
+# CHECK-NEXT: Section: __data
# CHECK-NEXT: RefType: UndefinedNonLazy (0x0)
-# CHECK-NEXT: Flags [ (0x80)
-# CHECK-NEXT: WeakDef (0x80)
+# CHECK-NEXT: Flags [ (0x0)
# CHECK-NEXT: ]
# CHECK-NEXT: Value: 0x1{{[0-9a-f]*}}
# CHECK-NEXT: }
More information about the llvm-commits
mailing list