[lld] [lld-macho] Fix compatibility between --icf=safe_thunks and --keep-icf-stabs (PR #116687)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 18 23:09:46 PST 2024
https://github.com/alx32 updated https://github.com/llvm/llvm-project/pull/116687
>From 474d9a2696d2c296e98869fa3cd7bff3e96e5a32 Mon Sep 17 00:00:00 2001
From: Alex B <alexborcan at meta.com>
Date: Sat, 16 Nov 2024 06:28:17 -0800
Subject: [PATCH 1/3] [lld-macho] Fix safe_thunks / --keep-icf-stabs
compatibility
---
lld/MachO/Arch/ARM64.cpp | 11 +++++
lld/MachO/ICF.cpp | 28 +++++++++++
lld/MachO/ICF.h | 6 +++
lld/MachO/SyntheticSections.cpp | 62 +++++++++++++------------
lld/MachO/SyntheticSections.h | 1 +
lld/MachO/Target.h | 6 +++
lld/test/MachO/icf-safe-thunks-dwarf.ll | 53 +++++++++++++++++++++
7 files changed, 138 insertions(+), 29 deletions(-)
diff --git a/lld/MachO/Arch/ARM64.cpp b/lld/MachO/Arch/ARM64.cpp
index 195a8f09f47c1a..882873ae5de0c1 100644
--- a/lld/MachO/Arch/ARM64.cpp
+++ b/lld/MachO/Arch/ARM64.cpp
@@ -44,6 +44,7 @@ struct ARM64 : ARM64Common {
void initICFSafeThunkBody(InputSection *thunk,
InputSection *branchTarget) const override;
+ InputSection *getThunkBranchTarget(InputSection *thunk) const override;
uint32_t getICFSafeThunkSize() const override;
};
@@ -197,6 +198,16 @@ void ARM64::initICFSafeThunkBody(InputSection *thunk,
/*referent=*/branchTarget);
}
+InputSection *ARM64::getThunkBranchTarget(InputSection *thunk) const {
+ assert(thunk->relocs.size() == 1 &&
+ "expected a single reloc on ARM64 ICF thunk");
+ auto &reloc = thunk->relocs[0];
+ assert(reloc.referent.is<InputSection *>() &&
+ "ARM64 thunk reloc is expected to point to an InputSection");
+
+ return reloc.referent.dyn_cast<InputSection *>();
+}
+
uint32_t ARM64::getICFSafeThunkSize() const { return sizeof(icfSafeThunkCode); }
ARM64::ARM64() : ARM64Common(LP64()) {
diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index aedaecfdeb2c01..451aaa7caaf0c1 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -481,6 +481,34 @@ void macho::markAddrSigSymbols() {
}
}
+// Given a symbol that was folded into a thunk, return the symbol pointing to
+// the actual body of the function. We use this approach rather than storing the
+// needed info in the Defined itself in order to minimize memory usage.
+Defined *macho::getBodyForThunkFoldedSym(Defined *foldedSym) {
+ assert(isa<ConcatInputSection>(foldedSym->originalIsec) &&
+ "thunk-folded ICF symbol expected to be on a ConcatInputSection");
+ // foldedSec is the InputSection that was marked as deleted upon fold
+ ConcatInputSection *foldedSec =
+ cast<ConcatInputSection>(foldedSym->originalIsec);
+
+ // thunkBody is the actual live thunk, containing the code that branches to
+ // the actual body of the function.
+ InputSection *thunkBody = foldedSec->replacement;
+
+ // the actual (merged) body of the function that the thunk jumps to. This will
+ // end up in the final binary.
+ InputSection *functionBody = target->getThunkBranchTarget(thunkBody);
+
+ for (Symbol *sym : functionBody->symbols) {
+ Defined *d = dyn_cast<Defined>(sym);
+ // The symbol needs to be at the start of the InputSection
+ if (d && d->value == 0)
+ return d;
+ }
+
+ llvm_unreachable("could not find body symbol for ICF-generated thunk");
+ return nullptr;
+}
void macho::foldIdenticalSections(bool onlyCfStrings) {
TimeTraceScope timeScope("Fold Identical Code Sections");
// The ICF equivalence-class segregation algorithm relies on pre-computed
diff --git a/lld/MachO/ICF.h b/lld/MachO/ICF.h
index 34ceb1cf284bf3..e382fd6c60956c 100644
--- a/lld/MachO/ICF.h
+++ b/lld/MachO/ICF.h
@@ -15,11 +15,17 @@
namespace lld::macho {
class Symbol;
+class Defined;
void markAddrSigSymbols();
void markSymAsAddrSig(Symbol *s);
void foldIdenticalSections(bool onlyCfStrings);
+// Given a symbol that was folded into a thunk, return the symbol pointing to
+// the actual body of the function. We expose this function to allow getting the
+// main function body for a symbol that was folded via a thunk.
+Defined *getBodyForThunkFoldedSym(Defined *foldedSym);
+
} // namespace lld::macho
#endif
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index eee87b3a6cb4de..7e0ba8d501c994 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -10,6 +10,7 @@
#include "ConcatOutputSection.h"
#include "Config.h"
#include "ExportTrie.h"
+#include "ICF.h"
#include "InputFiles.h"
#include "MachOStructs.h"
#include "ObjC.h"
@@ -1204,6 +1205,14 @@ void SymtabSection::emitEndFunStab(Defined *defined) {
stabs.emplace_back(std::move(stab));
}
+Defined *SymtabSection::getFuncBodySym(Defined *originalSym) {
+ // If the Defined is not a thunk, we can use it directly
+ if (originalSym->identicalCodeFoldingKind != Symbol::ICFFoldKind::Thunk)
+ return originalSym;
+
+ return macho::getBodyForThunkFoldedSym(originalSym);
+}
+
void SymtabSection::emitStabs() {
if (config->omitDebugInfo)
return;
@@ -1214,9 +1223,14 @@ void SymtabSection::emitStabs() {
stabs.emplace_back(std::move(astStab));
}
- // Cache the file ID for each symbol in an std::pair for faster sorting.
- using SortingPair = std::pair<Defined *, int>;
- std::vector<SortingPair> symbolsNeedingStabs;
+ struct SymbolStabInfo {
+ Defined *originalSym; // Original Defined symbol - this may be an ICF thunk
+ int fileId; // File ID associated with the STABS symbol
+ Defined *mainBodySym; // Symbol that consists of the full function body -
+ // use this for the STABS entry
+ };
+
+ std::vector<SymbolStabInfo> symbolsNeedingStabs;
for (const SymtabEntry &entry :
concat<SymtabEntry>(localSymbols, externalSymbols)) {
Symbol *sym = entry.sym;
@@ -1229,20 +1243,10 @@ void SymtabSection::emitStabs() {
if (defined->isAbsolute())
continue;
- // Never generate a STABS entry for a symbol that has been ICF'ed using a
- // thunk, just as we do for fully ICF'ed functions. Otherwise, we end up
- // generating invalid DWARF as dsymutil will assume the entire function
- // body is at that location, when, in reality, only the thunk is
- // present. This will end up causing overlapping DWARF entries.
- // TODO: Find an implementation that works in combination with
- // `--keep-icf-stabs`.
- if (defined->identicalCodeFoldingKind == Symbol::ICFFoldKind::Thunk)
- continue;
-
// Constant-folded symbols go in the executable's symbol table, but don't
- // get a stabs entry unless --keep-icf-stabs flag is specified
+ // get a stabs entry unless --keep-icf-stabs flag is specified.
if (!config->keepICFStabs &&
- defined->identicalCodeFoldingKind == Symbol::ICFFoldKind::Body)
+ defined->identicalCodeFoldingKind != Symbol::ICFFoldKind::None)
continue;
ObjFile *file = defined->getObjectFile();
@@ -1251,26 +1255,26 @@ void SymtabSection::emitStabs() {
// We use 'originalIsec' to get the file id of the symbol since 'isec()'
// might point to the merged ICF symbol's file
- symbolsNeedingStabs.emplace_back(defined,
- defined->originalIsec->getFile()->id);
+ Defined *funcBodySym = getFuncBodySym(defined);
+ symbolsNeedingStabs.emplace_back(SymbolStabInfo{
+ defined, funcBodySym->originalIsec->getFile()->id, funcBodySym});
}
}
llvm::stable_sort(symbolsNeedingStabs,
- [&](const SortingPair &a, const SortingPair &b) {
- return a.second < b.second;
+ [&](const SymbolStabInfo &a, const SymbolStabInfo &b) {
+ return a.fileId < b.fileId;
});
// Emit STABS symbols so that dsymutil and/or the debugger can map address
// regions in the final binary to the source and object files from which they
// originated.
InputFile *lastFile = nullptr;
- for (SortingPair &pair : symbolsNeedingStabs) {
- Defined *defined = pair.first;
+ for (const SymbolStabInfo &info : symbolsNeedingStabs) {
// We use 'originalIsec' of the symbol since we care about the actual origin
// of the symbol, not the canonical location returned by `isec()`.
- InputSection *isec = defined->originalIsec;
- ObjFile *file = cast<ObjFile>(isec->getFile());
+ InputSection *bodyIsec = info.mainBodySym->originalIsec;
+ ObjFile *file = cast<ObjFile>(bodyIsec->getFile());
if (lastFile == nullptr || lastFile != file) {
if (lastFile != nullptr)
@@ -1282,16 +1286,16 @@ void SymtabSection::emitStabs() {
}
StabsEntry symStab;
- symStab.sect = isec->parent->index;
- symStab.strx = stringTableSection.addString(defined->getName());
- symStab.value = defined->getVA();
+ symStab.sect = bodyIsec->parent->index;
+ symStab.strx = stringTableSection.addString(info.originalSym->getName());
+ symStab.value = info.mainBodySym->getVA();
- if (isCodeSection(isec)) {
+ if (isCodeSection(bodyIsec)) {
symStab.type = N_FUN;
stabs.emplace_back(std::move(symStab));
- emitEndFunStab(defined);
+ emitEndFunStab(info.mainBodySym);
} else {
- symStab.type = defined->isExternal() ? N_GSYM : N_STSYM;
+ symStab.type = info.originalSym->isExternal() ? N_GSYM : N_STSYM;
stabs.emplace_back(std::move(symStab));
}
}
diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h
index a4c7f58481aa14..af99f22788d6e9 100644
--- a/lld/MachO/SyntheticSections.h
+++ b/lld/MachO/SyntheticSections.h
@@ -485,6 +485,7 @@ class SymtabSection : public LinkEditSection {
void emitEndSourceStab();
void emitObjectFileStab(ObjFile *);
void emitEndFunStab(Defined *);
+ Defined *getFuncBodySym(Defined *);
void emitStabs();
protected:
diff --git a/lld/MachO/Target.h b/lld/MachO/Target.h
index eaa0336e70cb6b..b5b80e083a6c34 100644
--- a/lld/MachO/Target.h
+++ b/lld/MachO/Target.h
@@ -80,6 +80,12 @@ class TargetInfo {
llvm_unreachable("target does not support ICF safe thunks");
}
+ // Given a thunk for which `initICFSafeThunkBody` was called, return the
+ // branchTarget it was initialized with.
+ virtual InputSection *getThunkBranchTarget(InputSection *thunk) const {
+ llvm_unreachable("target does not support ICF safe thunks");
+ }
+
virtual uint32_t getICFSafeThunkSize() const {
llvm_unreachable("target does not support ICF safe thunks");
}
diff --git a/lld/test/MachO/icf-safe-thunks-dwarf.ll b/lld/test/MachO/icf-safe-thunks-dwarf.ll
index 1e4422a3313239..9edad60946837e 100644
--- a/lld/test/MachO/icf-safe-thunks-dwarf.ll
+++ b/lld/test/MachO/icf-safe-thunks-dwarf.ll
@@ -20,6 +20,59 @@
; VERIFY-STABS: N_FUN{{.*}}_func_A
; VERIFY-STABS: N_FUN{{.*}}_take_func_addr
+;;;;;;;;;;;;;;;;;;;;;;;;;;;; Check safe_thunks ICF + keeping STABS entries ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Check scenario with where we do safe_thunks ICF and also generate STABS entries
+; RUN: %lld -arch arm64 -lSystem --icf=safe_thunks --keep-icf-stabs -dylib -o %t/a_thunks.dylib %t/a.o
+; RUN: dsymutil -s %t/a_thunks.dylib > %t/a_thunks.txt
+
+
+; RUN: dsymutil --flat --verify-dwarf=none %t/a_thunks.dylib -o %t/a_thunks.dSYM
+; RUN: dsymutil -s %t/a_thunks.dSYM >> %t/a_thunks.txt
+; RUN: llvm-dwarfdump -a %t/a_thunks.dSYM >> %t/a_thunks.txt
+
+; RUN: cat %t/a_thunks.txt | FileCheck %s --check-prefix=VERIFY-THUNK
+
+# VERIFY-THUNK-LABEL: Symbol table for: '{{.*}}/a_thunks.dylib'
+# Capture the 'n_value's for N_FUN entries of _func_A, _func_B, and _func_C
+# VERIFY-THUNK: [[MERGED_FUN_ADDR:[0-9a-f]+]] '_func_A'
+# VERIFY-THUNK: [[MERGED_FUN_ADDR]] '_func_B'
+# VERIFY-THUNK: [[MERGED_FUN_ADDR]] '_func_C'
+
+# Capture the 'n_value's for SECT EXT entries in the first part
+# VERIFY-THUNK: SECT EXT{{.*}} [[SECT_EXT_A_NVALUE:[0-9a-f]+]] '_func_A'
+# VERIFY-THUNK: SECT EXT{{.*}} [[SECT_EXT_B_NVALUE:[0-9a-f]+]] '_func_B'
+# VERIFY-THUNK: SECT EXT{{.*}} [[SECT_EXT_C_NVALUE:[0-9a-f]+]] '_func_C'
+
+# VERIFY-THUNK: ----------------------------------------------------------------------
+# VERIFY-THUNK-LABEL: Symbol table for: '{{.*}}/a_thunks.dSYM'
+
+# Verify that the SECT EXT 'n_value's in the second part match the first part
+# VERIFY-THUNK: SECT EXT{{.*}} [[SECT_EXT_A_NVALUE]] '_func_A'
+# VERIFY-THUNK: SECT EXT{{.*}} [[SECT_EXT_B_NVALUE]] '_func_B'
+# VERIFY-THUNK: SECT EXT{{.*}} [[SECT_EXT_C_NVALUE]] '_func_C'
+
+# Ensure that N_FUN 'n_value's match the DW_TAG_subprogram's DW_AT_low_pc
+# and that the DW_AT_name is at a specific relative position
+
+# VERIFY-THUNK-LABEL: .debug_info contents:
+# VERIFY-THUNK: Compile Unit: length = {{.*}}
+
+# Match the subprogram for func_A
+# VERIFY-THUNK: : DW_TAG_subprogram
+# VERIFY-THUNK-NEXT: {{ +}}DW_AT_low_pc (0x[[MERGED_FUN_ADDR]])
+# VERIFY-THUNK-NEXT-NEXT-NEXT-NEXT-NEXT: {{ +}}DW_AT_name ("func_A")
+
+# Match the subprogram for func_B
+# VERIFY-THUNK: : DW_TAG_subprogram
+# VERIFY-THUNK-NEXT: {{ +}}DW_AT_low_pc (0x[[MERGED_FUN_ADDR]])
+# VERIFY-THUNK-NEXT-NEXT-NEXT-NEXT-NEXT: {{ +}}DW_AT_name ("func_B")
+
+# Match the subprogram for func_C
+# VERIFY-THUNK: : DW_TAG_subprogram
+# VERIFY-THUNK-NEXT: {{ +}}DW_AT_low_pc (0x[[MERGED_FUN_ADDR]])
+# VERIFY-THUNK-NEXT-NEXT-NEXT-NEXT-NEXT: {{ +}}DW_AT_name ("func_C")
+
;--- a.cpp
#define ATTR __attribute__((noinline)) extern "C"
typedef unsigned long long ULL;
>From 5ab60755500acf8542af4eeac74d3ba2bee57fa2 Mon Sep 17 00:00:00 2001
From: Alex B <alexborcan at meta.com>
Date: Mon, 18 Nov 2024 17:08:23 -0800
Subject: [PATCH 2/3] Address feedback nr.1
---
lld/MachO/ICF.cpp | 1 -
lld/MachO/SyntheticSections.cpp | 18 ++++++++++--------
2 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index 451aaa7caaf0c1..a20da12b097f2d 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -507,7 +507,6 @@ Defined *macho::getBodyForThunkFoldedSym(Defined *foldedSym) {
}
llvm_unreachable("could not find body symbol for ICF-generated thunk");
- return nullptr;
}
void macho::foldIdenticalSections(bool onlyCfStrings) {
TimeTraceScope timeScope("Fold Identical Code Sections");
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 7e0ba8d501c994..7c76de1cca00f6 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -1205,8 +1205,11 @@ void SymtabSection::emitEndFunStab(Defined *defined) {
stabs.emplace_back(std::move(stab));
}
+// Given a pointer to a function symbol, return the symbol that points to the
+// actual function body that will go in the final binary. Generally this is the
+// symbol itself, but if the symbol was folded using a thunk, we retrieve the
+// target function body from the thunk.
Defined *SymtabSection::getFuncBodySym(Defined *originalSym) {
- // If the Defined is not a thunk, we can use it directly
if (originalSym->identicalCodeFoldingKind != Symbol::ICFFoldKind::Thunk)
return originalSym;
@@ -1226,8 +1229,6 @@ void SymtabSection::emitStabs() {
struct SymbolStabInfo {
Defined *originalSym; // Original Defined symbol - this may be an ICF thunk
int fileId; // File ID associated with the STABS symbol
- Defined *mainBodySym; // Symbol that consists of the full function body -
- // use this for the STABS entry
};
std::vector<SymbolStabInfo> symbolsNeedingStabs;
@@ -1256,8 +1257,8 @@ void SymtabSection::emitStabs() {
// We use 'originalIsec' to get the file id of the symbol since 'isec()'
// might point to the merged ICF symbol's file
Defined *funcBodySym = getFuncBodySym(defined);
- symbolsNeedingStabs.emplace_back(SymbolStabInfo{
- defined, funcBodySym->originalIsec->getFile()->id, funcBodySym});
+ symbolsNeedingStabs.emplace_back(
+ SymbolStabInfo{defined, funcBodySym->originalIsec->getFile()->id});
}
}
@@ -1273,7 +1274,8 @@ void SymtabSection::emitStabs() {
for (const SymbolStabInfo &info : symbolsNeedingStabs) {
// We use 'originalIsec' of the symbol since we care about the actual origin
// of the symbol, not the canonical location returned by `isec()`.
- InputSection *bodyIsec = info.mainBodySym->originalIsec;
+ Defined *funcBodySym = getFuncBodySym(info.originalSym);
+ InputSection *bodyIsec = funcBodySym->originalIsec;
ObjFile *file = cast<ObjFile>(bodyIsec->getFile());
if (lastFile == nullptr || lastFile != file) {
@@ -1288,12 +1290,12 @@ void SymtabSection::emitStabs() {
StabsEntry symStab;
symStab.sect = bodyIsec->parent->index;
symStab.strx = stringTableSection.addString(info.originalSym->getName());
- symStab.value = info.mainBodySym->getVA();
+ symStab.value = funcBodySym->getVA();
if (isCodeSection(bodyIsec)) {
symStab.type = N_FUN;
stabs.emplace_back(std::move(symStab));
- emitEndFunStab(info.mainBodySym);
+ emitEndFunStab(funcBodySym);
} else {
symStab.type = info.originalSym->isExternal() ? N_GSYM : N_STSYM;
stabs.emplace_back(std::move(symStab));
>From 56d75f47f4ad59597b452c192c275692e7fbee9b Mon Sep 17 00:00:00 2001
From: Alex B <alexborcan at meta.com>
Date: Mon, 18 Nov 2024 23:09:34 -0800
Subject: [PATCH 3/3] Address Feedbak Nr.2
---
lld/MachO/SyntheticSections.cpp | 33 +++++++++++++++------------------
1 file changed, 15 insertions(+), 18 deletions(-)
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 7c76de1cca00f6..a897a8889bd1f6 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -1226,12 +1226,9 @@ void SymtabSection::emitStabs() {
stabs.emplace_back(std::move(astStab));
}
- struct SymbolStabInfo {
- Defined *originalSym; // Original Defined symbol - this may be an ICF thunk
- int fileId; // File ID associated with the STABS symbol
- };
-
- std::vector<SymbolStabInfo> symbolsNeedingStabs;
+ // Cache the file ID for each symbol in an std::pair for faster sorting.
+ using SortingPair = std::pair<Defined *, int>;
+ std::vector<SortingPair> symbolsNeedingStabs;
for (const SymtabEntry &entry :
concat<SymtabEntry>(localSymbols, externalSymbols)) {
Symbol *sym = entry.sym;
@@ -1256,27 +1253,27 @@ void SymtabSection::emitStabs() {
// We use 'originalIsec' to get the file id of the symbol since 'isec()'
// might point to the merged ICF symbol's file
- Defined *funcBodySym = getFuncBodySym(defined);
symbolsNeedingStabs.emplace_back(
- SymbolStabInfo{defined, funcBodySym->originalIsec->getFile()->id});
+ defined, getFuncBodySym(defined)->originalIsec->getFile()->id);
}
}
llvm::stable_sort(symbolsNeedingStabs,
- [&](const SymbolStabInfo &a, const SymbolStabInfo &b) {
- return a.fileId < b.fileId;
+ [&](const SortingPair &a, const SortingPair &b) {
+ return a.second < b.second;
});
// Emit STABS symbols so that dsymutil and/or the debugger can map address
// regions in the final binary to the source and object files from which they
// originated.
InputFile *lastFile = nullptr;
- for (const SymbolStabInfo &info : symbolsNeedingStabs) {
+ for (SortingPair &pair : symbolsNeedingStabs) {
+ Defined *defined = pair.first;
// We use 'originalIsec' of the symbol since we care about the actual origin
// of the symbol, not the canonical location returned by `isec()`.
- Defined *funcBodySym = getFuncBodySym(info.originalSym);
- InputSection *bodyIsec = funcBodySym->originalIsec;
- ObjFile *file = cast<ObjFile>(bodyIsec->getFile());
+ Defined *funcBodySym = getFuncBodySym(defined);
+ InputSection *isec = funcBodySym->originalIsec;
+ ObjFile *file = cast<ObjFile>(isec->getFile());
if (lastFile == nullptr || lastFile != file) {
if (lastFile != nullptr)
@@ -1288,16 +1285,16 @@ void SymtabSection::emitStabs() {
}
StabsEntry symStab;
- symStab.sect = bodyIsec->parent->index;
- symStab.strx = stringTableSection.addString(info.originalSym->getName());
+ symStab.sect = isec->parent->index;
+ symStab.strx = stringTableSection.addString(defined->getName());
symStab.value = funcBodySym->getVA();
- if (isCodeSection(bodyIsec)) {
+ if (isCodeSection(isec)) {
symStab.type = N_FUN;
stabs.emplace_back(std::move(symStab));
emitEndFunStab(funcBodySym);
} else {
- symStab.type = info.originalSym->isExternal() ? N_GSYM : N_STSYM;
+ symStab.type = defined->isExternal() ? N_GSYM : N_STSYM;
stabs.emplace_back(std::move(symStab));
}
}
More information about the llvm-commits
mailing list