[lld] [lld-macho] Enhance safe ICF with thunk-based deduplication (PR #106573)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 29 14:24:07 PDT 2024
https://github.com/alx32 updated https://github.com/llvm/llvm-project/pull/106573
>From e2f8e5c4ed8be9a453f92227e04f30fbc9d1d3d3 Mon Sep 17 00:00:00 2001
From: Alex B <alexborcan at meta.com>
Date: Thu, 29 Aug 2024 07:23:19 -0700
Subject: [PATCH] [lld-macho] Enhance safe ICF with thunk-based deduplication
Currently, our `safe` ICF mode only merges non-address-significant code, leaving duplicate address-significant functions in the output. This patch introduces `safe_thunks` ICF mode, which keeps a single master copy of each function and replaces address-significant duplicates with thunks that branch to the master copy.
---
lld/MachO/Arch/ARM64.cpp | 23 +++
lld/MachO/Config.h | 1 +
lld/MachO/Driver.cpp | 10 +-
lld/MachO/ICF.cpp | 86 ++++++++++-
lld/MachO/Target.h | 10 ++
lld/test/MachO/icf-safe-thunks.ll | 241 ++++++++++++++++++++++++++++++
6 files changed, 367 insertions(+), 4 deletions(-)
create mode 100644 lld/test/MachO/icf-safe-thunks.ll
diff --git a/lld/MachO/Arch/ARM64.cpp b/lld/MachO/Arch/ARM64.cpp
index e192676394c965..30b6c27ff99f89 100644
--- a/lld/MachO/Arch/ARM64.cpp
+++ b/lld/MachO/Arch/ARM64.cpp
@@ -41,6 +41,10 @@ struct ARM64 : ARM64Common {
Symbol *objcMsgSend) const override;
void populateThunk(InputSection *thunk, Symbol *funcSym) override;
void applyOptimizationHints(uint8_t *, const ObjFile &) const override;
+
+ virtual void initICFSafeThunkBody(InputSection *thunk,
+ InputSection *branchTarget) const override;
+ virtual uint32_t getICFSafeThunkSize() const override;
};
} // namespace
@@ -175,6 +179,25 @@ void ARM64::populateThunk(InputSection *thunk, Symbol *funcSym) {
/*offset=*/0, /*addend=*/0,
/*referent=*/funcSym);
}
+// Just a single direct branch to the target function.
+static constexpr uint32_t icfSafeThunkCode[] = {
+ 0x94000000, // 08: b target
+};
+
+void ARM64::initICFSafeThunkBody(InputSection *thunk,
+ InputSection *branchTarget) const {
+ // The base data here will not be itself modified, we'll just be adding a
+ // reloc below. So we can directly use the constexpr above as the data.
+ thunk->data = {reinterpret_cast<const uint8_t *>(icfSafeThunkCode),
+ sizeof(icfSafeThunkCode)};
+
+ thunk->relocs.emplace_back(/*type=*/ARM64_RELOC_BRANCH26,
+ /*pcrel=*/true, /*length=*/2,
+ /*offset=*/0, /*addend=*/0,
+ /*referent=*/branchTarget);
+}
+
+uint32_t ARM64::getICFSafeThunkSize() const { return sizeof(icfSafeThunkCode); }
ARM64::ARM64() : ARM64Common(LP64()) {
cpuType = CPU_TYPE_ARM64;
diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index 5beb0662ba7274..4e940693602c95 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -68,6 +68,7 @@ enum class ICFLevel {
unknown,
none,
safe,
+ safe_thunks,
all,
};
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 6a1ff96ed65697..7b23fdfea1303d 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -847,8 +847,15 @@ static ICFLevel getICFLevel(const ArgList &args) {
auto icfLevel = StringSwitch<ICFLevel>(icfLevelStr)
.Cases("none", "", ICFLevel::none)
.Case("safe", ICFLevel::safe)
+ .Case("safe_thunks", ICFLevel::safe_thunks)
.Case("all", ICFLevel::all)
.Default(ICFLevel::unknown);
+
+ if (icfLevel == ICFLevel::safe_thunks &&
+ !is_contained({AK_x86_64h, AK_arm64}, config->arch())) {
+ error("--icf=safe_thunks is only supported on arm64 targets");
+ }
+
if (icfLevel == ICFLevel::unknown) {
warn(Twine("unknown --icf=OPTION `") + icfLevelStr +
"', defaulting to `none'");
@@ -2104,7 +2111,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
// foldIdenticalLiterals before foldIdenticalSections.
foldIdenticalLiterals();
if (config->icfLevel != ICFLevel::none) {
- if (config->icfLevel == ICFLevel::safe)
+ if (config->icfLevel == ICFLevel::safe ||
+ config->icfLevel == ICFLevel::safe_thunks)
markAddrSigSymbols();
foldIdenticalSections(/*onlyCfStrings=*/false);
} else if (config->dedupStrings) {
diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index fc786b571dc64f..cac5e673986829 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -45,6 +45,7 @@ class ICF {
const ConcatInputSection *ib);
bool equalsVariable(const ConcatInputSection *ia,
const ConcatInputSection *ib);
+ void applySafeThunksToRange(size_t begin, size_t end);
// ICF needs a copy of the inputs vector because its equivalence-class
// segregation algorithm destroys the proper sequence.
@@ -251,6 +252,63 @@ void ICF::forEachClassRange(size_t begin, size_t end,
}
}
+// Given a range of identical icfInputs's, replace address significant functions
+// with a thunk that is just a direct branch to the first function in the
+// series. This way we end up we keep only one main body of the function but we
+// still retain address uniqueness of rellevant functions by having them be a
+// direct branch thunk rather than contain a full copy of the actual function
+// body.
+void ICF::applySafeThunksToRange(size_t begin, size_t end) {
+ // If we need to create a unique ICF thunk, use the first section as the
+ // section that all thunks will branch to.
+ ConcatInputSection *masterIsec = icfInputs[begin];
+ uint32_t thunkSize = target->getICFSafeThunkSize();
+ static std::mutex thunkInsertionMutex;
+
+ uint32_t keepUniqueCount = masterIsec->keepUnique ? 1 : 0;
+ for (size_t i = begin + 1; i < end; ++i) {
+ ConcatInputSection *isec = icfInputs[i];
+ if (isec->keepUnique)
+ ++keepUniqueCount;
+
+ // We create thunks for the 2nd, 3rd, ... keepUnique sections. The first
+ // keepUnique section we leave as is - as it will not end up sharing an
+ // address with any other keepUnique section.
+ if (keepUniqueCount >= 2 && isec->keepUnique) {
+ // If the target to be folded is smaller than the thunk size, then just
+ // leave it as-is - creating the thunk would be a net loss.
+ if (isec->data.size() <= thunkSize)
+ return;
+
+ // applySafeThunksToRange is called from multiple threads, but
+ // `makeSyntheticInputSection` and `addInputSection` are not thread safe.
+ // So we need to guard them with a mutex.
+ ConcatInputSection *thunk;
+ {
+ std::lock_guard<std::mutex> lock(thunkInsertionMutex);
+ thunk = makeSyntheticInputSection(isec->getSegName(), isec->getName());
+ addInputSection(thunk);
+ }
+
+ target->initICFSafeThunkBody(thunk, masterIsec);
+ thunk->foldIdentical(isec);
+
+ // Since we're folding the target function into a thunk, we need to adjust
+ // the symbols that now got relocated from the target function to the
+ // thunk.
+ // Since the thunk is only one branch, we move all symbols to offset 0 and
+ // make sure that the size of all non-zero-size symbols is equal to the
+ // size of the branch.
+ for (auto *sym : isec->symbols) {
+ if (sym->value != 0)
+ sym->value = 0;
+ if (sym->size != 0)
+ sym->size = thunkSize;
+ }
+ }
+ }
+}
+
// Split icfInputs into shards, then parallelize invocation of FUNC on subranges
// with matching equivalence class
void ICF::forEachClass(llvm::function_ref<void(size_t, size_t)> func) {
@@ -335,9 +393,20 @@ void ICF::run() {
forEachClass([&](size_t begin, size_t end) {
if (end - begin < 2)
return;
+ bool useSafeThunks = config->icfLevel == ICFLevel::safe_thunks;
+
+ // For ICF level safe_thunks, replace keepUnique function bodies with
+ // thunks. For all other ICF levles, directly merge the functions.
+ if (useSafeThunks)
+ applySafeThunksToRange(begin, end);
+
ConcatInputSection *beginIsec = icfInputs[begin];
- for (size_t i = begin + 1; i < end; ++i)
+ for (size_t i = begin + 1; i < end; ++i) {
+ // When using safe_thunks, keepUnique inputs are already handeled above
+ if (useSafeThunks && icfInputs[i]->keepUnique)
+ continue;
beginIsec->foldIdentical(icfInputs[i]);
+ }
});
}
@@ -421,11 +490,22 @@ void macho::foldIdenticalSections(bool onlyCfStrings) {
// can still fold it.
bool hasFoldableFlags = (isSelRefsSection(isec) ||
sectionType(isec->getFlags()) == MachO::S_REGULAR);
+
+ bool isCodeSec = isCodeSection(isec);
+
+ // When keepUnique is true, the section is not foldable. Unless we are at
+ // icf level safe_thunks, in which case we still want to fold code sections.
+ // When using safe_thunks we'll apply the safe_thunks logic at merge time
+ // based on the 'keepUnique' flag.
+ bool noUniqueRequirement =
+ !isec->keepUnique ||
+ ((config->icfLevel == ICFLevel::safe_thunks) && isCodeSec);
+
// FIXME: consider non-code __text sections as foldable?
bool isFoldable = (!onlyCfStrings || isCfStringSection(isec)) &&
- (isCodeSection(isec) || isFoldableWithAddendsRemoved ||
+ (isCodeSec || isFoldableWithAddendsRemoved ||
isGccExceptTabSection(isec)) &&
- !isec->keepUnique && !isec->hasAltEntry &&
+ noUniqueRequirement && !isec->hasAltEntry &&
!isec->shouldOmitFromOutput() && hasFoldableFlags;
if (isFoldable) {
foldable.push_back(isec);
diff --git a/lld/MachO/Target.h b/lld/MachO/Target.h
index cc47ae4386b477..eaa0336e70cb6b 100644
--- a/lld/MachO/Target.h
+++ b/lld/MachO/Target.h
@@ -74,6 +74,16 @@ class TargetInfo {
uint64_t selrefVA,
Symbol *objcMsgSend) const = 0;
+ // Init 'thunk' so that it be a direct jump to 'branchTarget'.
+ virtual void initICFSafeThunkBody(InputSection *thunk,
+ InputSection *branchTarget) const {
+ llvm_unreachable("target does not support ICF safe thunks");
+ }
+
+ virtual uint32_t getICFSafeThunkSize() const {
+ llvm_unreachable("target does not support ICF safe thunks");
+ }
+
// Symbols may be referenced via either the GOT or the stubs section,
// depending on the relocation type. prepareSymbolRelocation() will set up the
// GOT/stubs entries, and resolveSymbolVA() will return the addresses of those
diff --git a/lld/test/MachO/icf-safe-thunks.ll b/lld/test/MachO/icf-safe-thunks.ll
new file mode 100644
index 00000000000000..2a0ca8314036f8
--- /dev/null
+++ b/lld/test/MachO/icf-safe-thunks.ll
@@ -0,0 +1,241 @@
+; REQUIRES: aarch64
+
+; RUN: rm -rf %t; mkdir %t
+; RUN: llc -filetype=obj %s -O3 -o %t/icf-obj-safe-thunks.o -enable-machine-outliner=never -mtriple arm64-apple-macos -addrsig
+; RUN: %lld -arch arm64 -lSystem --icf=safe_thunks -dylib -o %t/icf-safe.dylib %t/icf-obj-safe-thunks.o
+; RUN: llvm-objdump %t/icf-safe.dylib -d --macho | FileCheck %s --check-prefixes=CHECK-ARM64
+
+; CHECK-ARM64: (__TEXT,__text) section
+; CHECK-ARM64-NEXT: _func_unique_1:
+; CHECK-ARM64-NEXT: mov {{.*}}, #0x1
+;
+; CHECK-ARM64: _func_unique_2_canmerge:
+; CHECK-ARM64-NEXT: mov {{.*}}, #0x2
+;
+; CHECK-ARM64: _func_2identical_v1:
+; CHECK-ARM64-NEXT: mov {{.*}}, #0x2
+;
+; CHECK-ARM64: _func_3identical_v1:
+; CHECK-ARM64-NEXT: mov {{.*}}, #0x3
+;
+; CHECK-ARM64: _func_3identical_v1_canmerge:
+; CHECK-ARM64-NEXT: _func_3identical_v2_canmerge:
+; CHECK-ARM64-NEXT: _func_3identical_v3_canmerge:
+; CHECK-ARM64-NEXT: mov {{.*}}, #0x21
+;
+; CHECK-ARM64: _call_all_funcs:
+; CHECK-ARM64-NEXT: stp x29
+;
+; CHECK-ARM64: _take_func_addr:
+; CHECK-ARM64-NEXT: adr
+;
+; CHECK-ARM64: _func_2identical_v2:
+; CHECK-ARM64-NEXT: bl _func_unique_2_canmerge
+; CHECK-ARM64-NEXT: _func_3identical_v2:
+; CHECK-ARM64-NEXT: bl _func_3identical_v1
+; CHECK-ARM64-NEXT: _func_3identical_v3:
+; CHECK-ARM64-NEXT: bl _func_3identical_v1
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx11.0.0"
+
+ at g_val = global i8 0, align 1
+ at g_ptr = global ptr null, align 8
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_unique_1() #0 {
+entry:
+ store volatile i8 1, ptr @g_val, align 1, !tbaa !5
+ ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_unique_2_canmerge() local_unnamed_addr #0 {
+entry:
+ store volatile i8 2, ptr @g_val, align 1, !tbaa !5
+ ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_2identical_v1() #0 {
+entry:
+ store volatile i8 2, ptr @g_val, align 1, !tbaa !5
+ ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_2identical_v2() #0 {
+entry:
+ store volatile i8 2, ptr @g_val, align 1, !tbaa !5
+ ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v1() #0 {
+entry:
+ store volatile i8 3, ptr @g_val, align 1, !tbaa !5
+ ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v2() #0 {
+entry:
+ store volatile i8 3, ptr @g_val, align 1, !tbaa !5
+ ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v3() #0 {
+entry:
+ store volatile i8 3, ptr @g_val, align 1, !tbaa !5
+ ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v1_canmerge() local_unnamed_addr #0 {
+entry:
+ store volatile i8 33, ptr @g_val, align 1, !tbaa !5
+ ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v2_canmerge() local_unnamed_addr #0 {
+entry:
+ store volatile i8 33, ptr @g_val, align 1, !tbaa !5
+ ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v3_canmerge() local_unnamed_addr #0 {
+entry:
+ store volatile i8 33, ptr @g_val, align 1, !tbaa !5
+ ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp uwtable(sync)
+define void @call_all_funcs() local_unnamed_addr #1 {
+entry:
+ tail call void @func_unique_1()
+ tail call void @func_unique_2_canmerge()
+ tail call void @func_2identical_v1()
+ tail call void @func_2identical_v2()
+ tail call void @func_3identical_v1()
+ tail call void @func_3identical_v2()
+ tail call void @func_3identical_v3()
+ tail call void @func_3identical_v1_canmerge()
+ tail call void @func_3identical_v2_canmerge()
+ tail call void @func_3identical_v3_canmerge()
+ ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @take_func_addr() local_unnamed_addr #0 {
+entry:
+ store volatile ptr @func_unique_1, ptr @g_ptr, align 8, !tbaa !8
+ store volatile ptr @func_2identical_v1, ptr @g_ptr, align 8, !tbaa !8
+ store volatile ptr @func_2identical_v2, ptr @g_ptr, align 8, !tbaa !8
+ store volatile ptr @func_3identical_v1, ptr @g_ptr, align 8, !tbaa !8
+ store volatile ptr @func_3identical_v2, ptr @g_ptr, align 8, !tbaa !8
+ store volatile ptr @func_3identical_v3, ptr @g_ptr, align 8, !tbaa !8
+ ret void
+}
+
+attributes #0 = { mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,+zcm,+zcz" }
+attributes #1 = { mustprogress nofree noinline norecurse nounwind ssp uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,+zcm,+zcz" }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"uwtable", i32 1}
+!3 = !{i32 7, !"frame-pointer", i32 1}
+!4 = !{!"clang"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C++ TBAA"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"any pointer", !6, i64 0}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;; Generate the above LLVM IR with the below script ;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; #!/bin/bash
+; set -ex
+; TOOLCHAIN_BIN="llvm-project/build/Debug/bin"
+;
+; # Create icf-safe-thunks.cpp file
+; cat > icf-safe-thunks.cpp <<EOF
+;
+; #define ATTR __attribute__((noinline)) extern "C"
+; typedef unsigned long long ULL;
+;
+; volatile char g_val = 0;
+; void *volatile g_ptr = 0;
+;
+; ATTR void func_unique_1() {
+; g_val = 1;
+; }
+;
+; ATTR void func_unique_2_canmerge() {
+; g_val = 2;
+; }
+;
+; ATTR void func_2identical_v1() {
+; g_val = 2;
+; }
+;
+; ATTR void func_2identical_v2() {
+; g_val = 2;
+; }
+;
+; ATTR void func_3identical_v1() {
+; g_val = 3;
+; }
+;
+; ATTR void func_3identical_v2() {
+; g_val = 3;
+; }
+;
+; ATTR void func_3identical_v3() {
+; g_val = 3;
+; }
+;
+; ATTR void func_3identical_v1_canmerge() {
+; g_val = 33;
+; }
+;
+; ATTR void func_3identical_v2_canmerge() {
+; g_val = 33;
+; }
+;
+; ATTR void func_3identical_v3_canmerge() {
+; g_val = 33;
+; }
+;
+; ATTR void call_all_funcs() {
+; func_unique_1();
+; func_unique_2_canmerge();
+; func_2identical_v1();
+; func_2identical_v2();
+; func_3identical_v1();
+; func_3identical_v2();
+; func_3identical_v3();
+; func_3identical_v1_canmerge();
+; func_3identical_v2_canmerge();
+; func_3identical_v3_canmerge();
+; }
+;
+; ATTR void take_func_addr() {
+; g_ptr = (void*)func_unique_1;
+; g_ptr = (void*)func_2identical_v1;
+; g_ptr = (void*)func_2identical_v2;
+; g_ptr = (void*)func_3identical_v1;
+; g_ptr = (void*)func_3identical_v2;
+; g_ptr = (void*)func_3identical_v3;
+; }
+; EOF
+;
+; $TOOLCHAIN_BIN/clang -target arm64-apple-macos11.0 -S -emit-llvm \
+; icf-safe-thunks.cpp -O3 -o icf-safe-thunks.ll
More information about the llvm-commits
mailing list