[lld] [lld-macho] Enhance safe ICF with thunk-based deduplication (PR #106573)

via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 29 14:24:07 PDT 2024


https://github.com/alx32 updated https://github.com/llvm/llvm-project/pull/106573

>From e2f8e5c4ed8be9a453f92227e04f30fbc9d1d3d3 Mon Sep 17 00:00:00 2001
From: Alex B <alexborcan at meta.com>
Date: Thu, 29 Aug 2024 07:23:19 -0700
Subject: [PATCH] [lld-macho] Enhance safe ICF with thunk-based deduplication

Currently, our `safe` ICF mode only merges non-address-significant code, leaving duplicate address-significant functions in the output. This patch introduces `safe_thunks` ICF mode, which keeps a single master copy of each function and replaces address-significant duplicates with thunks that branch to the master copy.
---
 lld/MachO/Arch/ARM64.cpp          |  23 +++
 lld/MachO/Config.h                |   1 +
 lld/MachO/Driver.cpp              |  10 +-
 lld/MachO/ICF.cpp                 |  86 ++++++++++-
 lld/MachO/Target.h                |  10 ++
 lld/test/MachO/icf-safe-thunks.ll | 241 ++++++++++++++++++++++++++++++
 6 files changed, 367 insertions(+), 4 deletions(-)
 create mode 100644 lld/test/MachO/icf-safe-thunks.ll

diff --git a/lld/MachO/Arch/ARM64.cpp b/lld/MachO/Arch/ARM64.cpp
index e192676394c965..30b6c27ff99f89 100644
--- a/lld/MachO/Arch/ARM64.cpp
+++ b/lld/MachO/Arch/ARM64.cpp
@@ -41,6 +41,10 @@ struct ARM64 : ARM64Common {
                             Symbol *objcMsgSend) const override;
   void populateThunk(InputSection *thunk, Symbol *funcSym) override;
   void applyOptimizationHints(uint8_t *, const ObjFile &) const override;
+
+  virtual void initICFSafeThunkBody(InputSection *thunk,
+                                    InputSection *branchTarget) const override;
+  virtual uint32_t getICFSafeThunkSize() const override;
 };
 
 } // namespace
@@ -175,6 +179,25 @@ void ARM64::populateThunk(InputSection *thunk, Symbol *funcSym) {
                              /*offset=*/0, /*addend=*/0,
                              /*referent=*/funcSym);
 }
+// Just a single direct branch to the target function.
+static constexpr uint32_t icfSafeThunkCode[] = {
+    0x94000000, // 08: b    target
+};
+
+void ARM64::initICFSafeThunkBody(InputSection *thunk,
+                                 InputSection *branchTarget) const {
+  // The base data here will not be itself modified, we'll just be adding a
+  // reloc below. So we can directly use the constexpr above as the data.
+  thunk->data = {reinterpret_cast<const uint8_t *>(icfSafeThunkCode),
+                 sizeof(icfSafeThunkCode)};
+
+  thunk->relocs.emplace_back(/*type=*/ARM64_RELOC_BRANCH26,
+                             /*pcrel=*/true, /*length=*/2,
+                             /*offset=*/0, /*addend=*/0,
+                             /*referent=*/branchTarget);
+}
+
+uint32_t ARM64::getICFSafeThunkSize() const { return sizeof(icfSafeThunkCode); }
 
 ARM64::ARM64() : ARM64Common(LP64()) {
   cpuType = CPU_TYPE_ARM64;
diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index 5beb0662ba7274..4e940693602c95 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -68,6 +68,7 @@ enum class ICFLevel {
   unknown,
   none,
   safe,
+  safe_thunks,
   all,
 };
 
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 6a1ff96ed65697..7b23fdfea1303d 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -847,8 +847,15 @@ static ICFLevel getICFLevel(const ArgList &args) {
   auto icfLevel = StringSwitch<ICFLevel>(icfLevelStr)
                       .Cases("none", "", ICFLevel::none)
                       .Case("safe", ICFLevel::safe)
+                      .Case("safe_thunks", ICFLevel::safe_thunks)
                       .Case("all", ICFLevel::all)
                       .Default(ICFLevel::unknown);
+
+  if (icfLevel == ICFLevel::safe_thunks &&
+      !is_contained({AK_x86_64h, AK_arm64}, config->arch())) {
+    error("--icf=safe_thunks is only supported on arm64 targets");
+  }
+
   if (icfLevel == ICFLevel::unknown) {
     warn(Twine("unknown --icf=OPTION `") + icfLevelStr +
          "', defaulting to `none'");
@@ -2104,7 +2111,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     // foldIdenticalLiterals before foldIdenticalSections.
     foldIdenticalLiterals();
     if (config->icfLevel != ICFLevel::none) {
-      if (config->icfLevel == ICFLevel::safe)
+      if (config->icfLevel == ICFLevel::safe ||
+          config->icfLevel == ICFLevel::safe_thunks)
         markAddrSigSymbols();
       foldIdenticalSections(/*onlyCfStrings=*/false);
     } else if (config->dedupStrings) {
diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index fc786b571dc64f..cac5e673986829 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -45,6 +45,7 @@ class ICF {
                       const ConcatInputSection *ib);
   bool equalsVariable(const ConcatInputSection *ia,
                       const ConcatInputSection *ib);
+  void applySafeThunksToRange(size_t begin, size_t end);
 
   // ICF needs a copy of the inputs vector because its equivalence-class
   // segregation algorithm destroys the proper sequence.
@@ -251,6 +252,63 @@ void ICF::forEachClassRange(size_t begin, size_t end,
   }
 }
 
+// Given a range of identical icfInputs's, replace address significant functions
+// with a thunk that is just a direct branch to the first function in the
+// series. This way we end up we keep only one main body of the function but we
+// still retain address uniqueness of rellevant functions by having them be a
+// direct branch thunk rather than contain a full copy of the actual function
+// body.
+void ICF::applySafeThunksToRange(size_t begin, size_t end) {
+  // If we need to create a unique ICF thunk, use the first section as the
+  // section that all thunks will branch to.
+  ConcatInputSection *masterIsec = icfInputs[begin];
+  uint32_t thunkSize = target->getICFSafeThunkSize();
+  static std::mutex thunkInsertionMutex;
+
+  uint32_t keepUniqueCount = masterIsec->keepUnique ? 1 : 0;
+  for (size_t i = begin + 1; i < end; ++i) {
+    ConcatInputSection *isec = icfInputs[i];
+    if (isec->keepUnique)
+      ++keepUniqueCount;
+
+    // We create thunks for the 2nd, 3rd, ... keepUnique sections. The first
+    // keepUnique section we leave as is - as it will not end up sharing an
+    // address with any other keepUnique section.
+    if (keepUniqueCount >= 2 && isec->keepUnique) {
+      // If the target to be folded is smaller than the thunk size, then just
+      // leave it as-is - creating the thunk would be a net loss.
+      if (isec->data.size() <= thunkSize)
+        return;
+
+      // applySafeThunksToRange is called from multiple threads, but
+      // `makeSyntheticInputSection` and `addInputSection` are not thread safe.
+      // So we need to guard them with a mutex.
+      ConcatInputSection *thunk;
+      {
+        std::lock_guard<std::mutex> lock(thunkInsertionMutex);
+        thunk = makeSyntheticInputSection(isec->getSegName(), isec->getName());
+        addInputSection(thunk);
+      }
+
+      target->initICFSafeThunkBody(thunk, masterIsec);
+      thunk->foldIdentical(isec);
+
+      // Since we're folding the target function into a thunk, we need to adjust
+      // the symbols that now got relocated from the target function to the
+      // thunk.
+      // Since the thunk is only one branch, we move all symbols to offset 0 and
+      // make sure that the size of all non-zero-size symbols is equal to the
+      // size of the branch.
+      for (auto *sym : isec->symbols) {
+        if (sym->value != 0)
+          sym->value = 0;
+        if (sym->size != 0)
+          sym->size = thunkSize;
+      }
+    }
+  }
+}
+
 // Split icfInputs into shards, then parallelize invocation of FUNC on subranges
 // with matching equivalence class
 void ICF::forEachClass(llvm::function_ref<void(size_t, size_t)> func) {
@@ -335,9 +393,20 @@ void ICF::run() {
   forEachClass([&](size_t begin, size_t end) {
     if (end - begin < 2)
       return;
+    bool useSafeThunks = config->icfLevel == ICFLevel::safe_thunks;
+
+    // For ICF level safe_thunks, replace keepUnique function bodies with
+    // thunks. For all other ICF levles, directly merge the functions.
+    if (useSafeThunks)
+      applySafeThunksToRange(begin, end);
+
     ConcatInputSection *beginIsec = icfInputs[begin];
-    for (size_t i = begin + 1; i < end; ++i)
+    for (size_t i = begin + 1; i < end; ++i) {
+      // When using safe_thunks, keepUnique inputs are already handeled above
+      if (useSafeThunks && icfInputs[i]->keepUnique)
+        continue;
       beginIsec->foldIdentical(icfInputs[i]);
+    }
   });
 }
 
@@ -421,11 +490,22 @@ void macho::foldIdenticalSections(bool onlyCfStrings) {
     // can still fold it.
     bool hasFoldableFlags = (isSelRefsSection(isec) ||
                              sectionType(isec->getFlags()) == MachO::S_REGULAR);
+
+    bool isCodeSec = isCodeSection(isec);
+
+    // When keepUnique is true, the section is not foldable. Unless we are at
+    // icf level safe_thunks, in which case we still want to fold code sections.
+    // When using safe_thunks we'll apply the safe_thunks logic at merge time
+    // based on the 'keepUnique' flag.
+    bool noUniqueRequirement =
+        !isec->keepUnique ||
+        ((config->icfLevel == ICFLevel::safe_thunks) && isCodeSec);
+
     // FIXME: consider non-code __text sections as foldable?
     bool isFoldable = (!onlyCfStrings || isCfStringSection(isec)) &&
-                      (isCodeSection(isec) || isFoldableWithAddendsRemoved ||
+                      (isCodeSec || isFoldableWithAddendsRemoved ||
                        isGccExceptTabSection(isec)) &&
-                      !isec->keepUnique && !isec->hasAltEntry &&
+                      noUniqueRequirement && !isec->hasAltEntry &&
                       !isec->shouldOmitFromOutput() && hasFoldableFlags;
     if (isFoldable) {
       foldable.push_back(isec);
diff --git a/lld/MachO/Target.h b/lld/MachO/Target.h
index cc47ae4386b477..eaa0336e70cb6b 100644
--- a/lld/MachO/Target.h
+++ b/lld/MachO/Target.h
@@ -74,6 +74,16 @@ class TargetInfo {
                                     uint64_t selrefVA,
                                     Symbol *objcMsgSend) const = 0;
 
+  // Init 'thunk' so that it be a direct jump to 'branchTarget'.
+  virtual void initICFSafeThunkBody(InputSection *thunk,
+                                    InputSection *branchTarget) const {
+    llvm_unreachable("target does not support ICF safe thunks");
+  }
+
+  virtual uint32_t getICFSafeThunkSize() const {
+    llvm_unreachable("target does not support ICF safe thunks");
+  }
+
   // Symbols may be referenced via either the GOT or the stubs section,
   // depending on the relocation type. prepareSymbolRelocation() will set up the
   // GOT/stubs entries, and resolveSymbolVA() will return the addresses of those
diff --git a/lld/test/MachO/icf-safe-thunks.ll b/lld/test/MachO/icf-safe-thunks.ll
new file mode 100644
index 00000000000000..2a0ca8314036f8
--- /dev/null
+++ b/lld/test/MachO/icf-safe-thunks.ll
@@ -0,0 +1,241 @@
+; REQUIRES: aarch64
+
+; RUN: rm -rf %t; mkdir %t
+; RUN: llc -filetype=obj %s -O3 -o %t/icf-obj-safe-thunks.o -enable-machine-outliner=never -mtriple arm64-apple-macos -addrsig
+; RUN: %lld -arch arm64 -lSystem --icf=safe_thunks -dylib -o %t/icf-safe.dylib %t/icf-obj-safe-thunks.o
+; RUN: llvm-objdump %t/icf-safe.dylib -d --macho | FileCheck %s --check-prefixes=CHECK-ARM64
+
+; CHECK-ARM64:        (__TEXT,__text) section
+; CHECK-ARM64-NEXT:   _func_unique_1:
+; CHECK-ARM64-NEXT:        mov {{.*}}, #0x1
+;
+; CHECK-ARM64:        _func_unique_2_canmerge:
+; CHECK-ARM64-NEXT:        mov {{.*}}, #0x2
+;
+; CHECK-ARM64:        _func_2identical_v1:
+; CHECK-ARM64-NEXT:        mov {{.*}}, #0x2
+;
+; CHECK-ARM64:        _func_3identical_v1:
+; CHECK-ARM64-NEXT:        mov {{.*}}, #0x3
+;
+; CHECK-ARM64:        _func_3identical_v1_canmerge:
+; CHECK-ARM64-NEXT:   _func_3identical_v2_canmerge:
+; CHECK-ARM64-NEXT:   _func_3identical_v3_canmerge:
+; CHECK-ARM64-NEXT:        mov {{.*}}, #0x21
+;
+; CHECK-ARM64:        _call_all_funcs:
+; CHECK-ARM64-NEXT:        stp  x29
+;
+; CHECK-ARM64:        _take_func_addr:
+; CHECK-ARM64-NEXT:        adr
+;
+; CHECK-ARM64:        _func_2identical_v2:
+; CHECK-ARM64-NEXT:         bl  _func_unique_2_canmerge
+; CHECK-ARM64-NEXT:   _func_3identical_v2:
+; CHECK-ARM64-NEXT:        bl  _func_3identical_v1
+; CHECK-ARM64-NEXT:   _func_3identical_v3:
+; CHECK-ARM64-NEXT:        bl  _func_3identical_v1
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx11.0.0"
+
+ at g_val = global i8 0, align 1
+ at g_ptr = global ptr null, align 8
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_unique_1() #0 {
+entry:
+  store volatile i8 1, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_unique_2_canmerge() local_unnamed_addr #0 {
+entry:
+  store volatile i8 2, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_2identical_v1() #0 {
+entry:
+  store volatile i8 2, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_2identical_v2() #0 {
+entry:
+  store volatile i8 2, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v1() #0 {
+entry:
+  store volatile i8 3, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v2() #0 {
+entry:
+  store volatile i8 3, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v3() #0 {
+entry:
+  store volatile i8 3, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v1_canmerge() local_unnamed_addr #0 {
+entry:
+  store volatile i8 33, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v2_canmerge() local_unnamed_addr #0 {
+entry:
+  store volatile i8 33, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v3_canmerge() local_unnamed_addr #0 {
+entry:
+  store volatile i8 33, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp uwtable(sync)
+define void @call_all_funcs() local_unnamed_addr #1 {
+entry:
+  tail call void @func_unique_1()
+  tail call void @func_unique_2_canmerge()
+  tail call void @func_2identical_v1()
+  tail call void @func_2identical_v2()
+  tail call void @func_3identical_v1()
+  tail call void @func_3identical_v2()
+  tail call void @func_3identical_v3()
+  tail call void @func_3identical_v1_canmerge()
+  tail call void @func_3identical_v2_canmerge()
+  tail call void @func_3identical_v3_canmerge()
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @take_func_addr() local_unnamed_addr #0 {
+entry:
+  store volatile ptr @func_unique_1, ptr @g_ptr, align 8, !tbaa !8
+  store volatile ptr @func_2identical_v1, ptr @g_ptr, align 8, !tbaa !8
+  store volatile ptr @func_2identical_v2, ptr @g_ptr, align 8, !tbaa !8
+  store volatile ptr @func_3identical_v1, ptr @g_ptr, align 8, !tbaa !8
+  store volatile ptr @func_3identical_v2, ptr @g_ptr, align 8, !tbaa !8
+  store volatile ptr @func_3identical_v3, ptr @g_ptr, align 8, !tbaa !8
+  ret void
+}
+
+attributes #0 = { mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,+zcm,+zcz" }
+attributes #1 = { mustprogress nofree noinline norecurse nounwind ssp uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,+zcm,+zcz" }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"uwtable", i32 1}
+!3 = !{i32 7, !"frame-pointer", i32 1}
+!4 = !{!"clang"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C++ TBAA"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"any pointer", !6, i64 0}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;; Generate the above LLVM IR with the below script ;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; #!/bin/bash
+; set -ex
+; TOOLCHAIN_BIN="llvm-project/build/Debug/bin"
+;
+; # Create icf-safe-thunks.cpp file
+; cat > icf-safe-thunks.cpp <<EOF
+;
+; #define ATTR __attribute__((noinline)) extern "C"
+; typedef unsigned long long ULL;
+;
+; volatile char g_val = 0;
+; void *volatile g_ptr = 0;
+;
+; ATTR void func_unique_1() {
+;     g_val = 1;
+; }
+;
+; ATTR void func_unique_2_canmerge() {
+;     g_val = 2;
+; }
+;
+; ATTR void func_2identical_v1() {
+;     g_val = 2;
+; }
+;
+; ATTR void func_2identical_v2() {
+;     g_val = 2;
+; }
+;
+; ATTR void func_3identical_v1() {
+;     g_val = 3;
+; }
+;
+; ATTR void func_3identical_v2() {
+;     g_val = 3;
+; }
+;
+; ATTR void func_3identical_v3() {
+;     g_val = 3;
+; }
+;
+; ATTR void func_3identical_v1_canmerge() {
+;     g_val = 33;
+; }
+;
+; ATTR void func_3identical_v2_canmerge() {
+;     g_val = 33;
+; }
+;
+; ATTR void func_3identical_v3_canmerge() {
+;     g_val = 33;
+; }
+;
+; ATTR void call_all_funcs() {
+;     func_unique_1();
+;     func_unique_2_canmerge();
+;     func_2identical_v1();
+;     func_2identical_v2();
+;     func_3identical_v1();
+;     func_3identical_v2();
+;     func_3identical_v3();
+;     func_3identical_v1_canmerge();
+;     func_3identical_v2_canmerge();
+;     func_3identical_v3_canmerge();
+; }
+;
+; ATTR void take_func_addr() {
+;     g_ptr = (void*)func_unique_1;
+;     g_ptr = (void*)func_2identical_v1;
+;     g_ptr = (void*)func_2identical_v2;
+;     g_ptr = (void*)func_3identical_v1;
+;     g_ptr = (void*)func_3identical_v2;
+;     g_ptr = (void*)func_3identical_v3;
+; }
+; EOF
+;
+; $TOOLCHAIN_BIN/clang -target arm64-apple-macos11.0 -S -emit-llvm \
+;                      icf-safe-thunks.cpp -O3 -o icf-safe-thunks.ll



More information about the llvm-commits mailing list