[lld] [lld-macho] Choose ICF root function deterministically based on symbol names (PR #158157)

Kyungwoo Lee via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 12 11:48:51 PDT 2025


https://github.com/kyulee-com updated https://github.com/llvm/llvm-project/pull/158157

>From d02f446a3d6c0394fe9e58f62a7523e428ea74fd Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Thu, 11 Sep 2025 14:30:29 -0700
Subject: [PATCH 1/2] [lld-macho] Choose ICF root function deterministically
 based on symbol names

Our codebase has global function merging and outlining which enables many ICF instances, but small code changes often pick different root functions which confuses developers. This change improves determinism in ICF root function selection by sorting equivalent functions based on their primary symbol names, preferring external symbols at offset 0 over local symbols, with section names as fallback, ensuring more consistent and predictable folding results across builds.
---
 lld/MachO/ICF.cpp                 | 37 +++++++++++++++++++++++---
 lld/test/MachO/arm64-thunks.s     | 22 ++++++++--------
 lld/test/MachO/icf-name-order.s   | 44 +++++++++++++++++++++++++++++++
 lld/test/MachO/icf-safe-thunks.ll | 16 +++++------
 lld/test/MachO/map-file.s         |  4 +--
 5 files changed, 99 insertions(+), 24 deletions(-)
 create mode 100644 lld/test/MachO/icf-name-order.s

diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index 7b31378c3781e..25148c8849e9b 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -408,9 +408,40 @@ void ICF::run() {
         // When using safe_thunks, ensure that we first sort by icfEqClass and
         // then by keepUnique (descending). This guarantees that within an
         // equivalence class, the keepUnique inputs are always first.
-        if (config->icfLevel == ICFLevel::safe_thunks)
-          if (a->icfEqClass[0] == b->icfEqClass[0])
-            return a->keepUnique > b->keepUnique;
+        if (a->icfEqClass[0] == b->icfEqClass[0]) {
+          if (config->icfLevel == ICFLevel::safe_thunks) {
+            if (a->keepUnique != b->keepUnique)
+              return a->keepUnique > b->keepUnique;
+          }
+          // Within each equivalence class, sort by primary (user) symbol name
+          // for determinism. Prefer the first non-local (external) symbol at
+          // offset 0.
+          auto getPrimarySymbolName = [](const ConcatInputSection *isec) {
+            const Symbol *firstLocalSymAtZero = nullptr;
+
+            // Single pass through symbols to find the best candidate
+            for (const Symbol *sym : isec->symbols) {
+              if (auto *d = dyn_cast<Defined>(sym)) {
+                if (d->value == 0) {
+                  // If we find an external symbol at offset 0, return it
+                  // immediately
+                  if (d->isExternal())
+                    return d->getName();
+                  // Otherwise, remember the first local symbol at offset 0
+                  if (!firstLocalSymAtZero)
+                    firstLocalSymAtZero = sym;
+                }
+              }
+            }
+            // Return the local symbol at offset 0 if we found one
+            if (firstLocalSymAtZero)
+              return firstLocalSymAtZero->getName();
+
+            // Fallback: use section name
+            return isec->getName();
+          };
+          return getPrimarySymbolName(a) < getPrimarySymbolName(b);
+        }
         return a->icfEqClass[0] < b->icfEqClass[0];
       });
   forEachClass([&](size_t begin, size_t end) {
diff --git a/lld/test/MachO/arm64-thunks.s b/lld/test/MachO/arm64-thunks.s
index cd3842895c47e..79e4350f4ca5e 100644
--- a/lld/test/MachO/arm64-thunks.s
+++ b/lld/test/MachO/arm64-thunks.s
@@ -40,7 +40,7 @@
 # OBJC: _objc_msgSend$bar:
 # OBJC: _objc_msgSend$foo:
 
-# MAP:      0x{{[[:xdigit:]]+}} {{.*}} _fold_func_low_addr
+# MAP:      0x{{[[:xdigit:]]+}} {{.*}} _fold_func_first_addr
 # MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _a
 # MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _b
 # MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _c
@@ -58,12 +58,12 @@
 # MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _b.thunk.0
 # MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _h
 # MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _main
-# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _fold_func_high_addr
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _fold_func_second_addr
 # MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _c.thunk.0
 # MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _d.thunk.1
 # MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _e.thunk.1
 # MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _f.thunk.1
-# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _fold_func_low_addr.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _fold_func_first_addr.thunk.0
 # MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _z
 
 
@@ -243,14 +243,14 @@ _objc_msgSend:
 .subsections_via_symbols
 
 .addrsig
-.addrsig_sym _fold_func_low_addr
-.addrsig_sym _fold_func_high_addr
+.addrsig_sym _fold_func_first_addr
+.addrsig_sym _fold_func_second_addr
 
 .text
 
-.globl _fold_func_low_addr
+.globl _fold_func_first_addr
 .p2align 2
-_fold_func_low_addr:
+_fold_func_first_addr:
   add x0, x0, x0
   add x1, x0, x1
   add x2, x0, x2
@@ -383,16 +383,16 @@ _main:
   bl _f
   bl _g
   bl _h
-  bl _fold_func_low_addr
-  bl _fold_func_high_addr
+  bl _fold_func_first_addr
+  bl _fold_func_second_addr
   bl ___nan
   bl _objc_msgSend$foo
   bl _objc_msgSend$bar
   ret
 
-.globl _fold_func_high_addr
+.globl _fold_func_second_addr
 .p2align 2
-_fold_func_high_addr:
+_fold_func_second_addr:
   add x0, x0, x0
   add x1, x0, x1
   add x2, x0, x2
diff --git a/lld/test/MachO/icf-name-order.s b/lld/test/MachO/icf-name-order.s
new file mode 100644
index 0000000000000..856f944b217f0
--- /dev/null
+++ b/lld/test/MachO/icf-name-order.s
@@ -0,0 +1,44 @@
+# REQUIRES: aarch64
+# RUN: rm -rf %t; split-file %s %t
+
+# This test verifies ICF deterministically chooses root functions based on symbol names
+# regardless of input object file order, with all three identical functions being folded together
+# and _a chosen as the root due to lexicographic ordering.
+
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/a.s -o %t/a.o
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/b.s -o %t/b.o
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/c.s -o %t/c.o
+
+# RUN: %lld -dylib -arch arm64 -lSystem -o %t/abc --icf=all -map %t/abc.txt %t/a.o %t/b.o %t/c.o
+# RUN: %lld -dylib -arch arm64 -lSystem -o %t/bac --icf=all -map %t/bac.txt %t/b.o %t/a.o %t/c.o
+# RUN: %lld -dylib -arch arm64 -lSystem -o %t/cba --icf=all -map %t/cba.txt %t/c.o %t/b.o %t/a.o
+
+# RUN: cat %t/abc.txt | FileCheck %s
+# RUN: cat %t/bac.txt | FileCheck %s
+# RUN: cat %t/cba.txt | FileCheck %s
+
+# CHECK: Symbols:
+# CHECK: [[#%X,ADDR:]] 0x00000008  {{.*}} _a
+# CHECK-NEXT: [[#ADDR]] 0x00000000 {{.*}} _b
+# CHECK-NEXT: [[#ADDR]] 0x00000000 {{.*}} _c
+
+#--- a.s
+.section __TEXT,__text,regular,pure_instructions
+  .globl _a
+_a:
+  mov x0, 100
+  ret
+
+#--- b.s
+.section __TEXT,__text,regular,pure_instructions
+  .globl _b
+_b:
+  mov x0, 100
+  ret
+
+#--- c.s
+.section __TEXT,__text,regular,pure_instructions
+  .globl _c
+_c:
+  mov x0, 100
+  ret
diff --git a/lld/test/MachO/icf-safe-thunks.ll b/lld/test/MachO/icf-safe-thunks.ll
index 12f1e81bdf3e8..de15a067e6a5c 100644
--- a/lld/test/MachO/icf-safe-thunks.ll
+++ b/lld/test/MachO/icf-safe-thunks.ll
@@ -27,8 +27,8 @@
 ; CHECK-ARM64:        _func_call_thunked_1_nomerge:
 ; CHECK-ARM64-NEXT:        stp	x29
 ;
-; CHECK-ARM64:        _func_call_thunked_2_nomerge:
-; CHECK-ARM64-NEXT:   _func_call_thunked_2_merge:
+; CHECK-ARM64:        _func_call_thunked_2_first:
+; CHECK-ARM64-NEXT:   _func_call_thunked_2_second:
 ; CHECK-ARM64-NEXT:        stp	x29
 ;
 ; CHECK-ARM64:        _call_all_funcs:
@@ -53,8 +53,8 @@
 ; CHECK-ARM64-MAP-NEXT: 0x00000000 [  2] _func_3identical_v2_canmerge
 ; CHECK-ARM64-MAP-NEXT: 0x00000000 [  2] _func_3identical_v3_canmerge
 ; CHECK-ARM64-MAP-NEXT: 0x00000020 [  2] _func_call_thunked_1_nomerge
-; CHECK-ARM64-MAP-NEXT: 0x00000020 [  2] _func_call_thunked_2_nomerge
-; CHECK-ARM64-MAP-NEXT: 0x00000000 [  2] _func_call_thunked_2_merge
+; CHECK-ARM64-MAP-NEXT: 0x00000020 [  2] _func_call_thunked_2_first
+; CHECK-ARM64-MAP-NEXT: 0x00000000 [  2] _func_call_thunked_2_second
 ; CHECK-ARM64-MAP-NEXT: 0x00000034 [  2] _call_all_funcs
 ; CHECK-ARM64-MAP-NEXT: 0x00000050 [  2] _take_func_addr
 ; CHECK-ARM64-MAP-NEXT: 0x00000004 [  2] _func_2identical_v2
@@ -93,12 +93,12 @@ ATTR void func_call_thunked_1_nomerge() {
     g_val = 77;
 }
 
-ATTR void func_call_thunked_2_nomerge() {
+ATTR void func_call_thunked_2_first() {
     func_2identical_v2();
     g_val = 77;
 }
 
-ATTR void func_call_thunked_2_merge() {
+ATTR void func_call_thunked_2_second() {
     func_2identical_v2();
     g_val = 77;
 }
@@ -205,14 +205,14 @@ define void @func_call_thunked_1_nomerge() local_unnamed_addr #1 {
 }
 
 ; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp uwtable(sync)
-define void @func_call_thunked_2_nomerge() local_unnamed_addr #1 {
+define void @func_call_thunked_2_first() local_unnamed_addr #1 {
   tail call void @func_2identical_v2()
   store volatile i8 77, ptr @g_val, align 1, !tbaa !4
   ret void
 }
 
 ; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp uwtable(sync)
-define void @func_call_thunked_2_merge() local_unnamed_addr #1 {
+define void @func_call_thunked_2_second() local_unnamed_addr #1 {
   tail call void @func_2identical_v2()
   store volatile i8 77, ptr @g_val, align 1, !tbaa !4
   ret void
diff --git a/lld/test/MachO/map-file.s b/lld/test/MachO/map-file.s
index aa9fff9938eb2..acdfcb04a2245 100644
--- a/lld/test/MachO/map-file.s
+++ b/lld/test/MachO/map-file.s
@@ -110,8 +110,8 @@
 ## folded symbols but not folded cstrings; we print both.
 
 # ICF:     Symbols:
-# ICF-DAG: 0x[[#%X,FOO:]]     0x00000000  [  3] __ZTIN3foo3bar4MethE
-# ICF-DAG: 0x[[#FOO]]         0x00000001  [  2] _bar
+# ICF-DAG: 0x[[#%X,FOO:]]     0x00000001  [  3] __ZTIN3foo3bar4MethE
+# ICF-DAG: 0x[[#FOO]]         0x00000000  [  2] _bar
 # ICF-DAG: 0x[[#%X,HIWORLD:]] 0x0000000E  [  4]  literal string: Hello world!\n
 # ICF-DAG: 0x[[#%X,HIWORLD]]  0x00000000  [  4]  literal string: Hello world!\n
 

>From 176868953b0f55005343083daf844ef27aa24bd1 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 12 Sep 2025 11:48:23 -0700
Subject: [PATCH 2/2] Simplify the logic

---
 lld/MachO/ICF.cpp | 41 +++++++++++++++++------------------------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index 25148c8849e9b..d87fb42dcf19a 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -413,37 +413,30 @@ void ICF::run() {
             if (a->keepUnique != b->keepUnique)
               return a->keepUnique > b->keepUnique;
           }
-          // Within each equivalence class, sort by primary (user) symbol name
-          // for determinism. Prefer the first non-local (external) symbol at
-          // offset 0.
-          auto getPrimarySymbolName = [](const ConcatInputSection *isec) {
-            const Symbol *firstLocalSymAtZero = nullptr;
-
-            // Single pass through symbols to find the best candidate
+          // Within each equivalence class, sort by symbol name for determinism.
+          // If a section has an external symbol at offset 0, use that name.
+          // If either section doesn't have such a symbol, retain the original
+          // order.
+          auto getExternalSymbolName =
+              [](const ConcatInputSection *isec) -> std::optional<StringRef> {
             for (const Symbol *sym : isec->symbols) {
-              if (auto *d = dyn_cast<Defined>(sym)) {
-                if (d->value == 0) {
-                  // If we find an external symbol at offset 0, return it
-                  // immediately
-                  if (d->isExternal())
-                    return d->getName();
-                  // Otherwise, remember the first local symbol at offset 0
-                  if (!firstLocalSymAtZero)
-                    firstLocalSymAtZero = sym;
-                }
+              if (const auto *d = dyn_cast<Defined>(sym)) {
+                if (d->value == 0 && d->isExternal())
+                  return d->getName();
               }
             }
-            // Return the local symbol at offset 0 if we found one
-            if (firstLocalSymAtZero)
-              return firstLocalSymAtZero->getName();
-
-            // Fallback: use section name
-            return isec->getName();
+            return std::nullopt;
           };
-          return getPrimarySymbolName(a) < getPrimarySymbolName(b);
+          auto nameA = getExternalSymbolName(a);
+          auto nameB = getExternalSymbolName(b);
+          // For determinism, define a strict ordering only when both sections
+          // have a primary symbol. Otherwise, they are considered equivalent
+          // (the expression returns false), preserving the stable sort order.
+          return nameA && nameB && *nameA < *nameB;
         }
         return a->icfEqClass[0] < b->icfEqClass[0];
       });
+
   forEachClass([&](size_t begin, size_t end) {
     segregate(begin, end, &ICF::equalsConstant);
   });



More information about the llvm-commits mailing list