[lld] [lld] Support order cstrings with -order_file_cstring (PR #140307)
via llvm-commits
llvm-commits at lists.llvm.org
Fri May 16 15:02:59 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-lld-macho
@llvm/pr-subscribers-lld
Author: None (SharonXSharon)
<details>
<summary>Changes</summary>
Add a new -order_file_cstring flag, which can take a order file for ordering cstrings, similar to the existing -order_file.
Due to the speciality of cstrings, there's no way to pass in symbol names in the order file as the existing -order_file, so we expect `<hash of cstring literal content>` in the cstring order file. Given the cstrings are deduplicated by default, the hash should be able to identify each cstring. The order file can also accept comments starting with #, same with existing -order_file.
The ordering of cstring has to happen during/before the finalizing of the cstring section content in the `finalizeContents()` function, which happens before the writer is run
---
Full diff: https://github.com/llvm/llvm-project/pull/140307.diff
7 Files Affected:
- (modified) lld/MachO/Config.h (+1)
- (modified) lld/MachO/Driver.cpp (+12-9)
- (modified) lld/MachO/Options.td (+4)
- (modified) lld/MachO/SectionPriorities.cpp (+71)
- (modified) lld/MachO/SectionPriorities.h (+20)
- (modified) lld/MachO/SyntheticSections.cpp (+29-29)
- (added) lld/test/MachO/ordre-file-cstring.s (+222)
``````````diff
diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index a01e60efbe761..d0217b38c3007 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -225,6 +225,7 @@ struct Configuration {
bool callGraphProfileSort = false;
llvm::StringRef printSymbolOrder;
+ llvm::StringRef cStringOrderFilePath;
llvm::StringRef irpgoProfilePath;
bool bpStartupFunctionSort = false;
bool bpCompressionSortStartupFunctions = false;
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 5c32055166da6..0f2957295d136 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -337,15 +337,15 @@ static InputFile *addFile(StringRef path, LoadType loadType,
for (const object::Archive::Child &c : file->getArchive().children(e)) {
StringRef reason;
switch (loadType) {
- case LoadType::LCLinkerOption:
- reason = "LC_LINKER_OPTION";
- break;
- case LoadType::CommandLineForce:
- reason = "-force_load";
- break;
- case LoadType::CommandLine:
- reason = "-all_load";
- break;
+ case LoadType::LCLinkerOption:
+ reason = "LC_LINKER_OPTION";
+ break;
+ case LoadType::CommandLineForce:
+ reason = "-force_load";
+ break;
+ case LoadType::CommandLine:
+ reason = "-all_load";
+ break;
}
if (Error e = file->fetch(c, reason)) {
if (config->warnThinArchiveMissingMembers)
@@ -2178,6 +2178,9 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
StringRef orderFile = args.getLastArgValue(OPT_order_file);
if (!orderFile.empty())
priorityBuilder.parseOrderFile(orderFile);
+ config->cStringOrderFilePath = args.getLastArgValue(OPT_order_file_cstring);
+ if (!config->cStringOrderFilePath.empty())
+ priorityBuilder.parseOrderFileCString(config->cStringOrderFilePath);
referenceStubBinder();
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 4f0602f59812b..34faa75103224 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -400,6 +400,10 @@ def order_file : Separate<["-"], "order_file">,
MetaVarName<"<file>">,
HelpText<"Layout functions and data according to specification in <file>">,
Group<grp_opts>;
+def order_file_cstring : Separate<["-"], "order_file_cstring">,
+ MetaVarName<"<file>">,
+ HelpText<"Layout cstrings according to specification in <file>">,
+ Group<grp_opts>;
def no_order_inits : Flag<["-"], "no_order_inits">,
HelpText<"Disable default reordering of initializer and terminator functions">,
Flags<[HelpHidden]>,
diff --git a/lld/MachO/SectionPriorities.cpp b/lld/MachO/SectionPriorities.cpp
index 7a4a5d8465f64..e7372050b7601 100644
--- a/lld/MachO/SectionPriorities.cpp
+++ b/lld/MachO/SectionPriorities.cpp
@@ -388,3 +388,74 @@ macho::PriorityBuilder::buildInputSectionPriorities() {
return sectionPriorities;
}
+
+void macho::PriorityBuilder::parseOrderFileCString(StringRef path) {
+ std::optional<MemoryBufferRef> buffer = readFile(path);
+ if (!buffer) {
+ error("Could not read cstring order file at " + path);
+ return;
+ }
+ MemoryBufferRef mbref = *buffer;
+ int priority = std::numeric_limits<int>::min();
+ for (StringRef line : args::getLines(mbref)) {
+ if (line.empty())
+ continue;
+ uint32_t hash = 0;
+ if (!to_integer(line, hash))
+ continue;
+ auto it = cStringPriorities.find(hash);
+ if (it == cStringPriorities.end())
+ cStringPriorities[hash] = ++priority;
+ else
+ assert(it->second <= priority);
+ }
+}
+
+std::vector<StringPiecePair> macho::PriorityBuilder::buildCStringPriorities(
+ ArrayRef<CStringInputSection *> inputs) {
+ std::vector<StringPiecePair> orderedStringPieces;
+ if (config->cStringOrderFilePath.empty()) {
+ for (CStringInputSection *isec : inputs) {
+ for (const auto &[stringPieceIdx, piece] :
+ llvm::enumerate(isec->pieces)) {
+ if (!piece.live)
+ continue;
+ orderedStringPieces.emplace_back(isec, stringPieceIdx);
+ }
+ }
+ return orderedStringPieces;
+ }
+
+ // Split the input strings into hold and cold sets.
+ // Order hot set based on -order_file_cstring for performance improvement;
+ // TODO: Order cold set of cstrings for compression via BP.
+ std::vector<std::pair<int, StringPiecePair>>
+ hotStringPrioritiesAndStringPieces;
+ std::vector<StringPiecePair> coldStringPieces;
+
+ for (CStringInputSection *isec : inputs) {
+ for (const auto &[stringPieceIdx, piece] : llvm::enumerate(isec->pieces)) {
+ if (!piece.live)
+ continue;
+
+ auto it = cStringPriorities.find(piece.hash);
+ if (it != cStringPriorities.end())
+ hotStringPrioritiesAndStringPieces.emplace_back(
+ it->second, std::make_pair(isec, stringPieceIdx));
+ else
+ coldStringPieces.emplace_back(isec, stringPieceIdx);
+ }
+ }
+
+ // Order hot set for perf
+ llvm::stable_sort(hotStringPrioritiesAndStringPieces);
+ for (auto &[priority, stringPiecePair] : hotStringPrioritiesAndStringPieces)
+ orderedStringPieces.push_back(stringPiecePair);
+
+ // TODO: Order cold set for compression
+
+ orderedStringPieces.insert(orderedStringPieces.end(),
+ coldStringPieces.begin(), coldStringPieces.end());
+
+ return orderedStringPieces;
+}
diff --git a/lld/MachO/SectionPriorities.h b/lld/MachO/SectionPriorities.h
index 44fb101990c51..5593494d8a274 100644
--- a/lld/MachO/SectionPriorities.h
+++ b/lld/MachO/SectionPriorities.h
@@ -16,6 +16,7 @@
namespace lld::macho {
using SectionPair = std::pair<const InputSection *, const InputSection *>;
+using StringPiecePair = std::pair<CStringInputSection *, size_t>;
class PriorityBuilder {
public:
@@ -55,6 +56,23 @@ class PriorityBuilder {
// contains.
llvm::DenseMap<const InputSection *, int> buildInputSectionPriorities();
+ // Reads the cstring order file at `path` into cStringPriorities.
+ // An cstring order file has one entry per line, in the following format:
+ //
+ // <hash of cstring literal content>
+ //
+ // Cstring literals are not symbolized, we can't identify them by name
+ // However, cstrings are deduplicated, hence unique, so we use the hash of
+ // the content of cstring literals to identify them and assign priority to it.
+ // We use the same hash as used in StringPiece, i.e. 31 bit:
+ // xxh3_64bits(string) & 0x7fffffff
+ //
+ // Additionally, given they are deduplicated and unique, we don't need to know
+ // which object file they are from.
+ void parseOrderFileCString(StringRef path);
+ std::vector<StringPiecePair>
+ buildCStringPriorities(ArrayRef<CStringInputSection *>);
+
private:
// The symbol with the smallest priority should be ordered first in the output
// section (modulo input section contiguity constraints).
@@ -68,6 +86,8 @@ class PriorityBuilder {
std::optional<int> getSymbolPriority(const Defined *sym);
llvm::DenseMap<llvm::StringRef, SymbolPriorityEntry> priorities;
+ /// A map from cstring literal hashes to priorities
+ llvm::DenseMap<uint32_t, int> cStringPriorities;
llvm::MapVector<SectionPair, uint64_t> callGraphProfile;
};
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index dfacaf2ef4e0d..beddcfa2174e0 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -15,6 +15,7 @@
#include "MachOStructs.h"
#include "ObjC.h"
#include "OutputSegment.h"
+#include "SectionPriorities.h"
#include "SymbolTable.h"
#include "Symbols.h"
@@ -1766,26 +1767,25 @@ void DeduplicatedCStringSection::finalizeContents() {
}
}
- // Assign an offset for each string and save it to the corresponding
+ // Sort the strings for performance and compression size win, and then
+ // assign an offset for each string and save it to the corresponding
// StringPieces for easy access.
- for (CStringInputSection *isec : inputs) {
- for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
- if (!piece.live)
- continue;
- auto s = isec->getCachedHashStringRef(i);
- auto it = stringOffsetMap.find(s);
- assert(it != stringOffsetMap.end());
- StringOffset &offsetInfo = it->second;
- if (offsetInfo.outSecOff == UINT64_MAX) {
- offsetInfo.outSecOff =
- alignToPowerOf2(size, 1ULL << offsetInfo.trailingZeros);
- size =
- offsetInfo.outSecOff + s.size() + 1; // account for null terminator
- }
- piece.outSecOff = offsetInfo.outSecOff;
+ for (auto &[isec, i] : priorityBuilder.buildCStringPriorities(inputs)) {
+ auto &piece = isec->pieces[i];
+ auto s = isec->getCachedHashStringRef(i);
+ auto it = stringOffsetMap.find(s);
+ assert(it != stringOffsetMap.end());
+ lld::macho::DeduplicatedCStringSection::StringOffset &offsetInfo =
+ it->second;
+ if (offsetInfo.outSecOff == UINT64_MAX) {
+ offsetInfo.outSecOff =
+ alignToPowerOf2(size, 1ULL << offsetInfo.trailingZeros);
+ size = offsetInfo.outSecOff + s.size() + 1; // account for null terminator
}
- isec->isFinal = true;
+ piece.outSecOff = offsetInfo.outSecOff;
}
+ for (CStringInputSection *isec : inputs)
+ isec->isFinal = true;
}
void DeduplicatedCStringSection::writeTo(uint8_t *buf) const {
@@ -1908,18 +1908,18 @@ ObjCImageInfoSection::parseImageInfo(const InputFile *file) {
static std::string swiftVersionString(uint8_t version) {
switch (version) {
- case 1:
- return "1.0";
- case 2:
- return "1.1";
- case 3:
- return "2.0";
- case 4:
- return "3.0";
- case 5:
- return "4.0";
- default:
- return ("0x" + Twine::utohexstr(version)).str();
+ case 1:
+ return "1.0";
+ case 2:
+ return "1.1";
+ case 3:
+ return "2.0";
+ case 4:
+ return "3.0";
+ case 5:
+ return "4.0";
+ default:
+ return ("0x" + Twine::utohexstr(version)).str();
}
}
diff --git a/lld/test/MachO/ordre-file-cstring.s b/lld/test/MachO/ordre-file-cstring.s
new file mode 100644
index 0000000000000..138f1685467ee
--- /dev/null
+++ b/lld/test/MachO/ordre-file-cstring.s
@@ -0,0 +1,222 @@
+# RUN: rm -rf %t; split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/test.s -o %t/test.o
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/more-cstrings.s -o %t/more-cstrings.o
+
+# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-0 %t/test.o %t/more-cstrings.o
+# RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-0 | FileCheck %s --check-prefix=ORIGIN_SYM
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-0 | FileCheck %s --check-prefix=ORIGIN_SEC
+
+# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-1 %t/test.o %t/more-cstrings.o -order_file_cstring %t/ord-1
+# RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-1 | FileCheck %s --check-prefix=ONE_SYM
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-1 | FileCheck %s --check-prefix=ONE_SEC
+
+
+# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-2 %t/test.o %t/more-cstrings.o -order_file_cstring %t/ord-2
+# RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-2 | FileCheck %s --check-prefix=TWO_SYM
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-2 | FileCheck %s --check-prefix=TWO_SEC
+
+# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-3 %t/test.o %t/more-cstrings.o -order_file_cstring %t/ord-3
+# RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-3 | FileCheck %s --check-prefix=THREE_SYM
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-3 | FileCheck %s --check-prefix=THREE_SEC
+
+# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-4 %t/test.o %t/more-cstrings.o -order_file_cstring %t/ord-4
+# RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-4 | FileCheck %s --check-prefix=FOUR_SYM
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-4 | FileCheck %s --check-prefix=FOUR_SEC
+# RUN: llvm-readobj --string-dump=__cstring %t/test-4 | FileCheck %s --check-prefix=FOUR_SEC_ESCAPE
+
+
+# We expect:
+# 1) Covered cstring symbols are reordered
+# 2) the rest of the cstring symbols remain original relative order within the cstring section
+
+# ORIGIN_SYM: _local_foo1
+# ORIGIN_SYM: _globl_foo2
+# ORIGIN_SYM: _local_foo2
+# ORIGIN_SYM: _bar
+# ORIGIN_SYM: _baz
+# ORIGIN_SYM: _baz_dup
+# ORIGIN_SYM: _bar2
+# ORIGIN_SYM: _globl_foo3
+
+# ORIGIN_SEC: foo1
+# ORIGIN_SEC: foo2
+# ORIGIN_SEC: bar
+# ORIGIN_SEC: baz
+# ORIGIN_SEC: bar2
+# ORIGIN_SEC: foo3
+
+
+# ONE_SYM: _globl_foo2
+# ONE_SYM: _local_foo2
+# ONE_SYM: _bar
+# ONE_SYM: _bar2
+# ONE_SYM: _globl_foo3
+# ONE_SYM: _local_foo1
+# ONE_SYM: _baz
+# ONE_SYM: _baz_dup
+
+# ONE_SEC: foo2
+# ONE_SEC: bar
+# ONE_SEC: bar2
+# ONE_SEC: foo3
+# ONE_SEC: foo1
+# ONE_SEC: baz
+
+
+# TWO_SYM: _globl_foo2
+# TWO_SYM: _local_foo2
+# TWO_SYM: _local_foo1
+# TWO_SYM: _baz
+# TWO_SYM: _baz_dup
+# TWO_SYM: _bar
+# TWO_SYM: _bar2
+# TWO_SYM: _globl_foo3
+
+# TWO_SEC: foo2
+# TWO_SEC: foo1
+# TWO_SEC: baz
+# TWO_SEC: bar
+# TWO_SEC: bar2
+# TWO_SEC: foo3
+
+
+# THREE_SYM: _local_foo1
+# THREE_SYM: _baz
+# THREE_SYM: _baz_dup
+# THREE_SYM: _bar
+# THREE_SYM: _bar2
+# THREE_SYM: _globl_foo2
+# THREE_SYM: _local_foo2
+# THREE_SYM: _globl_foo3
+
+# THREE_SEC: foo1
+# THREE_SEC: baz
+# THREE_SEC: bar
+# THREE_SEC: bar2
+# THREE_SEC: foo2
+# THREE_SEC: foo3
+
+
+# FOUR_SYM: _local_escape_white_space
+# FOUR_SYM: _globl_foo2
+# FOUR_SYM: _local_foo2
+# FOUR_SYM: _local_escape
+# FOUR_SYM: _globl_foo3
+# FOUR_SYM: _bar
+# FOUR_SYM: _local_foo1
+# FOUR_SYM: _baz
+# FOUR_SYM: _baz_dup
+# FOUR_SYM: _bar2
+
+# FOUR_SEC: \t\n
+# FOUR_SEC: foo2
+# FOUR_SEC: @\"NSDictionary\"
+# FOUR_SEC: foo3
+# FOUR_SEC: bar
+# FOUR_SEC: foo1
+# FOUR_SEC: baz
+# FOUR_SEC: bar2
+
+# FOUR_SEC_ESCAPE: ..
+# FOUR_SEC_ESCAPE: foo2
+# FOUR_SEC_ESCAPE: @"NSDictionary"
+# FOUR_SEC_ESCAPE: foo3
+# FOUR_SEC_ESCAPE: bar
+# FOUR_SEC_ESCAPE: foo1
+# FOUR_SEC_ESCAPE: baz
+# FOUR_SEC_ESCAPE: bar2
+
+# original order, but only parital covered
+#--- ord-1
+#foo2
+0x55783A95
+#bar
+0x2032D362
+#bar2
+0x592F855B
+#foo3
+0x501BCC31
+
+# change order, parital covered
+#--- ord-2
+#foo2
+0x55783A95
+#foo1
+0x6326A039
+#baz
+0x336F8925
+#bar
+0x2032D362
+#bar2
+0x592F855B
+
+# change order, parital covered, with mismatches, duplicates
+#--- ord-3
+foo2222
+0x11111111
+#foo1
+0x6326A039
+#baz
+0x336F8925
+#bar
+0x2032D362
+#bar2
+0x592F855B
+#baz
+0x336F8925
+
+# test escape strings
+#--- ord-4
+#\t\n
+0x3DBEA0C9
+#foo2
+0x55783A95
+#@\"NSDictionary\"
+0x47AF4776
+#foo3
+0x501BCC31
+#bar
+0x2032D362
+
+
+#--- test.s
+.text
+.globl _main
+
+_main:
+ ret
+
+.cstring
+.p2align 2
+_local_foo1:
+ .asciz "foo1"
+_local_foo2:
+ .asciz "foo2"
+L_.foo1_dup:
+ .asciz "foo1"
+L_.foo2_dup:
+ .asciz "foo2"
+_local_escape:
+ .asciz "@\"NSDictionary\""
+_local_escape_white_space:
+ .asciz "\t\n"
+
+_bar:
+ .asciz "bar"
+_baz:
+ .asciz "baz"
+_bar2:
+ .asciz "bar2"
+_baz_dup:
+ .asciz "baz"
+
+.subsections_via_symbols
+
+#--- more-cstrings.s
+.globl _globl_foo1, _globl_foo3
+.cstring
+.p2align 4
+_globl_foo3:
+ .asciz "foo3"
+_globl_foo2:
+ .asciz "foo2"
``````````
</details>
https://github.com/llvm/llvm-project/pull/140307
More information about the llvm-commits
mailing list