[lld] [lld][MachO] Read cstring order for non deduped sections (PR #161879)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 3 10:04:17 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-lld-macho
Author: Ellis Hoag (ellishg)
<details>
<summary>Changes</summary>
https://github.com/llvm/llvm-project/pull/140307 added support for cstring hashes in the orderfile to layout cstrings in a specific order, but only when `--deduplicate-strings` is used. This PR supports cstring ordering when `--no-deduplicate-strings` is used.
1. Create `cStringPriorities`, separate from `priorities`, to hold only priorities for cstring pieces. This allows us to lookup by hash directly, instead of first converting to a string. It also fixes a contrived bug where we want to order a symbol named `CSTR;12345` rather than a cstring.
2. Rather than calling `buildCStringPriorities()` which always constructs and returns a vector, we use `forEachStringPiece()` to efficiently iterate over cstring pieces without creating a new vector if no cstring is ordered.
3. Create `SymbolPriorityEntry::{get,set}Priority()` helper functions to simplify code.
---
Full diff: https://github.com/llvm/llvm-project/pull/161879.diff
4 Files Affected:
- (modified) lld/MachO/SectionPriorities.cpp (+65-62)
- (modified) lld/MachO/SectionPriorities.h (+19-9)
- (modified) lld/MachO/SyntheticSections.cpp (+30-32)
- (modified) lld/test/MachO/order-file-cstring.s (+18-18)
``````````diff
diff --git a/lld/MachO/SectionPriorities.cpp b/lld/MachO/SectionPriorities.cpp
index cf657aad5d145..183c90f9e5905 100644
--- a/lld/MachO/SectionPriorities.cpp
+++ b/lld/MachO/SectionPriorities.cpp
@@ -27,6 +27,7 @@
#include "llvm/Support/Path.h"
#include "llvm/Support/TimeProfiler.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/xxhash.h"
#include <numeric>
@@ -246,33 +247,45 @@ DenseMap<const InputSection *, int> CallGraphSort::run() {
return orderMap;
}
-std::optional<int>
-macho::PriorityBuilder::getSymbolOrCStringPriority(const StringRef key,
- InputFile *f) {
+void macho::PriorityBuilder::SymbolPriorityEntry::setPriority(
+ int priority, StringRef objectFile) {
+ if (!objectFile.empty())
+ objectFiles.try_emplace(objectFile, priority);
+ else
+ anyObjectFile = std::min(anyObjectFile, priority);
+}
- auto it = priorities.find(key);
- if (it == priorities.end())
- return std::nullopt;
- const SymbolPriorityEntry &entry = it->second;
+int macho::PriorityBuilder::SymbolPriorityEntry::getPriority(
+ const InputFile *f) const {
if (!f)
- return entry.anyObjectFile;
+ return anyObjectFile;
// We don't use toString(InputFile *) here because it returns the full path
// for object files, and we only want the basename.
- StringRef filename;
- if (f->archiveName.empty())
- filename = path::filename(f->getName());
- else
- filename = saver().save(path::filename(f->archiveName) + "(" +
- path::filename(f->getName()) + ")");
- return std::min(entry.objectFiles.lookup(filename), entry.anyObjectFile);
+ StringRef basename = path::filename(f->getName());
+ StringRef filename =
+ f->archiveName.empty()
+ ? basename
+ : saver().save(path::filename(f->archiveName) + "(" + basename + ")");
+ return std::min(objectFiles.lookup(filename), anyObjectFile);
}
std::optional<int>
-macho::PriorityBuilder::getSymbolPriority(const Defined *sym) {
+macho::PriorityBuilder::getCStringPriority(uint32_t hash,
+ const InputFile *f) const {
+ auto it = cStringPriorities.find(hash);
+ if (it == cStringPriorities.end())
+ return std::nullopt;
+ return it->second.getPriority(f);
+}
+
+std::optional<int>
+macho::PriorityBuilder::getSymbolPriority(const Defined *sym) const {
if (sym->isAbsolute())
return std::nullopt;
- return getSymbolOrCStringPriority(utils::getRootSymbol(sym->getName()),
- sym->isec()->getFile());
+ auto it = priorities.find(utils::getRootSymbol(sym->getName()));
+ if (it == priorities.end())
+ return std::nullopt;
+ return it->second.getPriority(sym->isec()->getFile());
}
void macho::PriorityBuilder::extractCallGraphProfile() {
@@ -307,7 +320,7 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
int prio = std::numeric_limits<int>::min();
MemoryBufferRef mbref = *buffer;
for (StringRef line : args::getLines(mbref)) {
- StringRef objectFile, symbolOrCStrHash;
+ StringRef objectFile;
line = line.take_until([](char c) { return c == '#'; }); // ignore comments
line = line.ltrim();
@@ -338,23 +351,14 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
}
// The rest of the line is either <symbol name> or
- // CStringEntryPrefix<cstring hash>
+ // cStringEntryPrefix<cstring hash>
line = line.trim();
- if (line.starts_with(CStringEntryPrefix)) {
- StringRef possibleHash = line.drop_front(CStringEntryPrefix.size());
+ if (line.consume_front(cStringEntryPrefix)) {
uint32_t hash = 0;
- if (to_integer(possibleHash, hash))
- symbolOrCStrHash = possibleHash;
+ if (to_integer(line, hash))
+ cStringPriorities[hash].setPriority(prio, objectFile);
} else
- symbolOrCStrHash = utils::getRootSymbol(line);
-
- if (!symbolOrCStrHash.empty()) {
- SymbolPriorityEntry &entry = priorities[symbolOrCStrHash];
- if (!objectFile.empty())
- entry.objectFiles.insert(std::make_pair(objectFile, prio));
- else
- entry.anyObjectFile = std::min(entry.anyObjectFile, prio);
- }
+ priorities[utils::getRootSymbol(line)].setPriority(prio, objectFile);
++prio;
}
@@ -405,40 +409,39 @@ macho::PriorityBuilder::buildInputSectionPriorities() {
return sectionPriorities;
}
-std::vector<StringPiecePair> macho::PriorityBuilder::buildCStringPriorities(
- ArrayRef<CStringInputSection *> inputs) {
- // Split the input strings into hold and cold sets.
- // Order hot set based on -order_file_cstring for performance improvement;
- // TODO: Order cold set of cstrings for compression via BP.
- std::vector<std::pair<int, StringPiecePair>>
- hotStringPrioritiesAndStringPieces;
- std::vector<StringPiecePair> coldStringPieces;
- std::vector<StringPiecePair> orderedStringPieces;
-
+void macho::PriorityBuilder::forEachStringPiece(
+ ArrayRef<CStringInputSection *> inputs,
+ std::function<void(CStringInputSection &, StringPiece &, size_t)> f,
+ bool forceInputOrder, bool computeHash) const {
+ std::vector<std::tuple<int, CStringInputSection *, size_t>> orderedPieces;
+ std::vector<std::pair<CStringInputSection *, size_t>> unorderedPieces;
for (CStringInputSection *isec : inputs) {
for (const auto &[stringPieceIdx, piece] : llvm::enumerate(isec->pieces)) {
if (!piece.live)
continue;
-
- std::optional<int> priority = getSymbolOrCStringPriority(
- std::to_string(piece.hash), isec->getFile());
- if (!priority)
- coldStringPieces.emplace_back(isec, stringPieceIdx);
+ // Process pieces in input order if we have no cstrings in our orderfile
+ if (forceInputOrder || cStringPriorities.empty()) {
+ f(*isec, piece, stringPieceIdx);
+ continue;
+ }
+ uint32_t hash =
+ computeHash
+ ? (xxh3_64bits(isec->getStringRef(stringPieceIdx)) & 0x7fffffff)
+ : piece.hash;
+ if (auto priority = getCStringPriority(hash, isec->getFile()))
+ orderedPieces.emplace_back(*priority, isec, stringPieceIdx);
else
- hotStringPrioritiesAndStringPieces.emplace_back(
- *priority, std::make_pair(isec, stringPieceIdx));
+ unorderedPieces.emplace_back(isec, stringPieceIdx);
}
}
-
- // Order hot set for perf
- llvm::stable_sort(hotStringPrioritiesAndStringPieces);
- for (auto &[priority, stringPiecePair] : hotStringPrioritiesAndStringPieces)
- orderedStringPieces.push_back(stringPiecePair);
-
- // TODO: Order cold set for compression
-
- orderedStringPieces.insert(orderedStringPieces.end(),
- coldStringPieces.begin(), coldStringPieces.end());
-
- return orderedStringPieces;
+ if (orderedPieces.empty() && unorderedPieces.empty())
+ return;
+ llvm::stable_sort(orderedPieces, [](const auto &left, const auto &right) {
+ return std::get<0>(left) < std::get<0>(right);
+ });
+ for (auto &[priority, isec, pieceIdx] : orderedPieces)
+ f(*isec, isec->pieces[pieceIdx], pieceIdx);
+ // TODO: Add option to order the remaining cstrings for compression
+ for (auto &[isec, pieceIdx] : unorderedPieces)
+ f(*isec, isec->pieces[pieceIdx], pieceIdx);
}
diff --git a/lld/MachO/SectionPriorities.h b/lld/MachO/SectionPriorities.h
index cc4e30fffc600..a29615f5f0df0 100644
--- a/lld/MachO/SectionPriorities.h
+++ b/lld/MachO/SectionPriorities.h
@@ -16,7 +16,6 @@
namespace lld::macho {
using SectionPair = std::pair<const InputSection *, const InputSection *>;
-using StringPiecePair = std::pair<CStringInputSection *, size_t>;
class PriorityBuilder {
public:
@@ -29,7 +28,7 @@ class PriorityBuilder {
//
// An order file has one entry per line, in the following format:
//
- // <cpu>:<object file>:[<symbol name> | CStringEntryPrefix <cstring hash>]
+ // <cpu>:<object file>:[<symbol name> | cStringEntryPrefix <cstring hash>]
//
// <cpu> and <object file> are optional.
// If not specified, then that entry tries to match either,
@@ -42,7 +41,7 @@ class PriorityBuilder {
// lowest-ordered entry (the one nearest to the front of the list.)
//
// or 2) any cstring literal with the given hash, if the entry has the
- // CStringEntryPrefix prefix defined below in the file. <cstring hash> is the
+ // cStringEntryPrefix prefix defined below in the file. <cstring hash> is the
// hash of cstring literal content.
//
// Cstring literals are not symbolized, we can't identify them by name
@@ -54,6 +53,16 @@ class PriorityBuilder {
// The file can also have line comments that start with '#'.
void parseOrderFile(StringRef path);
+ /// Call \p f for each string piece in \p inputs. If there are any cstring
+ /// literals in the orderfile (and \p forceInputOrder is false) then string
+ /// pieces are ordered by the orderfile. \p computeHash must be set when
+ /// \p deduplicateLiterals is false because then the string piece hash is not
+ /// set.
+ void forEachStringPiece(
+ ArrayRef<CStringInputSection *> inputs,
+ std::function<void(CStringInputSection &, StringPiece &, size_t)> f,
+ bool forceInputOrder = false, bool computeHash = false) const;
+
// Returns layout priorities for some or all input sections. Sections are laid
// out in decreasing order; that is, a higher priority section will be closer
// to the beginning of its output section.
@@ -66,8 +75,6 @@ class PriorityBuilder {
// Each section gets assigned the priority of the highest-priority symbol it
// contains.
llvm::DenseMap<const InputSection *, int> buildInputSectionPriorities();
- std::vector<StringPiecePair>
- buildCStringPriorities(ArrayRef<CStringInputSection *>);
private:
// The symbol with the smallest priority should be ordered first in the output
@@ -78,13 +85,16 @@ class PriorityBuilder {
int anyObjectFile = 0;
// The priority given to a matching symbol from a particular object file.
llvm::DenseMap<llvm::StringRef, int> objectFiles;
+ void setPriority(int priority, StringRef objectFile);
+ int getPriority(const InputFile *f) const;
};
- const llvm::StringRef CStringEntryPrefix = "CSTR;";
+ const llvm::StringRef cStringEntryPrefix = "CSTR;";
- std::optional<int> getSymbolPriority(const Defined *sym);
- std::optional<int> getSymbolOrCStringPriority(const StringRef key,
- InputFile *f);
+ std::optional<int> getSymbolPriority(const Defined *sym) const;
+ std::optional<int> getCStringPriority(uint32_t hash,
+ const InputFile *f) const;
llvm::DenseMap<llvm::StringRef, SymbolPriorityEntry> priorities;
+ llvm::DenseMap<int32_t, SymbolPriorityEntry> cStringPriorities;
llvm::MapVector<SectionPair, uint64_t> callGraphProfile;
};
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 187cccbe90dbc..fecc51f912b08 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -1721,26 +1721,24 @@ void CStringSection::writeTo(uint8_t *buf) const {
// and don't need this alignment. They will be emitted at some arbitrary address
// `A`, but ld64 will treat them as being 16-byte aligned with an offset of
// `16 % A`.
-static Align getStringPieceAlignment(const CStringInputSection *isec,
+static Align getStringPieceAlignment(const CStringInputSection &isec,
const StringPiece &piece) {
- return llvm::Align(1ULL << llvm::countr_zero(isec->align | piece.inSecOff));
+ return llvm::Align(1ULL << llvm::countr_zero(isec.align | piece.inSecOff));
}
void CStringSection::finalizeContents() {
size = 0;
- // TODO: Call buildCStringPriorities() to support cstring ordering when
- // deduplication is off, although this may negatively impact build
- // performance.
- for (CStringInputSection *isec : inputs) {
- for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
- if (!piece.live)
- continue;
- piece.outSecOff = alignTo(size, getStringPieceAlignment(isec, piece));
- StringRef string = isec->getStringRef(i);
- size = piece.outSecOff + string.size() + 1; // account for null terminator
- }
+ priorityBuilder.forEachStringPiece(
+ inputs,
+ [&](CStringInputSection &isec, StringPiece &piece, size_t pieceIdx) {
+ piece.outSecOff = alignTo(size, getStringPieceAlignment(isec, piece));
+ StringRef string = isec.getStringRef(pieceIdx);
+ size =
+ piece.outSecOff + string.size() + 1; // account for null terminator
+ },
+ /*forceInputOrder=*/false, /*computeHash=*/true);
+ for (CStringInputSection *isec : inputs)
isec->isFinal = true;
- }
}
void DeduplicatedCStringSection::finalizeContents() {
@@ -1748,20 +1746,19 @@ void DeduplicatedCStringSection::finalizeContents() {
DenseMap<CachedHashStringRef, Align> strToAlignment;
// Used for tail merging only
std::vector<CachedHashStringRef> deduplicatedStrs;
- for (const CStringInputSection *isec : inputs) {
- for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
- if (!piece.live)
- continue;
- auto s = isec->getCachedHashStringRef(i);
- assert(isec->align != 0);
- auto align = getStringPieceAlignment(isec, piece);
- auto [it, wasInserted] = strToAlignment.try_emplace(s, align);
- if (config->tailMergeStrings && wasInserted)
- deduplicatedStrs.push_back(s);
- if (!wasInserted && it->second < align)
- it->second = align;
- }
- }
+ priorityBuilder.forEachStringPiece(
+ inputs,
+ [&](CStringInputSection &isec, StringPiece &piece, size_t pieceIdx) {
+ auto s = isec.getCachedHashStringRef(pieceIdx);
+ assert(isec.align != 0);
+ auto align = getStringPieceAlignment(isec, piece);
+ auto [it, wasInserted] = strToAlignment.try_emplace(s, align);
+ if (config->tailMergeStrings && wasInserted)
+ deduplicatedStrs.push_back(s);
+ if (!wasInserted && it->second < align)
+ it->second = align;
+ },
+ /*forceInputOrder=*/true);
// Like lexigraphical sort, except we read strings in reverse and take the
// longest string first
@@ -1801,9 +1798,10 @@ void DeduplicatedCStringSection::finalizeContents() {
// Sort the strings for performance and compression size win, and then
// assign an offset for each string and save it to the corresponding
// StringPieces for easy access.
- for (auto &[isec, i] : priorityBuilder.buildCStringPriorities(inputs)) {
- auto &piece = isec->pieces[i];
- auto s = isec->getCachedHashStringRef(i);
+ priorityBuilder.forEachStringPiece(inputs, [&](CStringInputSection &isec,
+ StringPiece &piece,
+ size_t pieceIdx) {
+ auto s = isec.getCachedHashStringRef(pieceIdx);
// Any string can be tail merged with itself with an offset of zero
uint64_t tailMergeOffset = 0;
auto mergeIt =
@@ -1829,7 +1827,7 @@ void DeduplicatedCStringSection::finalizeContents() {
stringOffsetMap[tailMergedString] = piece.outSecOff;
assert(isAligned(strToAlignment.at(tailMergedString), piece.outSecOff));
}
- }
+ });
for (CStringInputSection *isec : inputs)
isec->isFinal = true;
}
diff --git a/lld/test/MachO/order-file-cstring.s b/lld/test/MachO/order-file-cstring.s
index 3c6d2a377dc38..2b471bbdabd49 100644
--- a/lld/test/MachO/order-file-cstring.s
+++ b/lld/test/MachO/order-file-cstring.s
@@ -4,32 +4,34 @@
# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/test.s -o %t/test.o
# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/more-cstrings.s -o %t/more-cstrings.o
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-0 %t/test.o %t/more-cstrings.o
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-0 %t/test.o %t/more-cstrings.o
# RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-0 | FileCheck %s --check-prefix=ORIGIN_SYM
# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-0 | FileCheck %s --check-prefix=ORIGIN_SEC
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-1 %t/test.o %t/more-cstrings.o -order_file %t/ord-1
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-1 %t/test.o %t/more-cstrings.o -order_file %t/ord-1
# RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-1 | FileCheck %s --check-prefix=ONE_SYM
# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-1 | FileCheck %s --check-prefix=ONE_SEC
+# RUN: %lld --no-deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-1-dup %t/test.o %t/more-cstrings.o -order_file %t/ord-1
+# RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-1-dup | FileCheck %s --check-prefix=ONE_SYM
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-1-dup | FileCheck %s --check-prefix=ONE_SEC
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-2 %t/test.o %t/more-cstrings.o -order_file %t/ord-2
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-2 %t/test.o %t/more-cstrings.o -order_file %t/ord-2
# RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-2 | FileCheck %s --check-prefix=TWO_SYM
# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-2 | FileCheck %s --check-prefix=TWO_SEC
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-3 %t/test.o %t/more-cstrings.o -order_file %t/ord-3
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-3 %t/test.o %t/more-cstrings.o -order_file %t/ord-3
# RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-3 | FileCheck %s --check-prefix=THREE_SYM
# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-3 | FileCheck %s --check-prefix=THREE_SEC
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-4 %t/test.o %t/more-cstrings.o -order_file %t/ord-4
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-4 %t/test.o %t/more-cstrings.o -order_file %t/ord-4
# RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-4 | FileCheck %s --check-prefix=FOUR_SYM
# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-4 | FileCheck %s --check-prefix=FOUR_SEC
# RUN: llvm-readobj --string-dump=__cstring %t/test-4 | FileCheck %s --check-prefix=FOUR_SEC_ESCAPE
-
# We expect:
# 1) Covered cstring symbols are reordered
-# 2) the rest of the cstring symbols remain original relative order within the cstring section
+# 2) the rest of the cstring symbols remain in the original relative order within the cstring section
# ORIGIN_SYM: _local_foo1
# ORIGIN_SYM: _globl_foo2
@@ -49,29 +51,27 @@
# original order, but only parital covered
#--- ord-1
-#foo2
-CSTR;1433942677
-#bar
-CSTR;540201826
#bar2
CSTR;1496286555
+#foo2
+CSTR;1433942677
#foo3
CSTR;1343999025
+#bar
+CSTR;540201826
-# ONE_SYM: _globl_foo2
-# ONE_SYM: _local_foo2
-# ONE_SYM: _bar
# ONE_SYM: _bar2
+# ONE_SYM-DAG: _globl_foo2
+# ONE_SYM-DAG: _local_foo2
# ONE_SYM: _globl_foo3
-# ONE_SYM: _local_foo1
+# ONE_SYM: _bar
# ONE_SYM: _baz
# ONE_SYM: _baz_dup
-# ONE_SEC: foo2
-# ONE_SEC: bar
# ONE_SEC: bar2
+# ONE_SEC: foo2
# ONE_SEC: foo3
-# ONE_SEC: foo1
+# ONE_SEC: bar
# ONE_SEC: baz
``````````
</details>
https://github.com/llvm/llvm-project/pull/161879
More information about the llvm-commits
mailing list