[lld] ac2dd06 - [lld-macho] Deduplicate CFStrings
Jez Ng via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 1 18:23:01 PDT 2021
Author: Jez Ng
Date: 2021-07-01T21:22:38-04:00
New Revision: ac2dd06b91ae7264fa7d396c15c7647510ede231
URL: https://github.com/llvm/llvm-project/commit/ac2dd06b91ae7264fa7d396c15c7647510ede231
DIFF: https://github.com/llvm/llvm-project/commit/ac2dd06b91ae7264fa7d396c15c7647510ede231.diff
LOG: [lld-macho] Deduplicate CFStrings
`__cfstring` is a special literal section, so instead of breaking it up
at symbol boundaries, we break it up at fixed-width boundaries (since
each literal is the same size). Symbols can only occur at one of those
boundaries, so this is strictly more powerful than
`.subsections_via_symbols`.
With that in place, we then run the section through ICF.
This change is about perf-neutral when linking chromium_framework.
Reviewed By: #lld-macho, gkm
Differential Revision: https://reviews.llvm.org/D105045
Added:
lld/test/MachO/cfstring-dedup.s
lld/test/MachO/invalid/cfstring.s
Modified:
lld/MachO/ICF.cpp
lld/MachO/InputFiles.cpp
lld/MachO/InputSection.cpp
lld/MachO/InputSection.h
lld/test/MachO/Inputs/MacOSX.sdk/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation.tbd
Removed:
################################################################################
diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index c1b8325d2a6c8..fbd7cb36514fa 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -324,6 +324,7 @@ void macho::foldIdenticalSections() {
// parallelization. Therefore, we hash every InputSection here where we have
// them all accessible as simple vectors.
std::vector<ConcatInputSection *> codeSections;
+ std::vector<ConcatInputSection *> cfStringSections;
// ICF can't fold functions with unwind info
DenseSet<const InputSection *> functionsWithUnwindInfo =
@@ -339,18 +340,30 @@ void macho::foldIdenticalSections() {
// ICF::segregate()
uint64_t icfUniqueID = inputSections.size();
for (ConcatInputSection *isec : inputSections) {
- bool isHashable = isCodeSection(isec) && !isec->shouldOmitFromOutput() &&
+ bool isHashable = (isCodeSection(isec) || isCfStringSection(isec)) &&
+ !isec->shouldOmitFromOutput() &&
!functionsWithUnwindInfo.contains(isec) &&
isec->isHashableForICF();
if (isHashable) {
- codeSections.push_back(isec);
+ if (isCodeSection(isec))
+ codeSections.push_back(isec);
+ else {
+ assert(isCfStringSection(isec));
+ cfStringSections.push_back(isec);
+ }
} else {
isec->icfEqClass[0] = ++icfUniqueID;
}
}
- parallelForEach(codeSections,
+ std::vector<ConcatInputSection *> hashable(codeSections);
+ hashable.insert(hashable.end(), cfStringSections.begin(),
+ cfStringSections.end());
+ parallelForEach(hashable,
[](ConcatInputSection *isec) { isec->hashForICF(); });
// Now that every input section is either hashed or marked as unique, run the
// segregation algorithm to detect foldable subsections.
+ // We dedup cfStringSections first since code sections may refer to them, but
+ // not vice-versa.
+ ICF(cfStringSections).run();
ICF(codeSections).run();
}
diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp
index f75c65f9370d8..6caf48232ad9b 100644
--- a/lld/MachO/InputFiles.cpp
+++ b/lld/MachO/InputFiles.cpp
@@ -281,6 +281,17 @@ void ObjFile::parseSections(ArrayRef<Section> sections) {
flags);
}
subsections.push_back({{0, isec}});
+ } else if (config->icfLevel != ICFLevel::none &&
+ (name == section_names::cfString &&
+ segname == segment_names::data)) {
+ uint64_t literalSize = target->wordSize == 8 ? 32 : 16;
+ subsections.push_back({});
+ SubsectionMap &subsecMap = subsections.back();
+ for (uint64_t off = 0; off < data.size(); off += literalSize)
+ subsecMap.push_back(
+ {off, make<ConcatInputSection>(segname, name, this,
+ data.slice(off, literalSize), align,
+ flags)});
} else {
auto *isec =
make<ConcatInputSection>(segname, name, this, data, align, flags);
@@ -593,22 +604,43 @@ void ObjFile::parseSymbols(ArrayRef<typename LP::section> sectionHeaders,
}
}
- // Calculate symbol sizes and create subsections by splitting the sections
- // along symbol boundaries.
for (size_t i = 0; i < subsections.size(); ++i) {
SubsectionMap &subsecMap = subsections[i];
if (subsecMap.empty())
continue;
std::vector<uint32_t> &symbolIndices = symbolsBySection[i];
- llvm::stable_sort(symbolIndices, [&](uint32_t lhs, uint32_t rhs) {
- return nList[lhs].n_value < nList[rhs].n_value;
- });
uint64_t sectionAddr = sectionHeaders[i].addr;
uint32_t sectionAlign = 1u << sectionHeaders[i].align;
+ InputSection *isec = subsecMap.back().isec;
+ // __cfstring has already been split into subsections during
+ // parseSections(), so we simply need to match Symbols to the corresponding
+ // subsection here.
+ if (config->icfLevel != ICFLevel::none && isCfStringSection(isec)) {
+ for (size_t j = 0; j < symbolIndices.size(); ++j) {
+ uint32_t symIndex = symbolIndices[j];
+ const NList &sym = nList[symIndex];
+ StringRef name = strtab + sym.n_strx;
+ uint64_t symbolOffset = sym.n_value - sectionAddr;
+ InputSection *isec = findContainingSubsection(subsecMap, &symbolOffset);
+ if (symbolOffset != 0) {
+ error(toString(this) + ": __cfstring contains symbol " + name +
+ " at misaligned offset");
+ continue;
+ }
+ symbols[symIndex] = createDefined(sym, name, isec, 0, isec->getSize());
+ }
+ continue;
+ }
+
+ // Calculate symbol sizes and create subsections by splitting the sections
+ // along symbol boundaries.
// We populate subsecMap by repeatedly splitting the last (highest address)
// subsection.
+ llvm::stable_sort(symbolIndices, [&](uint32_t lhs, uint32_t rhs) {
+ return nList[lhs].n_value < nList[rhs].n_value;
+ });
SubsectionEntry subsecEntry = subsecMap.back();
for (size_t j = 0; j < symbolIndices.size(); ++j) {
uint32_t symIndex = symbolIndices[j];
diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp
index 4ad790377676f..5f1eab349ba5a 100644
--- a/lld/MachO/InputSection.cpp
+++ b/lld/MachO/InputSection.cpp
@@ -232,6 +232,11 @@ bool macho::isCodeSection(const InputSection *isec) {
return false;
}
+bool macho::isCfStringSection(const InputSection *isec) {
+ return isec->name == section_names::cfString &&
+ isec->segname == segment_names::data;
+}
+
std::string lld::toString(const InputSection *isec) {
return (toString(isec->file) + ":(" + isec->name + ")").str();
}
diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h
index efa175e0bfc7f..a7d8b6e29fcf9 100644
--- a/lld/MachO/InputSection.h
+++ b/lld/MachO/InputSection.h
@@ -257,6 +257,8 @@ inline bool isWordLiteralSection(uint32_t flags) {
bool isCodeSection(const InputSection *);
+bool isCfStringSection(const InputSection *);
+
extern std::vector<ConcatInputSection *> inputSections;
namespace section_names {
diff --git a/lld/test/MachO/Inputs/MacOSX.sdk/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation.tbd b/lld/test/MachO/Inputs/MacOSX.sdk/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation.tbd
index 643dff44bde46..4faf3f2be6bbb 100644
--- a/lld/test/MachO/Inputs/MacOSX.sdk/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation.tbd
+++ b/lld/test/MachO/Inputs/MacOSX.sdk/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation.tbd
@@ -7,7 +7,7 @@ current-version: 0001.001.1
compatibility-version: 150
exports:
- archs: [ 'x86_64' ]
- symbols: [ '__CFBigNumGetInt128' ]
+ symbols: [ __CFBigNumGetInt128, ___CFConstantStringClassReference ]
objc-classes: [ NSObject ]
objc-ivars: [ NSConstantArray._count ]
objc-eh-types: [ NSException ]
diff --git a/lld/test/MachO/cfstring-dedup.s b/lld/test/MachO/cfstring-dedup.s
new file mode 100644
index 0000000000000..1a043064b6e0d
--- /dev/null
+++ b/lld/test/MachO/cfstring-dedup.s
@@ -0,0 +1,146 @@
+# REQUIRES: x86
+# RUN: rm -rf %t; split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/foo1.s -o %t/foo1.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/foo2.s -o %t/foo2.o
+# RUN: %lld -dylib --icf=all -framework CoreFoundation %t/foo1.o %t/foo2.o -o %t/foo
+# RUN: llvm-objdump --macho --rebase --bind --syms -d %t/foo | FileCheck %s
+
+# CHECK: (__TEXT,__text) section
+# CHECK-NEXT: _foo1:
+# CHECK-NEXT: _foo2:
+# CHECK-NEXT: movq _named_cfstring(%rip), %rax
+# CHECK-NEXT: _foo1_utf16:
+# CHECK-NEXT: movq [[#]](%rip), %rax
+# CHECK-NEXT: _named_foo1:
+# CHECK-NEXT: _named_foo2:
+# CHECK-NEXT: movq _named_cfstring(%rip), %rax
+# CHECK-NEXT: _foo2_utf16:
+# CHECK-NEXT: movq [[#]](%rip), %rax
+
+# CHECK: SYMBOL TABLE:
+# CHECK-DAG: [[#%.16x,FOO:]] g F __TEXT,__text _foo1
+# CHECK-DAG: [[#FOO]] g F __TEXT,__text _foo2
+
+## Make sure we don't emit redundant bind / rebase opcodes for folded sections.
+# CHECK: Rebase table:
+# CHECK-NEXT: segment section address type
+# CHECK-NEXT: __DATA_CONST __cfstring {{.*}} pointer
+# CHECK-NEXT: __DATA_CONST __cfstring {{.*}} pointer
+# CHECK-NEXT: __DATA_CONST __cfstring {{.*}} pointer
+# CHECK-EMPTY:
+# CHECK-NEXT: Bind table:
+# CHECK-NEXT: segment section address type addend dylib symbol
+# CHECK-NEXT: __DATA_CONST __cfstring {{.*}} pointer 0 CoreFoundation ___CFConstantStringClassReference
+# CHECK-NEXT: __DATA_CONST __cfstring {{.*}} pointer 0 CoreFoundation ___CFConstantStringClassReference
+# CHECK-NEXT: __DATA_CONST __cfstring {{.*}} pointer 0 CoreFoundation ___CFConstantStringClassReference
+# CHECK-EMPTY:
+
+#--- foo1.s
+.cstring
+L_.str:
+ .asciz "foo"
+
+.section __DATA,__cfstring
+.p2align 3
+L__unnamed_cfstring_:
+ .quad ___CFConstantStringClassReference
+ .long 1992 ## utf-8
+ .space 4
+ .quad L_.str
+ .quad 3 ## strlen
+
+_named_cfstring:
+ .quad ___CFConstantStringClassReference
+ .long 1992 ## utf-8
+ .space 4
+ .quad L_.str
+ .quad 3 ## strlen
+
+.section __TEXT,__ustring
+l_.str.2:
+ .short 102 ## f
+ .short 111 ## o
+ .short 0 ## \0
+ .short 111 ## o
+ .short 0 ## \0
+
+## FIXME: We should be able to deduplicate UTF-16 CFStrings too.
+## Note that this string contains a null byte in the middle -- any dedup code
+## we add should take care to handle this correctly.
+## Technically, UTF-8 should support encoding null bytes too, but since we
+## atomize the __cstring section at every null byte, this isn't supported. ld64
+## doesn't support it either, and clang seems to always emit a UTF-16 CFString
+## if it needs to contain a null, so I think we're good here.
+.section __DATA,__cfstring
+.p2align 3
+L__unnamed_cfstring_.2:
+ .quad ___CFConstantStringClassReference
+ .long 2000 ## utf-16
+ .space 4
+ .quad l_.str.2
+ .quad 4 ## strlen
+
+.text
+.globl _foo1, _foo1_utf16, _named_foo1
+_foo1:
+ movq L__unnamed_cfstring_(%rip), %rax
+
+_foo1_utf16:
+ movq L__unnamed_cfstring_.2(%rip), %rax
+
+_named_foo1:
+ movq _named_cfstring(%rip), %rax
+
+.subsections_via_symbols
+
+#--- foo2.s
+.cstring
+L_.str:
+ .asciz "foo"
+
+.section __DATA,__cfstring
+.p2align 3
+L__unnamed_cfstring_:
+ .quad ___CFConstantStringClassReference
+ .long 1992 ## utf-8
+ .space 4
+ .quad L_.str
+ .quad 3 ## strlen
+
+_named_cfstring:
+ .quad ___CFConstantStringClassReference
+ .long 1992 ## utf-8
+ .space 4
+ .quad L_.str
+ .quad 3 ## strlen
+
+.section __TEXT,__ustring
+ .p2align 1
+l_.str.2:
+ .short 102 ## f
+ .short 111 ## o
+ .short 0 ## \0
+ .short 111 ## o
+ .short 0 ## \0
+
+.section __DATA,__cfstring
+.p2align 3
+L__unnamed_cfstring_.2:
+ .quad ___CFConstantStringClassReference
+ .long 2000 ## utf-16
+ .space 4
+ .quad l_.str.2
+ .quad 4 ## strlen
+
+.text
+.globl _foo2, _foo2_utf16, _named_foo2
+_foo2:
+ movq L__unnamed_cfstring_(%rip), %rax
+
+_foo2_utf16:
+ movq L__unnamed_cfstring_.2(%rip), %rax
+
+_named_foo2:
+ movq _named_cfstring(%rip), %rax
+
+.subsections_via_symbols
diff --git a/lld/test/MachO/invalid/cfstring.s b/lld/test/MachO/invalid/cfstring.s
new file mode 100644
index 0000000000000..50a1038cb7522
--- /dev/null
+++ b/lld/test/MachO/invalid/cfstring.s
@@ -0,0 +1,19 @@
+# REQUIRES: x86
+# RUN: rm -rf %t; mkdir %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o
+# RUN: not %lld -dylib -framework CoreFoundation --icf=all %t/test.o 2>&1 | FileCheck %s
+# CHECK: error: {{.*}}test.o: __cfstring contains symbol _uh_oh at misaligned offset
+
+.cstring
+L_.str:
+ .asciz "foo"
+
+.section __DATA,__cfstring
+.p2align 3
+L__unnamed_cfstring_:
+ .quad ___CFConstantStringClassReference
+ .long 1992 ## utf-8
+_uh_oh:
+ .space 4
+ .quad L_.str
+ .quad 3 ## strlen
More information about the llvm-commits
mailing list