[llvm] [BOLT][DWARF] Slice .debug_str from the DWP for each CU (PR #159540)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 18 02:52:07 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-bolt
Author: Liu Ke (Sockke)
<details>
<summary>Changes</summary>
Slice .debug_str from the DWP for each CU using .debug_str_offsets and emit it, instead of directly copying the global .debug_str, in order to address the bloat issue of DWO after updates. (more details here - [#<!-- -->155766](https://github.com/llvm/llvm-project/issues/155766))
I have added a test case for DWARF5, and the test case for DWARF4 depend on [#<!-- -->155619](https://github.com/llvm/llvm-project/pull/155619).
---
Patch is 30.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159540.diff
4 Files Affected:
- (modified) bolt/lib/Rewrite/DWARFRewriter.cpp (+100-1)
- (added) bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-helper.s (+226)
- (added) bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-main.s (+225)
- (added) bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test (+78)
``````````diff
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 6752489ad562a..7847eab7e4822 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -1725,6 +1725,65 @@ StringRef getSectionName(const SectionRef &Section) {
return Name;
}
+// Exctracts some appropriate slices of .debug_str.dwo from DWP.
+// Updates the .debug_str_offets.dwo for CUs.
+void UpdateStrAndStrOffsets(StringRef StrDWOContent,
+ StringRef StrOffsetsContent,
+ SmallVectorImpl<StringRef> &StrDWOOutData,
+ std::string &StrOffsetsOutData,
+ unsigned DwarfVersion, bool IsLittleEndian) {
+ const llvm::endianness Endian =
+ IsLittleEndian ? llvm::endianness::little : llvm::endianness::big;
+ // ignore DWARF64
+ const uint64_t HeaderOffset = (DwarfVersion >= 5) ? 8 : 0;
+ const uint64_t NumOffsets = (StrOffsetsContent.size() - HeaderOffset) / 4;
+
+ DataExtractor Extractor(StrOffsetsContent, IsLittleEndian, 0);
+ uint64_t ExtractionOffset = HeaderOffset;
+
+ using StringFragment = DWARFUnitIndex::Entry::SectionContribution;
+ auto getStringLength = [](StringRef Content, uint64_t Offset) -> uint64_t {
+ size_t NullPos = Content.find('\0', Offset);
+ return (NullPos != StringRef::npos) ? (NullPos - Offset + 1) : 0;
+ };
+ auto isContiguous = [](const StringFragment &Fragment,
+ uint64_t NextOffset) -> bool {
+ return NextOffset == Fragment.getOffset() + Fragment.getLength();
+ };
+ std::optional<StringFragment> CurrentFragment;
+ uint64_t AccumulatedStrLen = 0;
+ for (uint64_t I = 0; I < NumOffsets; ++I) {
+ const uint64_t StrOffset = Extractor.getU32(&ExtractionOffset);
+ const uint64_t StringLength = getStringLength(StrDWOContent, StrOffset);
+ if (!CurrentFragment) {
+ // first init
+ CurrentFragment = StringFragment(StrOffset, StringLength);
+ } else {
+ if (isContiguous(*CurrentFragment, StrOffset)) {
+ // expand the current fragment
+ CurrentFragment->setLength(CurrentFragment->getLength() + StringLength);
+ } else {
+ // save the current fragment and start a new one
+ StrDWOOutData.push_back(StrDWOContent.substr(
+ CurrentFragment->getOffset(), CurrentFragment->getLength()));
+ CurrentFragment = StringFragment(StrOffset, StringLength);
+ }
+ }
+ if (AccumulatedStrLen != StrOffset) {
+ // update str offsets
+ if (StrOffsetsOutData.empty())
+ StrOffsetsOutData = StrOffsetsContent.str();
+ llvm::support::endian::write32(&StrOffsetsOutData[HeaderOffset + I * 4],
+ static_cast<uint32_t>(AccumulatedStrLen),
+ Endian);
+ }
+ AccumulatedStrLen += StringLength;
+ }
+ if (CurrentFragment)
+ StrDWOOutData.push_back(StrDWOContent.substr(CurrentFragment->getOffset(),
+ CurrentFragment->getLength()));
+}
+
// Exctracts an appropriate slice if input is DWP.
// Applies patches or overwrites the section.
std::optional<StringRef> updateDebugData(
@@ -1890,6 +1949,10 @@ void DWARFRewriter::writeDWOFiles(
}
}
+ StringRef StrDWOContent;
+ StringRef StrOffsetsContent;
+ llvm::SmallVector<StringRef, 3> StrDWOOutData;
+ std::string StrOffsetsOutData;
for (const SectionRef &Section : File->sections()) {
std::unique_ptr<DebugBufferVector> OutputData;
StringRef SectionName = getSectionName(Section);
@@ -1897,11 +1960,47 @@ void DWARFRewriter::writeDWOFiles(
continue;
Expected<StringRef> ContentsExp = Section.getContents();
assert(ContentsExp && "Invalid contents.");
+ if (IsDWP && SectionName == "debug_str.dwo") {
+ StrDWOContent = *ContentsExp;
+ continue;
+ }
if (std::optional<StringRef> OutData = updateDebugData(
(*DWOCU)->getContext(), SectionName, *ContentsExp, KnownSections,
*Streamer, *this, CUDWOEntry, DWOId, OutputData, RangeListssWriter,
- LocWriter, StrOffstsWriter, StrWriter, OverridenSections))
+ LocWriter, StrOffstsWriter, StrWriter, OverridenSections)) {
+ if (IsDWP && SectionName == "debug_str_offsets.dwo") {
+ StrOffsetsContent = *OutData;
+ continue;
+ }
Streamer->emitBytes(*OutData);
+ }
+ }
+
+ if (IsDWP) {
+ // Handling both .debug_str.dwo and .debug_str_offsets.dwo concurrently. In
+ // the original DWP, .debug_str is a deduplicated global table, and the
+ // .debug_str.dwo slice for a single CU needs to be extracted according to
+ // .debug_str_offsets.dwo.
+ UpdateStrAndStrOffsets(StrDWOContent, StrOffsetsContent, StrDWOOutData,
+ StrOffsetsOutData, CU.getVersion(),
+ (*DWOCU)->getContext().isLittleEndian());
+ auto SectionIter = KnownSections.find("debug_str.dwo");
+ if (SectionIter != KnownSections.end()) {
+ Streamer->switchSection(SectionIter->second.first);
+ for (size_t i = 0; i < StrDWOOutData.size(); ++i) {
+ StringRef OutData = StrDWOOutData[i];
+ if (!OutData.empty())
+ Streamer->emitBytes(OutData);
+ }
+ }
+ SectionIter = KnownSections.find("debug_str_offsets.dwo");
+ if (SectionIter != KnownSections.end()) {
+ Streamer->switchSection(SectionIter->second.first);
+ if (!StrOffsetsOutData.empty())
+ Streamer->emitBytes(StrOffsetsOutData);
+ else
+ Streamer->emitBytes(StrOffsetsContent);
+ }
}
Streamer->finish();
TempOut->keep();
diff --git a/bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-helper.s b/bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-helper.s
new file mode 100644
index 0000000000000..540f41a462123
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-helper.s
@@ -0,0 +1,226 @@
+# clang++ -g2 -gdwarf-5 -gsplit-dwarf=split -S helper.cpp
+# int getReturn() {
+# return 0;
+# }
+ .file "helper.cpp"
+ .text
+ .globl _Z9getReturnv # -- Begin function _Z9getReturnv
+ .p2align 4
+ .type _Z9getReturnv, at function
+_Z9getReturnv: # @_Z9getReturnv
+.Lfunc_begin0:
+ .file 0 "." "helper.cpp" md5 0xc7d7879297b54325c71b3e0cfbb65e2d
+ .loc 0 1 0 # helper.cpp:1:0
+ .cfi_startproc
+# %bb.0: # %entry
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+.Ltmp0:
+ .loc 0 2 3 prologue_end # helper.cpp:2:3
+ xorl %eax, %eax
+ .loc 0 2 3 epilogue_begin is_stmt 0 # helper.cpp:2:3
+ popq %rbp
+ .cfi_def_cfa %rsp, 8
+ retq
+.Ltmp1:
+.Lfunc_end0:
+ .size _Z9getReturnv, .Lfunc_end0-_Z9getReturnv
+ .cfi_endproc
+ # -- End function
+ .section .debug_abbrev,"", at progbits
+ .byte 1 # Abbreviation Code
+ .byte 74 # DW_TAG_skeleton_unit
+ .byte 0 # DW_CHILDREN_no
+ .byte 16 # DW_AT_stmt_list
+ .byte 23 # DW_FORM_sec_offset
+ .byte 114 # DW_AT_str_offsets_base
+ .byte 23 # DW_FORM_sec_offset
+ .byte 27 # DW_AT_comp_dir
+ .byte 37 # DW_FORM_strx1
+ .ascii "\264B" # DW_AT_GNU_pubnames
+ .byte 25 # DW_FORM_flag_present
+ .byte 118 # DW_AT_dwo_name
+ .byte 37 # DW_FORM_strx1
+ .byte 17 # DW_AT_low_pc
+ .byte 27 # DW_FORM_addrx
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 115 # DW_AT_addr_base
+ .byte 23 # DW_FORM_sec_offset
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 0 # EOM(3)
+ .section .debug_info,"", at progbits
+.Lcu_begin0:
+ .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+ .short 5 # DWARF version number
+ .byte 4 # DWARF Unit Type
+ .byte 8 # Address Size (in bytes)
+ .long .debug_abbrev # Offset Into Abbrev. Section
+ .quad 5976014880088676049
+ .byte 1 # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+ .long .Lline_table_start0 # DW_AT_stmt_list
+ .long .Lstr_offsets_base0 # DW_AT_str_offsets_base
+ .byte 0 # DW_AT_comp_dir
+ # DW_AT_GNU_pubnames
+ .byte 1 # DW_AT_dwo_name
+ .byte 0 # DW_AT_low_pc
+ .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc
+ .long .Laddr_table_base0 # DW_AT_addr_base
+.Ldebug_info_end0:
+ .section .debug_str_offsets,"", at progbits
+ .long 12 # Length of String Offsets Set
+ .short 5
+ .short 0
+.Lstr_offsets_base0:
+ .section .debug_str,"MS", at progbits,1
+.Lskel_string0:
+ .asciz "." # string offset=0
+.Lskel_string1:
+ .asciz "helper.dwo" # string offset=2
+ .section .debug_str_offsets,"", at progbits
+ .long .Lskel_string0
+ .long .Lskel_string1
+ .section .debug_str_offsets.dwo,"e", at progbits
+ .long 28 # Length of String Offsets Set
+ .short 5
+ .short 0
+ .section .debug_str.dwo,"eMS", at progbits,1
+.Linfo_string0:
+ .asciz "_Z9getReturnv" # string offset=0
+.Linfo_string1:
+ .asciz "getReturn" # string offset=14
+.Linfo_string2:
+ .asciz "int" # string offset=24
+.Linfo_string3:
+ .asciz "clang version 22.0.0" # string offset=28
+.Linfo_string4:
+ .asciz "helper.cpp" # string offset=49
+.Linfo_string5:
+ .asciz "helper.dwo" # string offset=60
+ .section .debug_str_offsets.dwo,"e", at progbits
+ .long 0
+ .long 14
+ .long 24
+ .long 28
+ .long 49
+ .long 60
+ .section .debug_info.dwo,"e", at progbits
+ .long .Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+ .short 5 # DWARF version number
+ .byte 5 # DWARF Unit Type
+ .byte 8 # Address Size (in bytes)
+ .long 0 # Offset Into Abbrev. Section
+ .quad 5976014880088676049
+ .byte 1 # Abbrev [1] 0x14:0x1b DW_TAG_compile_unit
+ .byte 3 # DW_AT_producer
+ .short 33 # DW_AT_language
+ .byte 4 # DW_AT_name
+ .byte 5 # DW_AT_dwo_name
+ .byte 2 # Abbrev [2] 0x1a:0x10 DW_TAG_subprogram
+ .byte 0 # DW_AT_low_pc
+ .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc
+ .byte 1 # DW_AT_frame_base
+ .byte 86
+ .byte 0 # DW_AT_linkage_name
+ .byte 1 # DW_AT_name
+ .byte 0 # DW_AT_decl_file
+ .byte 1 # DW_AT_decl_line
+ .long 42 # DW_AT_type
+ # DW_AT_external
+ .byte 3 # Abbrev [3] 0x2a:0x4 DW_TAG_base_type
+ .byte 2 # DW_AT_name
+ .byte 5 # DW_AT_encoding
+ .byte 4 # DW_AT_byte_size
+ .byte 0 # End Of Children Mark
+.Ldebug_info_dwo_end0:
+ .section .debug_abbrev.dwo,"e", at progbits
+ .byte 1 # Abbreviation Code
+ .byte 17 # DW_TAG_compile_unit
+ .byte 1 # DW_CHILDREN_yes
+ .byte 37 # DW_AT_producer
+ .byte 37 # DW_FORM_strx1
+ .byte 19 # DW_AT_language
+ .byte 5 # DW_FORM_data2
+ .byte 3 # DW_AT_name
+ .byte 37 # DW_FORM_strx1
+ .byte 118 # DW_AT_dwo_name
+ .byte 37 # DW_FORM_strx1
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 2 # Abbreviation Code
+ .byte 46 # DW_TAG_subprogram
+ .byte 0 # DW_CHILDREN_no
+ .byte 17 # DW_AT_low_pc
+ .byte 27 # DW_FORM_addrx
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 64 # DW_AT_frame_base
+ .byte 24 # DW_FORM_exprloc
+ .byte 110 # DW_AT_linkage_name
+ .byte 37 # DW_FORM_strx1
+ .byte 3 # DW_AT_name
+ .byte 37 # DW_FORM_strx1
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 63 # DW_AT_external
+ .byte 25 # DW_FORM_flag_present
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 3 # Abbreviation Code
+ .byte 36 # DW_TAG_base_type
+ .byte 0 # DW_CHILDREN_no
+ .byte 3 # DW_AT_name
+ .byte 37 # DW_FORM_strx1
+ .byte 62 # DW_AT_encoding
+ .byte 11 # DW_FORM_data1
+ .byte 11 # DW_AT_byte_size
+ .byte 11 # DW_FORM_data1
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 0 # EOM(3)
+ .section .debug_addr,"", at progbits
+ .long .Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+ .short 5 # DWARF version number
+ .byte 8 # Address size
+ .byte 0 # Segment selector size
+.Laddr_table_base0:
+ .quad .Lfunc_begin0
+.Ldebug_addr_end0:
+ .section .debug_gnu_pubnames,"", at progbits
+ .long .LpubNames_end0-.LpubNames_start0 # Length of Public Names Info
+.LpubNames_start0:
+ .short 2 # DWARF Version
+ .long .Lcu_begin0 # Offset of Compilation Unit Info
+ .long 40 # Compilation Unit Length
+ .long 26 # DIE offset
+ .byte 48 # Attributes: FUNCTION, EXTERNAL
+ .asciz "getReturn" # External Name
+ .long 0 # End Mark
+.LpubNames_end0:
+ .section .debug_gnu_pubtypes,"", at progbits
+ .long .LpubTypes_end0-.LpubTypes_start0 # Length of Public Types Info
+.LpubTypes_start0:
+ .short 2 # DWARF Version
+ .long .Lcu_begin0 # Offset of Compilation Unit Info
+ .long 40 # Compilation Unit Length
+ .long 42 # DIE offset
+ .byte 144 # Attributes: TYPE, STATIC
+ .asciz "int" # External Name
+ .long 0 # End Mark
+.LpubTypes_end0:
+ .ident "clang version 22.0.0"
+ .section ".note.GNU-stack","", at progbits
+ .addrsig
+ .section .debug_line,"", at progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-main.s b/bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-main.s
new file mode 100644
index 0000000000000..d20eab14edf00
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-main.s
@@ -0,0 +1,225 @@
+# clang++ -g2 -gdwarf-5 -gsplit-dwarf=split -S main.cpp
+# extern int getReturn();
+# int main() {
+# return getReturn();
+# }
+ .file "main.cpp"
+ .text
+ .globl main # -- Begin function main
+ .p2align 4
+ .type main, at function
+main: # @main
+.Lfunc_begin0:
+ .file 0 "." "main.cpp" md5 0x9cdef858e26cf684ed9ef3b60e05bdad
+ .loc 0 2 0 # main.cpp:2:0
+ .cfi_startproc
+# %bb.0: # %entry
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ subq $16, %rsp
+ movl $0, -4(%rbp)
+.Ltmp0:
+ .loc 0 3 10 prologue_end # main.cpp:3:10
+ callq _Z9getReturnv at PLT
+ .loc 0 3 3 epilogue_begin is_stmt 0 # main.cpp:3:3
+ addq $16, %rsp
+ popq %rbp
+ .cfi_def_cfa %rsp, 8
+ retq
+.Ltmp1:
+.Lfunc_end0:
+ .size main, .Lfunc_end0-main
+ .cfi_endproc
+ # -- End function
+ .section .debug_abbrev,"", at progbits
+ .byte 1 # Abbreviation Code
+ .byte 74 # DW_TAG_skeleton_unit
+ .byte 0 # DW_CHILDREN_no
+ .byte 16 # DW_AT_stmt_list
+ .byte 23 # DW_FORM_sec_offset
+ .byte 114 # DW_AT_str_offsets_base
+ .byte 23 # DW_FORM_sec_offset
+ .byte 27 # DW_AT_comp_dir
+ .byte 37 # DW_FORM_strx1
+ .ascii "\264B" # DW_AT_GNU_pubnames
+ .byte 25 # DW_FORM_flag_present
+ .byte 118 # DW_AT_dwo_name
+ .byte 37 # DW_FORM_strx1
+ .byte 17 # DW_AT_low_pc
+ .byte 27 # DW_FORM_addrx
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 115 # DW_AT_addr_base
+ .byte 23 # DW_FORM_sec_offset
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 0 # EOM(3)
+ .section .debug_info,"", at progbits
+.Lcu_begin0:
+ .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+ .short 5 # DWARF version number
+ .byte 4 # DWARF Unit Type
+ .byte 8 # Address Size (in bytes)
+ .long .debug_abbrev # Offset Into Abbrev. Section
+ .quad -9094791692727444213
+ .byte 1 # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+ .long .Lline_table_start0 # DW_AT_stmt_list
+ .long .Lstr_offsets_base0 # DW_AT_str_offsets_base
+ .byte 0 # DW_AT_comp_dir
+ # DW_AT_GNU_pubnames
+ .byte 1 # DW_AT_dwo_name
+ .byte 0 # DW_AT_low_pc
+ .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc
+ .long .Laddr_table_base0 # DW_AT_addr_base
+.Ldebug_info_end0:
+ .section .debug_str_offsets,"", at progbits
+ .long 12 # Length of String Offsets Set
+ .short 5
+ .short 0
+.Lstr_offsets_base0:
+ .section .debug_str,"MS", at progbits,1
+.Lskel_string0:
+ .asciz "." # string offset=0
+.Lskel_string1:
+ .asciz "main.dwo" # string offset=2
+ .section .debug_str_offsets,"", at progbits
+ .long .Lskel_string0
+ .long .Lskel_string1
+ .section .debug_str_offsets.dwo,"e", at progbits
+ .long 24 ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/159540
More information about the llvm-commits
mailing list