[llvm] [BOLT][DWARF] Slice .debug_str from the DWP for each CU (PR #159540)

via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 18 02:52:07 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-bolt

Author: Liu Ke (Sockke)

<details>
<summary>Changes</summary>

Slice .debug_str from the DWP for each CU using .debug_str_offsets and emit it, instead of directly copying the global .debug_str, in order to address the bloat issue of DWO after updates. (more details here - [#<!-- -->155766](https://github.com/llvm/llvm-project/issues/155766))

I have added a test case for DWARF5, and the test case for DWARF4 depend on [#<!-- -->155619](https://github.com/llvm/llvm-project/pull/155619).

---

Patch is 30.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159540.diff


4 Files Affected:

- (modified) bolt/lib/Rewrite/DWARFRewriter.cpp (+100-1) 
- (added) bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-helper.s (+226) 
- (added) bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-main.s (+225) 
- (added) bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test (+78) 


``````````diff
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 6752489ad562a..7847eab7e4822 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -1725,6 +1725,65 @@ StringRef getSectionName(const SectionRef &Section) {
   return Name;
 }
 
+// Exctracts some appropriate slices of .debug_str.dwo from DWP.
+// Updates the .debug_str_offets.dwo for CUs.
+void UpdateStrAndStrOffsets(StringRef StrDWOContent,
+                            StringRef StrOffsetsContent,
+                            SmallVectorImpl<StringRef> &StrDWOOutData,
+                            std::string &StrOffsetsOutData,
+                            unsigned DwarfVersion, bool IsLittleEndian) {
+  const llvm::endianness Endian =
+      IsLittleEndian ? llvm::endianness::little : llvm::endianness::big;
+  // ignore DWARF64
+  const uint64_t HeaderOffset = (DwarfVersion >= 5) ? 8 : 0;
+  const uint64_t NumOffsets = (StrOffsetsContent.size() - HeaderOffset) / 4;
+
+  DataExtractor Extractor(StrOffsetsContent, IsLittleEndian, 0);
+  uint64_t ExtractionOffset = HeaderOffset;
+
+  using StringFragment = DWARFUnitIndex::Entry::SectionContribution;
+  auto getStringLength = [](StringRef Content, uint64_t Offset) -> uint64_t {
+    size_t NullPos = Content.find('\0', Offset);
+    return (NullPos != StringRef::npos) ? (NullPos - Offset + 1) : 0;
+  };
+  auto isContiguous = [](const StringFragment &Fragment,
+                         uint64_t NextOffset) -> bool {
+    return NextOffset == Fragment.getOffset() + Fragment.getLength();
+  };
+  std::optional<StringFragment> CurrentFragment;
+  uint64_t AccumulatedStrLen = 0;
+  for (uint64_t I = 0; I < NumOffsets; ++I) {
+    const uint64_t StrOffset = Extractor.getU32(&ExtractionOffset);
+    const uint64_t StringLength = getStringLength(StrDWOContent, StrOffset);
+    if (!CurrentFragment) {
+      // first init
+      CurrentFragment = StringFragment(StrOffset, StringLength);
+    } else {
+      if (isContiguous(*CurrentFragment, StrOffset)) {
+        // expand the current fragment
+        CurrentFragment->setLength(CurrentFragment->getLength() + StringLength);
+      } else {
+        // save the current fragment and start a new one
+        StrDWOOutData.push_back(StrDWOContent.substr(
+            CurrentFragment->getOffset(), CurrentFragment->getLength()));
+        CurrentFragment = StringFragment(StrOffset, StringLength);
+      }
+    }
+    if (AccumulatedStrLen != StrOffset) {
+      // update str offsets
+      if (StrOffsetsOutData.empty())
+        StrOffsetsOutData = StrOffsetsContent.str();
+      llvm::support::endian::write32(&StrOffsetsOutData[HeaderOffset + I * 4],
+                                     static_cast<uint32_t>(AccumulatedStrLen),
+                                     Endian);
+    }
+    AccumulatedStrLen += StringLength;
+  }
+  if (CurrentFragment)
+    StrDWOOutData.push_back(StrDWOContent.substr(CurrentFragment->getOffset(),
+                                                 CurrentFragment->getLength()));
+}
+
 // Exctracts an appropriate slice if input is DWP.
 // Applies patches or overwrites the section.
 std::optional<StringRef> updateDebugData(
@@ -1890,6 +1949,10 @@ void DWARFRewriter::writeDWOFiles(
     }
   }
 
+  StringRef StrDWOContent;
+  StringRef StrOffsetsContent;
+  llvm::SmallVector<StringRef, 3> StrDWOOutData;
+  std::string StrOffsetsOutData;
   for (const SectionRef &Section : File->sections()) {
     std::unique_ptr<DebugBufferVector> OutputData;
     StringRef SectionName = getSectionName(Section);
@@ -1897,11 +1960,47 @@ void DWARFRewriter::writeDWOFiles(
       continue;
     Expected<StringRef> ContentsExp = Section.getContents();
     assert(ContentsExp && "Invalid contents.");
+    if (IsDWP && SectionName == "debug_str.dwo") {
+      StrDWOContent = *ContentsExp;
+      continue;
+    }
     if (std::optional<StringRef> OutData = updateDebugData(
             (*DWOCU)->getContext(), SectionName, *ContentsExp, KnownSections,
             *Streamer, *this, CUDWOEntry, DWOId, OutputData, RangeListssWriter,
-            LocWriter, StrOffstsWriter, StrWriter, OverridenSections))
+            LocWriter, StrOffstsWriter, StrWriter, OverridenSections)) {
+      if (IsDWP && SectionName == "debug_str_offsets.dwo") {
+        StrOffsetsContent = *OutData;
+        continue;
+      }
       Streamer->emitBytes(*OutData);
+    }
+  }
+
+  if (IsDWP) {
+    // Handling both .debug_str.dwo and .debug_str_offsets.dwo concurrently. In
+    // the original DWP, .debug_str is a deduplicated global table, and the
+    // .debug_str.dwo slice for a single CU needs to be extracted according to
+    // .debug_str_offsets.dwo.
+    UpdateStrAndStrOffsets(StrDWOContent, StrOffsetsContent, StrDWOOutData,
+                           StrOffsetsOutData, CU.getVersion(),
+                           (*DWOCU)->getContext().isLittleEndian());
+    auto SectionIter = KnownSections.find("debug_str.dwo");
+    if (SectionIter != KnownSections.end()) {
+      Streamer->switchSection(SectionIter->second.first);
+      for (size_t i = 0; i < StrDWOOutData.size(); ++i) {
+        StringRef OutData = StrDWOOutData[i];
+        if (!OutData.empty())
+          Streamer->emitBytes(OutData);
+      }
+    }
+    SectionIter = KnownSections.find("debug_str_offsets.dwo");
+    if (SectionIter != KnownSections.end()) {
+      Streamer->switchSection(SectionIter->second.first);
+      if (!StrOffsetsOutData.empty())
+        Streamer->emitBytes(StrOffsetsOutData);
+      else
+        Streamer->emitBytes(StrOffsetsContent);
+    }
   }
   Streamer->finish();
   TempOut->keep();
diff --git a/bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-helper.s b/bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-helper.s
new file mode 100644
index 0000000000000..540f41a462123
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-helper.s
@@ -0,0 +1,226 @@
+# clang++ -g2 -gdwarf-5 -gsplit-dwarf=split -S helper.cpp
+# int getReturn() {
+#   return 0;
+# }
+	.file	"helper.cpp"
+	.text
+	.globl	_Z9getReturnv                   # -- Begin function _Z9getReturnv
+	.p2align	4
+	.type	_Z9getReturnv, at function
+_Z9getReturnv:                          # @_Z9getReturnv
+.Lfunc_begin0:
+	.file	0 "." "helper.cpp" md5 0xc7d7879297b54325c71b3e0cfbb65e2d
+	.loc	0 1 0                           # helper.cpp:1:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+.Ltmp0:
+	.loc	0 2 3 prologue_end              # helper.cpp:2:3
+	xorl	%eax, %eax
+	.loc	0 2 3 epilogue_begin is_stmt 0  # helper.cpp:2:3
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	_Z9getReturnv, .Lfunc_end0-_Z9getReturnv
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_abbrev,"", at progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.ascii	"\264B"                         # DW_AT_GNU_pubnames
+	.byte	25                              # DW_FORM_flag_present
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"", at progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	5976014880088676049
+	.byte	1                               # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+                                        # DW_AT_GNU_pubnames
+	.byte	1                               # DW_AT_dwo_name
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"", at progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS", at progbits,1
+.Lskel_string0:
+	.asciz	"." # string offset=0
+.Lskel_string1:
+	.asciz	"helper.dwo"                    # string offset=2
+	.section	.debug_str_offsets,"", at progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string1
+	.section	.debug_str_offsets.dwo,"e", at progbits
+	.long	28                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS", at progbits,1
+.Linfo_string0:
+	.asciz	"_Z9getReturnv"                 # string offset=0
+.Linfo_string1:
+	.asciz	"getReturn"                     # string offset=14
+.Linfo_string2:
+	.asciz	"int"                           # string offset=24
+.Linfo_string3:
+	.asciz	"clang version 22.0.0" # string offset=28
+.Linfo_string4:
+	.asciz	"helper.cpp"                    # string offset=49
+.Linfo_string5:
+	.asciz	"helper.dwo"                    # string offset=60
+	.section	.debug_str_offsets.dwo,"e", at progbits
+	.long	0
+	.long	14
+	.long	24
+	.long	28
+	.long	49
+	.long	60
+	.section	.debug_info.dwo,"e", at progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	5976014880088676049
+	.byte	1                               # Abbrev [1] 0x14:0x1b DW_TAG_compile_unit
+	.byte	3                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_dwo_name
+	.byte	2                               # Abbrev [2] 0x1a:0x10 DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_linkage_name
+	.byte	1                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.long	42                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x2a:0x4 DW_TAG_base_type
+	.byte	2                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e", at progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"", at progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_gnu_pubnames,"", at progbits
+	.long	.LpubNames_end0-.LpubNames_start0 # Length of Public Names Info
+.LpubNames_start0:
+	.short	2                               # DWARF Version
+	.long	.Lcu_begin0                     # Offset of Compilation Unit Info
+	.long	40                              # Compilation Unit Length
+	.long	26                              # DIE offset
+	.byte	48                              # Attributes: FUNCTION, EXTERNAL
+	.asciz	"getReturn"                     # External Name
+	.long	0                               # End Mark
+.LpubNames_end0:
+	.section	.debug_gnu_pubtypes,"", at progbits
+	.long	.LpubTypes_end0-.LpubTypes_start0 # Length of Public Types Info
+.LpubTypes_start0:
+	.short	2                               # DWARF Version
+	.long	.Lcu_begin0                     # Offset of Compilation Unit Info
+	.long	40                              # Compilation Unit Length
+	.long	42                              # DIE offset
+	.byte	144                             # Attributes: TYPE, STATIC
+	.asciz	"int"                           # External Name
+	.long	0                               # End Mark
+.LpubTypes_end0:
+	.ident	"clang version 22.0.0"
+	.section	".note.GNU-stack","", at progbits
+	.addrsig
+	.section	.debug_line,"", at progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-main.s b/bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-main.s
new file mode 100644
index 0000000000000..d20eab14edf00
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-debug-str-split-dwarf-main.s
@@ -0,0 +1,225 @@
+# clang++ -g2 -gdwarf-5 -gsplit-dwarf=split -S main.cpp
+# extern int getReturn();
+# int main() {
+#   return getReturn();
+# }
+	.file	"main.cpp"
+	.text
+	.globl	main                            # -- Begin function main
+	.p2align	4
+	.type	main, at function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	0 "." "main.cpp" md5 0x9cdef858e26cf684ed9ef3b60e05bdad
+	.loc	0 2 0                           # main.cpp:2:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	subq	$16, %rsp
+	movl	$0, -4(%rbp)
+.Ltmp0:
+	.loc	0 3 10 prologue_end             # main.cpp:3:10
+	callq	_Z9getReturnv at PLT
+	.loc	0 3 3 epilogue_begin is_stmt 0  # main.cpp:3:3
+	addq	$16, %rsp
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_abbrev,"", at progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.ascii	"\264B"                         # DW_AT_GNU_pubnames
+	.byte	25                              # DW_FORM_flag_present
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"", at progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	-9094791692727444213
+	.byte	1                               # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+                                        # DW_AT_GNU_pubnames
+	.byte	1                               # DW_AT_dwo_name
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"", at progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS", at progbits,1
+.Lskel_string0:
+	.asciz	"." # string offset=0
+.Lskel_string1:
+	.asciz	"main.dwo"                      # string offset=2
+	.section	.debug_str_offsets,"", at progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string1
+	.section	.debug_str_offsets.dwo,"e", at progbits
+	.long	24 ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/159540


More information about the llvm-commits mailing list