[Lldb-commits] [lldb] [llvm] Modify llvm-dwp to be able to emit string tables over 4GB without losing data (PR #167457)
Greg Clayton via lldb-commits
lldb-commits at lists.llvm.org
Mon Nov 10 21:29:21 PST 2025
https://github.com/clayborg created https://github.com/llvm/llvm-project/pull/167457
We can change llvm-dwp to emit DWARF64 version of the .debug_str_offsets tables for .dwo files in a .dwp file. This allows the string table to exceed 4GB without truncating string offsets into the .debug_str section and losing data. llvm-dwp will append all strings to the .debug_str section for a .dwo file, and if any of the new string offsets exceed UINT32_MAX, it will upgrade the .debug_str_offsets table to a DWARF64 header and then each string offset in that table can now have a 64 bit offset.
Fixed LLDB to be able to successfully load the 64 bit string tables in .dwp files.
Fixed llvm-dwarfdump and LLVM DWARF parsing code to do the right thing with DWARF64 string table headers.
>From a2431068c087edc09893009448d41b5a83ca03c9 Mon Sep 17 00:00:00 2001
From: Greg Clayton <clayborg at gmail.com>
Date: Mon, 10 Nov 2025 21:22:39 -0800
Subject: [PATCH] Modify llvm-dwp to be able to emit string tables over 4GB
without losing data.
We can change llvm-dwp to emit DWARF64 version of the .debug_str_offsets tables for .dwo files in a .dwp file. This allows the string table to exceed 4GB without truncating string offsets into the .debug_str section and losing data. llvm-dwp will append all strings to the .debug_str section for a .dwo file, and if any of the new string offsets exceed UINT32_MAX, it will upgrade the .debug_str_offsets table to a DWARF64 header and then each string offset in that table can now have a 64 bit offset.
Fixed LLDB to be able to successfully load the 64 bit string tables in .dwp files.
Fixed llvm-dwarfdump and LLVM DWARF parsing code to do the right thing with DWARF64 string table headers.
---
.../Plugins/SymbolFile/DWARF/DWARFUnit.cpp | 13 +++-
.../Plugins/SymbolFile/DWARF/DWARFUnit.h | 2 +-
llvm/include/llvm/DWP/DWP.h | 4 +-
llvm/include/llvm/DWP/DWPStringPool.h | 6 +-
llvm/lib/DWP/DWP.cpp | 78 +++++++++++++++----
llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp | 13 +++-
6 files changed, 89 insertions(+), 27 deletions(-)
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index 94fc2e83e899d..7b7864caf8c09 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -360,8 +360,10 @@ void DWARFUnit::SetDwoStrOffsetsBase() {
const DWARFDataExtractor &strOffsets =
GetSymbolFileDWARF().GetDWARFContext().getOrLoadStrOffsetsData();
uint64_t length = strOffsets.GetU32(&baseOffset);
- if (length == 0xffffffff)
+ if (length == 0xffffffff) {
length = strOffsets.GetU64(&baseOffset);
+ m_str_offsets_size = 8;
+ }
// Check version.
if (strOffsets.GetU16(&baseOffset) < 5)
@@ -369,6 +371,10 @@ void DWARFUnit::SetDwoStrOffsetsBase() {
// Skip padding.
baseOffset += 2;
+ } else {
+ // Size of offset for .debug_str_offsets is same as DWARF offset byte size
+ // of the DWARFUnit for DWARF version 4 and earlier.
+ m_str_offsets_size = m_header.getDwarfOffsetByteSize();
}
SetStrOffsetsBase(baseOffset);
@@ -1079,10 +1085,9 @@ uint32_t DWARFUnit::GetHeaderByteSize() const { return m_header.getSize(); }
std::optional<uint64_t>
DWARFUnit::GetStringOffsetSectionItem(uint32_t index) const {
- lldb::offset_t offset =
- GetStrOffsetsBase() + index * m_header.getDwarfOffsetByteSize();
+ lldb::offset_t offset = GetStrOffsetsBase() + index * m_str_offsets_size;
return m_dwarf.GetDWARFContext().getOrLoadStrOffsetsData().GetMaxU64(
- &offset, m_header.getDwarfOffsetByteSize());
+ &offset, m_str_offsets_size);
}
llvm::Expected<llvm::DWARFAddressRangesVector>
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
index 91a693860c55a..856db5e4101cd 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
@@ -364,7 +364,7 @@ class DWARFUnit : public DWARFExpression::Delegate, public UserID {
dw_offset_t m_line_table_offset = DW_INVALID_OFFSET;
dw_offset_t m_str_offsets_base = 0; // Value of DW_AT_str_offsets_base.
-
+ dw_offset_t m_str_offsets_size = 4; // Size in bytes of the string offsets.
std::optional<llvm::DWARFDebugRnglistTable> m_rnglist_table;
bool m_rnglist_table_done = false;
std::optional<llvm::DWARFListTableHeader> m_loclist_table_header;
diff --git a/llvm/include/llvm/DWP/DWP.h b/llvm/include/llvm/DWP/DWP.h
index a759bae10d160..cc38369658eaa 100644
--- a/llvm/include/llvm/DWP/DWP.h
+++ b/llvm/include/llvm/DWP/DWP.h
@@ -70,6 +70,8 @@ struct CompileUnitIdentifiers {
LLVM_ABI Error write(MCStreamer &Out, ArrayRef<std::string> Inputs,
OnCuIndexOverflow OverflowOptValue);
+typedef std::vector<std::pair<DWARFSectionKind, uint32_t>> SectionLengths;
+
LLVM_ABI Error handleSection(
const StringMap<std::pair<MCSection *, DWARFSectionKind>> &KnownSections,
const MCSection *StrSection, const MCSection *StrOffsetSection,
@@ -82,7 +84,7 @@ LLVM_ABI Error handleSection(
std::vector<StringRef> &CurTypesSection,
std::vector<StringRef> &CurInfoSection, StringRef &AbbrevSection,
StringRef &CurCUIndexSection, StringRef &CurTUIndexSection,
- std::vector<std::pair<DWARFSectionKind, uint32_t>> &SectionLength);
+ SectionLengths &SectionLength);
LLVM_ABI Expected<InfoSectionUnitHeader>
parseInfoSectionUnitHeader(StringRef Info);
diff --git a/llvm/include/llvm/DWP/DWPStringPool.h b/llvm/include/llvm/DWP/DWPStringPool.h
index 1354b46f156b6..d1486ff7872e1 100644
--- a/llvm/include/llvm/DWP/DWPStringPool.h
+++ b/llvm/include/llvm/DWP/DWPStringPool.h
@@ -32,13 +32,13 @@ class DWPStringPool {
MCStreamer &Out;
MCSection *Sec;
- DenseMap<const char *, uint32_t, CStrDenseMapInfo> Pool;
- uint32_t Offset = 0;
+ DenseMap<const char *, uint64_t, CStrDenseMapInfo> Pool;
+ uint64_t Offset = 0;
public:
DWPStringPool(MCStreamer &Out, MCSection *Sec) : Out(Out), Sec(Sec) {}
- uint32_t getOffset(const char *Str, unsigned Length) {
+ uint64_t getOffset(const char *Str, unsigned Length) {
assert(strlen(Str) + 1 == Length && "Ensure length hint is correct");
auto Pair = Pool.insert(std::make_pair(Str, Offset));
diff --git a/llvm/lib/DWP/DWP.cpp b/llvm/lib/DWP/DWP.cpp
index b565edbfe96db..54edce81208b5 100644
--- a/llvm/lib/DWP/DWP.cpp
+++ b/llvm/lib/DWP/DWP.cpp
@@ -413,33 +413,43 @@ Expected<InfoSectionUnitHeader> parseInfoSectionUnitHeader(StringRef Info) {
}
static void writeNewOffsetsTo(MCStreamer &Out, DataExtractor &Data,
- DenseMap<uint64_t, uint32_t> &OffsetRemapping,
- uint64_t &Offset, uint64_t &Size) {
+ DenseMap<uint64_t, uint64_t> &OffsetRemapping,
+ uint64_t &Offset, const uint64_t Size,
+ uint32_t OldOffsetSize, uint32_t NewOffsetSize) {
while (Offset < Size) {
- auto OldOffset = Data.getU32(&Offset);
- auto NewOffset = OffsetRemapping[OldOffset];
- Out.emitIntValue(NewOffset, 4);
+ const uint64_t OldOffset = Data.getUnsigned(&Offset, OldOffsetSize);
+ const uint64_t NewOffset = OffsetRemapping[OldOffset];
+ assert(NewOffsetSize == 8 || NewOffset <= UINT32_MAX);
+ Out.emitIntValue(NewOffset, NewOffsetSize);
}
}
void writeStringsAndOffsets(MCStreamer &Out, DWPStringPool &Strings,
MCSection *StrOffsetSection,
StringRef CurStrSection,
- StringRef CurStrOffsetSection, uint16_t Version) {
+ StringRef CurStrOffsetSection, uint16_t Version,
+ SectionLengths &SectionLength) {
// Could possibly produce an error or warning if one of these was non-null but
// the other was null.
if (CurStrSection.empty() || CurStrOffsetSection.empty())
return;
- DenseMap<uint64_t, uint32_t> OffsetRemapping;
+ DenseMap<uint64_t, uint64_t> OffsetRemapping;
DataExtractor Data(CurStrSection, true, 0);
uint64_t LocalOffset = 0;
uint64_t PrevOffset = 0;
+
+ // Keep track if any new string offsets exceed UINT32_MAX. If any do, we can
+ // emit a DWARF64 .debug_str_offsets table for this compile unit.
+ uint32_t OldOffsetSize = 4;
+ uint32_t NewOffsetSize = 4;
while (const char *S = Data.getCStr(&LocalOffset)) {
- OffsetRemapping[PrevOffset] =
- Strings.getOffset(S, LocalOffset - PrevOffset);
+ uint64_t NewOffset = Strings.getOffset(S, LocalOffset - PrevOffset);
+ OffsetRemapping[PrevOffset] = NewOffset;
+ if (NewOffset > UINT32_MAX)
+ NewOffsetSize = 8;
PrevOffset = LocalOffset;
}
@@ -451,7 +461,7 @@ void writeStringsAndOffsets(MCStreamer &Out, DWPStringPool &Strings,
uint64_t Size = CurStrOffsetSection.size();
if (Version > 4) {
while (Offset < Size) {
- uint64_t HeaderSize = debugStrOffsetsHeaderSize(Data, Version);
+ const uint64_t HeaderSize = debugStrOffsetsHeaderSize(Data, Version);
assert(HeaderSize <= Size - Offset &&
"StrOffsetSection size is less than its header");
@@ -461,16 +471,52 @@ void writeStringsAndOffsets(MCStreamer &Out, DWPStringPool &Strings,
if (HeaderSize == 8) {
ContributionSize = Data.getU32(&HeaderLengthOffset);
} else if (HeaderSize == 16) {
+ OldOffsetSize = 8;
HeaderLengthOffset += 4; // skip the dwarf64 marker
ContributionSize = Data.getU64(&HeaderLengthOffset);
}
ContributionEnd = ContributionSize + HeaderLengthOffset;
- Out.emitBytes(Data.getBytes(&Offset, HeaderSize));
- writeNewOffsetsTo(Out, Data, OffsetRemapping, Offset, ContributionEnd);
+
+ StringRef HeaderBytes = Data.getBytes(&Offset, HeaderSize);
+ if (OldOffsetSize == 4 && NewOffsetSize == 8) {
+ // We had a DWARF32 .debug_str_offsets header, but we need to emit
+ // some string offsets that require 64 bit offsets on the .debug_str
+ // section. Emit the .debug_str_offsets header in DWARF64 format so we
+ // can emit string offsets that exceed UINT32_MAX without truncating
+ // the string offset.
+
+ // 2 bytes for DWARF version, 2 bytes pad.
+ const uint64_t VersionPadSize = 4;
+ const uint64_t NewLength =
+ (ContributionSize - VersionPadSize) * 2 + VersionPadSize;
+ // Emit the DWARF64 length that starts with a 4 byte DW_LENGTH_DWARF64
+ // value followed by the 8 byte updated length.
+ Out.emitIntValue(llvm::dwarf::DW_LENGTH_DWARF64, 4);
+ Out.emitIntValue(NewLength, 8);
+ // Emit DWARF version as a 2 byte integer.
+ Out.emitIntValue(Version, 2);
+ // Emit 2 bytes of padding.
+ Out.emitIntValue(0, 2);
+ // Update the .debug_str_offsets section length contribution for the
+ // this .dwo file.
+ for (auto &Pair : SectionLength) {
+ if (Pair.first == DW_SECT_STR_OFFSETS) {
+ Pair.second = NewLength + 12;
+ break;
+ }
+ }
+ } else {
+ // Just emit the same .debug_str_offsets header.
+ Out.emitBytes(HeaderBytes);
+ }
+ writeNewOffsetsTo(Out, Data, OffsetRemapping, Offset, ContributionEnd,
+ OldOffsetSize, NewOffsetSize);
}
} else {
- writeNewOffsetsTo(Out, Data, OffsetRemapping, Offset, Size);
+ assert(OldOffsetSize == NewOffsetSize);
+ writeNewOffsetsTo(Out, Data, OffsetRemapping, Offset, Size, OldOffsetSize,
+ NewOffsetSize);
}
}
@@ -562,7 +608,7 @@ Error handleSection(
std::vector<StringRef> &CurTypesSection,
std::vector<StringRef> &CurInfoSection, StringRef &AbbrevSection,
StringRef &CurCUIndexSection, StringRef &CurTUIndexSection,
- std::vector<std::pair<DWARFSectionKind, uint32_t>> &SectionLength) {
+ SectionLengths &SectionLength) {
if (Section.isBSS())
return Error::success();
@@ -684,7 +730,7 @@ Error write(MCStreamer &Out, ArrayRef<std::string> Inputs,
// This maps each section contained in this file to its length.
// This information is later on used to calculate the contributions,
// i.e. offset and length, of each compile/type unit to a section.
- std::vector<std::pair<DWARFSectionKind, uint32_t>> SectionLength;
+ SectionLengths SectionLength;
for (const auto &Section : Obj.sections())
if (auto Err = handleSection(
@@ -713,7 +759,7 @@ Error write(MCStreamer &Out, ArrayRef<std::string> Inputs,
}
writeStringsAndOffsets(Out, Strings, StrOffsetSection, CurStrSection,
- CurStrOffsetSection, Header.Version);
+ CurStrOffsetSection, Header.Version, SectionLength);
for (auto Pair : SectionLength) {
auto Index = getContributionIndex(Pair.first, IndexVersion);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index da0bf03e1ac57..b4256ae13914c 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -1187,9 +1187,18 @@ DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA) {
if (getVersion() >= 5) {
if (DA.getData().data() == nullptr)
return std::nullopt;
- Offset += Header.getFormat() == dwarf::DwarfFormat::DWARF32 ? 8 : 16;
+ // For .dwo files, the section contribution for the .debug_str_offsets
+ // points to the string offsets table header. Decode the format from this
+ // data as llvm-dwp has been modified to be able to emit a
+ // .debug_str_offsets table as DWARF64 even if the compile unit is DWARF32.
+ // This allows .dwp files to have string tables that exceed UINT32_MAX in
+ // size.
+ uint64_t Length = 0;
+ DwarfFormat Format = dwarf::DwarfFormat::DWARF32;
+ std::tie(Length, Format) = DA.getInitialLength(&Offset);
+ Offset += 4; // Skip the DWARF version uint16_t and the uint16_t padding.
// Look for a valid contribution at the given offset.
- auto DescOrError = parseDWARFStringOffsetsTableHeader(DA, Header.getFormat(), Offset);
+ auto DescOrError = parseDWARFStringOffsetsTableHeader(DA, Format, Offset);
if (!DescOrError)
return DescOrError.takeError();
return *DescOrError;
More information about the lldb-commits
mailing list