[clang] [llvm] [clang] Deduplicate builtin diagnostic descriptions (PR #202624)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Jun 11 12:46:22 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-tablegen
Author: David Zbarsky (dzbarsky)
<details>
<summary>Changes</summary>
Generate each distinct builtin diagnostic description once in the existing `DiagnosticStableIDs.inc` output, emit pointer-free character storage, and store its 20-bit offset and 12-bit length in `StaticDiagInfoRec`. Diagtool also stores each diagnostic name once, uses naturally aligned six-byte `DiagnosticRecord` entries, and replaces the duplicate ID-sorted record table with a direct `uint16_t` index, without changing the generated `DIAG` macro signature.
On arm64 macOS Release builds, stripped standalone clang decreases by 49,536 bytes and the LLVM 22 stripped all-tools multicall binary decreases by 49,544 bytes and linked fixups decrease from 14,604 to 4.
Work towards #<!-- -->202616
AI tool disclosure: Co-authored with OpenAI Codex.
---
Full diff: https://github.com/llvm/llvm-project/pull/202624.diff
6 Files Affected:
- (modified) clang/lib/Basic/DiagnosticIDs.cpp (+16-42)
- (modified) clang/tools/diagtool/DiagnosticNames.cpp (+44-30)
- (modified) clang/tools/diagtool/DiagnosticNames.h (+70-77)
- (modified) clang/utils/TableGen/ClangDiagnosticsEmitter.cpp (+25-2)
- (modified) llvm/include/llvm/TableGen/StringToOffsetTable.h (+6)
- (modified) llvm/lib/TableGen/StringToOffsetTable.cpp (+10-5)
``````````diff
diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp
index d9c5e4082c1a7..2ebe3b5f5cc4e 100644
--- a/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/clang/lib/Basic/DiagnosticIDs.cpp
@@ -36,40 +36,8 @@ struct StaticDiagInfoRec;
#include "clang/Basic/DiagnosticStableIDs.inc"
#undef GET_DIAG_STABLE_ID_ARRAYS
-// Store the descriptions in a separate table to avoid pointers that need to
-// be relocated, and also decrease the amount of data needed on 64-bit
-// platforms. See "How To Write Shared Libraries" by Ulrich Drepper.
-struct StaticDiagInfoDescriptionStringTable {
-#define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR, \
- SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY, STABLE_ID, \
- LEGACY_STABLE_IDS) \
- char ENUM##_desc[sizeof(DESC)];
-#include "clang/Basic/AllDiagnosticKinds.inc"
-#undef DIAG
-};
-
-const StaticDiagInfoDescriptionStringTable StaticDiagInfoDescriptions = {
-#define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR, \
- SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY, STABLE_ID, \
- LEGACY_STABLE_IDS) \
- DESC,
-#include "clang/Basic/AllDiagnosticKinds.inc"
-#undef DIAG
-};
-
extern const StaticDiagInfoRec StaticDiagInfo[];
-// Stored separately from StaticDiagInfoRec to pack better. Otherwise,
-// StaticDiagInfoRec would have extra padding on 64-bit platforms.
-const uint32_t StaticDiagInfoDescriptionOffsets[] = {
-#define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR, \
- SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY, STABLE_ID, \
- LEGACY_STABLE_IDS) \
- offsetof(StaticDiagInfoDescriptionStringTable, ENUM##_desc),
-#include "clang/Basic/AllDiagnosticKinds.inc"
-#undef DIAG
-};
-
const uint32_t StaticDiagInfoStableIDOffsets[] = {
#define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR, \
SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY, STABLE_ID, \
@@ -111,25 +79,26 @@ struct StaticDiagInfoRec {
uint16_t WarnNoWerror : 1;
LLVM_PREFERRED_TYPE(bool)
uint16_t WarnShowInSystemHeader : 1;
+ LLVM_PREFERRED_TYPE(diag::Group)
+ uint16_t OptionGroupIndex : 14;
LLVM_PREFERRED_TYPE(bool)
uint16_t WarnShowInSystemMacro : 1;
-
- LLVM_PREFERRED_TYPE(diag::Group)
- uint16_t OptionGroupIndex : 15;
LLVM_PREFERRED_TYPE(bool)
uint16_t Deferrable : 1;
- uint16_t DescriptionLen;
+ uint16_t DescriptionOffsetLow;
+ uint16_t DescriptionOffsetHigh : 4;
+ uint16_t DescriptionLen : 12;
unsigned getOptionGroupIndex() const {
return OptionGroupIndex;
}
StringRef getDescription() const {
- size_t MyIndex = this - &StaticDiagInfo[0];
- uint32_t StringOffset = StaticDiagInfoDescriptionOffsets[MyIndex];
- const char* Table = reinterpret_cast<const char*>(&StaticDiagInfoDescriptions);
- return StringRef(&Table[StringOffset], DescriptionLen);
+ uint32_t Offset =
+ DescriptionOffsetLow | (uint32_t(DescriptionOffsetHigh) << 16);
+ return StringRef(StaticDiagInfoDescriptionsStorage + Offset,
+ DescriptionLen);
}
StringRef getStableID() const {
@@ -159,6 +128,9 @@ struct StaticDiagInfoRec {
return DiagID < RHS.DiagID;
}
};
+static_assert(sizeof(StaticDiagInfoRec) == 10);
+static_assert(static_cast<unsigned>(diag::Group::NUM_GROUPS) < (1U << 14),
+ "too many diagnostic groups for StaticDiagInfoRec");
#define STRINGIFY_NAME(NAME) #NAME
#define VALIDATE_DIAG_SIZE(NAME) \
@@ -200,9 +172,11 @@ const StaticDiagInfoRec StaticDiagInfo[] = {
CATEGORY, \
NOWERROR, \
SHOWINSYSHEADER, \
- SHOWINSYSMACRO, \
GROUP, \
- DEFERRABLE, \
+ SHOWINSYSMACRO, \
+ DEFERRABLE, \
+ uint16_t(DIAG_DESC_OFFSET_##ENUM), \
+ uint16_t(DIAG_DESC_OFFSET_##ENUM >> 16), \
STR_SIZE(DESC, uint16_t)},
#include "clang/Basic/DiagnosticCommonKinds.inc"
#include "clang/Basic/DiagnosticDriverKinds.inc"
diff --git a/clang/tools/diagtool/DiagnosticNames.cpp b/clang/tools/diagtool/DiagnosticNames.cpp
index 1538167022aca..0c5f6bc771cff 100644
--- a/clang/tools/diagtool/DiagnosticNames.cpp
+++ b/clang/tools/diagtool/DiagnosticNames.cpp
@@ -10,53 +10,67 @@
#include "clang/Basic/AllDiagnostics.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringTable.h"
+#include <array>
+#include <cstddef>
+#include <cstdint>
using namespace clang;
using namespace diagtool;
-static const DiagnosticRecord BuiltinDiagnosticsByName[] = {
-#define DIAG_NAME_INDEX(ENUM) { #ENUM, diag::ENUM, STR_SIZE(#ENUM, uint8_t) },
+struct BuiltinDiagnosticNameStorage {
+#define DIAG_NAME_INDEX(ENUM) char ENUM[sizeof(#ENUM)];
#include "clang/Basic/DiagnosticIndexName.inc"
#undef DIAG_NAME_INDEX
};
+static_assert(sizeof(BuiltinDiagnosticNameStorage) <= uint64_t(1) << 24);
+
+static constexpr BuiltinDiagnosticNameStorage BuiltinDiagnosticNames = {
+#define DIAG_NAME_INDEX(ENUM) #ENUM,
+#include "clang/Basic/DiagnosticIndexName.inc"
+#undef DIAG_NAME_INDEX
+};
+
+#define DIAGNOSTIC_RECORD(ENUM) \
+ {diag::ENUM, uint16_t(offsetof(BuiltinDiagnosticNameStorage, ENUM)), \
+ uint8_t(offsetof(BuiltinDiagnosticNameStorage, ENUM) >> 16), \
+ STR_SIZE(#ENUM, uint8_t)}
+
+static constexpr DiagnosticRecord BuiltinDiagnosticsByName[] = {
+#define DIAG_NAME_INDEX(ENUM) DIAGNOSTIC_RECORD(ENUM),
+#include "clang/Basic/DiagnosticIndexName.inc"
+#undef DIAG_NAME_INDEX
+};
+static_assert(std::size(BuiltinDiagnosticsByName) < (1U << 16));
+#undef DIAGNOSTIC_RECORD
+
llvm::ArrayRef<DiagnosticRecord> diagtool::getBuiltinDiagnosticsByName() {
return llvm::ArrayRef(BuiltinDiagnosticsByName);
}
-// FIXME: Is it worth having two tables, especially when this one can get
-// out of sync easily?
-static const DiagnosticRecord BuiltinDiagnosticsByID[] = {
-#define DIAG(ENUM, CLASS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \
- SHOWINSYSHEADER, SHOWINSYSMACRO, DEFER, CATEGORY, STABLE_ID, \
- LEGACY_STABLE_IDS) \
- {#ENUM, diag::ENUM, STR_SIZE(#ENUM, uint8_t)},
-#include "clang/Basic/AllDiagnosticKinds.inc"
-#undef DIAG
-};
-
-static bool orderByID(const DiagnosticRecord &Left,
- const DiagnosticRecord &Right) {
- return Left.DiagID < Right.DiagID;
+static constexpr auto BuiltinDiagnosticIndexByID = [] {
+ std::array<uint16_t, diag::DIAG_UPPER_LIMIT> Result = {};
+ uint16_t Index = 0;
+#define DIAG_NAME_INDEX(ENUM) Result[diag::ENUM] = ++Index;
+#include "clang/Basic/DiagnosticIndexName.inc"
+#undef DIAG_NAME_INDEX
+ return Result;
+}();
+
+llvm::StringRef DiagnosticRecord::getName() const {
+ const char *Names = reinterpret_cast<const char *>(&BuiltinDiagnosticNames);
+ uint32_t NameOffset =
+ uint32_t(NameOffsetLow) | (uint32_t(NameOffsetHigh) << 16);
+ return llvm::StringRef(Names + NameOffset, NameLen);
}
const DiagnosticRecord &diagtool::getDiagnosticForID(short DiagID) {
- DiagnosticRecord Key = {nullptr, DiagID, 0};
-
- // The requirement for lower_bound to produce a valid result it is
- // enough if the BuiltinDiagnosticsByID is partitioned (by DiagID),
- // but as we want this function to work for all possible values of
- // DiagID sent in as argument it is better to right away check if
- // BuiltinDiagnosticsByID is sorted.
- assert(llvm::is_sorted(BuiltinDiagnosticsByID, orderByID) &&
- "IDs in BuiltinDiagnosticsByID must be sorted.");
- const DiagnosticRecord *Result =
- llvm::lower_bound(BuiltinDiagnosticsByID, Key, orderByID);
- assert(Result && "diagnostic not found; table may be out of date");
- return *Result;
+ assert(DiagID >= 0 &&
+ static_cast<unsigned>(DiagID) < BuiltinDiagnosticIndexByID.size() &&
+ BuiltinDiagnosticIndexByID[DiagID] != 0 && "diagnostic not found");
+ return BuiltinDiagnosticsByName[BuiltinDiagnosticIndexByID[DiagID] - 1];
}
-
#define GET_DIAG_ARRAYS
#include "clang/Basic/DiagnosticGroups.inc"
#undef GET_DIAG_ARRAYS
diff --git a/clang/tools/diagtool/DiagnosticNames.h b/clang/tools/diagtool/DiagnosticNames.h
index f541e88577cc5..52622875175af 100644
--- a/clang/tools/diagtool/DiagnosticNames.h
+++ b/clang/tools/diagtool/DiagnosticNames.h
@@ -15,91 +15,84 @@
namespace diagtool {
- struct DiagnosticRecord {
- const char *NameStr;
- short DiagID;
- uint8_t NameLen;
+struct DiagnosticRecord {
+ short DiagID;
+ uint16_t NameOffsetLow;
+ uint8_t NameOffsetHigh;
+ uint8_t NameLen;
- llvm::StringRef getName() const {
- return llvm::StringRef(NameStr, NameLen);
+ llvm::StringRef getName() const;
+
+ bool operator<(const DiagnosticRecord &Other) const {
+ return getName() < Other.getName();
+ }
+};
+static_assert(sizeof(DiagnosticRecord) == 6,
+ "DiagnosticRecord must remain compact");
+
+/// Get every diagnostic in the system, sorted by name.
+llvm::ArrayRef<DiagnosticRecord> getBuiltinDiagnosticsByName();
+
+/// Get a diagnostic by its ID.
+const DiagnosticRecord &getDiagnosticForID(short DiagID);
+
+struct GroupRecord {
+ uint16_t NameOffset;
+ uint16_t Members;
+ uint16_t SubGroups;
+
+ llvm::StringRef getName() const;
+
+ template <typename RecordType> class group_iterator {
+ const short *CurrentID;
+
+ friend struct GroupRecord;
+ group_iterator(const short *Start) : CurrentID(Start) {
+ if (CurrentID && *CurrentID == -1)
+ CurrentID = nullptr;
}
- bool operator<(const DiagnosticRecord &Other) const {
- return getName() < Other.getName();
+ public:
+ typedef RecordType value_type;
+ typedef const value_type &reference;
+ typedef const value_type *pointer;
+ typedef std::forward_iterator_tag iterator_category;
+ typedef std::ptrdiff_t difference_type;
+
+ inline reference operator*() const;
+ inline pointer operator->() const { return &operator*(); }
+
+ inline short getID() const { return *CurrentID; }
+
+ group_iterator &operator++() {
+ ++CurrentID;
+ if (*CurrentID == -1)
+ CurrentID = nullptr;
+ return *this;
+ }
+
+ bool operator==(const group_iterator &Other) const {
+ return CurrentID == Other.CurrentID;
}
- };
- /// Get every diagnostic in the system, sorted by name.
- llvm::ArrayRef<DiagnosticRecord> getBuiltinDiagnosticsByName();
-
- /// Get a diagnostic by its ID.
- const DiagnosticRecord &getDiagnosticForID(short DiagID);
-
-
- struct GroupRecord {
- uint16_t NameOffset;
- uint16_t Members;
- uint16_t SubGroups;
-
- llvm::StringRef getName() const;
-
- template<typename RecordType>
- class group_iterator {
- const short *CurrentID;
-
- friend struct GroupRecord;
- group_iterator(const short *Start) : CurrentID(Start) {
- if (CurrentID && *CurrentID == -1)
- CurrentID = nullptr;
- }
-
- public:
- typedef RecordType value_type;
- typedef const value_type & reference;
- typedef const value_type * pointer;
- typedef std::forward_iterator_tag iterator_category;
- typedef std::ptrdiff_t difference_type;
-
- inline reference operator*() const;
- inline pointer operator->() const {
- return &operator*();
- }
-
- inline short getID() const {
- return *CurrentID;
- }
-
- group_iterator &operator++() {
- ++CurrentID;
- if (*CurrentID == -1)
- CurrentID = nullptr;
- return *this;
- }
-
- bool operator==(const group_iterator &Other) const {
- return CurrentID == Other.CurrentID;
- }
-
- bool operator!=(const group_iterator &Other) const {
- return CurrentID != Other.CurrentID;
- }
- };
-
- typedef group_iterator<GroupRecord> subgroup_iterator;
- subgroup_iterator subgroup_begin() const;
- subgroup_iterator subgroup_end() const;
- llvm::iterator_range<subgroup_iterator> subgroups() const;
-
- typedef group_iterator<DiagnosticRecord> diagnostics_iterator;
- diagnostics_iterator diagnostics_begin() const;
- diagnostics_iterator diagnostics_end() const;
- llvm::iterator_range<diagnostics_iterator> diagnostics() const;
-
- bool operator<(llvm::StringRef Other) const {
- return getName() < Other;
+ bool operator!=(const group_iterator &Other) const {
+ return CurrentID != Other.CurrentID;
}
};
+ typedef group_iterator<GroupRecord> subgroup_iterator;
+ subgroup_iterator subgroup_begin() const;
+ subgroup_iterator subgroup_end() const;
+ llvm::iterator_range<subgroup_iterator> subgroups() const;
+
+ typedef group_iterator<DiagnosticRecord> diagnostics_iterator;
+ diagnostics_iterator diagnostics_begin() const;
+ diagnostics_iterator diagnostics_end() const;
+ llvm::iterator_range<diagnostics_iterator> diagnostics() const;
+
+ bool operator<(llvm::StringRef Other) const { return getName() < Other; }
+};
+
/// Get every diagnostic group in the system, sorted by name.
llvm::ArrayRef<GroupRecord> getDiagnosticGroups();
diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index ba10be060c20a..193cfa69e8d88 100644
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -1736,10 +1736,11 @@ class DiagStableIDsMap {
/// static constexpr llvm::StringTable DiagStableIds;
/// #endif
/// \endcode
- void emit(raw_ostream &OS) const {
+ void emit(raw_ostream &OS, const RecordKeeper &Records) const {
OS << "\n#ifdef GET_DIAG_STABLE_ID_ARRAYS\n";
emitStableIDs(OS);
emitLegacyStableIDs(OS);
+ emitDescriptions(OS, Records);
OS << "#endif // GET_DIAG_STABLE_ID_ARRAYS\n\n";
}
@@ -1795,6 +1796,28 @@ class DiagStableIDsMap {
}
OS << "};\n\n";
}
+
+ static void emitDescriptions(raw_ostream &OS, const RecordKeeper &Records) {
+ DiagnosticTextBuilder DiagTextBuilder(Records);
+ StringToOffsetTable Descriptions;
+
+ OS << "enum {\n";
+ for (const Record &R :
+ make_pointee_range(Records.getAllDerivedDefinitions("Diagnostic"))) {
+ std::string Description = DiagTextBuilder.buildForDefinition(&R);
+ if (Description.size() >= (1U << 12))
+ PrintFatalError(R.getLoc(),
+ "diagnostic description exceeds 12-bit length limit");
+
+ unsigned Offset = Descriptions.GetOrAddStringOffset(Description);
+ OS << " DIAG_DESC_OFFSET_" << R.getName() << " = " << Offset << ",\n";
+ }
+ OS << "};\n\n";
+
+ if (Descriptions.size() >= (1U << 20))
+ PrintFatalError("diagnostic descriptions exceed 20-bit offset limit");
+ Descriptions.EmitStringTableStorageDef(OS, "StaticDiagInfoDescriptions");
+ }
};
} // namespace
@@ -1802,7 +1825,7 @@ class DiagStableIDsMap {
void clang::EmitClangDiagsStableIDs(const RecordKeeper &Records,
raw_ostream &OS) {
DiagStableIDsMap StableIDs(Records);
- StableIDs.emit(OS);
+ StableIDs.emit(OS, Records);
}
/// ClangDiagsDefsEmitter - The top-level class emits .def files containing
diff --git a/llvm/include/llvm/TableGen/StringToOffsetTable.h b/llvm/include/llvm/TableGen/StringToOffsetTable.h
index 5ee5fb538f3eb..fcf517ad56918 100644
--- a/llvm/include/llvm/TableGen/StringToOffsetTable.h
+++ b/llvm/include/llvm/TableGen/StringToOffsetTable.h
@@ -67,6 +67,12 @@ class StringToOffsetTable {
// valid identifiers to declare.
void EmitStringTableDef(raw_ostream &OS, const Twine &Name) const;
+ // Emit only the character storage used by a string table definition.
+ //
+ // This preserves the large-string handling of EmitStringTableDef without
+ // emitting an llvm::StringTable object.
+ void EmitStringTableStorageDef(raw_ostream &OS, const Twine &Name) const;
+
// Emit the string as one single string.
void EmitString(raw_ostream &O) const;
};
diff --git a/llvm/lib/TableGen/StringToOffsetTable.cpp b/llvm/lib/TableGen/StringToOffsetTable.cpp
index 06d8240486245..826752c23ff64 100644
--- a/llvm/lib/TableGen/StringToOffsetTable.cpp
+++ b/llvm/lib/TableGen/StringToOffsetTable.cpp
@@ -26,10 +26,9 @@ unsigned StringToOffsetTable::GetOrAddStringOffset(StringRef Str) {
return II->second;
}
-void StringToOffsetTable::EmitStringTableDef(raw_ostream &OS,
- const Twine &Name) const {
- // This generates a `llvm::StringTable` which expects that entries are null
- // terminated. So fail with an error if `AppendZero` is false.
+void StringToOffsetTable::EmitStringTableStorageDef(raw_ostream &OS,
+ const Twine &Name) const {
+ // String table entries must be null terminated.
if (!AppendZero)
PrintFatalError("llvm::StringTable requires null terminated strings");
@@ -79,11 +78,17 @@ void StringToOffsetTable::EmitStringTableDef(raw_ostream &OS,
}
OS << LineSep << (UseChars ? "};" : " ;");
- OS << formatv(R"(
+ OS << R"(
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
+)";
+}
+void StringToOffsetTable::EmitStringTableDef(raw_ostream &OS,
+ const Twine &Name) const {
+ EmitStringTableStorageDef(OS, Name);
+ OS << formatv(R"(
{1} llvm::StringTable
{2}{0} = {0}Storage;
)",
``````````
</details>
https://github.com/llvm/llvm-project/pull/202624
More information about the cfe-commits
mailing list