[llvm] bc6f84a - [StrTable] Switch diag group names to `llvm::StringTable` (#123302)

via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 22 00:41:33 PST 2025


Author: Chandler Carruth
Date: 2025-01-22T00:41:27-08:00
New Revision: bc6f84a2db6e7d60d70cf9be8d6cce2a101d0faa

URL: https://github.com/llvm/llvm-project/commit/bc6f84a2db6e7d60d70cf9be8d6cce2a101d0faa
DIFF: https://github.com/llvm/llvm-project/commit/bc6f84a2db6e7d60d70cf9be8d6cce2a101d0faa.diff

LOG: [StrTable] Switch diag group names to `llvm::StringTable` (#123302)

Previously, they used a hand-rolled Pascal-string encoding different
from all the other string tables produced from TableGen. This moves them
to use the newly introduced runtime abstraction, and enhances that
abstraction to support iterating over the string table as used in this
case.

>From what I can tell the Pascal-string encoding isn't critical here to
avoid expensive `strlen` calls, so I think this is a simpler and more
consistent model. But if folks would prefer a Pascal-string style
encoding, I can instead work to switch the `StringTable` abstraction
towards that. It would require some tricky tradeoffs though to make it
reasonably general: either using 4 bytes instead of 1 byte to encode the
size, or having a fallback to `strlen` for long strings.

Added: 
    

Modified: 
    clang/lib/Basic/DiagnosticIDs.cpp
    clang/tools/diagtool/DiagnosticNames.cpp
    clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
    llvm/include/llvm/ADT/StringTable.h

Removed: 
    


################################################################################
diff  --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp
index 81194bbf2538e6..de1de6f61f3a1b 100644
--- a/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/clang/lib/Basic/DiagnosticIDs.cpp
@@ -16,6 +16,7 @@
 #include "clang/Basic/SourceManager.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringTable.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Path.h"
 #include <map>
@@ -576,11 +577,7 @@ namespace {
     uint16_t SubGroups;
     StringRef Documentation;
 
-    // String is stored with a pascal-style length byte.
-    StringRef getName() const {
-      return StringRef(DiagGroupNames + NameOffset + 1,
-                       DiagGroupNames[NameOffset]);
-    }
+    StringRef getName() const { return DiagGroupNames[NameOffset]; }
   };
 }
 
@@ -627,11 +624,12 @@ StringRef DiagnosticIDs::getWarningOptionForDiag(unsigned DiagID) {
 
 std::vector<std::string> DiagnosticIDs::getDiagnosticFlags() {
   std::vector<std::string> Res{"-W", "-Wno-"};
-  for (size_t I = 1; DiagGroupNames[I] != '\0';) {
-    std::string Diag(DiagGroupNames + I + 1, DiagGroupNames[I]);
-    I += DiagGroupNames[I] + 1;
-    Res.push_back("-W" + Diag);
-    Res.push_back("-Wno-" + Diag);
+  for (StringRef Name : DiagGroupNames) {
+    if (Name.empty())
+      continue;
+
+    Res.push_back((Twine("-W") + Name).str());
+    Res.push_back((Twine("-Wno-") + Name).str());
   }
 
   return Res;

diff  --git a/clang/tools/diagtool/DiagnosticNames.cpp b/clang/tools/diagtool/DiagnosticNames.cpp
index c3a3002889c736..4ac9825848ef39 100644
--- a/clang/tools/diagtool/DiagnosticNames.cpp
+++ b/clang/tools/diagtool/DiagnosticNames.cpp
@@ -9,6 +9,7 @@
 #include "DiagnosticNames.h"
 #include "clang/Basic/AllDiagnostics.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringTable.h"
 
 using namespace clang;
 using namespace diagtool;
@@ -68,7 +69,7 @@ static const GroupRecord OptionTable[] = {
 };
 
 llvm::StringRef GroupRecord::getName() const {
-  return StringRef(DiagGroupNames + NameOffset + 1, DiagGroupNames[NameOffset]);
+  return DiagGroupNames[NameOffset];
 }
 
 GroupRecord::subgroup_iterator GroupRecord::subgroup_begin() const {

diff  --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index fb00c640d6b144..5f03efdb804344 100644
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -1782,19 +1782,12 @@ static void emitDiagArrays(DiagsInGroupTy &DiagsInGroup,
 
 /// Emit a list of group names.
 ///
-/// This creates a long string which by itself contains a list of pascal style
-/// strings, which consist of a length byte directly followed by the string.
-///
-/// \code
-///   static const char DiagGroupNames[] = {
-///     \000\020#pragma-messages\t#warnings\020CFString-literal"
-///   };
-/// \endcode
+/// This creates an `llvm::StringTable` of all the diagnostic group names.
 static void emitDiagGroupNames(const StringToOffsetTable &GroupNames,
                                raw_ostream &OS) {
-  OS << "static const char DiagGroupNames[] = {\n";
-  GroupNames.EmitString(OS);
-  OS << "};\n\n";
+  GroupNames.EmitStringLiteralDef(
+      OS, "static constexpr llvm::StringTable DiagGroupNames");
+  OS << "\n";
 }
 
 /// Emit diagnostic arrays and related data structures.
@@ -1806,7 +1799,7 @@ static void emitDiagGroupNames(const StringToOffsetTable &GroupNames,
 ///  #ifdef GET_DIAG_ARRAYS
 ///     static const int16_t DiagArrays[];
 ///     static const int16_t DiagSubGroups[];
-///     static const char DiagGroupNames[];
+///     static constexpr llvm::StringTable DiagGroupNames;
 ///  #endif
 ///  \endcode
 static void emitAllDiagArrays(DiagsInGroupTy &DiagsInGroup,
@@ -1858,9 +1851,7 @@ static void emitDiagTable(DiagsInGroupTy &DiagsInGroup,
                                "0123456789!@#$%^*-+=:?") != std::string::npos)
       PrintFatalError("Invalid character in diagnostic group '" + Name + "'");
     OS << Name << " */, ";
-    // Store a pascal-style length byte at the beginning of the string.
-    std::string PascalName = char(Name.size()) + Name.str();
-    OS << *GroupNames.GetStringOffset(PascalName) << ", ";
+    OS << *GroupNames.GetStringOffset(Name) << ", ";
 
     // Special handling for 'pedantic'.
     const bool IsPedantic = Name == "pedantic";
@@ -1948,10 +1939,11 @@ void clang::EmitClangDiagGroups(const RecordKeeper &Records, raw_ostream &OS) {
   inferPedantic.compute(&DiagsInPedantic, &GroupsInPedantic);
 
   StringToOffsetTable GroupNames;
+  // Add an empty string to the table first so we can use `llvm::StringTable`.
+  // TODO: Factor this into `StringToOffsetTable`.
+  GroupNames.GetOrAddStringOffset("");
   for (const auto &[Name, Group] : DiagsInGroup) {
-    // Store a pascal-style length byte at the beginning of the string.
-    std::string PascalName = char(Name.size()) + Name.str();
-    GroupNames.GetOrAddStringOffset(PascalName, false);
+    GroupNames.GetOrAddStringOffset(Name);
   }
 
   emitAllDiagArrays(DiagsInGroup, DiagsInPedantic, GroupsInPedantic, GroupNames,

diff  --git a/llvm/include/llvm/ADT/StringTable.h b/llvm/include/llvm/ADT/StringTable.h
index 4049f892fa66e0..ce5efa1e06ea61 100644
--- a/llvm/include/llvm/ADT/StringTable.h
+++ b/llvm/include/llvm/ADT/StringTable.h
@@ -10,6 +10,8 @@
 #define LLVM_ADT_STRING_TABLE_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator.h"
+#include <iterator>
 #include <limits>
 
 namespace llvm {
@@ -51,6 +53,14 @@ class StringTable {
     constexpr Offset() = default;
     constexpr Offset(unsigned Value) : Value(Value) {}
 
+    friend constexpr bool operator==(const Offset &LHS, const Offset &RHS) {
+      return LHS.Value == RHS.Value;
+    }
+
+    friend constexpr bool operator!=(const Offset &LHS, const Offset &RHS) {
+      return LHS.Value != RHS.Value;
+    }
+
     constexpr unsigned value() const { return Value; }
   };
 
@@ -69,9 +79,13 @@ class StringTable {
     assert(!Table.empty() && "Requires at least a valid empty string.");
     assert(Table.data()[0] == '\0' && "Offset zero must be the empty string.");
     // Ensure that `strlen` from any offset cannot overflow the end of the table
-    // by insisting on a null byte at the end.
+    // by insisting on a null byte at the end. We also insist on the last string
+    // within the table being *separately* null terminated. This structure is
+    // used to enable predictable iteration over all the strings when needed.
     assert(Table.data()[Table.size() - 1] == '\0' &&
            "Last byte must be a null byte.");
+    assert(Table.data()[Table.size() - 2] == '\0' &&
+           "Next-to-last byte must be a null byte.");
   }
 
   // Get a string from the table starting with the provided offset. The returned
@@ -84,6 +98,43 @@ class StringTable {
 
   /// Returns the byte size of the table.
   constexpr size_t size() const { return Table.size(); }
+
+  class Iterator
+      : public iterator_facade_base<Iterator, std::forward_iterator_tag,
+                                    const StringRef> {
+    friend StringTable;
+
+    const StringTable *Table;
+    Offset O;
+
+    // A cache of one value to allow `*` to return a reference.
+    mutable StringRef S;
+
+    explicit constexpr Iterator(const StringTable &Table, Offset O)
+        : Table(&Table), O(O) {}
+
+  public:
+    constexpr Iterator(const Iterator &RHS) = default;
+    constexpr Iterator(Iterator &&RHS) = default;
+
+    bool operator==(const Iterator &RHS) const {
+      assert(Table == RHS.Table && "Compared iterators for unrelated tables!");
+      return O == RHS.O;
+    }
+
+    const StringRef &operator*() const {
+      S = (*Table)[O];
+      return S;
+    }
+
+    Iterator &operator++() {
+      O = O.value() + (*Table)[O].size() + 1;
+      return *this;
+    }
+  };
+
+  constexpr Iterator begin() const { return Iterator(*this, 0); }
+  constexpr Iterator end() const { return Iterator(*this, size() - 1); }
 };
 
 } // namespace llvm


        


More information about the llvm-commits mailing list