[llvm] [StrTable] Use string literal emission for intrinsics on non-MSVC platforms (PR #124856)

Reid Kleckner via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 28 15:18:55 PST 2025


https://github.com/rnk created https://github.com/llvm/llvm-project/pull/124856

This mainly transitions the LLVM intrinsic string table from character emission to string literal emission, which I confirmed happens for me locally.

I moved the guts of StringToOffsetTable to a cpp file so I could move the `EmitLongStrLiterals` cl::opt global to a non-vague linkage home in the `TableGen` library. I had to add missing FormatVariadic.h includes to account for moving other includes to a cpp file.

>From c63fe050475e1eb2982bc590ecb74180a0d99086 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk at google.com>
Date: Tue, 28 Jan 2025 22:29:31 +0000
Subject: [PATCH] [StrTable] Use string literal emission for non-MSVC platforms

This mainly transitions the LLVM intrinsic string table from character
emission to string literal emission.

Add missing FormatVariadic.h includes to account for moving the include
to a cpp file.
---
 .../llvm/TableGen/StringToOffsetTable.h       | 104 +-------------
 llvm/lib/TableGen/CMakeLists.txt              |   1 +
 llvm/lib/TableGen/StringToOffsetTable.cpp     | 129 ++++++++++++++++++
 llvm/utils/TableGen/AsmMatcherEmitter.cpp     |   1 +
 llvm/utils/TableGen/Basic/TableGen.cpp        |   9 --
 llvm/utils/TableGen/SDNodeInfoEmitter.cpp     |   1 +
 .../gn/secondary/llvm/lib/TableGen/BUILD.gn   |   1 +
 7 files changed, 136 insertions(+), 110 deletions(-)
 create mode 100644 llvm/lib/TableGen/StringToOffsetTable.cpp

diff --git a/llvm/include/llvm/TableGen/StringToOffsetTable.h b/llvm/include/llvm/TableGen/StringToOffsetTable.h
index e716411514bd63..21795644d4bd67 100644
--- a/llvm/include/llvm/TableGen/StringToOffsetTable.h
+++ b/llvm/include/llvm/TableGen/StringToOffsetTable.h
@@ -12,8 +12,6 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
 #include <optional>
 
 namespace llvm {
@@ -36,17 +34,7 @@ class StringToOffsetTable {
   bool empty() const { return StringOffset.empty(); }
   size_t size() const { return AggregateString.size(); }
 
-  unsigned GetOrAddStringOffset(StringRef Str, bool appendZero = true) {
-    auto [II, Inserted] = StringOffset.insert({Str, size()});
-    if (Inserted) {
-      // Add the string to the aggregate if this is the first time found.
-      AggregateString.append(Str.begin(), Str.end());
-      if (appendZero)
-        AggregateString += '\0';
-    }
-
-    return II->second;
-  }
+  unsigned GetOrAddStringOffset(StringRef Str, bool appendZero = true);
 
   // Returns the offset of `Str` in the table if its preset, else return
   // std::nullopt.
@@ -69,96 +57,10 @@ class StringToOffsetTable {
   // `static` and `constexpr`. Both `Name` and (`Name` + "Storage") must be
   // valid identifiers to declare.
   void EmitStringTableDef(raw_ostream &OS, const Twine &Name,
-                          const Twine &Indent = "") const {
-    OS << formatv(R"(
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Woverlength-strings"
-#endif
-{0}static constexpr char {1}Storage[] = )",
-                  Indent, Name);
-
-    // MSVC silently miscompiles string literals longer than 64k in some
-    // circumstances. When the string table is longer, emit it as an array of
-    // character literals.
-    bool UseChars = AggregateString.size() > (64 * 1024);
-    OS << (UseChars ? "{\n" : "\n");
-
-    llvm::ListSeparator LineSep(UseChars ? ",\n" : "\n");
-    llvm::SmallVector<StringRef> Strings(split(AggregateString, '\0'));
-    // We should always have an empty string at the start, and because these are
-    // null terminators rather than separators, we'll have one at the end as
-    // well. Skip the end one.
-    assert(Strings.front().empty() && "Expected empty initial string!");
-    assert(Strings.back().empty() &&
-           "Expected empty string at the end due to terminators!");
-    Strings.pop_back();
-    for (StringRef Str : Strings) {
-      OS << LineSep << Indent << "  ";
-      // If we can, just emit this as a string literal to be concatenated.
-      if (!UseChars) {
-        OS << "\"";
-        OS.write_escaped(Str);
-        OS << "\\0\"";
-        continue;
-      }
-
-      llvm::ListSeparator CharSep(", ");
-      for (char C : Str) {
-        OS << CharSep << "'";
-        OS.write_escaped(StringRef(&C, 1));
-        OS << "'";
-      }
-      OS << CharSep << "'\\0'";
-    }
-    OS << LineSep << Indent << (UseChars ? "};" : "  ;");
-
-    OS << formatv(R"(
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-{0}static constexpr llvm::StringTable {1} =
-{0}    {1}Storage;
-)",
-                  Indent, Name);
-  }
+                          const Twine &Indent = "") const;
 
   // Emit the string as one single string.
-  void EmitString(raw_ostream &O) const {
-    // Escape the string.
-    SmallString<256> EscapedStr;
-    raw_svector_ostream(EscapedStr).write_escaped(AggregateString);
-
-    O << "    \"";
-    unsigned CharsPrinted = 0;
-    for (unsigned i = 0, e = EscapedStr.size(); i != e; ++i) {
-      if (CharsPrinted > 70) {
-        O << "\"\n    \"";
-        CharsPrinted = 0;
-      }
-      O << EscapedStr[i];
-      ++CharsPrinted;
-
-      // Print escape sequences all together.
-      if (EscapedStr[i] != '\\')
-        continue;
-
-      assert(i + 1 < EscapedStr.size() && "Incomplete escape sequence!");
-      if (isDigit(EscapedStr[i + 1])) {
-        assert(isDigit(EscapedStr[i + 2]) && isDigit(EscapedStr[i + 3]) &&
-               "Expected 3 digit octal escape!");
-        O << EscapedStr[++i];
-        O << EscapedStr[++i];
-        O << EscapedStr[++i];
-        CharsPrinted += 3;
-      } else {
-        O << EscapedStr[++i];
-        ++CharsPrinted;
-      }
-    }
-    O << "\"";
-  }
+  void EmitString(raw_ostream &O) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/TableGen/CMakeLists.txt b/llvm/lib/TableGen/CMakeLists.txt
index 84815c77369979..0f9284c8bb999d 100644
--- a/llvm/lib/TableGen/CMakeLists.txt
+++ b/llvm/lib/TableGen/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_component_library(LLVMTableGen
   Record.cpp
   SetTheory.cpp
   StringMatcher.cpp
+  StringToOffsetTable.cpp
   TableGenBackend.cpp
   TableGenBackendSkeleton.cpp
   TGLexer.cpp
diff --git a/llvm/lib/TableGen/StringToOffsetTable.cpp b/llvm/lib/TableGen/StringToOffsetTable.cpp
new file mode 100644
index 00000000000000..6de889e1a10ee3
--- /dev/null
+++ b/llvm/lib/TableGen/StringToOffsetTable.cpp
@@ -0,0 +1,129 @@
+//===- StringToOffsetTable.h - Emit a big concatenated string ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TableGen/StringToOffsetTable.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace llvm {
+cl::opt<bool> EmitLongStrLiterals(
+    "long-string-literals",
+    cl::desc("when emitting large string tables, prefer string literals over "
+             "comma-separated char literals. This can be a readability and "
+             "compile-time performance win, but upsets some compilers"),
+    cl::Hidden, cl::init(true));
+} // end namespace llvm
+
+
+unsigned StringToOffsetTable::GetOrAddStringOffset(StringRef Str,
+                                                   bool appendZero) {
+  auto [II, Inserted] = StringOffset.insert({Str, size()});
+  if (Inserted) {
+    // Add the string to the aggregate if this is the first time found.
+    AggregateString.append(Str.begin(), Str.end());
+    if (appendZero)
+      AggregateString += '\0';
+  }
+
+  return II->second;
+}
+
+void StringToOffsetTable::EmitStringTableDef(raw_ostream &OS, const Twine &Name,
+                                             const Twine &Indent) const {
+  OS << formatv(R"(
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+#endif
+{0}static constexpr char {1}Storage[] = )",
+                Indent, Name);
+
+  // MSVC silently miscompiles string literals longer than 64k in some
+  // circumstances. In this case, the build system sets EmitLongStrLiterals to
+  // false. When that option is false and the string table is longer than 64k,
+  // emit it as an array of character literals.
+  bool UseChars = !EmitLongStrLiterals && AggregateString.size() > (64 * 1024);
+  OS << (UseChars ? "{\n" : "\n");
+
+  llvm::ListSeparator LineSep(UseChars ? ",\n" : "\n");
+  llvm::SmallVector<StringRef> Strings(split(AggregateString, '\0'));
+  // We should always have an empty string at the start, and because these are
+  // null terminators rather than separators, we'll have one at the end as
+  // well. Skip the end one.
+  assert(Strings.front().empty() && "Expected empty initial string!");
+  assert(Strings.back().empty() &&
+         "Expected empty string at the end due to terminators!");
+  Strings.pop_back();
+  for (StringRef Str : Strings) {
+    OS << LineSep << Indent << "  ";
+    // If we can, just emit this as a string literal to be concatenated.
+    if (!UseChars) {
+      OS << "\"";
+      OS.write_escaped(Str);
+      OS << "\\0\"";
+      continue;
+    }
+
+    llvm::ListSeparator CharSep(", ");
+    for (char C : Str) {
+      OS << CharSep << "'";
+      OS.write_escaped(StringRef(&C, 1));
+      OS << "'";
+    }
+    OS << CharSep << "'\\0'";
+  }
+  OS << LineSep << Indent << (UseChars ? "};" : "  ;");
+
+  OS << formatv(R"(
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+{0}static constexpr llvm::StringTable {1} =
+{0}    {1}Storage;
+)",
+                Indent, Name);
+}
+
+void StringToOffsetTable::EmitString(raw_ostream &O) const {
+  // Escape the string.
+  SmallString<256> EscapedStr;
+  raw_svector_ostream(EscapedStr).write_escaped(AggregateString);
+
+  O << "    \"";
+  unsigned CharsPrinted = 0;
+  for (unsigned i = 0, e = EscapedStr.size(); i != e; ++i) {
+    if (CharsPrinted > 70) {
+      O << "\"\n    \"";
+      CharsPrinted = 0;
+    }
+    O << EscapedStr[i];
+    ++CharsPrinted;
+
+    // Print escape sequences all together.
+    if (EscapedStr[i] != '\\')
+      continue;
+
+    assert(i + 1 < EscapedStr.size() && "Incomplete escape sequence!");
+    if (isDigit(EscapedStr[i + 1])) {
+      assert(isDigit(EscapedStr[i + 2]) && isDigit(EscapedStr[i + 3]) &&
+             "Expected 3 digit octal escape!");
+      O << EscapedStr[++i];
+      O << EscapedStr[++i];
+      O << EscapedStr[++i];
+      CharsPrinted += 3;
+    } else {
+      O << EscapedStr[++i];
+      ++CharsPrinted;
+    }
+  }
+  O << "\"";
+}
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 7684387d80fe24..0f2e20eb37649b 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -110,6 +110,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/StringMatcher.h"
diff --git a/llvm/utils/TableGen/Basic/TableGen.cpp b/llvm/utils/TableGen/Basic/TableGen.cpp
index 80ac93f2b54fb6..edb7791500699f 100644
--- a/llvm/utils/TableGen/Basic/TableGen.cpp
+++ b/llvm/utils/TableGen/Basic/TableGen.cpp
@@ -26,15 +26,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-cl::opt<bool> EmitLongStrLiterals(
-    "long-string-literals",
-    cl::desc("when emitting large string tables, prefer string literals over "
-             "comma-separated char literals. This can be a readability and "
-             "compile-time performance win, but upsets some compilers"),
-    cl::Hidden, cl::init(true));
-} // end namespace llvm
-
 static cl::OptionCategory PrintEnumsCat("Options for -print-enums");
 static cl::opt<std::string> Class("class",
                                   cl::desc("Print Enum list for this class"),
diff --git a/llvm/utils/TableGen/SDNodeInfoEmitter.cpp b/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
index 63ee0deb871109..64f03dae83e7de 100644
--- a/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
+++ b/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
@@ -9,6 +9,7 @@
 #include "Basic/SequenceToOffsetTable.h"
 #include "Common/CodeGenDAGPatterns.h" // For SDNodeInfo.
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/StringToOffsetTable.h"
 #include "llvm/TableGen/TableGenBackend.h"
diff --git a/llvm/utils/gn/secondary/llvm/lib/TableGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/TableGen/BUILD.gn
index d90df7bc0e57a5..b40fdf154b01a2 100644
--- a/llvm/utils/gn/secondary/llvm/lib/TableGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/TableGen/BUILD.gn
@@ -10,6 +10,7 @@ static_library("TableGen") {
     "Record.cpp",
     "SetTheory.cpp",
     "StringMatcher.cpp",
+    "StringToOffsetTable.cpp",
     "TGLexer.cpp",
     "TGParser.cpp",
     "TGTimer.cpp",



More information about the llvm-commits mailing list