[llvm] Add a super simple wrapper for a merged string table. (PR #119488)

Chandler Carruth via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 11 17:37:52 PST 2024


https://github.com/chandlerc updated https://github.com/llvm/llvm-project/pull/119488

>From fb7bc415c1f4493a6d80abbf0da4eca5ea3f457d Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc at gmail.com>
Date: Wed, 11 Dec 2024 02:47:28 +0000
Subject: [PATCH] Add a super simple wrapper for a merged string table.

Suggestions welcome on what to better name this -- `StringTable` as
I currently have it seems too general, but wasn't sure what other name
would be better.

It currently has a *very* minimal API. I'm happy to expand it if folks
have ideas for what API would be useful, but this actually seemed like
it might be all we really need.
---
 llvm/include/llvm/ADT/StringTable.h    | 91 ++++++++++++++++++++++++++
 llvm/unittests/ADT/CMakeLists.txt      |  1 +
 llvm/unittests/ADT/StringTableTest.cpp | 41 ++++++++++++
 3 files changed, 133 insertions(+)
 create mode 100644 llvm/include/llvm/ADT/StringTable.h
 create mode 100644 llvm/unittests/ADT/StringTableTest.cpp

diff --git a/llvm/include/llvm/ADT/StringTable.h b/llvm/include/llvm/ADT/StringTable.h
new file mode 100644
index 00000000000000..4049f892fa66e0
--- /dev/null
+++ b/llvm/include/llvm/ADT/StringTable.h
@@ -0,0 +1,91 @@
+//===- StringTable.h - Table of strings tracked by offset ----------C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_STRING_TABLE_H
+#define LLVM_ADT_STRING_TABLE_H
+
+#include "llvm/ADT/StringRef.h"
+#include <limits>
+
+namespace llvm {
+
+/// A table of densely packed, null-terminated strings indexed by offset.
+///
+/// This table abstracts a densely concatenated list of null-terminated strings,
+/// each of which can be referenced using an offset into the table.
+///
+/// This requires and ensures that the string at offset 0 is also the empty
+/// string. This helps allow zero-initialized offsets form empty strings and
+/// avoids non-zero initialization when using a string literal pointer would
+/// allow a null pointer.
+///
+/// The primary use case is having a single global string literal for the table
+/// contents, and offsets into it in other global data structures to avoid
+/// dynamic relocations of individual string literal pointers in those global
+/// data structures.
+class StringTable {
+  StringRef Table;
+
+public:
+  // An offset into one of these packed string tables, used to select a string
+  // within the table.
+  //
+  // Typically these are created by TableGen or other code generator from
+  // computed offsets, and it just wraps that integer into a type until it is
+  // used with the relevant table.
+  //
+  // We also ensure that the empty string is at offset zero and default
+  // constructing this class gives you an offset of zero. This makes default
+  // constructing this type work similarly (after indexing the table) to default
+  // constructing a `StringRef`.
+  class Offset {
+    // Note that we ensure the empty string is at offset zero.
+    unsigned Value = 0;
+
+  public:
+    constexpr Offset() = default;
+    constexpr Offset(unsigned Value) : Value(Value) {}
+
+    constexpr unsigned value() const { return Value; }
+  };
+
+  // We directly handle string literals with a templated converting constructor
+  // because we *don't* want to do `strlen` on them -- we fully expect null
+  // bytes in this input. This is somewhat the opposite of how `StringLiteral`
+  // works.
+  template <size_t N>
+  constexpr StringTable(const char (&RawTable)[N]) : Table(RawTable, N) {
+    static_assert(N <= std::numeric_limits<unsigned>::max(),
+                  "We only support table sizes that can be indexed by an "
+                  "`unsigned` offset.");
+
+    // Note that we can only use `empty`, `data`, and `size` in these asserts to
+    // support `constexpr`.
+    assert(!Table.empty() && "Requires at least a valid empty string.");
+    assert(Table.data()[0] == '\0' && "Offset zero must be the empty string.");
+    // Ensure that `strlen` from any offset cannot overflow the end of the table
+    // by insisting on a null byte at the end.
+    assert(Table.data()[Table.size() - 1] == '\0' &&
+           "Last byte must be a null byte.");
+  }
+
+  // Get a string from the table starting with the provided offset. The returned
+  // `StringRef` is in fact null terminated, and so can be converted safely to a
+  // C-string if necessary for a system API.
+  constexpr StringRef operator[](Offset O) const {
+    assert(O.value() < Table.size() && "Out of bounds offset!");
+    return Table.data() + O.value();
+  }
+
+  /// Returns the byte size of the table.
+  constexpr size_t size() const { return Table.size(); }
+};
+
+} // namespace llvm
+
+#endif // LLVM_ADT_STRING_TABLE_H
diff --git a/llvm/unittests/ADT/CMakeLists.txt b/llvm/unittests/ADT/CMakeLists.txt
index c9bc58f45f08cf..07568ad0c64e33 100644
--- a/llvm/unittests/ADT/CMakeLists.txt
+++ b/llvm/unittests/ADT/CMakeLists.txt
@@ -86,6 +86,7 @@ add_llvm_unittest(ADTTests
   StringRefTest.cpp
   StringSetTest.cpp
   StringSwitchTest.cpp
+  StringTableTest.cpp
   TinyPtrVectorTest.cpp
   TrieRawHashMapTest.cpp
   TwineTest.cpp
diff --git a/llvm/unittests/ADT/StringTableTest.cpp b/llvm/unittests/ADT/StringTableTest.cpp
new file mode 100644
index 00000000000000..0fc4ba7b50b804
--- /dev/null
+++ b/llvm/unittests/ADT/StringTableTest.cpp
@@ -0,0 +1,41 @@
+//===- llvm/unittest/ADT/StringTableTest.cpp - StringTable tests ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringTable.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::StrEq;
+
+TEST(StringTableTest, Basic) {
+  static constexpr char InputTable[] = "\0test\0";
+  constexpr StringTable T = InputTable;
+
+  // We support some limited constexpr operations, check those first.
+  static_assert(T.size() == sizeof(InputTable));
+  static_assert(T[0].empty());
+  static_assert(T[StringTable::Offset()].empty());
+  static_assert(T[1].size() == 4);
+
+  // And use normal Google Test runtime assertions to check the contents and
+  // give more complete error messages.
+  EXPECT_THAT(T[0], Eq(""));
+  EXPECT_THAT(T[StringTable::Offset()], Eq(""));
+  EXPECT_THAT(T[1], Eq("test"));
+
+  // Also check that this is a valid C-string.
+  EXPECT_THAT(T[1].data(), StrEq("test"));
+}
+
+} // anonymous namespace



More information about the llvm-commits mailing list