[llvm] 2936a2c - [CAS] Add OnDiskTrieRawHashMap (#114100)

via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 29 12:47:57 PDT 2025


Author: Steven Wu
Date: 2025-09-29T12:47:52-07:00
New Revision: 2936a2c882d76c719f9a96e443ad3f75b366bc8f

URL: https://github.com/llvm/llvm-project/commit/2936a2c882d76c719f9a96e443ad3f75b366bc8f
DIFF: https://github.com/llvm/llvm-project/commit/2936a2c882d76c719f9a96e443ad3f75b366bc8f.diff

LOG: [CAS] Add OnDiskTrieRawHashMap (#114100)

Add OnDiskTrieRawHashMap. This is a on-disk persistent hash map that
uses a Trie data structure that is similar to TrieRawHashMap.
OnDiskTrieRawHashMap is thread safe and process safe. It is mostly lock
free, except it internally coordinates cross process creation and
closing using file lock.

Added: 
    llvm/include/llvm/CAS/FileOffset.h
    llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h
    llvm/lib/CAS/DatabaseFile.cpp
    llvm/lib/CAS/DatabaseFile.h
    llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
    llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp

Modified: 
    llvm/lib/CAS/CMakeLists.txt
    llvm/unittests/CAS/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CAS/FileOffset.h b/llvm/include/llvm/CAS/FileOffset.h
new file mode 100644
index 0000000000000..21d045e8c9d78
--- /dev/null
+++ b/llvm/include/llvm/CAS/FileOffset.h
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares interface for FileOffset that represent stored data at an
+/// offset from the beginning of a file.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_FILEOFFSET_H
+#define LLVM_CAS_FILEOFFSET_H
+
+#include <cstdlib>
+
+namespace llvm::cas {
+
+/// FileOffset is a wrapper around `uint64_t` to represent the offset of data
+/// from the beginning of the file.
+class FileOffset {
+public:
+  uint64_t get() const { return Offset; }
+
+  explicit operator bool() const { return Offset; }
+
+  FileOffset() = default;
+  explicit FileOffset(uint64_t Offset) : Offset(Offset) {}
+
+private:
+  uint64_t Offset = 0;
+};
+
+} // namespace llvm::cas
+
+#endif // LLVM_CAS_FILEOFFSET_H

diff  --git a/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h b/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h
new file mode 100644
index 0000000000000..5e41bf6ab571e
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h
@@ -0,0 +1,236 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares interface for OnDiskTrieRawHashMap, a thread-safe and
+/// (mostly) lock-free hash map stored as trie and backed by persistent files on
+/// disk.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_ONDISKTRIERAWHASHMAP_H
+#define LLVM_CAS_ONDISKTRIERAWHASHMAP_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/FileOffset.h"
+#include "llvm/Support/Error.h"
+#include <optional>
+
+namespace llvm {
+
+class raw_ostream;
+
+namespace cas {
+
+/// OnDiskTrieRawHashMap is a persistent trie data structure used as hash maps.
+/// The keys are fixed length, and are expected to be binary hashes with a
+/// normal distribution.
+///
+/// - Thread-safety is achieved through the use of atomics within a shared
+///   memory mapping. Atomic access does not work on networked filesystems.
+/// - Filesystem locks are used, but only sparingly:
+///     - during initialization, for creating / opening an existing store;
+///     - for the lifetime of the instance, a shared/reader lock is held
+///     - during destruction, if there are no concurrent readers, to shrink the
+///       files to their minimum size.
+/// - Path is used as a directory:
+///     - "index" stores the root trie and subtries.
+///     - "data" stores (most of) the entries, like a bump-ptr-allocator.
+///     - Large entries are stored externally in a file named by the key.
+/// - Code is system-dependent and binary format itself is not portable. These
+///   are not artifacts that can/should be moved between 
diff erent systems; they
+///   are only appropriate for local storage.
+class OnDiskTrieRawHashMap {
+public:
+  LLVM_DUMP_METHOD void dump() const;
+  void
+  print(raw_ostream &OS,
+        function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const;
+
+public:
+  /// Const value proxy to access the records stored in TrieRawHashMap.
+  struct ConstValueProxy {
+    ConstValueProxy() = default;
+    ConstValueProxy(ArrayRef<uint8_t> Hash, ArrayRef<char> Data)
+        : Hash(Hash), Data(Data) {}
+    ConstValueProxy(ArrayRef<uint8_t> Hash, StringRef Data)
+        : Hash(Hash), Data(Data.begin(), Data.size()) {}
+
+    ArrayRef<uint8_t> Hash;
+    ArrayRef<char> Data;
+  };
+
+  /// Value proxy to access the records stored in TrieRawHashMap.
+  struct ValueProxy {
+    operator ConstValueProxy() const { return ConstValueProxy(Hash, Data); }
+
+    ValueProxy() = default;
+    ValueProxy(ArrayRef<uint8_t> Hash, MutableArrayRef<char> Data)
+        : Hash(Hash), Data(Data) {}
+
+    ArrayRef<uint8_t> Hash;
+    MutableArrayRef<char> Data;
+  };
+
+  /// Validate the trie data structure.
+  ///
+  /// Callback receives the file offset to the data entry and the data stored.
+  Error validate(
+      function_ref<Error(FileOffset, ConstValueProxy)> RecordVerifier) const;
+
+  /// Check the valid range of file offset for OnDiskTrieRawHashMap.
+  static bool validOffset(FileOffset Offset) {
+    return Offset.get() < (1LL << 48);
+  }
+
+public:
+  /// Template class to implement a `pointer` type into the trie data structure.
+  ///
+  /// It provides pointer-like operation, e.g., dereference to get underlying
+  /// data. It also reserves the top 16 bits of the pointer value, which can be
+  /// used to pack additional information if needed.
+  template <class ProxyT> class PointerImpl {
+  public:
+    FileOffset getOffset() const {
+      return FileOffset(OffsetLow32 | (uint64_t)OffsetHigh16 << 32);
+    }
+
+    explicit operator bool() const { return IsValue; }
+
+    const ProxyT &operator*() const {
+      assert(IsValue);
+      return Value;
+    }
+    const ProxyT *operator->() const {
+      assert(IsValue);
+      return &Value;
+    }
+
+    PointerImpl() = default;
+
+  protected:
+    PointerImpl(ProxyT Value, FileOffset Offset, bool IsValue = true)
+        : Value(Value), OffsetLow32((uint64_t)Offset.get()),
+          OffsetHigh16((uint64_t)Offset.get() >> 32), IsValue(IsValue) {
+      if (IsValue)
+        assert(validOffset(Offset));
+    }
+
+    ProxyT Value;
+    uint32_t OffsetLow32 = 0;
+    uint16_t OffsetHigh16 = 0;
+
+    // True if points to a value (not a "nullptr"). Use an extra field because
+    // 0 can be a valid offset.
+    bool IsValue = false;
+  };
+
+  class pointer;
+  class const_pointer : public PointerImpl<ConstValueProxy> {
+  public:
+    const_pointer() = default;
+
+  private:
+    friend class pointer;
+    friend class OnDiskTrieRawHashMap;
+    using const_pointer::PointerImpl::PointerImpl;
+  };
+
+  class pointer : public PointerImpl<ValueProxy> {
+  public:
+    operator const_pointer() const {
+      return const_pointer(Value, getOffset(), IsValue);
+    }
+
+    pointer() = default;
+
+  private:
+    friend class OnDiskTrieRawHashMap;
+    using pointer::PointerImpl::PointerImpl;
+  };
+
+  /// Find the value from hash.
+  ///
+  /// \returns pointer to the value if exists, otherwise returns a non-value
+  /// pointer that evaluates to `false` when convert to boolean.
+  const_pointer find(ArrayRef<uint8_t> Hash) const;
+
+  /// Helper function to recover a pointer into the trie from file offset.
+  Expected<const_pointer> recoverFromFileOffset(FileOffset Offset) const;
+
+  using LazyInsertOnConstructCB =
+      function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue)>;
+  using LazyInsertOnLeakCB =
+      function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue,
+                        FileOffset FinalOffset, ValueProxy FinalValue)>;
+
+  /// Insert lazily.
+  ///
+  /// \p OnConstruct is called when ready to insert a value, after allocating
+  /// space for the data. It is called at most once.
+  ///
+  /// \p OnLeak is called only if \p OnConstruct has been called and a race
+  /// occurred before insertion, causing the tentative offset and data to be
+  /// abandoned. This allows clients to clean up other results or update any
+  /// references.
+  ///
+  /// NOTE: Does *not* guarantee that \p OnConstruct is only called on success.
+  /// The in-memory \a TrieRawHashMap uses LazyAtomicPointer to synchronize
+  /// simultaneous writes, but that seems dangerous to use in a memory-mapped
+  /// file in case a process crashes in the busy state.
+  Expected<pointer> insertLazy(ArrayRef<uint8_t> Hash,
+                               LazyInsertOnConstructCB OnConstruct = nullptr,
+                               LazyInsertOnLeakCB OnLeak = nullptr);
+
+  Expected<pointer> insert(const ConstValueProxy &Value) {
+    return insertLazy(Value.Hash, [&](FileOffset, ValueProxy Allocated) {
+      assert(Allocated.Hash == Value.Hash);
+      assert(Allocated.Data.size() == Value.Data.size());
+      llvm::copy(Value.Data, Allocated.Data.begin());
+    });
+  }
+
+  size_t size() const;
+  size_t capacity() const;
+
+  /// Gets or creates a file at \p Path with a hash-mapped trie named \p
+  /// TrieName. The hash size is \p NumHashBits (in bits) and the records store
+  /// data of size \p DataSize (in bytes).
+  ///
+  /// \p MaxFileSize controls the maximum file size to support, limiting the
+  /// size of the \a mapped_file_region. \p NewFileInitialSize is the starting
+  /// size if a new file is created.
+  ///
+  /// \p NewTableNumRootBits and \p NewTableNumSubtrieBits are hints to
+  /// configure the trie, if it doesn't already exist.
+  ///
+  /// \pre NumHashBits is a multiple of 8 (byte-aligned).
+  static Expected<OnDiskTrieRawHashMap>
+  create(const Twine &Path, const Twine &TrieName, size_t NumHashBits,
+         uint64_t DataSize, uint64_t MaxFileSize,
+         std::optional<uint64_t> NewFileInitialSize,
+         std::optional<size_t> NewTableNumRootBits = std::nullopt,
+         std::optional<size_t> NewTableNumSubtrieBits = std::nullopt);
+
+  OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS);
+  OnDiskTrieRawHashMap &operator=(OnDiskTrieRawHashMap &&RHS);
+  ~OnDiskTrieRawHashMap();
+
+private:
+  struct ImplType;
+  explicit OnDiskTrieRawHashMap(std::unique_ptr<ImplType> Impl);
+  std::unique_ptr<ImplType> Impl;
+};
+
+} // namespace cas
+} // namespace llvm
+
+#endif // LLVM_CAS_ONDISKTRIERAWHASHMAP_H

diff  --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index 6ed724bc2fd76..cc866f25f3240 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -2,10 +2,12 @@ add_llvm_component_library(LLVMCAS
   ActionCache.cpp
   ActionCaches.cpp
   BuiltinCAS.cpp
+  DatabaseFile.cpp
   InMemoryCAS.cpp
   MappedFileRegionArena.cpp
   ObjectStore.cpp
   OnDiskCommon.cpp
+  OnDiskTrieRawHashMap.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS

diff  --git a/llvm/lib/CAS/DatabaseFile.cpp b/llvm/lib/CAS/DatabaseFile.cpp
new file mode 100644
index 0000000000000..db8ce1dc5bb14
--- /dev/null
+++ b/llvm/lib/CAS/DatabaseFile.cpp
@@ -0,0 +1,123 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This file implements the common abstractions for CAS database file.
+///
+//===----------------------------------------------------------------------===//
+
+#include "DatabaseFile.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+Error ondisk::createTableConfigError(std::errc ErrC, StringRef Path,
+                                     StringRef TableName, const Twine &Msg) {
+  return createStringError(make_error_code(ErrC),
+                           Path + "[" + TableName + "]: " + Msg);
+}
+
+Error ondisk::checkTable(StringRef Label, size_t Expected, size_t Observed,
+                         StringRef Path, StringRef TrieName) {
+  if (Expected == Observed)
+    return Error::success();
+  return createTableConfigError(std::errc::invalid_argument, Path, TrieName,
+                                "mismatched " + Label +
+                                    " (expected: " + Twine(Expected) +
+                                    ", observed: " + Twine(Observed) + ")");
+}
+
+Expected<DatabaseFile>
+DatabaseFile::create(const Twine &Path, uint64_t Capacity,
+                     function_ref<Error(DatabaseFile &)> NewDBConstructor) {
+  // Constructor for if the file doesn't exist.
+  auto NewFileConstructor = [&](MappedFileRegionArena &Alloc) -> Error {
+    if (Alloc.capacity() <
+        sizeof(Header) + sizeof(MappedFileRegionArena::Header))
+      return createTableConfigError(std::errc::argument_out_of_domain,
+                                    Path.str(), "datafile",
+                                    "Allocator too small for header");
+    (void)new (Alloc.data()) Header{getMagic(), getVersion(), {0}};
+    DatabaseFile DB(Alloc);
+    return NewDBConstructor(DB);
+  };
+
+  // Get or create the file.
+  MappedFileRegionArena Alloc;
+  if (Error E = MappedFileRegionArena::create(Path, Capacity, sizeof(Header),
+                                              NewFileConstructor)
+                    .moveInto(Alloc))
+    return std::move(E);
+
+  return DatabaseFile::get(
+      std::make_unique<MappedFileRegionArena>(std::move(Alloc)));
+}
+
+Error DatabaseFile::addTable(TableHandle Table) {
+  assert(Table);
+  assert(&Table.getRegion() == &getRegion());
+  int64_t ExistingRootOffset = 0;
+  const int64_t NewOffset =
+      reinterpret_cast<const char *>(&Table.getHeader()) - getRegion().data();
+  if (H->RootTableOffset.compare_exchange_strong(ExistingRootOffset, NewOffset))
+    return Error::success();
+
+  // Silently ignore attempts to set the root to itself.
+  if (ExistingRootOffset == NewOffset)
+    return Error::success();
+
+  // Return an proper error message.
+  TableHandle Root(getRegion(), ExistingRootOffset);
+  if (Root.getName() == Table.getName())
+    return createStringError(
+        make_error_code(std::errc::not_supported),
+        "collision with existing table of the same name '" + Table.getName() +
+            "'");
+
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "cannot add new table '" + Table.getName() +
+                               "'"
+                               " to existing root '" +
+                               Root.getName() + "'");
+}
+
+std::optional<TableHandle> DatabaseFile::findTable(StringRef Name) {
+  int64_t RootTableOffset = H->RootTableOffset.load();
+  if (!RootTableOffset)
+    return std::nullopt;
+
+  TableHandle Root(getRegion(), RootTableOffset);
+  if (Root.getName() == Name)
+    return Root;
+
+  return std::nullopt;
+}
+
+Error DatabaseFile::validate(MappedFileRegion &Region) {
+  if (Region.size() < sizeof(Header))
+    return createStringError(std::errc::invalid_argument,
+                             "database: missing header");
+
+  // Check the magic and version.
+  auto *H = reinterpret_cast<Header *>(Region.data());
+  if (H->Magic != getMagic())
+    return createStringError(std::errc::invalid_argument,
+                             "database: bad magic");
+  if (H->Version != getVersion())
+    return createStringError(std::errc::invalid_argument,
+                             "database: wrong version");
+
+  auto *MFH = reinterpret_cast<MappedFileRegionArena::Header *>(Region.data() +
+                                                                sizeof(Header));
+  // Check the bump-ptr, which should point past the header.
+  if (MFH->BumpPtr.load() < (int64_t)sizeof(Header))
+    return createStringError(std::errc::invalid_argument,
+                             "database: corrupt bump-ptr");
+
+  return Error::success();
+}

diff  --git a/llvm/lib/CAS/DatabaseFile.h b/llvm/lib/CAS/DatabaseFile.h
new file mode 100644
index 0000000000000..609e5f1357190
--- /dev/null
+++ b/llvm/lib/CAS/DatabaseFile.h
@@ -0,0 +1,153 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares the common interface for a DatabaseFile that is used to
+/// implement OnDiskCAS.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CAS_DATABASEFILE_H
+#define LLVM_LIB_CAS_DATABASEFILE_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/MappedFileRegionArena.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm::cas::ondisk {
+
+using MappedFileRegion = MappedFileRegionArena::RegionT;
+
+/// Generic handle for a table.
+///
+/// Generic table header layout:
+/// - 2-bytes: TableKind
+/// - 2-bytes: TableNameSize
+/// - 4-bytes: TableNameRelOffset (relative to header)
+class TableHandle {
+public:
+  enum class TableKind : uint16_t {
+    TrieRawHashMap = 1,
+    DataAllocator = 2,
+  };
+  struct Header {
+    TableKind Kind;
+    uint16_t NameSize;
+    int32_t NameRelOffset; ///< Relative to Header.
+  };
+
+  explicit operator bool() const { return H; }
+  const Header &getHeader() const { return *H; }
+  MappedFileRegion &getRegion() const { return *Region; }
+
+  template <class T> static void check() {
+    static_assert(
+        std::is_same<decltype(T::Header::GenericHeader), Header>::value,
+        "T::GenericHeader should be of type TableHandle::Header");
+    static_assert(offsetof(typename T::Header, GenericHeader) == 0,
+                  "T::GenericHeader must be the head of T::Header");
+  }
+  template <class T> bool is() const { return T::Kind == H->Kind; }
+  template <class T> T dyn_cast() const {
+    check<T>();
+    if (is<T>())
+      return T(*Region, *reinterpret_cast<typename T::Header *>(H));
+    return T();
+  }
+  template <class T> T cast() const {
+    assert(is<T>());
+    return dyn_cast<T>();
+  }
+
+  StringRef getName() const {
+    auto *Begin = reinterpret_cast<const char *>(H) + H->NameRelOffset;
+    return StringRef(Begin, H->NameSize);
+  }
+
+  TableHandle() = default;
+  TableHandle(MappedFileRegion &Region, Header &H) : Region(&Region), H(&H) {}
+  TableHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+      : TableHandle(Region,
+                    *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+  }
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+};
+
+/// Encapsulate a database file, which:
+/// - Sets/checks magic.
+/// - Sets/checks version.
+/// - Points at an arbitrary root table.
+/// - Sets up a MappedFileRegionArena for allocation.
+///
+/// Top-level layout:
+/// - 4-bytes: Magic
+/// - 4-bytes: Version
+/// - 8-bytes: RootTableOffset (16-bits: Kind; 48-bits: Offset)
+/// - 8-bytes: BumpPtr from MappedFileRegionArena
+class DatabaseFile {
+public:
+  static constexpr uint32_t getMagic() { return 0xDA7ABA53UL; }
+  static constexpr uint32_t getVersion() { return 1UL; }
+  struct Header {
+    uint32_t Magic;
+    uint32_t Version;
+    std::atomic<int64_t> RootTableOffset;
+  };
+
+  const Header &getHeader() { return *H; }
+  MappedFileRegionArena &getAlloc() { return Alloc; }
+  MappedFileRegion &getRegion() { return Alloc.getRegion(); }
+
+  /// Add a table. This is currently not thread safe and should be called inside
+  /// NewDBConstructor.
+  Error addTable(TableHandle Table);
+
+  /// Find a table. May return null.
+  std::optional<TableHandle> findTable(StringRef Name);
+
+  /// Create the DatabaseFile at Path with Capacity.
+  static Expected<DatabaseFile>
+  create(const Twine &Path, uint64_t Capacity,
+         function_ref<Error(DatabaseFile &)> NewDBConstructor);
+
+  size_t size() const { return Alloc.size(); }
+
+private:
+  static Expected<DatabaseFile>
+  get(std::unique_ptr<MappedFileRegionArena> Alloc) {
+    if (Error E = validate(Alloc->getRegion()))
+      return std::move(E);
+    return DatabaseFile(std::move(Alloc));
+  }
+
+  static Error validate(MappedFileRegion &Region);
+
+  DatabaseFile(MappedFileRegionArena &Alloc)
+      : H(reinterpret_cast<Header *>(Alloc.data())), Alloc(Alloc) {}
+  DatabaseFile(std::unique_ptr<MappedFileRegionArena> Alloc)
+      : DatabaseFile(*Alloc) {
+    OwnedAlloc = std::move(Alloc);
+  }
+
+  Header *H = nullptr;
+  MappedFileRegionArena &Alloc;
+  std::unique_ptr<MappedFileRegionArena> OwnedAlloc;
+};
+
+Error createTableConfigError(std::errc ErrC, StringRef Path,
+                             StringRef TableName, const Twine &Msg);
+
+Error checkTable(StringRef Label, size_t Expected, size_t Observed,
+                 StringRef Path, StringRef TrieName);
+
+} // namespace llvm::cas::ondisk
+
+#endif

diff  --git a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
new file mode 100644
index 0000000000000..3000c0f0e46f1
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
@@ -0,0 +1,1178 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file Implements OnDiskTrieRawHashMap.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+#include "DatabaseFile.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/TrieHashIndexGenerator.h"
+#include "llvm/CAS/MappedFileRegionArena.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/Threading.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+//===----------------------------------------------------------------------===//
+// TrieRawHashMap data structures.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class SubtrieHandle;
+class TrieRawHashMapHandle;
+class TrieVisitor;
+
+/// A value stored in the slots inside a SubTrie. A stored value can either be a
+/// subtrie (encoded after negation) which is the file offset to another
+/// subtrie, or it can be a fileset to a DataRecord.
+class SubtrieSlotValue {
+public:
+  explicit operator bool() const { return !isEmpty(); }
+  bool isEmpty() const { return !Offset; }
+  bool isData() const { return Offset > 0; }
+  bool isSubtrie() const { return Offset < 0; }
+  uint64_t asData() const {
+    assert(isData());
+    return Offset;
+  }
+  uint64_t asSubtrie() const {
+    assert(isSubtrie());
+    return -Offset;
+  }
+
+  FileOffset asSubtrieFileOffset() const { return FileOffset(asSubtrie()); }
+
+  FileOffset asDataFileOffset() const { return FileOffset(asData()); }
+
+  int64_t getRawOffset() const { return Offset; }
+
+  static SubtrieSlotValue getDataOffset(int64_t Offset) {
+    return SubtrieSlotValue(Offset);
+  }
+
+  static SubtrieSlotValue getSubtrieOffset(int64_t Offset) {
+    return SubtrieSlotValue(-Offset);
+  }
+
+  static SubtrieSlotValue getDataOffset(FileOffset Offset) {
+    return getDataOffset(Offset.get());
+  }
+
+  static SubtrieSlotValue getSubtrieOffset(FileOffset Offset) {
+    return getDataOffset(Offset.get());
+  }
+
+  static SubtrieSlotValue getFromSlot(std::atomic<int64_t> &Slot) {
+    return SubtrieSlotValue(Slot.load());
+  }
+
+  SubtrieSlotValue() = default;
+
+private:
+  friend class SubtrieHandle;
+  explicit SubtrieSlotValue(int64_t Offset) : Offset(Offset) {}
+  int64_t Offset = 0;
+};
+
+/// Subtrie layout:
+/// - 2-bytes: StartBit
+/// - 1-bytes: NumBits=lg(num-slots)
+/// - 5-bytes: 0-pad
+/// - <slots>
+class SubtrieHandle {
+public:
+  struct Header {
+    /// The bit this subtrie starts on.
+    uint16_t StartBit;
+
+    /// The number of bits this subtrie handles. It has 2^NumBits slots.
+    uint8_t NumBits;
+
+    /// 0-pad to 8B.
+    uint8_t ZeroPad1B;
+    uint32_t ZeroPad4B;
+  };
+
+  /// Slot storage:
+  /// - zero:     Empty
+  /// - positive: RecordOffset
+  /// - negative: SubtrieOffset
+  using SlotT = std::atomic<int64_t>;
+
+  static int64_t getSlotsSize(uint32_t NumBits) {
+    return sizeof(int64_t) * (1u << NumBits);
+  }
+
+  static int64_t getSize(uint32_t NumBits) {
+    return sizeof(SubtrieHandle::Header) + getSlotsSize(NumBits);
+  }
+
+  int64_t getSize() const { return getSize(H->NumBits); }
+  size_t getNumSlots() const { return Slots.size(); }
+
+  SubtrieSlotValue load(size_t I) const {
+    return SubtrieSlotValue(Slots[I].load());
+  }
+  void store(size_t I, SubtrieSlotValue V) {
+    return Slots[I].store(V.getRawOffset());
+  }
+
+  void printHash(raw_ostream &OS, ArrayRef<uint8_t> Bytes) const;
+
+  /// Return None on success, or the existing offset on failure.
+  bool compare_exchange_strong(size_t I, SubtrieSlotValue &Expected,
+                               SubtrieSlotValue New) {
+    return Slots[I].compare_exchange_strong(Expected.Offset, New.Offset);
+  }
+
+  /// Sink \p V from \p I in this subtrie down to \p NewI in a new subtrie with
+  /// \p NumSubtrieBits.
+  ///
+  /// \p UnusedSubtrie maintains a 1-item "free" list of unused subtries. If a
+  /// new subtrie is created that isn't used because of a lost race, then it If
+  /// it's already valid, it should be used instead of allocating a new one.
+  /// should be returned as an out parameter to be passed back in the future.
+  /// If it's already valid, it should be used instead of allocating a new one.
+  ///
+  /// Returns the subtrie that now lives at \p I.
+  Expected<SubtrieHandle> sink(size_t I, SubtrieSlotValue V,
+                               MappedFileRegionArena &Alloc,
+                               size_t NumSubtrieBits,
+                               SubtrieHandle &UnusedSubtrie, size_t NewI);
+
+  /// Only safe if the subtrie is empty.
+  void reinitialize(uint32_t StartBit, uint32_t NumBits);
+
+  SubtrieSlotValue getOffset() const {
+    return SubtrieSlotValue::getSubtrieOffset(
+        reinterpret_cast<const char *>(H) - Region->data());
+  }
+
+  FileOffset getFileOffset() const { return getOffset().asSubtrieFileOffset(); }
+
+  explicit operator bool() const { return H; }
+
+  Header &getHeader() const { return *H; }
+  uint32_t getStartBit() const { return H->StartBit; }
+  uint32_t getNumBits() const { return H->NumBits; }
+
+  static Expected<SubtrieHandle> create(MappedFileRegionArena &Alloc,
+                                        uint32_t StartBit, uint32_t NumBits);
+
+  static SubtrieHandle getFromFileOffset(MappedFileRegion &Region,
+                                         FileOffset Offset) {
+    return SubtrieHandle(Region, SubtrieSlotValue::getSubtrieOffset(Offset));
+  }
+
+  SubtrieHandle() = default;
+  SubtrieHandle(MappedFileRegion &Region, Header &H)
+      : Region(&Region), H(&H), Slots(getSlots(H)) {}
+  SubtrieHandle(MappedFileRegion &Region, SubtrieSlotValue Offset)
+      : SubtrieHandle(Region, *reinterpret_cast<Header *>(
+                                  Region.data() + Offset.asSubtrie())) {}
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+  MutableArrayRef<SlotT> Slots;
+
+  static MutableArrayRef<SlotT> getSlots(Header &H) {
+    return MutableArrayRef(reinterpret_cast<SlotT *>(&H + 1), 1u << H.NumBits);
+  }
+};
+
+/// Handle for a TrieRawHashMap table.
+///
+/// TrieRawHashMap table layout:
+/// - [8-bytes: Generic table header]
+/// - 1-byte:  NumSubtrieBits
+/// - 1-byte:  Flags (not used yet)
+/// - 2-bytes: NumHashBits
+/// - 4-bytes: RecordDataSize (in bytes)
+/// - 8-bytes: RootTrieOffset
+/// - 8-bytes: AllocatorOffset (reserved for implementing free lists)
+/// - <name> '\0'
+///
+/// Record layout:
+/// - <hash>
+/// - <data>
+class TrieRawHashMapHandle {
+public:
+  static constexpr TableHandle::TableKind Kind =
+      TableHandle::TableKind::TrieRawHashMap;
+
+  struct Header {
+    TableHandle::Header GenericHeader;
+    uint8_t NumSubtrieBits;
+    uint8_t Flags; ///< None used yet.
+    uint16_t NumHashBits;
+    uint32_t RecordDataSize;
+    std::atomic<int64_t> RootTrieOffset;
+    std::atomic<int64_t> AllocatorOffset;
+  };
+
+  operator TableHandle() const {
+    if (!H)
+      return TableHandle();
+    return TableHandle(*Region, H->GenericHeader);
+  }
+
+  struct RecordData {
+    OnDiskTrieRawHashMap::ValueProxy Proxy;
+    SubtrieSlotValue Offset;
+    FileOffset getFileOffset() const { return Offset.asDataFileOffset(); }
+  };
+
+  enum Limits : size_t {
+    /// Seems like 65528 hash bits ought to be enough.
+    MaxNumHashBytes = UINT16_MAX >> 3,
+    MaxNumHashBits = MaxNumHashBytes << 3,
+
+    /// 2^16 bits in a trie is 65536 slots. This restricts us to a 16-bit
+    /// index. This many slots is suspicously large anyway.
+    MaxNumRootBits = 16,
+
+    /// 2^10 bits in a trie is 1024 slots. This many slots seems suspiciously
+    /// large for subtries.
+    MaxNumSubtrieBits = 10,
+  };
+
+  static constexpr size_t getNumHashBytes(size_t NumHashBits) {
+    assert(NumHashBits % 8 == 0);
+    return NumHashBits / 8;
+  }
+  static constexpr size_t getRecordSize(size_t RecordDataSize,
+                                        size_t NumHashBits) {
+    return RecordDataSize + getNumHashBytes(NumHashBits);
+  }
+
+  RecordData getRecord(SubtrieSlotValue Offset);
+  Expected<RecordData> createRecord(MappedFileRegionArena &Alloc,
+                                    ArrayRef<uint8_t> Hash);
+
+  explicit operator bool() const { return H; }
+  const Header &getHeader() const { return *H; }
+  SubtrieHandle getRoot() const;
+  Expected<SubtrieHandle> getOrCreateRoot(MappedFileRegionArena &Alloc);
+  MappedFileRegion &getRegion() const { return *Region; }
+
+  size_t getFlags() const { return H->Flags; }
+  uint64_t getNumSubtrieBits() const { return H->NumSubtrieBits; }
+  uint64_t getNumHashBits() const { return H->NumHashBits; }
+  size_t getNumHashBytes() const { return getNumHashBytes(H->NumHashBits); }
+  size_t getRecordDataSize() const { return H->RecordDataSize; }
+  size_t getRecordSize() const {
+    return getRecordSize(H->RecordDataSize, H->NumHashBits);
+  }
+
+  TrieHashIndexGenerator getIndexGen(SubtrieHandle Root,
+                                     ArrayRef<uint8_t> Hash) {
+    assert(Root.getStartBit() == 0);
+    assert(getNumHashBytes() == Hash.size());
+    assert(getNumHashBits() == Hash.size() * 8);
+    return TrieHashIndexGenerator{Root.getNumBits(), getNumSubtrieBits(), Hash};
+  }
+
+  static Expected<TrieRawHashMapHandle>
+  create(MappedFileRegionArena &Alloc, StringRef Name,
+         std::optional<uint64_t> NumRootBits, uint64_t NumSubtrieBits,
+         uint64_t NumHashBits, uint64_t RecordDataSize);
+
+  void
+  print(raw_ostream &OS,
+        function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const;
+
+  Error validate(
+      function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+          RecordVerifier) const;
+  TrieRawHashMapHandle() = default;
+  TrieRawHashMapHandle(MappedFileRegion &Region, Header &H)
+      : Region(&Region), H(&H) {}
+  TrieRawHashMapHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+      : TrieRawHashMapHandle(
+            Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+  }
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+};
+
+} // end anonymous namespace
+
+struct OnDiskTrieRawHashMap::ImplType {
+  DatabaseFile File;
+  TrieRawHashMapHandle Trie;
+};
+
+Expected<SubtrieHandle> SubtrieHandle::create(MappedFileRegionArena &Alloc,
+                                              uint32_t StartBit,
+                                              uint32_t NumBits) {
+  assert(StartBit <= TrieRawHashMapHandle::MaxNumHashBits);
+  assert(NumBits <= UINT8_MAX);
+  assert(NumBits <= TrieRawHashMapHandle::MaxNumRootBits);
+
+  auto Mem = Alloc.allocate(getSize(NumBits));
+  if (LLVM_UNLIKELY(!Mem))
+    return Mem.takeError();
+  auto *H =
+      new (*Mem) SubtrieHandle::Header{(uint16_t)StartBit, (uint8_t)NumBits,
+                                       /*ZeroPad1B=*/0, /*ZeroPad4B=*/0};
+  SubtrieHandle S(Alloc.getRegion(), *H);
+  for (auto I = S.Slots.begin(), E = S.Slots.end(); I != E; ++I)
+    new (I) SlotT(0);
+  return S;
+}
+
+SubtrieHandle TrieRawHashMapHandle::getRoot() const {
+  if (int64_t Root = H->RootTrieOffset)
+    return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Root));
+  return SubtrieHandle();
+}
+
+Expected<SubtrieHandle>
+TrieRawHashMapHandle::getOrCreateRoot(MappedFileRegionArena &Alloc) {
+  assert(&Alloc.getRegion() == &getRegion());
+  if (SubtrieHandle Root = getRoot())
+    return Root;
+
+  int64_t Race = 0;
+  auto LazyRoot = SubtrieHandle::create(Alloc, 0, H->NumSubtrieBits);
+  if (LLVM_UNLIKELY(!LazyRoot))
+    return LazyRoot.takeError();
+  if (H->RootTrieOffset.compare_exchange_strong(
+          Race, LazyRoot->getOffset().asSubtrie()))
+    return *LazyRoot;
+
+  // There was a race. Return the other root.
+  //
+  // TODO: Avoid leaking the lazy root by storing it in an allocator.
+  return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Race));
+}
+
+Expected<TrieRawHashMapHandle>
+TrieRawHashMapHandle::create(MappedFileRegionArena &Alloc, StringRef Name,
+                             std::optional<uint64_t> NumRootBits,
+                             uint64_t NumSubtrieBits, uint64_t NumHashBits,
+                             uint64_t RecordDataSize) {
+  // Allocate.
+  auto Offset = Alloc.allocateOffset(sizeof(Header) + Name.size() + 1);
+  if (LLVM_UNLIKELY(!Offset))
+    return Offset.takeError();
+
+  // Construct the header and the name.
+  assert(Name.size() <= UINT16_MAX && "Expected smaller table name");
+  assert(NumSubtrieBits <= UINT8_MAX && "Expected valid subtrie bits");
+  assert(NumHashBits <= UINT16_MAX && "Expected valid hash size");
+  assert(RecordDataSize <= UINT32_MAX && "Expected smaller table name");
+  auto *H = new (Alloc.getRegion().data() + *Offset)
+      Header{{TableHandle::TableKind::TrieRawHashMap, (uint16_t)Name.size(),
+              (uint32_t)sizeof(Header)},
+             (uint8_t)NumSubtrieBits,
+             /*Flags=*/0,
+             (uint16_t)NumHashBits,
+             (uint32_t)RecordDataSize,
+             /*RootTrieOffset=*/{0},
+             /*AllocatorOffset=*/{0}};
+  char *NameStorage = reinterpret_cast<char *>(H + 1);
+  llvm::copy(Name, NameStorage);
+  NameStorage[Name.size()] = 0;
+
+  // Construct a root trie, if requested.
+  TrieRawHashMapHandle Trie(Alloc.getRegion(), *H);
+  auto Sub = SubtrieHandle::create(Alloc, 0, *NumRootBits);
+  if (LLVM_UNLIKELY(!Sub))
+    return Sub.takeError();
+  if (NumRootBits)
+    H->RootTrieOffset = Sub->getOffset().asSubtrie();
+  return Trie;
+}
+
+TrieRawHashMapHandle::RecordData
+TrieRawHashMapHandle::getRecord(SubtrieSlotValue Offset) {
+  char *Begin = Region->data() + Offset.asData();
+  OnDiskTrieRawHashMap::ValueProxy Proxy;
+  Proxy.Data = MutableArrayRef(Begin, getRecordDataSize());
+  Proxy.Hash = ArrayRef(reinterpret_cast<const uint8_t *>(Proxy.Data.end()),
+                        getNumHashBytes());
+  return RecordData{Proxy, Offset};
+}
+
+Expected<TrieRawHashMapHandle::RecordData>
+TrieRawHashMapHandle::createRecord(MappedFileRegionArena &Alloc,
+                                   ArrayRef<uint8_t> Hash) {
+  assert(&Alloc.getRegion() == Region);
+  assert(Hash.size() == getNumHashBytes());
+  auto Offset = Alloc.allocateOffset(getRecordSize());
+  if (LLVM_UNLIKELY(!Offset))
+    return Offset.takeError();
+
+  RecordData Record = getRecord(SubtrieSlotValue::getDataOffset(*Offset));
+  llvm::copy(Hash, const_cast<uint8_t *>(Record.Proxy.Hash.begin()));
+  return Record;
+}
+
+Expected<OnDiskTrieRawHashMap::const_pointer>
+OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const {
+  // Check alignment.
+  if (!isAligned(MappedFileRegionArena::getAlign(), Offset.get()))
+    return createStringError(make_error_code(std::errc::protocol_error),
+                             "unaligned file offset at 0x" +
+                                 utohexstr(Offset.get(), /*LowerCase=*/true));
+
+  // Check bounds.
+  //
+  // Note: There's no potential overflow when using \c uint64_t because Offset
+  // is in valid offset range and the record size is in \c [0,UINT32_MAX].
+  if (!validOffset(Offset) ||
+      Offset.get() + Impl->Trie.getRecordSize() > Impl->File.getAlloc().size())
+    return createStringError(make_error_code(std::errc::protocol_error),
+                             "file offset too large: 0x" +
+                                 utohexstr(Offset.get(), /*LowerCase=*/true));
+
+  // Looks okay...
+  TrieRawHashMapHandle::RecordData D =
+      Impl->Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset));
+  return const_pointer(D.Proxy, D.getFileOffset());
+}
+
+OnDiskTrieRawHashMap::const_pointer
+OnDiskTrieRawHashMap::find(ArrayRef<uint8_t> Hash) const {
+  TrieRawHashMapHandle Trie = Impl->Trie;
+  assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash");
+
+  SubtrieHandle S = Trie.getRoot();
+  if (!S)
+    return const_pointer();
+
+  TrieHashIndexGenerator IndexGen = Trie.getIndexGen(S, Hash);
+  size_t Index = IndexGen.next();
+  for (;;) {
+    // Try to set the content.
+    SubtrieSlotValue V = S.load(Index);
+    if (!V)
+      return const_pointer();
+
+    // Check for an exact match.
+    if (V.isData()) {
+      TrieRawHashMapHandle::RecordData D = Trie.getRecord(V);
+      return D.Proxy.Hash == Hash ? const_pointer(D.Proxy, D.getFileOffset())
+                                  : const_pointer();
+    }
+
+    Index = IndexGen.next();
+    S = SubtrieHandle(Trie.getRegion(), V);
+  }
+}
+
+/// Only safe if the subtrie is empty.
+void SubtrieHandle::reinitialize(uint32_t StartBit, uint32_t NumBits) {
+  assert(StartBit > H->StartBit);
+  assert(NumBits <= H->NumBits);
+  // Ideally would also assert that all slots are empty, but that's expensive.
+
+  H->StartBit = StartBit;
+  H->NumBits = NumBits;
+}
+
+Expected<OnDiskTrieRawHashMap::pointer>
+OnDiskTrieRawHashMap::insertLazy(ArrayRef<uint8_t> Hash,
+                                 LazyInsertOnConstructCB OnConstruct,
+                                 LazyInsertOnLeakCB OnLeak) {
+  TrieRawHashMapHandle Trie = Impl->Trie;
+  assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash");
+
+  MappedFileRegionArena &Alloc = Impl->File.getAlloc();
+  std::optional<SubtrieHandle> S;
+  auto Err = Trie.getOrCreateRoot(Alloc).moveInto(S);
+  if (LLVM_UNLIKELY(Err))
+    return std::move(Err);
+
+  TrieHashIndexGenerator IndexGen = Trie.getIndexGen(*S, Hash);
+  size_t Index = IndexGen.next();
+
+  // Walk through the hash bytes and insert into correct trie position.
+  std::optional<TrieRawHashMapHandle::RecordData> NewRecord;
+  SubtrieHandle UnusedSubtrie;
+  for (;;) {
+    SubtrieSlotValue Existing = S->load(Index);
+
+    // Try to set it, if it's empty.
+    if (!Existing) {
+      if (!NewRecord) {
+        auto Err = Trie.createRecord(Alloc, Hash).moveInto(NewRecord);
+        if (LLVM_UNLIKELY(Err))
+          return std::move(Err);
+        if (OnConstruct)
+          OnConstruct(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy);
+      }
+
+      if (S->compare_exchange_strong(Index, Existing, NewRecord->Offset))
+        return pointer(NewRecord->Proxy, NewRecord->Offset.asDataFileOffset());
+
+      // Race means that Existing is no longer empty; fall through...
+    }
+
+    if (Existing.isSubtrie()) {
+      S = SubtrieHandle(Trie.getRegion(), Existing);
+      Index = IndexGen.next();
+      continue;
+    }
+
+    // Check for an exact match.
+    TrieRawHashMapHandle::RecordData ExistingRecord = Trie.getRecord(Existing);
+    if (ExistingRecord.Proxy.Hash == Hash) {
+      if (NewRecord && OnLeak)
+        OnLeak(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy,
+               ExistingRecord.Offset.asDataFileOffset(), ExistingRecord.Proxy);
+      return pointer(ExistingRecord.Proxy,
+                     ExistingRecord.Offset.asDataFileOffset());
+    }
+
+    // Sink the existing content as long as the indexes match.
+    for (;;) {
+      size_t NextIndex = IndexGen.next();
+      size_t NewIndexForExistingContent =
+          IndexGen.getCollidingBits(ExistingRecord.Proxy.Hash);
+
+      auto Err = S->sink(Index, Existing, Alloc, IndexGen.getNumBits(),
+                         UnusedSubtrie, NewIndexForExistingContent)
+                     .moveInto(S);
+      if (LLVM_UNLIKELY(Err))
+        return std::move(Err);
+      Index = NextIndex;
+
+      // Found the 
diff erence.
+      if (NextIndex != NewIndexForExistingContent)
+        break;
+    }
+  }
+}
+
+Expected<SubtrieHandle> SubtrieHandle::sink(size_t I, SubtrieSlotValue V,
+                                            MappedFileRegionArena &Alloc,
+                                            size_t NumSubtrieBits,
+                                            SubtrieHandle &UnusedSubtrie,
+                                            size_t NewI) {
+  std::optional<SubtrieHandle> NewS;
+  if (UnusedSubtrie) {
+    // Steal UnusedSubtrie and initialize it.
+    NewS.emplace();
+    std::swap(*NewS, UnusedSubtrie);
+    NewS->reinitialize(getStartBit() + getNumBits(), NumSubtrieBits);
+  } else {
+    // Allocate a new, empty subtrie.
+    auto Err = SubtrieHandle::create(Alloc, getStartBit() + getNumBits(),
+                                     NumSubtrieBits)
+                   .moveInto(NewS);
+    if (LLVM_UNLIKELY(Err))
+      return std::move(Err);
+  }
+
+  NewS->store(NewI, V);
+  if (compare_exchange_strong(I, V, NewS->getOffset()))
+    return *NewS; // Success!
+
+  // Raced.
+  assert(V.isSubtrie() && "Expected racing sink() to add a subtrie");
+
+  // Wipe out the new slot so NewS can be reused and set the out parameter.
+  NewS->store(NewI, SubtrieSlotValue());
+  UnusedSubtrie = *NewS;
+
+  // Return the subtrie added by the concurrent sink() call.
+  return SubtrieHandle(Alloc.getRegion(), V);
+}
+
+void OnDiskTrieRawHashMap::print(
+    raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+  Impl->Trie.print(OS, PrintRecordData);
+}
+
+Error OnDiskTrieRawHashMap::validate(
+    function_ref<Error(FileOffset, ConstValueProxy)> RecordVerifier) const {
+  return Impl->Trie.validate(RecordVerifier);
+}
+
+// Helper function that prints hexdigit and have a sub-byte starting position.
+static void printHexDigits(raw_ostream &OS, ArrayRef<uint8_t> Bytes,
+                           size_t StartBit, size_t NumBits) {
+  assert(StartBit % 4 == 0);
+  assert(NumBits % 4 == 0);
+  for (size_t I = StartBit, E = StartBit + NumBits; I != E; I += 4) {
+    uint8_t HexPair = Bytes[I / 8];
+    uint8_t HexDigit = I % 8 == 0 ? HexPair >> 4 : HexPair & 0xf;
+    OS << hexdigit(HexDigit, /*LowerCase=*/true);
+  }
+}
+
+static void printBits(raw_ostream &OS, ArrayRef<uint8_t> Bytes, size_t StartBit,
+                      size_t NumBits) {
+  assert(StartBit + NumBits <= Bytes.size() * 8u);
+  for (size_t I = StartBit, E = StartBit + NumBits; I != E; ++I) {
+    uint8_t Byte = Bytes[I / 8];
+    size_t ByteOffset = I % 8;
+    if (size_t ByteShift = 8 - ByteOffset - 1)
+      Byte >>= ByteShift;
+    OS << (Byte & 0x1 ? '1' : '0');
+  }
+}
+
+void SubtrieHandle::printHash(raw_ostream &OS, ArrayRef<uint8_t> Bytes) const {
+  // afb[1c:00*01110*0]def
+  size_t EndBit = getStartBit() + getNumBits();
+  size_t HashEndBit = Bytes.size() * 8u;
+
+  size_t FirstBinaryBit = getStartBit() & ~0x3u;
+  printHexDigits(OS, Bytes, 0, FirstBinaryBit);
+
+  size_t LastBinaryBit = (EndBit + 3u) & ~0x3u;
+  OS << "[";
+  printBits(OS, Bytes, FirstBinaryBit, LastBinaryBit - FirstBinaryBit);
+  OS << "]";
+
+  printHexDigits(OS, Bytes, LastBinaryBit, HashEndBit - LastBinaryBit);
+}
+
+static void appendIndexBits(std::string &Prefix, size_t Index,
+                            size_t NumSlots) {
+  std::string Bits;
+  for (size_t NumBits = 1u; NumBits < NumSlots; NumBits <<= 1) {
+    Bits.push_back('0' + (Index & 0x1));
+    Index >>= 1;
+  }
+  for (char Ch : llvm::reverse(Bits))
+    Prefix += Ch;
+}
+
+static void printPrefix(raw_ostream &OS, StringRef Prefix) {
+  while (Prefix.size() >= 4) {
+    uint8_t Digit;
+    bool ErrorParsingBinary = Prefix.take_front(4).getAsInteger(2, Digit);
+    assert(!ErrorParsingBinary);
+    (void)ErrorParsingBinary;
+    OS << hexdigit(Digit, /*LowerCase=*/true);
+    Prefix = Prefix.drop_front(4);
+  }
+  if (!Prefix.empty())
+    OS << "[" << Prefix << "]";
+}
+
+LLVM_DUMP_METHOD void OnDiskTrieRawHashMap::dump() const { print(dbgs()); }
+
+static Expected<size_t> checkParameter(StringRef Label, size_t Max,
+                                       std::optional<size_t> Value,
+                                       std::optional<size_t> Default,
+                                       StringRef Path, StringRef TableName) {
+  assert(Value || Default);
+  assert(!Default || *Default <= Max);
+  if (!Value)
+    return *Default;
+
+  if (*Value <= Max)
+    return *Value;
+  return createTableConfigError(
+      std::errc::argument_out_of_domain, Path, TableName,
+      "invalid " + Label + ": " + Twine(*Value) + " (max: " + Twine(Max) + ")");
+}
+
+size_t OnDiskTrieRawHashMap::size() const { return Impl->File.size(); }
+size_t OnDiskTrieRawHashMap::capacity() const {
+  return Impl->File.getRegion().size();
+}
+
+Expected<OnDiskTrieRawHashMap>
+OnDiskTrieRawHashMap::create(const Twine &PathTwine, const Twine &TrieNameTwine,
+                             size_t NumHashBits, uint64_t DataSize,
+                             uint64_t MaxFileSize,
+                             std::optional<uint64_t> NewFileInitialSize,
+                             std::optional<size_t> NewTableNumRootBits,
+                             std::optional<size_t> NewTableNumSubtrieBits) {
+  SmallString<128> PathStorage;
+  StringRef Path = PathTwine.toStringRef(PathStorage);
+  SmallString<128> TrieNameStorage;
+  StringRef TrieName = TrieNameTwine.toStringRef(TrieNameStorage);
+
+  constexpr size_t DefaultNumRootBits = 10;
+  constexpr size_t DefaultNumSubtrieBits = 6;
+
+  size_t NumRootBits;
+  if (Error E = checkParameter(
+                    "root bits", TrieRawHashMapHandle::MaxNumRootBits,
+                    NewTableNumRootBits, DefaultNumRootBits, Path, TrieName)
+                    .moveInto(NumRootBits))
+    return std::move(E);
+
+  size_t NumSubtrieBits;
+  if (Error E = checkParameter("subtrie bits",
+                               TrieRawHashMapHandle::MaxNumSubtrieBits,
+                               NewTableNumSubtrieBits, DefaultNumSubtrieBits,
+                               Path, TrieName)
+                    .moveInto(NumSubtrieBits))
+    return std::move(E);
+
+  size_t NumHashBytes = NumHashBits >> 3;
+  if (Error E =
+          checkParameter("hash size", TrieRawHashMapHandle::MaxNumHashBits,
+                         NumHashBits, std::nullopt, Path, TrieName)
+              .takeError())
+    return std::move(E);
+  assert(NumHashBits == NumHashBytes << 3 &&
+         "Expected hash size to be byte-aligned");
+  if (NumHashBits != NumHashBytes << 3)
+    return createTableConfigError(
+        std::errc::argument_out_of_domain, Path, TrieName,
+        "invalid hash size: " + Twine(NumHashBits) + " (not byte-aligned)");
+
+  // Constructor for if the file doesn't exist.
+  auto NewDBConstructor = [&](DatabaseFile &DB) -> Error {
+    auto Trie =
+        TrieRawHashMapHandle::create(DB.getAlloc(), TrieName, NumRootBits,
+                                     NumSubtrieBits, NumHashBits, DataSize);
+    if (LLVM_UNLIKELY(!Trie))
+      return Trie.takeError();
+
+    return DB.addTable(*Trie);
+  };
+
+  // Get or create the file.
+  Expected<DatabaseFile> File =
+      DatabaseFile::create(Path, MaxFileSize, NewDBConstructor);
+  if (!File)
+    return File.takeError();
+
+  // Find the trie and validate it.
+  std::optional<TableHandle> Table = File->findTable(TrieName);
+  if (!Table)
+    return createTableConfigError(std::errc::argument_out_of_domain, Path,
+                                  TrieName, "table not found");
+  if (Error E = checkTable("table kind", (size_t)TrieRawHashMapHandle::Kind,
+                           (size_t)Table->getHeader().Kind, Path, TrieName))
+    return std::move(E);
+  auto Trie = Table->cast<TrieRawHashMapHandle>();
+  assert(Trie && "Already checked the kind");
+
+  // Check the hash and data size.
+  if (Error E = checkTable("hash size", NumHashBits, Trie.getNumHashBits(),
+                           Path, TrieName))
+    return std::move(E);
+  if (Error E = checkTable("data size", DataSize, Trie.getRecordDataSize(),
+                           Path, TrieName))
+    return std::move(E);
+
+  // No flags supported right now. Either corrupt, or coming from a future
+  // writer.
+  if (size_t Flags = Trie.getFlags())
+    return createTableConfigError(std::errc::invalid_argument, Path, TrieName,
+                                  "unsupported flags: " + Twine(Flags));
+
+  // Success.
+  OnDiskTrieRawHashMap::ImplType Impl{DatabaseFile(std::move(*File)), Trie};
+  return OnDiskTrieRawHashMap(std::make_unique<ImplType>(std::move(Impl)));
+}
+
+static Error createInvalidTrieError(uint64_t Offset, const Twine &Msg) {
+  return createStringError(make_error_code(std::errc::protocol_error),
+                           "invalid trie at 0x" +
+                               utohexstr(Offset, /*LowerCase=*/true) + ": " +
+                               Msg);
+}
+
+//===----------------------------------------------------------------------===//
+// TrieVisitor data structures.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A multi-threaded vistior to traverse the Trie.
+///
+/// TODO: add more sanity checks that isn't just plain data corruption. For
+/// example, some ill-formed data can be constructed to form a cycle using
+/// Sub-Tries and it can lead to inifinite loop when visiting (or inserting
+/// data).
+class TrieVisitor {
+public:
+  TrieVisitor(TrieRawHashMapHandle Trie, unsigned ThreadCount = 0,
+              unsigned ErrorLimit = 50)
+      : Trie(Trie), ErrorLimit(ErrorLimit),
+        Threads(hardware_concurrency(ThreadCount)) {}
+  virtual ~TrieVisitor() = default;
+  Error visit();
+
+private:
+  // Virtual method to implement the action when visiting a sub-trie.
+  virtual Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) {
+    return Error::success();
+  }
+
+  // Virtual method to implement the action when visiting a slot in a trie node.
+  virtual Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix,
+                          SubtrieSlotValue Slot) {
+    return Error::success();
+  }
+
+protected:
+  TrieRawHashMapHandle Trie;
+
+private:
+  Error traverseTrieNode(SubtrieHandle Node, StringRef Prefix);
+
+  Error validateSubTrie(SubtrieHandle Node, bool IsRoot);
+
+  // Helper function to capture errors when visiting the trie nodes.
+  void addError(Error NewError) {
+    assert(NewError && "not an error");
+    std::lock_guard<std::mutex> ErrorLock(Lock);
+    if (NumError >= ErrorLimit) {
+      // Too many errors.
+      consumeError(std::move(NewError));
+      return;
+    }
+
+    if (Err)
+      Err = joinErrors(std::move(*Err), std::move(NewError));
+    else
+      Err = std::move(NewError);
+    NumError++;
+  }
+
+  bool tooManyErrors() {
+    std::lock_guard<std::mutex> ErrorLock(Lock);
+    return (bool)Err && NumError >= ErrorLimit;
+  }
+
+  const unsigned ErrorLimit;
+  std::optional<Error> Err;
+  unsigned NumError = 0;
+  std::mutex Lock;
+  DefaultThreadPool Threads;
+};
+
+/// A visitor that traverse and print the Trie.
+class TriePrinter : public TrieVisitor {
+public:
+  TriePrinter(TrieRawHashMapHandle Trie, raw_ostream &OS,
+              function_ref<void(ArrayRef<char>)> PrintRecordData)
+      : TrieVisitor(Trie, /*ThreadCount=*/1), OS(OS),
+        PrintRecordData(PrintRecordData) {}
+
+  Error printRecords() {
+    if (Records.empty())
+      return Error::success();
+
+    OS << "records\n";
+    llvm::sort(Records);
+    for (int64_t Offset : Records) {
+      TrieRawHashMapHandle::RecordData Record =
+          Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset));
+      if (auto Err = printRecord(Record))
+        return Err;
+    }
+    return Error::success();
+  }
+
+  Error printRecord(TrieRawHashMapHandle::RecordData &Record) {
+    OS << "- addr=" << (void *)Record.getFileOffset().get() << " ";
+    if (PrintRecordData) {
+      PrintRecordData(Record.Proxy.Data);
+    } else {
+      OS << "bytes=";
+      ArrayRef<uint8_t> Data(
+          reinterpret_cast<const uint8_t *>(Record.Proxy.Data.data()),
+          Record.Proxy.Data.size());
+      printHexDigits(OS, Data, 0, Data.size() * 8);
+    }
+    OS << "\n";
+    return Error::success();
+  }
+
+  Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) override {
+    if (Prefix.empty()) {
+      OS << "root";
+    } else {
+      OS << "subtrie=";
+      printPrefix(OS, Prefix);
+    }
+
+    OS << " addr="
+       << (void *)(reinterpret_cast<const char *>(&SubTrie.getHeader()) -
+                   Trie.getRegion().data());
+    OS << " num-slots=" << SubTrie.getNumSlots() << "\n";
+    return Error::success();
+  }
+
+  Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix,
+                  SubtrieSlotValue Slot) override {
+    OS << "- index=";
+    for (size_t Pad : {10, 100, 1000})
+      if (I < Pad && Subtrie.getNumSlots() >= Pad)
+        OS << "0";
+    OS << I << " ";
+    if (Slot.isSubtrie()) {
+      OS << "addr=" << (void *)Slot.asSubtrie();
+      OS << " subtrie=";
+      printPrefix(OS, Prefix);
+      OS << "\n";
+      return Error::success();
+    }
+    TrieRawHashMapHandle::RecordData Record = Trie.getRecord(Slot);
+    OS << "addr=" << (void *)Record.getFileOffset().get();
+    OS << " content=";
+    Subtrie.printHash(OS, Record.Proxy.Hash);
+    OS << "\n";
+    Records.push_back(Slot.asData());
+    return Error::success();
+  }
+
+private:
+  raw_ostream &OS;
+  function_ref<void(ArrayRef<char>)> PrintRecordData;
+  SmallVector<int64_t> Records;
+};
+
+/// TrieVerifier that adds additional verification on top of the basic visitor.
+class TrieVerifier : public TrieVisitor {
+public:
+  TrieVerifier(
+      TrieRawHashMapHandle Trie,
+      function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+          RecordVerifier)
+      : TrieVisitor(Trie), RecordVerifier(RecordVerifier) {}
+
+private:
+  Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) final {
+    return Error::success();
+  }
+
+  Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix,
+                  SubtrieSlotValue Slot) final {
+    if (RecordVerifier && Slot.isData()) {
+      if (!isAligned(MappedFileRegionArena::getAlign(), Slot.asData()))
+        return createInvalidTrieError(Slot.asData(), "mis-aligned data entry");
+
+      TrieRawHashMapHandle::RecordData Record =
+          Trie.getRecord(SubtrieSlotValue::getDataOffset(Slot.asData()));
+      return RecordVerifier(Slot.asDataFileOffset(),
+                            OnDiskTrieRawHashMap::ConstValueProxy{
+                                Record.Proxy.Hash, Record.Proxy.Data});
+    }
+    return Error::success();
+  }
+
+  function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+      RecordVerifier;
+};
+} // namespace
+
+Error TrieVisitor::visit() {
+  auto Root = Trie.getRoot();
+  if (!Root)
+    return Error::success();
+
+  if (auto Err = validateSubTrie(Root, /*IsRoot=*/true))
+    return Err;
+
+  if (auto Err = visitSubTrie("", Root))
+    return Err;
+
+  SmallVector<SubtrieHandle> Subs;
+  SmallVector<std::string> Prefixes;
+  const size_t NumSlots = Root.getNumSlots();
+  for (size_t I = 0, E = NumSlots; I != E; ++I) {
+    SubtrieSlotValue Slot = Root.load(I);
+    if (!Slot)
+      continue;
+    uint64_t Offset = Slot.isSubtrie() ? Slot.asSubtrie() : Slot.asData();
+    if (Offset >= (uint64_t)Trie.getRegion().size())
+      return createInvalidTrieError(Offset, "slot points out of bound");
+    std::string SubtriePrefix;
+    appendIndexBits(SubtriePrefix, I, NumSlots);
+    if (Slot.isSubtrie()) {
+      SubtrieHandle S(Trie.getRegion(), Slot);
+      Subs.push_back(S);
+      Prefixes.push_back(SubtriePrefix);
+    }
+    if (auto Err = visitSlot(I, Root, SubtriePrefix, Slot))
+      return Err;
+  }
+
+  for (size_t I = 0, E = Subs.size(); I != E; ++I) {
+    Threads.async(
+        [&](unsigned Idx) {
+          // Don't run if there is an error already.
+          if (tooManyErrors())
+            return;
+          if (auto Err = traverseTrieNode(Subs[Idx], Prefixes[Idx]))
+            addError(std::move(Err));
+        },
+        I);
+  }
+
+  Threads.wait();
+  if (Err)
+    return std::move(*Err);
+  return Error::success();
+}
+
+Error TrieVisitor::validateSubTrie(SubtrieHandle Node, bool IsRoot) {
+  char *Addr = reinterpret_cast<char *>(&Node.getHeader());
+  const int64_t Offset = Node.getFileOffset().get();
+  if (Addr + Node.getSize() >=
+      Trie.getRegion().data() + Trie.getRegion().size())
+    return createInvalidTrieError(Offset, "subtrie node spans out of bound");
+
+  if (!IsRoot &&
+      Node.getStartBit() + Node.getNumBits() > Trie.getNumHashBits()) {
+    return createInvalidTrieError(Offset,
+                                  "subtrie represents too many hash bits");
+  }
+
+  if (IsRoot) {
+    if (Node.getStartBit() != 0)
+      return createInvalidTrieError(Offset,
+                                    "root node doesn't start at 0 index");
+
+    return Error::success();
+  }
+
+  if (Node.getNumBits() > Trie.getNumSubtrieBits())
+    return createInvalidTrieError(Offset, "subtrie has wrong number of slots");
+
+  return Error::success();
+}
+
+Error TrieVisitor::traverseTrieNode(SubtrieHandle Node, StringRef Prefix) {
+  if (auto Err = validateSubTrie(Node, /*IsRoot=*/false))
+    return Err;
+
+  if (auto Err = visitSubTrie(Prefix, Node))
+    return Err;
+
+  SmallVector<SubtrieHandle> Subs;
+  SmallVector<std::string> Prefixes;
+  const size_t NumSlots = Node.getNumSlots();
+  for (size_t I = 0, E = NumSlots; I != E; ++I) {
+    SubtrieSlotValue Slot = Node.load(I);
+    if (!Slot)
+      continue;
+    uint64_t Offset = Slot.isSubtrie() ? Slot.asSubtrie() : Slot.asData();
+    if (Offset >= (uint64_t)Trie.getRegion().size())
+      return createInvalidTrieError(Offset, "slot points out of bound");
+    std::string SubtriePrefix = Prefix.str();
+    appendIndexBits(SubtriePrefix, I, NumSlots);
+    if (Slot.isSubtrie()) {
+      SubtrieHandle S(Trie.getRegion(), Slot);
+      Subs.push_back(S);
+      Prefixes.push_back(SubtriePrefix);
+    }
+    if (auto Err = visitSlot(I, Node, SubtriePrefix, Slot))
+      return Err;
+  }
+  for (size_t I = 0, E = Subs.size(); I != E; ++I)
+    if (auto Err = traverseTrieNode(Subs[I], Prefixes[I]))
+      return Err;
+
+  return Error::success();
+}
+
+void TrieRawHashMapHandle::print(
+    raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+  OS << "hash-num-bits=" << getNumHashBits()
+     << " hash-size=" << getNumHashBytes()
+     << " record-data-size=" << getRecordDataSize() << "\n";
+
+  TriePrinter Printer(*this, OS, PrintRecordData);
+  if (auto Err = Printer.visit())
+    OS << "error: " << toString(std::move(Err)) << "\n";
+
+  if (auto Err = Printer.printRecords())
+    OS << "error: " << toString(std::move(Err)) << "\n";
+
+  return;
+}
+
+Error TrieRawHashMapHandle::validate(
+    function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+        RecordVerifier) const {
+  // Use the base TrieVisitor to identify the errors inside trie first.
+  TrieVisitor BasicVerifier(*this);
+  if (auto Err = BasicVerifier.visit())
+    return Err;
+
+  // If the trie data structure is sound, do a second pass to verify data and
+  // verifier function can assume the index is correct. However, there can be
+  // newly added bad entries that can still produce error.
+  TrieVerifier Verifier(*this, RecordVerifier);
+  return Verifier.visit();
+}
+
+#else // !LLVM_ENABLE_ONDISK_CAS
+
+struct OnDiskTrieRawHashMap::ImplType {};
+
+Expected<OnDiskTrieRawHashMap>
+OnDiskTrieRawHashMap::create(const Twine &PathTwine, const Twine &TrieNameTwine,
+                             size_t NumHashBits, uint64_t DataSize,
+                             uint64_t MaxFileSize,
+                             std::optional<uint64_t> NewFileInitialSize,
+                             std::optional<size_t> NewTableNumRootBits,
+                             std::optional<size_t> NewTableNumSubtrieBits) {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+Expected<OnDiskTrieRawHashMap::pointer>
+OnDiskTrieRawHashMap::insertLazy(ArrayRef<uint8_t> Hash,
+                                 LazyInsertOnConstructCB OnConstruct,
+                                 LazyInsertOnLeakCB OnLeak) {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+Expected<OnDiskTrieRawHashMap::const_pointer>
+OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+OnDiskTrieRawHashMap::const_pointer
+OnDiskTrieRawHashMap::find(ArrayRef<uint8_t> Hash) const {
+  return const_pointer();
+}
+
+void OnDiskTrieRawHashMap::print(
+    raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+}
+
+Error OnDiskTrieRawHashMap::validate(
+    function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+        RecordVerifier) const {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+size_t OnDiskTrieRawHashMap::size() const { return 0; }
+size_t OnDiskTrieRawHashMap::capacity() const { return 0; }
+
+#endif // LLVM_ENABLE_ONDISK_CAS
+
+OnDiskTrieRawHashMap::OnDiskTrieRawHashMap(std::unique_ptr<ImplType> Impl)
+    : Impl(std::move(Impl)) {}
+OnDiskTrieRawHashMap::OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS) =
+    default;
+OnDiskTrieRawHashMap &
+OnDiskTrieRawHashMap::operator=(OnDiskTrieRawHashMap &&RHS) = default;
+OnDiskTrieRawHashMap::~OnDiskTrieRawHashMap() = default;

diff  --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index ab709e30369bf..0f8fcb9e98954 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -1,7 +1,3 @@
-if (LLVM_ENABLE_ONDISK_CAS)
-  add_definitions(-DLLVM_ENABLE_ONDISK_CAS=1)
-endif()
-
 set(LLVM_LINK_COMPONENTS
   Support
   CAS
@@ -12,6 +8,7 @@ add_llvm_unittest(CASTests
   ActionCacheTest.cpp
   CASTestConfig.cpp
   ObjectStoreTest.cpp
+  OnDiskTrieRawHashMapTest.cpp
   ProgramTest.cpp
   )
 

diff  --git a/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp b/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp
new file mode 100644
index 0000000000000..7bedfe4b29e30
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp
@@ -0,0 +1,220 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+#if LLVM_ENABLE_ONDISK_CAS
+using namespace llvm;
+using namespace llvm::cas;
+
+namespace {
+
+struct OnDiskTrieRawHashMapTestFixture
+    : public ::testing::TestWithParam<size_t> {
+  static constexpr size_t MB = 1024u * 1024u;
+  static constexpr size_t DataSize = 8; // Multiple of 8B.
+
+  std::optional<unittest::TempDir> Temp;
+  size_t NumHashBytes;
+
+  void SetUp() override {
+    Temp.emplace("trie-raw-hash-map", /*Unique=*/true);
+    NumHashBytes = GetParam();
+  }
+  void TearDown() override { Temp.reset(); }
+
+  Expected<OnDiskTrieRawHashMap> createTrie() {
+    size_t NumHashBits = NumHashBytes * 8;
+    return OnDiskTrieRawHashMap::create(
+        Temp->path((Twine(NumHashBytes) + "B").str()), "index",
+        /*NumHashBits=*/NumHashBits, DataSize, /*MaxFileSize=*/MB,
+        /*NewInitialFileSize=*/std::nullopt);
+  }
+};
+
+// Create tries with various sizes of hash and with data.
+TEST_P(OnDiskTrieRawHashMapTestFixture, General) {
+  std::optional<OnDiskTrieRawHashMap> Trie1;
+  ASSERT_THAT_ERROR(createTrie().moveInto(Trie1), Succeeded());
+  std::optional<OnDiskTrieRawHashMap> Trie2;
+  ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded());
+
+  uint8_t Hash0Bytes[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  uint8_t Hash1Bytes[8] = {1, 0, 0, 0, 0, 0, 0, 0};
+  auto Hash0 = ArrayRef(Hash0Bytes).take_front(NumHashBytes);
+  auto Hash1 = ArrayRef(Hash1Bytes).take_front(NumHashBytes);
+  constexpr StringLiteral Data0v1Bytes = "data0.v1";
+  constexpr StringLiteral Data0v2Bytes = "data0.v2";
+  constexpr StringLiteral Data1Bytes = "data1...";
+  static_assert(Data0v1Bytes.size() == DataSize, "math error");
+  static_assert(Data0v2Bytes.size() == DataSize, "math error");
+  static_assert(Data1Bytes.size() == DataSize, "math error");
+  ArrayRef<char> Data0v1 = ArrayRef(Data0v1Bytes.data(), Data0v1Bytes.size());
+  ArrayRef<char> Data0v2 = ArrayRef(Data0v2Bytes.data(), Data0v2Bytes.size());
+  ArrayRef<char> Data1 = ArrayRef(Data1Bytes.data(), Data1Bytes.size());
+
+  // Lookup when trie is empty.
+  EXPECT_FALSE(Trie1->find(Hash0));
+
+  // Insert.
+  std::optional<FileOffset> Offset;
+  std::optional<MutableArrayRef<char>> Data;
+  {
+    std::optional<OnDiskTrieRawHashMap::pointer> Insertion;
+    ASSERT_THAT_ERROR(Trie1->insert({Hash0, Data0v1}).moveInto(Insertion),
+                      Succeeded());
+    EXPECT_EQ(Hash0, (*Insertion)->Hash);
+    EXPECT_EQ(Data0v1, (*Insertion)->Data);
+    EXPECT_TRUE(isAddrAligned(Align(8), (*Insertion)->Data.data()));
+
+    Offset = Insertion->getOffset();
+    Data = (*Insertion)->Data;
+  }
+
+  // Find.
+  {
+    auto Lookup = Trie1->find(Hash0);
+    ASSERT_TRUE(Lookup);
+    EXPECT_EQ(Hash0, Lookup->Hash);
+    EXPECT_EQ(Data0v1, Lookup->Data);
+    EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+  }
+
+  // Find in a 
diff erent instance of the same on-disk trie that existed
+  // before the insertion.
+  {
+    auto Lookup = Trie2->find(Hash0);
+    ASSERT_TRUE(Lookup);
+    EXPECT_EQ(Hash0, Lookup->Hash);
+    EXPECT_EQ(Data0v1, Lookup->Data);
+    EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+  }
+
+  // Create a new instance and check that too.
+  Trie2.reset();
+  ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded());
+  {
+    auto Lookup = Trie2->find(Hash0);
+    ASSERT_TRUE(Lookup);
+    EXPECT_EQ(Hash0, Lookup->Hash);
+    EXPECT_EQ(Data0v1, Lookup->Data);
+    EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+  }
+
+  // Change the data.
+  llvm::copy(Data0v2, Data->data());
+  {
+    auto Lookup = Trie2->find(Hash0);
+    ASSERT_TRUE(Lookup);
+    EXPECT_EQ(Hash0, Lookup->Hash);
+    EXPECT_EQ(Data0v2, Lookup->Data);
+    EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+  }
+
+  // Find 
diff erent hash.
+  EXPECT_FALSE(Trie1->find(Hash1));
+  EXPECT_FALSE(Trie2->find(Hash1));
+
+  // Recover from an offset.
+  {
+    OnDiskTrieRawHashMap::const_pointer Recovered;
+    ASSERT_THAT_ERROR(Trie1->recoverFromFileOffset(*Offset).moveInto(Recovered),
+                      Succeeded());
+    ASSERT_TRUE(Recovered);
+    EXPECT_EQ(Offset->get(), Recovered.getOffset().get());
+    EXPECT_EQ(Hash0, Recovered->Hash);
+    EXPECT_EQ(Data0v2, Recovered->Data);
+  }
+
+  // Recover from a bad offset.
+  {
+    FileOffset BadOffset(1);
+    OnDiskTrieRawHashMap::const_pointer Recovered;
+    ASSERT_THAT_ERROR(
+        Trie1->recoverFromFileOffset(BadOffset).moveInto(Recovered), Failed());
+  }
+
+  // Insert another thing.
+  {
+    std::optional<OnDiskTrieRawHashMap::pointer> Insertion;
+    ASSERT_THAT_ERROR(Trie1->insert({Hash1, Data1}).moveInto(Insertion),
+                      Succeeded());
+    EXPECT_EQ(Hash1, (*Insertion)->Hash);
+    EXPECT_EQ(Data1, (*Insertion)->Data);
+    EXPECT_TRUE(isAddrAligned(Align(8), (*Insertion)->Data.data()));
+
+    EXPECT_NE(Offset->get(), Insertion->getOffset().get());
+  }
+
+  // Validate.
+  {
+    auto RecordVerify =
+        [&](FileOffset Offset,
+            OnDiskTrieRawHashMap::ConstValueProxy Proxy) -> Error {
+      if (Proxy.Hash.size() != NumHashBytes)
+        return createStringError("wrong hash size");
+      if (Proxy.Data.size() != DataSize)
+        return createStringError("wrong data size");
+
+      return Error::success();
+    };
+    ASSERT_THAT_ERROR(Trie1->validate(RecordVerify), Succeeded());
+    ASSERT_THAT_ERROR(Trie2->validate(RecordVerify), Succeeded());
+  }
+
+  // Size and capacity.
+  {
+    EXPECT_EQ(Trie1->capacity(), MB);
+    EXPECT_EQ(Trie2->capacity(), MB);
+    EXPECT_LE(Trie1->size(), MB);
+    EXPECT_LE(Trie2->size(), MB);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(OnDiskTrieRawHashMapTest,
+                         OnDiskTrieRawHashMapTestFixture,
+                         ::testing::Values(1, 2, 4, 8));
+
+TEST(OnDiskTrieRawHashMapTest, OutOfSpace) {
+  unittest::TempDir Temp("trie-raw-hash-map", /*Unique=*/true);
+  std::optional<OnDiskTrieRawHashMap> Trie;
+
+  // Too small to create header.
+  ASSERT_THAT_ERROR(OnDiskTrieRawHashMap::create(
+                        Temp.path("NoSpace1").str(), "index",
+                        /*NumHashBits=*/8, /*DataSize=*/8, /*MaxFileSize=*/8,
+                        /*NewInitialFileSize=*/std::nullopt)
+                        .moveInto(Trie),
+                    Failed());
+
+  // Just enough for root node but not enough for any insertion.
+  ASSERT_THAT_ERROR(OnDiskTrieRawHashMap::create(
+                        Temp.path("NoSpace2").str(), "index",
+                        /*NumHashBits=*/8, /*DataSize=*/8, /*MaxFileSize=*/118,
+                        /*NewInitialFileSize=*/std::nullopt,
+                        /*NewTableNumRootBits=*/1, /*NewTableNumSubtrieBits=*/1)
+                        .moveInto(Trie),
+                    Succeeded());
+  uint8_t Hash0Bytes[1] = {0};
+  auto Hash0 = ArrayRef(Hash0Bytes);
+  constexpr StringLiteral Data0v1Bytes = "data0.v1";
+  ArrayRef<char> Data0v1 = ArrayRef(Data0v1Bytes.data(), Data0v1Bytes.size());
+  std::optional<OnDiskTrieRawHashMap::pointer> Insertion;
+  ASSERT_THAT_ERROR(Trie->insert({Hash0, Data0v1}).moveInto(Insertion),
+                    Failed());
+}
+
+} // namespace
+
+#endif // LLVM_ENABLE_ONDISK_CAS


        


More information about the llvm-commits mailing list