[llvm] be9c083 - [CAS] Add OnDiskGraphDB and OnDiskKeyValueDB (#114102)

via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 20 13:16:13 PDT 2025


Author: Steven Wu
Date: 2025-10-20T13:16:09-07:00
New Revision: be9c083cf7ec7dc9b6e93f29981d7b7e71273241

URL: https://github.com/llvm/llvm-project/commit/be9c083cf7ec7dc9b6e93f29981d7b7e71273241
DIFF: https://github.com/llvm/llvm-project/commit/be9c083cf7ec7dc9b6e93f29981d7b7e71273241.diff

LOG: [CAS] Add OnDiskGraphDB and OnDiskKeyValueDB (#114102)

Add OnDiskGraphDB and OnDiskKeyValueDB that can be used to implement
ObjectStore and ActionCache respectively. Those are on-disk persistent
storage that build upon OnDiskTrieHashMap and implements key functions
that are required by LLVMCAS interfaces.

This abstraction layer defines how the objects are hashed and stored on
disk. OnDiskKeyValueDB is a basic OnDiskTrieHashMap while OnDiskGraphDB
also defines:
* How objects of various size are store on disk and are referenced by
  the trie nodes.
* How to store the references from one stored object to another object
  that is referenced.

In addition to basic APIs for ObjectStore and ActionCache, other
advances database configuration features can be implemented in this
layer without exposing to the users of the LLVMCAS interface. For
example, OnDiskGraphDB has a faulty in function to fetch data from an
upstream OnDiskGraphDB if the data is missing.

Added: 
    llvm/include/llvm/CAS/OnDiskGraphDB.h
    llvm/include/llvm/CAS/OnDiskKeyValueDB.h
    llvm/lib/CAS/OnDiskGraphDB.cpp
    llvm/lib/CAS/OnDiskKeyValueDB.cpp
    llvm/unittests/CAS/OnDiskCommonUtils.h
    llvm/unittests/CAS/OnDiskGraphDBTest.cpp
    llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp

Modified: 
    llvm/include/llvm/CAS/OnDiskDataAllocator.h
    llvm/lib/CAS/CMakeLists.txt
    llvm/lib/CAS/OnDiskCommon.cpp
    llvm/lib/CAS/OnDiskCommon.h
    llvm/lib/CAS/OnDiskDataAllocator.cpp
    llvm/unittests/CAS/CASTestConfig.cpp
    llvm/unittests/CAS/CASTestConfig.h
    llvm/unittests/CAS/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CAS/OnDiskDataAllocator.h b/llvm/include/llvm/CAS/OnDiskDataAllocator.h
index 2809df800621b..b7099dce2069d 100644
--- a/llvm/include/llvm/CAS/OnDiskDataAllocator.h
+++ b/llvm/include/llvm/CAS/OnDiskDataAllocator.h
@@ -64,7 +64,7 @@ class OnDiskDataAllocator {
 
   /// \returns the buffer that was allocated at \p create time, with size
   /// \p UserHeaderSize.
-  MutableArrayRef<uint8_t> getUserHeader();
+  MutableArrayRef<uint8_t> getUserHeader() const;
 
   size_t size() const;
   size_t capacity() const;

diff  --git a/llvm/include/llvm/CAS/OnDiskGraphDB.h b/llvm/include/llvm/CAS/OnDiskGraphDB.h
new file mode 100644
index 0000000000000..83017a6a54fee
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskGraphDB.h
@@ -0,0 +1,469 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This declares OnDiskGraphDB, an ondisk CAS database with a fixed length
+/// hash. This is the class that implements the database storage scheme without
+/// exposing the hashing algorithm.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_ONDISKGRAPHDB_H
+#define LLVM_CAS_ONDISKGRAPHDB_H
+
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+
+namespace llvm::cas::ondisk {
+
+/// Standard 8 byte reference inside OnDiskGraphDB.
+class InternalRef {
+public:
+  FileOffset getFileOffset() const { return FileOffset(Data); }
+  uint64_t getRawData() const { return Data; }
+
+  static InternalRef getFromRawData(uint64_t Data) { return InternalRef(Data); }
+  static InternalRef getFromOffset(FileOffset Offset) {
+    return InternalRef(Offset.get());
+  }
+
+  friend bool operator==(InternalRef LHS, InternalRef RHS) {
+    return LHS.Data == RHS.Data;
+  }
+
+private:
+  InternalRef(FileOffset Offset) : Data((uint64_t)Offset.get()) {}
+  InternalRef(uint64_t Data) : Data(Data) {}
+  uint64_t Data;
+};
+
+/// Compact 4 byte reference inside OnDiskGraphDB for smaller references.
+class InternalRef4B {
+public:
+  FileOffset getFileOffset() const { return FileOffset(Data); }
+  uint32_t getRawData() const { return Data; }
+
+  /// Shrink to 4B reference.
+  static std::optional<InternalRef4B> tryToShrink(InternalRef Ref) {
+    uint64_t Offset = Ref.getRawData();
+    if (Offset > UINT32_MAX)
+      return std::nullopt;
+    return InternalRef4B(Offset);
+  }
+
+  operator InternalRef() const {
+    return InternalRef::getFromOffset(getFileOffset());
+  }
+
+private:
+  friend class InternalRef;
+  InternalRef4B(uint32_t Data) : Data(Data) {}
+  uint32_t Data;
+};
+
+/// Array of internal node references.
+class InternalRefArrayRef {
+public:
+  size_t size() const { return Size; }
+  bool empty() const { return !Size; }
+
+  class iterator
+      : public iterator_facade_base<iterator, std::random_access_iterator_tag,
+                                    const InternalRef> {
+  public:
+    bool operator==(const iterator &RHS) const { return I == RHS.I; }
+    InternalRef operator*() const {
+      if (auto *Ref = dyn_cast<const InternalRef *>(I))
+        return *Ref;
+      return InternalRef(*cast<const InternalRef4B *>(I));
+    }
+    bool operator<(const iterator &RHS) const {
+      assert(isa<const InternalRef *>(I) == isa<const InternalRef *>(RHS.I));
+      if (auto *Ref = dyn_cast<const InternalRef *>(I))
+        return Ref < cast<const InternalRef *>(RHS.I);
+      return cast<const InternalRef4B *>(I) -
+             cast<const InternalRef4B *>(RHS.I);
+    }
+    ptr
diff _t operator-(const iterator &RHS) const {
+      assert(isa<const InternalRef *>(I) == isa<const InternalRef *>(RHS.I));
+      if (auto *Ref = dyn_cast<const InternalRef *>(I))
+        return Ref - cast<const InternalRef *>(RHS.I);
+      return cast<const InternalRef4B *>(I) -
+             cast<const InternalRef4B *>(RHS.I);
+    }
+    iterator &operator+=(ptr
diff _t N) {
+      if (auto *Ref = dyn_cast<const InternalRef *>(I))
+        I = Ref + N;
+      else
+        I = cast<const InternalRef4B *>(I) + N;
+      return *this;
+    }
+    iterator &operator-=(ptr
diff _t N) {
+      if (auto *Ref = dyn_cast<const InternalRef *>(I))
+        I = Ref - N;
+      else
+        I = cast<const InternalRef4B *>(I) - N;
+      return *this;
+    }
+    InternalRef operator[](ptr
diff _t N) const { return *(this->operator+(N)); }
+
+    iterator() = default;
+
+    uint64_t getOpaqueData() const { return uintptr_t(I.getOpaqueValue()); }
+
+    static iterator fromOpaqueData(uint64_t Opaque) {
+      return iterator(
+          PointerUnion<const InternalRef *,
+                       const InternalRef4B *>::getFromOpaqueValue((void *)
+                                                                      Opaque));
+    }
+
+  private:
+    friend class InternalRefArrayRef;
+    explicit iterator(
+        PointerUnion<const InternalRef *, const InternalRef4B *> I)
+        : I(I) {}
+    PointerUnion<const InternalRef *, const InternalRef4B *> I;
+  };
+
+  bool operator==(const InternalRefArrayRef &RHS) const {
+    return size() == RHS.size() && std::equal(begin(), end(), RHS.begin());
+  }
+
+  iterator begin() const { return iterator(Begin); }
+  iterator end() const { return begin() + Size; }
+
+  /// Array accessor.
+  InternalRef operator[](ptr
diff _t N) const { return begin()[N]; }
+
+  bool is4B() const { return isa<const InternalRef4B *>(Begin); }
+  bool is8B() const { return isa<const InternalRef *>(Begin); }
+
+  ArrayRef<uint8_t> getBuffer() const {
+    if (is4B()) {
+      auto *B = cast<const InternalRef4B *>(Begin);
+      return ArrayRef((const uint8_t *)B, sizeof(InternalRef4B) * Size);
+    }
+    auto *B = cast<const InternalRef *>(Begin);
+    return ArrayRef((const uint8_t *)B, sizeof(InternalRef) * Size);
+  }
+
+  InternalRefArrayRef(std::nullopt_t = std::nullopt) {
+    // This is useful so that all the casts in the \p iterator functions can
+    // operate without needing to check for a null value.
+    static InternalRef PlaceHolder = InternalRef::getFromRawData(0);
+    Begin = &PlaceHolder;
+  }
+
+  InternalRefArrayRef(ArrayRef<InternalRef> Refs)
+      : Begin(Refs.begin()), Size(Refs.size()) {}
+
+  InternalRefArrayRef(ArrayRef<InternalRef4B> Refs)
+      : Begin(Refs.begin()), Size(Refs.size()) {}
+
+private:
+  PointerUnion<const InternalRef *, const InternalRef4B *> Begin;
+  size_t Size = 0;
+};
+
+/// Reference to a node. The node's data may not be stored in the database.
+/// An \p ObjectID instance can only be used with the \p OnDiskGraphDB instance
+/// it came from. \p ObjectIDs from 
diff erent \p OnDiskGraphDB instances are not
+/// comparable.
+class ObjectID {
+public:
+  uint64_t getOpaqueData() const { return Opaque; }
+
+  static ObjectID fromOpaqueData(uint64_t Opaque) { return ObjectID(Opaque); }
+
+  friend bool operator==(const ObjectID &LHS, const ObjectID &RHS) {
+    return LHS.Opaque == RHS.Opaque;
+  }
+  friend bool operator!=(const ObjectID &LHS, const ObjectID &RHS) {
+    return !(LHS == RHS);
+  }
+
+private:
+  explicit ObjectID(uint64_t Opaque) : Opaque(Opaque) {}
+  uint64_t Opaque;
+};
+
+/// Handle for a loaded node object.
+class ObjectHandle {
+public:
+  explicit ObjectHandle(uint64_t Opaque) : Opaque(Opaque) {}
+  uint64_t getOpaqueData() const { return Opaque; }
+
+  static ObjectHandle fromFileOffset(FileOffset Offset);
+  static ObjectHandle fromMemory(uintptr_t Ptr);
+
+  friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS) {
+    return LHS.Opaque == RHS.Opaque;
+  }
+  friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS) {
+    return !(LHS == RHS);
+  }
+
+private:
+  uint64_t Opaque;
+};
+
+/// Iterator for ObjectID.
+class object_refs_iterator
+    : public iterator_facade_base<object_refs_iterator,
+                                  std::random_access_iterator_tag, ObjectID> {
+public:
+  bool operator==(const object_refs_iterator &RHS) const { return I == RHS.I; }
+  ObjectID operator*() const {
+    return ObjectID::fromOpaqueData((*I).getRawData());
+  }
+  bool operator<(const object_refs_iterator &RHS) const { return I < RHS.I; }
+  ptr
diff _t operator-(const object_refs_iterator &RHS) const {
+    return I - RHS.I;
+  }
+  object_refs_iterator &operator+=(ptr
diff _t N) {
+    I += N;
+    return *this;
+  }
+  object_refs_iterator &operator-=(ptr
diff _t N) {
+    I -= N;
+    return *this;
+  }
+  ObjectID operator[](ptr
diff _t N) const { return *(this->operator+(N)); }
+
+  object_refs_iterator() = default;
+  object_refs_iterator(InternalRefArrayRef::iterator I) : I(I) {}
+
+  uint64_t getOpaqueData() const { return I.getOpaqueData(); }
+
+  static object_refs_iterator fromOpaqueData(uint64_t Opaque) {
+    return InternalRefArrayRef::iterator::fromOpaqueData(Opaque);
+  }
+
+private:
+  InternalRefArrayRef::iterator I;
+};
+
+using object_refs_range = llvm::iterator_range<object_refs_iterator>;
+
+/// On-disk CAS nodes database, independent of a particular hashing algorithm.
+class OnDiskGraphDB {
+public:
+  /// Associate data & references with a particular object ID. If there is
+  /// already a record for this object the operation is a no-op. \param ID the
+  /// object ID to associate the data & references with. \param Refs references
+  /// \param Data data buffer.
+  Error store(ObjectID ID, ArrayRef<ObjectID> Refs, ArrayRef<char> Data);
+
+  /// \returns \p nullopt if the object associated with \p Ref does not exist.
+  Expected<std::optional<ObjectHandle>> load(ObjectID Ref);
+
+  /// \returns the hash bytes digest for the object reference.
+  ArrayRef<uint8_t> getDigest(ObjectID Ref) const {
+    // ObjectID should be valid to fetch Digest.
+    return cantFail(getDigest(getInternalRef(Ref)));
+  }
+
+  /// Form a reference for the provided hash. The reference can be used as part
+  /// of a CAS object even if it's not associated with an object yet.
+  Expected<ObjectID> getReference(ArrayRef<uint8_t> Hash);
+
+  /// Get an existing reference to the object \p Digest.
+  ///
+  /// Returns \p nullopt if the object is not stored in this CAS.
+  std::optional<ObjectID> getExistingReference(ArrayRef<uint8_t> Digest);
+
+  /// Check whether the object associated with \p Ref is stored in the CAS.
+  /// Note that this function will fault-in according to the policy.
+  Expected<bool> isMaterialized(ObjectID Ref);
+
+  /// Check whether the object associated with \p Ref is stored in the CAS.
+  /// Note that this function does not fault-in.
+  bool containsObject(ObjectID Ref) const {
+    return containsObject(Ref, /*CheckUpstream=*/true);
+  }
+
+  /// \returns the data part of the provided object handle.
+  ArrayRef<char> getObjectData(ObjectHandle Node) const;
+
+  /// \returns the object referenced by the provided object handle.
+  object_refs_range getObjectRefs(ObjectHandle Node) const {
+    InternalRefArrayRef Refs = getInternalRefs(Node);
+    return make_range(Refs.begin(), Refs.end());
+  }
+
+  /// \returns Total size of stored objects.
+  ///
+  /// NOTE: There's a possibility that the returned size is not including a
+  /// large object if the process crashed right at the point of inserting it.
+  size_t getStorageSize() const;
+
+  /// \returns The precentage of space utilization of hard space limits.
+  ///
+  /// Return value is an integer between 0 and 100 for percentage.
+  unsigned getHardStorageLimitUtilization() const;
+
+  void print(raw_ostream &OS) const;
+
+  /// Hashing function type for validation.
+  using HashingFuncT = function_ref<void(
+      ArrayRef<ArrayRef<uint8_t>>, ArrayRef<char>, SmallVectorImpl<uint8_t> &)>;
+
+  /// Validate the OnDiskGraphDB.
+  ///
+  /// \param Deep if true, rehash all the objects to ensure no data
+  /// corruption in stored objects, otherwise just validate the structure of
+  /// CAS database.
+  /// \param Hasher is the hashing function used for objects inside CAS.
+  Error validate(bool Deep, HashingFuncT Hasher) const;
+
+  /// How to fault-in nodes if an upstream database is used.
+  enum class FaultInPolicy {
+    /// Copy only the requested node.
+    SingleNode,
+    /// Copy the the entire graph of a node.
+    FullTree,
+  };
+
+  /// Open the on-disk store from a directory.
+  ///
+  /// \param Path directory for the on-disk store. The directory will be created
+  /// if it doesn't exist.
+  /// \param HashName Identifier name for the hashing algorithm that is going to
+  /// be used.
+  /// \param HashByteSize Size for the object digest hash bytes.
+  /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes
+  /// if they don't exist in the primary store. The upstream store is only used
+  /// for reading nodes, new nodes are only written to the primary store.
+  /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied
+  /// to primary store. This is recorded at creation time and subsequent opens
+  /// need to pass the same policy otherwise the \p open will fail.
+  static Expected<std::unique_ptr<OnDiskGraphDB>>
+  open(StringRef Path, StringRef HashName, unsigned HashByteSize,
+       std::unique_ptr<OnDiskGraphDB> UpstreamDB = nullptr,
+       FaultInPolicy Policy = FaultInPolicy::FullTree);
+
+  ~OnDiskGraphDB();
+
+private:
+  /// Forward declaration for a proxy for an ondisk index record.
+  struct IndexProxy;
+
+  enum class ObjectPresence {
+    Missing,
+    InPrimaryDB,
+    OnlyInUpstreamDB,
+  };
+
+  /// Check if object exists and if it is on upstream only.
+  Expected<ObjectPresence> getObjectPresence(ObjectID Ref,
+                                             bool CheckUpstream) const;
+
+  /// \returns true if object can be found in database.
+  bool containsObject(ObjectID Ref, bool CheckUpstream) const {
+    auto Presence = getObjectPresence(Ref, CheckUpstream);
+    if (!Presence) {
+      consumeError(Presence.takeError());
+      return false;
+    }
+    switch (*Presence) {
+    case ObjectPresence::Missing:
+      return false;
+    case ObjectPresence::InPrimaryDB:
+      return true;
+    case ObjectPresence::OnlyInUpstreamDB:
+      return true;
+    }
+  }
+
+  /// When \p load is called for a node that doesn't exist, this function tries
+  /// to load it from the upstream store and copy it to the primary one.
+  Expected<std::optional<ObjectHandle>> faultInFromUpstream(ObjectID PrimaryID);
+
+  /// Import the entire tree from upstream with \p UpstreamNode as root.
+  Error importFullTree(ObjectID PrimaryID, ObjectHandle UpstreamNode);
+  /// Import only the \param UpstreamNode.
+  Error importSingleNode(ObjectID PrimaryID, ObjectHandle UpstreamNode);
+
+  /// Found the IndexProxy for the hash.
+  Expected<IndexProxy> indexHash(ArrayRef<uint8_t> Hash);
+
+  /// Get path for creating standalone data file.
+  void getStandalonePath(StringRef FileSuffix, const IndexProxy &I,
+                         SmallVectorImpl<char> &Path) const;
+  /// Create a standalone leaf file.
+  Error createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data);
+
+  /// \name Helper functions for internal data structures.
+  /// \{
+  static InternalRef getInternalRef(ObjectID Ref) {
+    return InternalRef::getFromRawData(Ref.getOpaqueData());
+  }
+
+  static ObjectID getExternalReference(InternalRef Ref) {
+    return ObjectID::fromOpaqueData(Ref.getRawData());
+  }
+
+  static ObjectID getExternalReference(const IndexProxy &I);
+
+  static InternalRef makeInternalRef(FileOffset IndexOffset);
+
+  Expected<ArrayRef<uint8_t>> getDigest(InternalRef Ref) const;
+
+  ArrayRef<uint8_t> getDigest(const IndexProxy &I) const;
+
+  Expected<IndexProxy> getIndexProxyFromRef(InternalRef Ref) const;
+
+  IndexProxy
+  getIndexProxyFromPointer(OnDiskTrieRawHashMap::ConstOnDiskPtr P) const;
+
+  InternalRefArrayRef getInternalRefs(ObjectHandle Node) const;
+  /// \}
+
+  /// Get the atomic variable that keeps track of the standalone data storage
+  /// size.
+  std::atomic<uint64_t> &standaloneStorageSize() const;
+
+  /// Increase the standalone data size.
+  void recordStandaloneSizeIncrease(size_t SizeIncrease);
+  /// Get the standalone data size.
+  uint64_t getStandaloneStorageSize() const;
+
+  // Private constructor.
+  OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
+                OnDiskDataAllocator DataPool,
+                std::unique_ptr<OnDiskGraphDB> UpstreamDB,
+                FaultInPolicy Policy);
+
+  /// Mapping from hash to object reference.
+  ///
+  /// Data type is TrieRecord.
+  OnDiskTrieRawHashMap Index;
+
+  /// Storage for most objects.
+  ///
+  /// Data type is DataRecordHandle.
+  OnDiskDataAllocator DataPool;
+
+  /// A StandaloneDataMap.
+  void *StandaloneData = nullptr;
+
+  /// Path to the root directory.
+  std::string RootPath;
+
+  /// Optional on-disk store to be used for faulting-in nodes.
+  std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+
+  /// The policy used to fault in data from upstream.
+  FaultInPolicy FIPolicy;
+};
+
+} // namespace llvm::cas::ondisk
+
+#endif // LLVM_CAS_ONDISKGRAPHDB_H

diff  --git a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
new file mode 100644
index 0000000000000..b762518366c21
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
@@ -0,0 +1,82 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This declares OnDiskKeyValueDB, a key value storage database of fixed size
+/// key and value.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_ONDISKKEYVALUEDB_H
+#define LLVM_CAS_ONDISKKEYVALUEDB_H
+
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+
+namespace llvm::cas::ondisk {
+
+/// An on-disk key-value data store with the following properties:
+/// * Keys are fixed length binary hashes with expected normal distribution.
+/// * Values are buffers of the same size, specified at creation time.
+/// * The value of a key cannot be changed once it is set.
+/// * The value buffers returned from a key lookup have 8-byte alignment.
+class OnDiskKeyValueDB {
+public:
+  /// Associate a value with a key.
+  ///
+  /// \param Key the hash bytes for the key
+  /// \param Value the value bytes, same size as \p ValueSize parameter of
+  /// \p open call.
+  ///
+  /// \returns the value associated with the \p Key. It may be 
diff erent than
+  /// \p Value if another value is already associated with this key.
+  Expected<ArrayRef<char>> put(ArrayRef<uint8_t> Key, ArrayRef<char> Value);
+
+  /// \returns the value associated with the \p Key, or \p std::nullopt if the
+  /// key does not exist.
+  Expected<std::optional<ArrayRef<char>>> get(ArrayRef<uint8_t> Key);
+
+  /// \returns Total size of stored data.
+  size_t getStorageSize() const { return Cache.size(); }
+
+  /// \returns The precentage of space utilization of hard space limits.
+  ///
+  /// Return value is an integer between 0 and 100 for percentage.
+  unsigned getHardStorageLimitUtilization() const {
+    return Cache.size() * 100ULL / Cache.capacity();
+  }
+
+  /// Open the on-disk store from a directory.
+  ///
+  /// \param Path directory for the on-disk store. The directory will be created
+  /// if it doesn't exist.
+  /// \param HashName Identifier name for the hashing algorithm that is going to
+  /// be used.
+  /// \param KeySize Size for the key hash bytes.
+  /// \param ValueName Identifier name for the values.
+  /// \param ValueSize Size for the value bytes.
+  static Expected<std::unique_ptr<OnDiskKeyValueDB>>
+  open(StringRef Path, StringRef HashName, unsigned KeySize,
+       StringRef ValueName, size_t ValueSize);
+
+  using CheckValueT =
+      function_ref<Error(FileOffset Offset, ArrayRef<char> Data)>;
+  /// Validate the storage with a callback \p CheckValue to check the stored
+  /// value.
+  Error validate(CheckValueT CheckValue) const;
+
+private:
+  OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache)
+      : ValueSize(ValueSize), Cache(std::move(Cache)) {}
+
+  const size_t ValueSize;
+  OnDiskTrieRawHashMap Cache;
+};
+
+} // namespace llvm::cas::ondisk
+
+#endif // LLVM_CAS_ONDISKKEYVALUEDB_H

diff  --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index bca39b645af45..a2f8c49e50145 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -8,6 +8,8 @@ add_llvm_component_library(LLVMCAS
   ObjectStore.cpp
   OnDiskCommon.cpp
   OnDiskDataAllocator.cpp
+  OnDiskGraphDB.cpp
+  OnDiskKeyValueDB.cpp
   OnDiskTrieRawHashMap.cpp
 
   ADDITIONAL_HEADER_DIRS

diff  --git a/llvm/lib/CAS/OnDiskCommon.cpp b/llvm/lib/CAS/OnDiskCommon.cpp
index 25aa06bfe64da..281bde981457b 100644
--- a/llvm/lib/CAS/OnDiskCommon.cpp
+++ b/llvm/lib/CAS/OnDiskCommon.cpp
@@ -7,9 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "OnDiskCommon.h"
-#include "llvm/Config/config.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Process.h"
+#include <mutex>
 #include <thread>
 
 #if __has_include(<sys/file.h>)
@@ -25,8 +26,44 @@
 #include <fcntl.h>
 #endif
 
+#if __has_include(<sys/mount.h>)
+#include <sys/mount.h> // statfs
+#endif
+
 using namespace llvm;
 
+static uint64_t OnDiskCASMaxMappingSize = 0;
+
+Expected<std::optional<uint64_t>> cas::ondisk::getOverriddenMaxMappingSize() {
+  static std::once_flag Flag;
+  Error Err = Error::success();
+  std::call_once(Flag, [&Err] {
+    ErrorAsOutParameter EAO(&Err);
+    constexpr const char *EnvVar = "LLVM_CAS_MAX_MAPPING_SIZE";
+    auto Value = sys::Process::GetEnv(EnvVar);
+    if (!Value)
+      return;
+
+    uint64_t Size;
+    if (StringRef(*Value).getAsInteger(/*auto*/ 0, Size))
+      Err = createStringError(inconvertibleErrorCode(),
+                              "invalid value for %s: expected integer", EnvVar);
+    OnDiskCASMaxMappingSize = Size;
+  });
+
+  if (Err)
+    return std::move(Err);
+
+  if (OnDiskCASMaxMappingSize == 0)
+    return std::nullopt;
+
+  return OnDiskCASMaxMappingSize;
+}
+
+void cas::ondisk::setMaxMappingSize(uint64_t Size) {
+  OnDiskCASMaxMappingSize = Size;
+}
+
 std::error_code cas::ondisk::lockFileThreadSafe(int FD,
                                                 sys::fs::LockKind Kind) {
 #if HAVE_FLOCK
@@ -125,3 +162,20 @@ Expected<size_t> cas::ondisk::preallocateFileTail(int FD, size_t CurrentSize,
   return NewSize;    // Pretend it worked.
 #endif
 }
+
+bool cas::ondisk::useSmallMappingSize(const Twine &P) {
+  // Add exceptions to use small database file here.
+#if defined(__APPLE__) && __has_include(<sys/mount.h>)
+  // macOS tmpfs does not support sparse tails.
+  SmallString<128> PathStorage;
+  StringRef Path = P.toNullTerminatedStringRef(PathStorage);
+  struct statfs StatFS;
+  if (statfs(Path.data(), &StatFS) != 0)
+    return false;
+
+  if (strcmp(StatFS.f_fstypename, "tmpfs") == 0)
+    return true;
+#endif
+  // Default to use regular database file.
+  return false;
+}

diff  --git a/llvm/lib/CAS/OnDiskCommon.h b/llvm/lib/CAS/OnDiskCommon.h
index 8b79ffe5c3158..ac00662a2e91e 100644
--- a/llvm/lib/CAS/OnDiskCommon.h
+++ b/llvm/lib/CAS/OnDiskCommon.h
@@ -12,9 +12,31 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include <chrono>
+#include <optional>
 
 namespace llvm::cas::ondisk {
 
+/// The version for all the ondisk database files. It needs to be bumped when
+/// compatibility breaking changes are introduced.
+constexpr StringLiteral CASFormatVersion = "v1";
+
+/// Retrieves an overridden maximum mapping size for CAS files, if any,
+/// speicified by LLVM_CAS_MAX_MAPPING_SIZE in the environment or set by
+/// `setMaxMappingSize()`. If the value from environment is unreadable, returns
+/// an error.
+Expected<std::optional<uint64_t>> getOverriddenMaxMappingSize();
+
+/// Set MaxMappingSize for ondisk CAS. This function is not thread-safe and
+/// should be set before creaing any ondisk CAS and does not affect CAS already
+/// created. Set value 0 to use default size.
+void setMaxMappingSize(uint64_t Size);
+
+/// Whether to use a small file mapping for ondisk databases created in \p Path.
+///
+/// For some file system that doesn't support sparse file, use a smaller file
+/// mapping to avoid consuming too much disk space on creation.
+bool useSmallMappingSize(const Twine &Path);
+
 /// Thread-safe alternative to \c sys::fs::lockFile. This does not support all
 /// the platforms that \c sys::fs::lockFile does, so keep it in the CAS library
 /// for now.

diff  --git a/llvm/lib/CAS/OnDiskDataAllocator.cpp b/llvm/lib/CAS/OnDiskDataAllocator.cpp
index 13bbd66139178..9c68bc48d533a 100644
--- a/llvm/lib/CAS/OnDiskDataAllocator.cpp
+++ b/llvm/lib/CAS/OnDiskDataAllocator.cpp
@@ -185,7 +185,7 @@ Expected<ArrayRef<char>> OnDiskDataAllocator::get(FileOffset Offset,
   return ArrayRef<char>{Impl->File.getRegion().data() + Offset.get(), Size};
 }
 
-MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() {
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() const {
   return Impl->Store.getUserHeader();
 }
 
@@ -221,7 +221,9 @@ Expected<ArrayRef<char>> OnDiskDataAllocator::get(FileOffset Offset,
                            "OnDiskDataAllocator is not supported");
 }
 
-MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() { return {}; }
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() const {
+  return {};
+}
 
 size_t OnDiskDataAllocator::size() const { return 0; }
 size_t OnDiskDataAllocator::capacity() const { return 0; }

diff  --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp
new file mode 100644
index 0000000000000..72bb98c4bf65d
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskGraphDB.cpp
@@ -0,0 +1,1755 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements OnDiskGraphDB, an on-disk CAS nodes database,
+/// independent of a particular hashing algorithm. It only needs to be
+/// configured for the hash size and controls the schema of the storage.
+///
+/// OnDiskGraphDB defines:
+///
+/// - How the data is stored inside database, either as a standalone file, or
+///   allocated inside a datapool.
+/// - How references to other objects inside the same database is stored. They
+///   are stored as internal references, instead of full hash value to save
+///   space.
+/// - How to chain databases together and import objects from upstream
+///   databases.
+///
+/// Here's a top-level description of the current layout:
+///
+/// - db/index.<version>: a file for the "index" table, named by \a
+///   IndexTableName and managed by \a TrieRawHashMap. The contents are 8B
+///   that are accessed atomically, describing the object kind and where/how
+///   it's stored (including an optional file offset). See \a TrieRecord for
+///   more details.
+/// - db/data.<version>: a file for the "data" table, named by \a
+///   DataPoolTableName and managed by \a DataStore. New objects within
+///   TrieRecord::MaxEmbeddedSize are inserted here as \a
+///   TrieRecord::StorageKind::DataPool.
+/// - db/obj.<offset>.<version>: a file storing an object outside the main
+///   "data" table, named by its offset into the "index" table, with the
+///   format of \a TrieRecord::StorageKind::Standalone.
+/// - db/leaf.<offset>.<version>: a file storing a leaf node outside the
+///   main "data" table, named by its offset into the "index" table, with
+///   the format of \a TrieRecord::StorageKind::StandaloneLeaf.
+/// - db/leaf+0.<offset>.<version>: a file storing a null-terminated leaf object
+///   outside the main "data" table, named by its offset into the "index" table,
+///   with the format of \a TrieRecord::StorageKind::StandaloneLeaf0.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include <atomic>
+#include <mutex>
+#include <optional>
+
+#define DEBUG_TYPE "on-disk-cas"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+static constexpr StringLiteral IndexTableName = "llvm.cas.index";
+static constexpr StringLiteral DataPoolTableName = "llvm.cas.data";
+
+static constexpr StringLiteral IndexFilePrefix = "index.";
+static constexpr StringLiteral DataPoolFilePrefix = "data.";
+
+static constexpr StringLiteral FilePrefixObject = "obj.";
+static constexpr StringLiteral FilePrefixLeaf = "leaf.";
+static constexpr StringLiteral FilePrefixLeaf0 = "leaf+0.";
+
+static Error createCorruptObjectError(Expected<ArrayRef<uint8_t>> ID) {
+  if (!ID)
+    return ID.takeError();
+
+  return createStringError(llvm::errc::invalid_argument,
+                           "corrupt object '" + toHex(*ID) + "'");
+}
+
+namespace {
+
+/// Trie record data: 8 bytes, atomic<uint64_t>
+/// - 1-byte: StorageKind
+/// - 7-bytes: DataStoreOffset (offset into referenced file)
+class TrieRecord {
+public:
+  enum class StorageKind : uint8_t {
+    /// Unknown object.
+    Unknown = 0,
+
+    /// data.vX: main pool, full DataStore record.
+    DataPool = 1,
+
+    /// obj.<TrieRecordOffset>.vX: standalone, with a full DataStore record.
+    Standalone = 10,
+
+    /// leaf.<TrieRecordOffset>.vX: standalone, just the data. File contents
+    /// exactly the data content and file size matches the data size. No refs.
+    StandaloneLeaf = 11,
+
+    /// leaf+0.<TrieRecordOffset>.vX: standalone, just the data plus an
+    /// extra null character ('\0'). File size is 1 bigger than the data size.
+    /// No refs.
+    StandaloneLeaf0 = 12,
+  };
+
+  static StringRef getStandaloneFilePrefix(StorageKind SK) {
+    switch (SK) {
+    default:
+      llvm_unreachable("Expected standalone storage kind");
+    case TrieRecord::StorageKind::Standalone:
+      return FilePrefixObject;
+    case TrieRecord::StorageKind::StandaloneLeaf:
+      return FilePrefixLeaf;
+    case TrieRecord::StorageKind::StandaloneLeaf0:
+      return FilePrefixLeaf0;
+    }
+  }
+
+  enum Limits : int64_t {
+    /// Saves files bigger than 64KB standalone instead of embedding them.
+    MaxEmbeddedSize = 64LL * 1024LL - 1,
+  };
+
+  struct Data {
+    StorageKind SK = StorageKind::Unknown;
+    FileOffset Offset;
+  };
+
+  /// Pack StorageKind and Offset from Data into 8 byte TrieRecord.
+  static uint64_t pack(Data D) {
+    assert(D.Offset.get() < (int64_t)(1ULL << 56));
+    uint64_t Packed = uint64_t(D.SK) << 56 | D.Offset.get();
+    assert(D.SK != StorageKind::Unknown || Packed == 0);
+#ifndef NDEBUG
+    Data RoundTrip = unpack(Packed);
+    assert(D.SK == RoundTrip.SK);
+    assert(D.Offset.get() == RoundTrip.Offset.get());
+#endif
+    return Packed;
+  }
+
+  // Unpack TrieRecord into Data.
+  static Data unpack(uint64_t Packed) {
+    Data D;
+    if (!Packed)
+      return D;
+    D.SK = (StorageKind)(Packed >> 56);
+    D.Offset = FileOffset(Packed & (UINT64_MAX >> 8));
+    return D;
+  }
+
+  TrieRecord() : Storage(0) {}
+
+  Data load() const { return unpack(Storage); }
+  bool compare_exchange_strong(Data &Existing, Data New);
+
+private:
+  std::atomic<uint64_t> Storage;
+};
+
+/// DataStore record data: 4B + size? + refs? + data + 0
+/// - 4-bytes: Header
+/// - {0,4,8}-bytes: DataSize     (may be packed in Header)
+/// - {0,4,8}-bytes: NumRefs      (may be packed in Header)
+/// - NumRefs*{4,8}-bytes: Refs[] (end-ptr is 8-byte aligned)
+/// - <data>
+/// - 1-byte: 0-term
+struct DataRecordHandle {
+  /// NumRefs storage: 4B, 2B, 1B, or 0B (no refs). Or, 8B, for alignment
+  /// convenience to avoid computing padding later.
+  enum class NumRefsFlags : uint8_t {
+    Uses0B = 0U,
+    Uses1B = 1U,
+    Uses2B = 2U,
+    Uses4B = 3U,
+    Uses8B = 4U,
+    Max = Uses8B,
+  };
+
+  /// DataSize storage: 8B, 4B, 2B, or 1B.
+  enum class DataSizeFlags {
+    Uses1B = 0U,
+    Uses2B = 1U,
+    Uses4B = 2U,
+    Uses8B = 3U,
+    Max = Uses8B,
+  };
+
+  /// Kind of ref stored in Refs[]: InternalRef or InternalRef4B.
+  enum class RefKindFlags {
+    InternalRef = 0U,
+    InternalRef4B = 1U,
+    Max = InternalRef4B,
+  };
+
+  enum Counts : int {
+    NumRefsShift = 0,
+    NumRefsBits = 3,
+    DataSizeShift = NumRefsShift + NumRefsBits,
+    DataSizeBits = 2,
+    RefKindShift = DataSizeShift + DataSizeBits,
+    RefKindBits = 1,
+  };
+  static_assert(((UINT32_MAX << NumRefsBits) & (uint32_t)NumRefsFlags::Max) ==
+                    0,
+                "Not enough bits");
+  static_assert(((UINT32_MAX << DataSizeBits) & (uint32_t)DataSizeFlags::Max) ==
+                    0,
+                "Not enough bits");
+  static_assert(((UINT32_MAX << RefKindBits) & (uint32_t)RefKindFlags::Max) ==
+                    0,
+                "Not enough bits");
+
+  /// Layout of the DataRecordHandle and how to decode it.
+  struct LayoutFlags {
+    NumRefsFlags NumRefs;
+    DataSizeFlags DataSize;
+    RefKindFlags RefKind;
+
+    static uint64_t pack(LayoutFlags LF) {
+      unsigned Packed = ((unsigned)LF.NumRefs << NumRefsShift) |
+                        ((unsigned)LF.DataSize << DataSizeShift) |
+                        ((unsigned)LF.RefKind << RefKindShift);
+#ifndef NDEBUG
+      LayoutFlags RoundTrip = unpack(Packed);
+      assert(LF.NumRefs == RoundTrip.NumRefs);
+      assert(LF.DataSize == RoundTrip.DataSize);
+      assert(LF.RefKind == RoundTrip.RefKind);
+#endif
+      return Packed;
+    }
+    static LayoutFlags unpack(uint64_t Storage) {
+      assert(Storage <= UINT8_MAX && "Expect storage to fit in a byte");
+      LayoutFlags LF;
+      LF.NumRefs =
+          (NumRefsFlags)((Storage >> NumRefsShift) & ((1U << NumRefsBits) - 1));
+      LF.DataSize = (DataSizeFlags)((Storage >> DataSizeShift) &
+                                    ((1U << DataSizeBits) - 1));
+      LF.RefKind =
+          (RefKindFlags)((Storage >> RefKindShift) & ((1U << RefKindBits) - 1));
+      return LF;
+    }
+  };
+
+  /// Header layout:
+  /// - 1-byte:      LayoutFlags
+  /// - 1-byte:      1B size field
+  /// - {0,2}-bytes: 2B size field
+  struct Header {
+    using PackTy = uint32_t;
+    PackTy Packed;
+
+    static constexpr unsigned LayoutFlagsShift =
+        (sizeof(PackTy) - 1) * CHAR_BIT;
+  };
+
+  struct Input {
+    InternalRefArrayRef Refs;
+    ArrayRef<char> Data;
+  };
+
+  LayoutFlags getLayoutFlags() const {
+    return LayoutFlags::unpack(H->Packed >> Header::LayoutFlagsShift);
+  }
+
+  uint64_t getDataSize() const;
+  void skipDataSize(LayoutFlags LF, int64_t &RelOffset) const;
+  uint32_t getNumRefs() const;
+  void skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const;
+  int64_t getRefsRelOffset() const;
+  int64_t getDataRelOffset() const;
+
+  static uint64_t getTotalSize(uint64_t DataRelOffset, uint64_t DataSize) {
+    return DataRelOffset + DataSize + 1;
+  }
+  uint64_t getTotalSize() const {
+    return getDataRelOffset() + getDataSize() + 1;
+  }
+
+  /// Describe the layout of data stored and how to decode from
+  /// DataRecordHandle.
+  struct Layout {
+    explicit Layout(const Input &I);
+
+    LayoutFlags Flags;
+    uint64_t DataSize = 0;
+    uint32_t NumRefs = 0;
+    int64_t RefsRelOffset = 0;
+    int64_t DataRelOffset = 0;
+    uint64_t getTotalSize() const {
+      return DataRecordHandle::getTotalSize(DataRelOffset, DataSize);
+    }
+  };
+
+  InternalRefArrayRef getRefs() const {
+    assert(H && "Expected valid handle");
+    auto *BeginByte = reinterpret_cast<const char *>(H) + getRefsRelOffset();
+    size_t Size = getNumRefs();
+    if (!Size)
+      return InternalRefArrayRef();
+    if (getLayoutFlags().RefKind == RefKindFlags::InternalRef4B)
+      return ArrayRef(reinterpret_cast<const InternalRef4B *>(BeginByte), Size);
+    return ArrayRef(reinterpret_cast<const InternalRef *>(BeginByte), Size);
+  }
+
+  ArrayRef<char> getData() const {
+    assert(H && "Expected valid handle");
+    return ArrayRef(reinterpret_cast<const char *>(H) + getDataRelOffset(),
+                    getDataSize());
+  }
+
+  static DataRecordHandle create(function_ref<char *(size_t Size)> Alloc,
+                                 const Input &I);
+  static Expected<DataRecordHandle>
+  createWithError(function_ref<Expected<char *>(size_t Size)> Alloc,
+                  const Input &I);
+  static DataRecordHandle construct(char *Mem, const Input &I);
+
+  static DataRecordHandle get(const char *Mem) {
+    return DataRecordHandle(
+        *reinterpret_cast<const DataRecordHandle::Header *>(Mem));
+  }
+  static Expected<DataRecordHandle>
+  getFromDataPool(const OnDiskDataAllocator &Pool, FileOffset Offset);
+
+  explicit operator bool() const { return H; }
+  const Header &getHeader() const { return *H; }
+
+  DataRecordHandle() = default;
+  explicit DataRecordHandle(const Header &H) : H(&H) {}
+
+private:
+  static DataRecordHandle constructImpl(char *Mem, const Input &I,
+                                        const Layout &L);
+  const Header *H = nullptr;
+};
+
+/// Proxy for any on-disk object or raw data.
+struct OnDiskContent {
+  std::optional<DataRecordHandle> Record;
+  std::optional<ArrayRef<char>> Bytes;
+};
+
+/// Data loaded inside the memory from standalone file.
+class StandaloneDataInMemory {
+public:
+  OnDiskContent getContent() const;
+
+  StandaloneDataInMemory(std::unique_ptr<sys::fs::mapped_file_region> Region,
+                         TrieRecord::StorageKind SK)
+      : Region(std::move(Region)), SK(SK) {
+#ifndef NDEBUG
+    bool IsStandalone = false;
+    switch (SK) {
+    case TrieRecord::StorageKind::Standalone:
+    case TrieRecord::StorageKind::StandaloneLeaf:
+    case TrieRecord::StorageKind::StandaloneLeaf0:
+      IsStandalone = true;
+      break;
+    default:
+      break;
+    }
+    assert(IsStandalone);
+#endif
+  }
+
+private:
+  std::unique_ptr<sys::fs::mapped_file_region> Region;
+  TrieRecord::StorageKind SK;
+};
+
+/// Container to lookup loaded standalone objects.
+template <size_t NumShards> class StandaloneDataMap {
+  static_assert(isPowerOf2_64(NumShards), "Expected power of 2");
+
+public:
+  uintptr_t insert(ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
+                   std::unique_ptr<sys::fs::mapped_file_region> Region);
+
+  const StandaloneDataInMemory *lookup(ArrayRef<uint8_t> Hash) const;
+  bool count(ArrayRef<uint8_t> Hash) const { return bool(lookup(Hash)); }
+
+private:
+  struct Shard {
+    /// Needs to store a std::unique_ptr for a stable address identity.
+    DenseMap<const uint8_t *, std::unique_ptr<StandaloneDataInMemory>> Map;
+    mutable std::mutex Mutex;
+  };
+  Shard &getShard(ArrayRef<uint8_t> Hash) {
+    return const_cast<Shard &>(
+        const_cast<const StandaloneDataMap *>(this)->getShard(Hash));
+  }
+  const Shard &getShard(ArrayRef<uint8_t> Hash) const {
+    static_assert(NumShards <= 256, "Expected only 8 bits of shard");
+    return Shards[Hash[0] % NumShards];
+  }
+
+  Shard Shards[NumShards];
+};
+
+using StandaloneDataMapTy = StandaloneDataMap<16>;
+
+/// A vector of internal node references.
+class InternalRefVector {
+public:
+  void push_back(InternalRef Ref) {
+    if (NeedsFull)
+      return FullRefs.push_back(Ref);
+    if (std::optional<InternalRef4B> Small = InternalRef4B::tryToShrink(Ref))
+      return SmallRefs.push_back(*Small);
+    NeedsFull = true;
+    assert(FullRefs.empty());
+    FullRefs.reserve(SmallRefs.size() + 1);
+    for (InternalRef4B Small : SmallRefs)
+      FullRefs.push_back(Small);
+    FullRefs.push_back(Ref);
+    SmallRefs.clear();
+  }
+
+  operator InternalRefArrayRef() const {
+    assert(SmallRefs.empty() || FullRefs.empty());
+    return NeedsFull ? InternalRefArrayRef(FullRefs)
+                     : InternalRefArrayRef(SmallRefs);
+  }
+
+private:
+  bool NeedsFull = false;
+  SmallVector<InternalRef4B> SmallRefs;
+  SmallVector<InternalRef> FullRefs;
+};
+
+} // namespace
+
+Expected<DataRecordHandle> DataRecordHandle::createWithError(
+    function_ref<Expected<char *>(size_t Size)> Alloc, const Input &I) {
+  Layout L(I);
+  if (Expected<char *> Mem = Alloc(L.getTotalSize()))
+    return constructImpl(*Mem, I, L);
+  else
+    return Mem.takeError();
+}
+
+DataRecordHandle
+DataRecordHandle::create(function_ref<char *(size_t Size)> Alloc,
+                         const Input &I) {
+  Layout L(I);
+  return constructImpl(Alloc(L.getTotalSize()), I, L);
+}
+
+ObjectHandle ObjectHandle::fromFileOffset(FileOffset Offset) {
+  // Store the file offset as it is.
+  assert(!(Offset.get() & 0x1));
+  return ObjectHandle(Offset.get());
+}
+
+ObjectHandle ObjectHandle::fromMemory(uintptr_t Ptr) {
+  // Store the pointer from memory with lowest bit set.
+  assert(!(Ptr & 0x1));
+  return ObjectHandle(Ptr | 1);
+}
+
+/// Proxy for an on-disk index record.
+struct OnDiskGraphDB::IndexProxy {
+  FileOffset Offset;
+  ArrayRef<uint8_t> Hash;
+  TrieRecord &Ref;
+};
+
+template <size_t N>
+uintptr_t StandaloneDataMap<N>::insert(
+    ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
+    std::unique_ptr<sys::fs::mapped_file_region> Region) {
+  auto &S = getShard(Hash);
+  std::lock_guard<std::mutex> Lock(S.Mutex);
+  auto &V = S.Map[Hash.data()];
+  if (!V)
+    V = std::make_unique<StandaloneDataInMemory>(std::move(Region), SK);
+  return reinterpret_cast<uintptr_t>(V.get());
+}
+
+template <size_t N>
+const StandaloneDataInMemory *
+StandaloneDataMap<N>::lookup(ArrayRef<uint8_t> Hash) const {
+  auto &S = getShard(Hash);
+  std::lock_guard<std::mutex> Lock(S.Mutex);
+  auto I = S.Map.find(Hash.data());
+  if (I == S.Map.end())
+    return nullptr;
+  return &*I->second;
+}
+
+namespace {
+
+/// Copy of \a sys::fs::TempFile that skips RemoveOnSignal, which is too
+/// expensive to register/unregister at this rate.
+///
+/// FIXME: Add a TempFileManager that maintains a thread-safe list of open temp
+/// files and has a signal handler registerd that removes them all.
+class TempFile {
+  bool Done = false;
+  TempFile(StringRef Name, int FD) : TmpName(std::string(Name)), FD(FD) {}
+
+public:
+  /// This creates a temporary file with createUniqueFile.
+  static Expected<TempFile> create(const Twine &Model);
+  TempFile(TempFile &&Other) { *this = std::move(Other); }
+  TempFile &operator=(TempFile &&Other) {
+    TmpName = std::move(Other.TmpName);
+    FD = Other.FD;
+    Other.Done = true;
+    Other.FD = -1;
+    return *this;
+  }
+
+  // Name of the temporary file.
+  std::string TmpName;
+
+  // The open file descriptor.
+  int FD = -1;
+
+  // Keep this with the given name.
+  Error keep(const Twine &Name);
+  Error discard();
+
+  // This checks that keep or delete was called.
+  ~TempFile() { consumeError(discard()); }
+};
+
+class MappedTempFile {
+public:
+  char *data() const { return Map.data(); }
+  size_t size() const { return Map.size(); }
+
+  Error discard() {
+    assert(Map && "Map already destroyed");
+    Map.unmap();
+    return Temp.discard();
+  }
+
+  Error keep(const Twine &Name) {
+    assert(Map && "Map already destroyed");
+    Map.unmap();
+    return Temp.keep(Name);
+  }
+
+  MappedTempFile(TempFile Temp, sys::fs::mapped_file_region Map)
+      : Temp(std::move(Temp)), Map(std::move(Map)) {}
+
+private:
+  TempFile Temp;
+  sys::fs::mapped_file_region Map;
+};
+} // namespace
+
+Error TempFile::discard() {
+  Done = true;
+  if (FD != -1) {
+    sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+    if (std::error_code EC = sys::fs::closeFile(File))
+      return errorCodeToError(EC);
+  }
+  FD = -1;
+
+  // Always try to close and remove.
+  std::error_code RemoveEC;
+  if (!TmpName.empty()) {
+    std::error_code EC = sys::fs::remove(TmpName);
+    if (EC)
+      return errorCodeToError(EC);
+  }
+  TmpName = "";
+
+  return Error::success();
+}
+
+Error TempFile::keep(const Twine &Name) {
+  assert(!Done);
+  Done = true;
+  // Always try to close and rename.
+  std::error_code RenameEC = sys::fs::rename(TmpName, Name);
+
+  if (!RenameEC)
+    TmpName = "";
+
+  sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+  if (std::error_code EC = sys::fs::closeFile(File))
+    return errorCodeToError(EC);
+  FD = -1;
+
+  return errorCodeToError(RenameEC);
+}
+
+Expected<TempFile> TempFile::create(const Twine &Model) {
+  int FD;
+  SmallString<128> ResultPath;
+  if (std::error_code EC = sys::fs::createUniqueFile(Model, FD, ResultPath))
+    return errorCodeToError(EC);
+
+  TempFile Ret(ResultPath, FD);
+  return std::move(Ret);
+}
+
+bool TrieRecord::compare_exchange_strong(Data &Existing, Data New) {
+  uint64_t ExistingPacked = pack(Existing);
+  uint64_t NewPacked = pack(New);
+  if (Storage.compare_exchange_strong(ExistingPacked, NewPacked))
+    return true;
+  Existing = unpack(ExistingPacked);
+  return false;
+}
+
+DataRecordHandle DataRecordHandle::construct(char *Mem, const Input &I) {
+  return constructImpl(Mem, I, Layout(I));
+}
+
+Expected<DataRecordHandle>
+DataRecordHandle::getFromDataPool(const OnDiskDataAllocator &Pool,
+                                  FileOffset Offset) {
+  auto HeaderData = Pool.get(Offset, sizeof(DataRecordHandle::Header));
+  if (!HeaderData)
+    return HeaderData.takeError();
+
+  auto Record = DataRecordHandle::get(HeaderData->data());
+  if (Record.getTotalSize() + Offset.get() > Pool.size())
+    return createStringError(
+        make_error_code(std::errc::illegal_byte_sequence),
+        "data record span passed the end of the data pool");
+
+  return Record;
+}
+
+DataRecordHandle DataRecordHandle::constructImpl(char *Mem, const Input &I,
+                                                 const Layout &L) {
+  char *Next = Mem + sizeof(Header);
+
+  // Fill in Packed and set other data, then come back to construct the header.
+  Header::PackTy Packed = 0;
+  Packed |= LayoutFlags::pack(L.Flags) << Header::LayoutFlagsShift;
+
+  // Construct DataSize.
+  switch (L.Flags.DataSize) {
+  case DataSizeFlags::Uses1B:
+    assert(I.Data.size() <= UINT8_MAX);
+    Packed |= (Header::PackTy)I.Data.size()
+              << ((sizeof(Packed) - 2) * CHAR_BIT);
+    break;
+  case DataSizeFlags::Uses2B:
+    assert(I.Data.size() <= UINT16_MAX);
+    Packed |= (Header::PackTy)I.Data.size()
+              << ((sizeof(Packed) - 4) * CHAR_BIT);
+    break;
+  case DataSizeFlags::Uses4B:
+    support::endian::write32le(Next, I.Data.size());
+    Next += 4;
+    break;
+  case DataSizeFlags::Uses8B:
+    support::endian::write64le(Next, I.Data.size());
+    Next += 8;
+    break;
+  }
+
+  // Construct NumRefs.
+  //
+  // NOTE: May be writing NumRefs even if there are zero refs in order to fix
+  // alignment.
+  switch (L.Flags.NumRefs) {
+  case NumRefsFlags::Uses0B:
+    break;
+  case NumRefsFlags::Uses1B:
+    assert(I.Refs.size() <= UINT8_MAX);
+    Packed |= (Header::PackTy)I.Refs.size()
+              << ((sizeof(Packed) - 2) * CHAR_BIT);
+    break;
+  case NumRefsFlags::Uses2B:
+    assert(I.Refs.size() <= UINT16_MAX);
+    Packed |= (Header::PackTy)I.Refs.size()
+              << ((sizeof(Packed) - 4) * CHAR_BIT);
+    break;
+  case NumRefsFlags::Uses4B:
+    support::endian::write32le(Next, I.Refs.size());
+    Next += 4;
+    break;
+  case NumRefsFlags::Uses8B:
+    support::endian::write64le(Next, I.Refs.size());
+    Next += 8;
+    break;
+  }
+
+  // Construct Refs[].
+  if (!I.Refs.empty()) {
+    assert((L.Flags.RefKind == RefKindFlags::InternalRef4B) == I.Refs.is4B());
+    ArrayRef<uint8_t> RefsBuffer = I.Refs.getBuffer();
+    llvm::copy(RefsBuffer, Next);
+    Next += RefsBuffer.size();
+  }
+
+  // Construct Data and the trailing null.
+  assert(isAddrAligned(Align(8), Next));
+  llvm::copy(I.Data, Next);
+  Next[I.Data.size()] = 0;
+
+  // Construct the header itself and return.
+  Header *H = new (Mem) Header{Packed};
+  DataRecordHandle Record(*H);
+  assert(Record.getData() == I.Data);
+  assert(Record.getNumRefs() == I.Refs.size());
+  assert(Record.getRefs() == I.Refs);
+  assert(Record.getLayoutFlags().DataSize == L.Flags.DataSize);
+  assert(Record.getLayoutFlags().NumRefs == L.Flags.NumRefs);
+  assert(Record.getLayoutFlags().RefKind == L.Flags.RefKind);
+  return Record;
+}
+
+DataRecordHandle::Layout::Layout(const Input &I) {
+  // Start initial relative offsets right after the Header.
+  uint64_t RelOffset = sizeof(Header);
+
+  // Initialize the easy stuff.
+  DataSize = I.Data.size();
+  NumRefs = I.Refs.size();
+
+  // Check refs size.
+  Flags.RefKind =
+      I.Refs.is4B() ? RefKindFlags::InternalRef4B : RefKindFlags::InternalRef;
+
+  // Find the smallest slot available for DataSize.
+  bool Has1B = true;
+  bool Has2B = true;
+  if (DataSize <= UINT8_MAX && Has1B) {
+    Flags.DataSize = DataSizeFlags::Uses1B;
+    Has1B = false;
+  } else if (DataSize <= UINT16_MAX && Has2B) {
+    Flags.DataSize = DataSizeFlags::Uses2B;
+    Has2B = false;
+  } else if (DataSize <= UINT32_MAX) {
+    Flags.DataSize = DataSizeFlags::Uses4B;
+    RelOffset += 4;
+  } else {
+    Flags.DataSize = DataSizeFlags::Uses8B;
+    RelOffset += 8;
+  }
+
+  // Find the smallest slot available for NumRefs. Never sets NumRefs8B here.
+  if (!NumRefs) {
+    Flags.NumRefs = NumRefsFlags::Uses0B;
+  } else if (NumRefs <= UINT8_MAX && Has1B) {
+    Flags.NumRefs = NumRefsFlags::Uses1B;
+    Has1B = false;
+  } else if (NumRefs <= UINT16_MAX && Has2B) {
+    Flags.NumRefs = NumRefsFlags::Uses2B;
+    Has2B = false;
+  } else {
+    Flags.NumRefs = NumRefsFlags::Uses4B;
+    RelOffset += 4;
+  }
+
+  // Helper to "upgrade" either DataSize or NumRefs by 4B to avoid complicated
+  // padding rules when reading and writing. This also bumps RelOffset.
+  //
+  // The value for NumRefs is strictly limited to UINT32_MAX, but it can be
+  // stored as 8B. This means we can *always* find a size to grow.
+  //
+  // NOTE: Only call this once.
+  auto GrowSizeFieldsBy4B = [&]() {
+    assert(isAligned(Align(4), RelOffset));
+    RelOffset += 4;
+
+    assert(Flags.NumRefs != NumRefsFlags::Uses8B &&
+           "Expected to be able to grow NumRefs8B");
+
+    // First try to grow DataSize. NumRefs will not (yet) be 8B, and if
+    // DataSize is upgraded to 8B it'll already be aligned.
+    //
+    // Failing that, grow NumRefs.
+    if (Flags.DataSize < DataSizeFlags::Uses4B)
+      Flags.DataSize = DataSizeFlags::Uses4B; // DataSize: Packed => 4B.
+    else if (Flags.DataSize < DataSizeFlags::Uses8B)
+      Flags.DataSize = DataSizeFlags::Uses8B; // DataSize: 4B => 8B.
+    else if (Flags.NumRefs < NumRefsFlags::Uses4B)
+      Flags.NumRefs = NumRefsFlags::Uses4B; // NumRefs: Packed => 4B.
+    else
+      Flags.NumRefs = NumRefsFlags::Uses8B; // NumRefs: 4B => 8B.
+  };
+
+  assert(isAligned(Align(4), RelOffset));
+  if (Flags.RefKind == RefKindFlags::InternalRef) {
+    // List of 8B refs should be 8B-aligned. Grow one of the sizes to get this
+    // without padding.
+    if (!isAligned(Align(8), RelOffset))
+      GrowSizeFieldsBy4B();
+
+    assert(isAligned(Align(8), RelOffset));
+    RefsRelOffset = RelOffset;
+    RelOffset += 8 * NumRefs;
+  } else {
+    // The array of 4B refs doesn't need 8B alignment, but the data will need
+    // to be 8B-aligned. Detect this now, and, if necessary, shift everything
+    // by 4B by growing one of the sizes.
+    // If we remove the need for 8B-alignment for data there is <1% savings in
+    // disk storage for a clang build using MCCAS but the 8B-alignment may be
+    // useful in the future so keep it for now.
+    uint64_t RefListSize = 4 * NumRefs;
+    if (!isAligned(Align(8), RelOffset + RefListSize))
+      GrowSizeFieldsBy4B();
+    RefsRelOffset = RelOffset;
+    RelOffset += RefListSize;
+  }
+
+  assert(isAligned(Align(8), RelOffset));
+  DataRelOffset = RelOffset;
+}
+
+uint64_t DataRecordHandle::getDataSize() const {
+  int64_t RelOffset = sizeof(Header);
+  auto *DataSizePtr = reinterpret_cast<const char *>(H) + RelOffset;
+  switch (getLayoutFlags().DataSize) {
+  case DataSizeFlags::Uses1B:
+    return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
+  case DataSizeFlags::Uses2B:
+    return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
+           UINT16_MAX;
+  case DataSizeFlags::Uses4B:
+    return support::endian::read32le(DataSizePtr);
+  case DataSizeFlags::Uses8B:
+    return support::endian::read64le(DataSizePtr);
+  }
+}
+
+void DataRecordHandle::skipDataSize(LayoutFlags LF, int64_t &RelOffset) const {
+  if (LF.DataSize >= DataSizeFlags::Uses4B)
+    RelOffset += 4;
+  if (LF.DataSize >= DataSizeFlags::Uses8B)
+    RelOffset += 4;
+}
+
+uint32_t DataRecordHandle::getNumRefs() const {
+  LayoutFlags LF = getLayoutFlags();
+  int64_t RelOffset = sizeof(Header);
+  skipDataSize(LF, RelOffset);
+  auto *NumRefsPtr = reinterpret_cast<const char *>(H) + RelOffset;
+  switch (LF.NumRefs) {
+  case NumRefsFlags::Uses0B:
+    return 0;
+  case NumRefsFlags::Uses1B:
+    return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
+  case NumRefsFlags::Uses2B:
+    return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
+           UINT16_MAX;
+  case NumRefsFlags::Uses4B:
+    return support::endian::read32le(NumRefsPtr);
+  case NumRefsFlags::Uses8B:
+    return support::endian::read64le(NumRefsPtr);
+  }
+}
+
+void DataRecordHandle::skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const {
+  if (LF.NumRefs >= NumRefsFlags::Uses4B)
+    RelOffset += 4;
+  if (LF.NumRefs >= NumRefsFlags::Uses8B)
+    RelOffset += 4;
+}
+
+int64_t DataRecordHandle::getRefsRelOffset() const {
+  LayoutFlags LF = getLayoutFlags();
+  int64_t RelOffset = sizeof(Header);
+  skipDataSize(LF, RelOffset);
+  skipNumRefs(LF, RelOffset);
+  return RelOffset;
+}
+
+int64_t DataRecordHandle::getDataRelOffset() const {
+  LayoutFlags LF = getLayoutFlags();
+  int64_t RelOffset = sizeof(Header);
+  skipDataSize(LF, RelOffset);
+  skipNumRefs(LF, RelOffset);
+  uint32_t RefSize = LF.RefKind == RefKindFlags::InternalRef4B ? 4 : 8;
+  RelOffset += RefSize * getNumRefs();
+  return RelOffset;
+}
+
+Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const {
+  return Index.validate([&](FileOffset Offset,
+                            OnDiskTrieRawHashMap::ConstValueProxy Record)
+                            -> Error {
+    auto formatError = [&](Twine Msg) {
+      return createStringError(
+          llvm::errc::illegal_byte_sequence,
+          "bad record at 0x" +
+              utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " +
+              Msg.str());
+    };
+
+    if (Record.Data.size() != sizeof(TrieRecord))
+      return formatError("wrong data record size");
+    if (!isAligned(Align::Of<TrieRecord>(), Record.Data.size()))
+      return formatError("wrong data record alignment");
+
+    auto *R = reinterpret_cast<const TrieRecord *>(Record.Data.data());
+    TrieRecord::Data D = R->load();
+    std::unique_ptr<MemoryBuffer> FileBuffer;
+    if ((uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::Unknown &&
+        (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::DataPool &&
+        (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::Standalone &&
+        (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::StandaloneLeaf &&
+        (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::StandaloneLeaf0)
+      return formatError("invalid record kind value");
+
+    auto Ref = InternalRef::getFromOffset(Offset);
+    auto I = getIndexProxyFromRef(Ref);
+    if (!I)
+      return I.takeError();
+
+    switch (D.SK) {
+    case TrieRecord::StorageKind::Unknown:
+      // This could be an abandoned entry due to a termination before updating
+      // the record. It can be reused by later insertion so just skip this entry
+      // for now.
+      return Error::success();
+    case TrieRecord::StorageKind::DataPool:
+      // Check offset is a postive value, and large enough to hold the
+      // header for the data record.
+      if (D.Offset.get() <= 0 ||
+          (uint64_t)D.Offset.get() + sizeof(DataRecordHandle::Header) >=
+              DataPool.size())
+        return formatError("datapool record out of bound");
+      break;
+    case TrieRecord::StorageKind::Standalone:
+    case TrieRecord::StorageKind::StandaloneLeaf:
+    case TrieRecord::StorageKind::StandaloneLeaf0:
+      SmallString<256> Path;
+      getStandalonePath(TrieRecord::getStandaloneFilePrefix(D.SK), *I, Path);
+      // If need to validate the content of the file later, just load the
+      // buffer here. Otherwise, just check the existance of the file.
+      if (Deep) {
+        auto File = MemoryBuffer::getFile(Path, /*IsText=*/false,
+                                          /*RequiresNullTerminator=*/false);
+        if (!File || !*File)
+          return formatError("record file \'" + Path + "\' does not exist");
+
+        FileBuffer = std::move(*File);
+      } else if (!llvm::sys::fs::exists(Path))
+        return formatError("record file \'" + Path + "\' does not exist");
+    }
+
+    if (!Deep)
+      return Error::success();
+
+    auto dataError = [&](Twine Msg) {
+      return createStringError(llvm::errc::illegal_byte_sequence,
+                               "bad data for digest \'" + toHex(I->Hash) +
+                                   "\': " + Msg.str());
+    };
+    SmallVector<ArrayRef<uint8_t>> Refs;
+    ArrayRef<char> StoredData;
+
+    switch (D.SK) {
+    case TrieRecord::StorageKind::Unknown:
+      llvm_unreachable("already handled");
+    case TrieRecord::StorageKind::DataPool: {
+      auto DataRecord = DataRecordHandle::getFromDataPool(DataPool, D.Offset);
+      if (!DataRecord)
+        return dataError(toString(DataRecord.takeError()));
+
+      for (auto InternRef : DataRecord->getRefs()) {
+        auto Index = getIndexProxyFromRef(InternRef);
+        if (!Index)
+          return Index.takeError();
+        Refs.push_back(Index->Hash);
+      }
+      StoredData = DataRecord->getData();
+      break;
+    }
+    case TrieRecord::StorageKind::Standalone: {
+      if (FileBuffer->getBufferSize() < sizeof(DataRecordHandle::Header))
+        return dataError("data record is not big enough to read the header");
+      auto DataRecord = DataRecordHandle::get(FileBuffer->getBufferStart());
+      if (DataRecord.getTotalSize() < FileBuffer->getBufferSize())
+        return dataError(
+            "data record span passed the end of the standalone file");
+      for (auto InternRef : DataRecord.getRefs()) {
+        auto Index = getIndexProxyFromRef(InternRef);
+        if (!Index)
+          return Index.takeError();
+        Refs.push_back(Index->Hash);
+      }
+      StoredData = DataRecord.getData();
+      break;
+    }
+    case TrieRecord::StorageKind::StandaloneLeaf:
+    case TrieRecord::StorageKind::StandaloneLeaf0: {
+      StoredData = arrayRefFromStringRef<char>(FileBuffer->getBuffer());
+      if (D.SK == TrieRecord::StorageKind::StandaloneLeaf0) {
+        if (!FileBuffer->getBuffer().ends_with('\0'))
+          return dataError("standalone file is not zero terminated");
+        StoredData = StoredData.drop_back(1);
+      }
+      break;
+    }
+    }
+
+    SmallVector<uint8_t> ComputedHash;
+    Hasher(Refs, StoredData, ComputedHash);
+    if (I->Hash != ArrayRef(ComputedHash))
+      return dataError("hash mismatch, got \'" + toHex(ComputedHash) +
+                       "\' instead");
+
+    return Error::success();
+  });
+}
+
+void OnDiskGraphDB::print(raw_ostream &OS) const {
+  OS << "on-disk-root-path: " << RootPath << "\n";
+
+  struct PoolInfo {
+    uint64_t Offset;
+  };
+  SmallVector<PoolInfo> Pool;
+
+  OS << "\n";
+  OS << "index:\n";
+  Index.print(OS, [&](ArrayRef<char> Data) {
+    assert(Data.size() == sizeof(TrieRecord));
+    assert(isAligned(Align::Of<TrieRecord>(), Data.size()));
+    auto *R = reinterpret_cast<const TrieRecord *>(Data.data());
+    TrieRecord::Data D = R->load();
+    OS << " SK=";
+    switch (D.SK) {
+    case TrieRecord::StorageKind::Unknown:
+      OS << "unknown          ";
+      break;
+    case TrieRecord::StorageKind::DataPool:
+      OS << "datapool         ";
+      Pool.push_back({D.Offset.get()});
+      break;
+    case TrieRecord::StorageKind::Standalone:
+      OS << "standalone-data  ";
+      break;
+    case TrieRecord::StorageKind::StandaloneLeaf:
+      OS << "standalone-leaf  ";
+      break;
+    case TrieRecord::StorageKind::StandaloneLeaf0:
+      OS << "standalone-leaf+0";
+      break;
+    }
+    OS << " Offset=" << (void *)D.Offset.get();
+  });
+  if (Pool.empty())
+    return;
+
+  OS << "\n";
+  OS << "pool:\n";
+  llvm::sort(
+      Pool, [](PoolInfo LHS, PoolInfo RHS) { return LHS.Offset < RHS.Offset; });
+  for (PoolInfo PI : Pool) {
+    OS << "- addr=" << (void *)PI.Offset << " ";
+    auto D = DataRecordHandle::getFromDataPool(DataPool, FileOffset(PI.Offset));
+    if (!D) {
+      OS << "error: " << toString(D.takeError());
+      return;
+    }
+
+    OS << "record refs=" << D->getNumRefs() << " data=" << D->getDataSize()
+       << " size=" << D->getTotalSize()
+       << " end=" << (void *)(PI.Offset + D->getTotalSize()) << "\n";
+  }
+}
+
+Expected<OnDiskGraphDB::IndexProxy>
+OnDiskGraphDB::indexHash(ArrayRef<uint8_t> Hash) {
+  auto P = Index.insertLazy(
+      Hash, [](FileOffset TentativeOffset,
+               OnDiskTrieRawHashMap::ValueProxy TentativeValue) {
+        assert(TentativeValue.Data.size() == sizeof(TrieRecord));
+        assert(
+            isAddrAligned(Align::Of<TrieRecord>(), TentativeValue.Data.data()));
+        new (TentativeValue.Data.data()) TrieRecord();
+      });
+  if (LLVM_UNLIKELY(!P))
+    return P.takeError();
+
+  assert(*P && "Expected insertion");
+  return getIndexProxyFromPointer(*P);
+}
+
+OnDiskGraphDB::IndexProxy OnDiskGraphDB::getIndexProxyFromPointer(
+    OnDiskTrieRawHashMap::ConstOnDiskPtr P) const {
+  assert(P);
+  assert(P.getOffset());
+  return IndexProxy{P.getOffset(), P->Hash,
+                    *const_cast<TrieRecord *>(
+                        reinterpret_cast<const TrieRecord *>(P->Data.data()))};
+}
+
+Expected<ObjectID> OnDiskGraphDB::getReference(ArrayRef<uint8_t> Hash) {
+  auto I = indexHash(Hash);
+  if (LLVM_UNLIKELY(!I))
+    return I.takeError();
+  return getExternalReference(*I);
+}
+
+ObjectID OnDiskGraphDB::getExternalReference(const IndexProxy &I) {
+  return getExternalReference(makeInternalRef(I.Offset));
+}
+
+std::optional<ObjectID>
+OnDiskGraphDB::getExistingReference(ArrayRef<uint8_t> Digest) {
+  auto tryUpstream =
+      [&](std::optional<IndexProxy> I) -> std::optional<ObjectID> {
+    if (!UpstreamDB)
+      return std::nullopt;
+    std::optional<ObjectID> UpstreamID =
+        UpstreamDB->getExistingReference(Digest);
+    if (LLVM_UNLIKELY(!UpstreamID))
+      return std::nullopt;
+    auto Ref = expectedToOptional(indexHash(Digest));
+    if (!Ref)
+      return std::nullopt;
+    if (!I)
+      I.emplace(*Ref);
+    return getExternalReference(*I);
+  };
+
+  OnDiskTrieRawHashMap::ConstOnDiskPtr P = Index.find(Digest);
+  if (!P)
+    return tryUpstream(std::nullopt);
+  IndexProxy I = getIndexProxyFromPointer(P);
+  TrieRecord::Data Obj = I.Ref.load();
+  if (Obj.SK == TrieRecord::StorageKind::Unknown)
+    return tryUpstream(I);
+  return getExternalReference(makeInternalRef(I.Offset));
+}
+
+Expected<OnDiskGraphDB::IndexProxy>
+OnDiskGraphDB::getIndexProxyFromRef(InternalRef Ref) const {
+  auto P = Index.recoverFromFileOffset(Ref.getFileOffset());
+  if (LLVM_UNLIKELY(!P))
+    return P.takeError();
+  return getIndexProxyFromPointer(*P);
+}
+
+Expected<ArrayRef<uint8_t>> OnDiskGraphDB::getDigest(InternalRef Ref) const {
+  auto I = getIndexProxyFromRef(Ref);
+  if (!I)
+    return I.takeError();
+  return I->Hash;
+}
+
+ArrayRef<uint8_t> OnDiskGraphDB::getDigest(const IndexProxy &I) const {
+  return I.Hash;
+}
+
+static OnDiskContent getContentFromHandle(const OnDiskDataAllocator &DataPool,
+                                          ObjectHandle OH) {
+  // Decode ObjectHandle to locate the stored content.
+  uint64_t Data = OH.getOpaqueData();
+  if (Data & 1) {
+    const auto *SDIM =
+        reinterpret_cast<const StandaloneDataInMemory *>(Data & (-1ULL << 1));
+    return SDIM->getContent();
+  }
+
+  auto DataHandle =
+      cantFail(DataRecordHandle::getFromDataPool(DataPool, FileOffset(Data)));
+  assert(DataHandle.getData().end()[0] == 0 && "Null termination");
+  return OnDiskContent{DataHandle, std::nullopt};
+}
+
+ArrayRef<char> OnDiskGraphDB::getObjectData(ObjectHandle Node) const {
+  OnDiskContent Content = getContentFromHandle(DataPool, Node);
+  if (Content.Bytes)
+    return *Content.Bytes;
+  assert(Content.Record && "Expected record or bytes");
+  return Content.Record->getData();
+}
+
+InternalRefArrayRef OnDiskGraphDB::getInternalRefs(ObjectHandle Node) const {
+  if (std::optional<DataRecordHandle> Record =
+          getContentFromHandle(DataPool, Node).Record)
+    return Record->getRefs();
+  return std::nullopt;
+}
+
+Expected<std::optional<ObjectHandle>>
+OnDiskGraphDB::load(ObjectID ExternalRef) {
+  InternalRef Ref = getInternalRef(ExternalRef);
+  auto I = getIndexProxyFromRef(Ref);
+  if (!I)
+    return I.takeError();
+  TrieRecord::Data Object = I->Ref.load();
+
+  if (Object.SK == TrieRecord::StorageKind::Unknown) {
+    if (!UpstreamDB)
+      return std::nullopt;
+    return faultInFromUpstream(ExternalRef);
+  }
+
+  if (Object.SK == TrieRecord::StorageKind::DataPool)
+    return ObjectHandle::fromFileOffset(Object.Offset);
+
+  // Only TrieRecord::StorageKind::Standalone (and variants) need to be
+  // explicitly loaded.
+  //
+  // There's corruption if standalone objects have offsets, or if we get here
+  // for something that isn't standalone.
+  if (Object.Offset)
+    return createCorruptObjectError(getDigest(*I));
+  switch (Object.SK) {
+  case TrieRecord::StorageKind::Unknown:
+  case TrieRecord::StorageKind::DataPool:
+    llvm_unreachable("unexpected storage kind");
+  case TrieRecord::StorageKind::Standalone:
+  case TrieRecord::StorageKind::StandaloneLeaf0:
+  case TrieRecord::StorageKind::StandaloneLeaf:
+    break;
+  }
+
+  // Load it from disk.
+  //
+  // Note: Creation logic guarantees that data that needs null-termination is
+  // suitably 0-padded. Requiring null-termination here would be too expensive
+  // for extremely large objects that happen to be page-aligned.
+  SmallString<256> Path;
+  getStandalonePath(TrieRecord::getStandaloneFilePrefix(Object.SK), *I, Path);
+
+  auto File = sys::fs::openNativeFileForRead(Path);
+  if (!File)
+    return createFileError(Path, File.takeError());
+
+  auto CloseFile = make_scope_exit([&]() { sys::fs::closeFile(*File); });
+
+  sys::fs::file_status Status;
+  if (std::error_code EC = sys::fs::status(*File, Status))
+    return createCorruptObjectError(getDigest(*I));
+
+  std::error_code EC;
+  auto Region = std::make_unique<sys::fs::mapped_file_region>(
+      *File, sys::fs::mapped_file_region::readonly, Status.getSize(), 0, EC);
+  if (EC)
+    return createCorruptObjectError(getDigest(*I));
+
+  return ObjectHandle::fromMemory(
+      static_cast<StandaloneDataMapTy *>(StandaloneData)
+          ->insert(I->Hash, Object.SK, std::move(Region)));
+}
+
+Expected<bool> OnDiskGraphDB::isMaterialized(ObjectID Ref) {
+  auto Presence = getObjectPresence(Ref, /*CheckUpstream=*/true);
+  if (!Presence)
+    return Presence.takeError();
+
+  switch (*Presence) {
+  case ObjectPresence::Missing:
+    return false;
+  case ObjectPresence::InPrimaryDB:
+    return true;
+  case ObjectPresence::OnlyInUpstreamDB:
+    if (auto FaultInResult = faultInFromUpstream(Ref); !FaultInResult)
+      return FaultInResult.takeError();
+    return true;
+  }
+}
+
+Expected<OnDiskGraphDB::ObjectPresence>
+OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef,
+                                 bool CheckUpstream) const {
+  InternalRef Ref = getInternalRef(ExternalRef);
+  auto I = getIndexProxyFromRef(Ref);
+  if (!I)
+    return I.takeError();
+
+  TrieRecord::Data Object = I->Ref.load();
+  if (Object.SK != TrieRecord::StorageKind::Unknown)
+    return ObjectPresence::InPrimaryDB;
+  if (!CheckUpstream || !UpstreamDB)
+    return ObjectPresence::Missing;
+  std::optional<ObjectID> UpstreamID =
+      UpstreamDB->getExistingReference(getDigest(*I));
+  return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB
+                                : ObjectPresence::Missing;
+}
+
+InternalRef OnDiskGraphDB::makeInternalRef(FileOffset IndexOffset) {
+  return InternalRef::getFromOffset(IndexOffset);
+}
+
+void OnDiskGraphDB::getStandalonePath(StringRef Prefix, const IndexProxy &I,
+                                      SmallVectorImpl<char> &Path) const {
+  Path.assign(RootPath.begin(), RootPath.end());
+  sys::path::append(Path,
+                    Prefix + Twine(I.Offset.get()) + "." + CASFormatVersion);
+}
+
+OnDiskContent StandaloneDataInMemory::getContent() const {
+  bool Leaf0 = false;
+  bool Leaf = false;
+  switch (SK) {
+  default:
+    llvm_unreachable("Storage kind must be standalone");
+  case TrieRecord::StorageKind::Standalone:
+    break;
+  case TrieRecord::StorageKind::StandaloneLeaf0:
+    Leaf = Leaf0 = true;
+    break;
+  case TrieRecord::StorageKind::StandaloneLeaf:
+    Leaf = true;
+    break;
+  }
+
+  if (Leaf) {
+    StringRef Data(Region->data(), Region->size());
+    assert(Data.drop_back(Leaf0).end()[0] == 0 &&
+           "Standalone node data missing null termination");
+    return OnDiskContent{std::nullopt,
+                         arrayRefFromStringRef<char>(Data.drop_back(Leaf0))};
+  }
+
+  DataRecordHandle Record = DataRecordHandle::get(Region->data());
+  assert(Record.getData().end()[0] == 0 &&
+         "Standalone object record missing null termination for data");
+  return OnDiskContent{Record, std::nullopt};
+}
+
+static Expected<MappedTempFile> createTempFile(StringRef FinalPath,
+                                               uint64_t Size) {
+  assert(Size && "Unexpected request for an empty temp file");
+  Expected<TempFile> File = TempFile::create(FinalPath + ".%%%%%%");
+  if (!File)
+    return File.takeError();
+
+  if (Error E = preallocateFileTail(File->FD, 0, Size).takeError())
+    return createFileError(File->TmpName, std::move(E));
+
+  if (auto EC = sys::fs::resize_file_before_mapping_readwrite(File->FD, Size))
+    return createFileError(File->TmpName, EC);
+
+  std::error_code EC;
+  sys::fs::mapped_file_region Map(sys::fs::convertFDToNativeFile(File->FD),
+                                  sys::fs::mapped_file_region::readwrite, Size,
+                                  0, EC);
+  if (EC)
+    return createFileError(File->TmpName, EC);
+  return MappedTempFile(std::move(*File), std::move(Map));
+}
+
+static size_t getPageSize() {
+  static int PageSize = sys::Process::getPageSizeEstimate();
+  return PageSize;
+}
+
+Error OnDiskGraphDB::createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data) {
+  assert(Data.size() > TrieRecord::MaxEmbeddedSize &&
+         "Expected a bigger file for external content...");
+
+  bool Leaf0 = isAligned(Align(getPageSize()), Data.size());
+  TrieRecord::StorageKind SK = Leaf0 ? TrieRecord::StorageKind::StandaloneLeaf0
+                                     : TrieRecord::StorageKind::StandaloneLeaf;
+
+  SmallString<256> Path;
+  int64_t FileSize = Data.size() + Leaf0;
+  getStandalonePath(TrieRecord::getStandaloneFilePrefix(SK), I, Path);
+
+  // Write the file. Don't reuse this mapped_file_region, which is read/write.
+  // Let load() pull up one that's read-only.
+  Expected<MappedTempFile> File = createTempFile(Path, FileSize);
+  if (!File)
+    return File.takeError();
+  assert(File->size() == (uint64_t)FileSize);
+  llvm::copy(Data, File->data());
+  if (Leaf0)
+    File->data()[Data.size()] = 0;
+  assert(File->data()[Data.size()] == 0);
+  if (Error E = File->keep(Path))
+    return E;
+
+  // Store the object reference.
+  TrieRecord::Data Existing;
+  {
+    TrieRecord::Data Leaf{SK, FileOffset()};
+    if (I.Ref.compare_exchange_strong(Existing, Leaf)) {
+      recordStandaloneSizeIncrease(FileSize);
+      return Error::success();
+    }
+  }
+
+  // If there was a race, confirm that the new value has valid storage.
+  if (Existing.SK == TrieRecord::StorageKind::Unknown)
+    return createCorruptObjectError(getDigest(I));
+
+  return Error::success();
+}
+
+Error OnDiskGraphDB::store(ObjectID ID, ArrayRef<ObjectID> Refs,
+                           ArrayRef<char> Data) {
+  auto I = getIndexProxyFromRef(getInternalRef(ID));
+  if (LLVM_UNLIKELY(!I))
+    return I.takeError();
+
+  // Early return in case the node exists.
+  {
+    TrieRecord::Data Existing = I->Ref.load();
+    if (Existing.SK != TrieRecord::StorageKind::Unknown)
+      return Error::success();
+  }
+
+  // Big leaf nodes.
+  if (Refs.empty() && Data.size() > TrieRecord::MaxEmbeddedSize)
+    return createStandaloneLeaf(*I, Data);
+
+  // TODO: Check whether it's worth checking the index for an already existing
+  // object (like storeTreeImpl() does) before building up the
+  // InternalRefVector.
+  InternalRefVector InternalRefs;
+  for (ObjectID Ref : Refs)
+    InternalRefs.push_back(getInternalRef(Ref));
+
+  // Create the object.
+
+  DataRecordHandle::Input Input{InternalRefs, Data};
+
+  // Compute the storage kind, allocate it, and create the record.
+  TrieRecord::StorageKind SK = TrieRecord::StorageKind::Unknown;
+  FileOffset PoolOffset;
+  SmallString<256> Path;
+  std::optional<MappedTempFile> File;
+  std::optional<uint64_t> FileSize;
+  auto AllocStandaloneFile = [&](size_t Size) -> Expected<char *> {
+    getStandalonePath(TrieRecord::getStandaloneFilePrefix(
+                          TrieRecord::StorageKind::Standalone),
+                      *I, Path);
+    if (Error E = createTempFile(Path, Size).moveInto(File))
+      return std::move(E);
+    assert(File->size() == Size);
+    FileSize = Size;
+    SK = TrieRecord::StorageKind::Standalone;
+    return File->data();
+  };
+  auto Alloc = [&](size_t Size) -> Expected<char *> {
+    if (Size <= TrieRecord::MaxEmbeddedSize) {
+      SK = TrieRecord::StorageKind::DataPool;
+      auto P = DataPool.allocate(Size);
+      if (LLVM_UNLIKELY(!P)) {
+        char *NewAlloc = nullptr;
+        auto NewE = handleErrors(
+            P.takeError(), [&](std::unique_ptr<StringError> E) -> Error {
+              if (E->convertToErrorCode() == std::errc::not_enough_memory)
+                return AllocStandaloneFile(Size).moveInto(NewAlloc);
+              return Error(std::move(E));
+            });
+        if (!NewE)
+          return NewAlloc;
+        return std::move(NewE);
+      }
+      PoolOffset = P->getOffset();
+      LLVM_DEBUG({
+        dbgs() << "pool-alloc addr=" << (void *)PoolOffset.get()
+               << " size=" << Size
+               << " end=" << (void *)(PoolOffset.get() + Size) << "\n";
+      });
+      return (*P)->data();
+    }
+    return AllocStandaloneFile(Size);
+  };
+
+  DataRecordHandle Record;
+  if (Error E =
+          DataRecordHandle::createWithError(Alloc, Input).moveInto(Record))
+    return E;
+  assert(Record.getData().end()[0] == 0 && "Expected null-termination");
+  assert(Record.getData() == Input.Data && "Expected initialization");
+  assert(SK != TrieRecord::StorageKind::Unknown);
+  assert(bool(File) != bool(PoolOffset) &&
+         "Expected either a mapped file or a pooled offset");
+
+  // Check for a race before calling MappedTempFile::keep().
+  //
+  // Then decide what to do with the file. Better to discard than overwrite if
+  // another thread/process has already added this.
+  TrieRecord::Data Existing = I->Ref.load();
+  {
+    TrieRecord::Data NewObject{SK, PoolOffset};
+    if (File) {
+      if (Existing.SK == TrieRecord::StorageKind::Unknown) {
+        // Keep the file!
+        if (Error E = File->keep(Path))
+          return E;
+      } else {
+        File.reset();
+      }
+    }
+
+    // If we didn't already see a racing/existing write, then try storing the
+    // new object. If that races, confirm that the new value has valid storage.
+    //
+    // TODO: Find a way to reuse the storage from the new-but-abandoned record
+    // handle.
+    if (Existing.SK == TrieRecord::StorageKind::Unknown) {
+      if (I->Ref.compare_exchange_strong(Existing, NewObject)) {
+        if (FileSize)
+          recordStandaloneSizeIncrease(*FileSize);
+        return Error::success();
+      }
+    }
+  }
+
+  if (Existing.SK == TrieRecord::StorageKind::Unknown)
+    return createCorruptObjectError(getDigest(*I));
+
+  // Load existing object.
+  return Error::success();
+}
+
+void OnDiskGraphDB::recordStandaloneSizeIncrease(size_t SizeIncrease) {
+  standaloneStorageSize().fetch_add(SizeIncrease, std::memory_order_relaxed);
+}
+
+std::atomic<uint64_t> &OnDiskGraphDB::standaloneStorageSize() const {
+  MutableArrayRef<uint8_t> UserHeader = DataPool.getUserHeader();
+  assert(UserHeader.size() == sizeof(std::atomic<uint64_t>));
+  assert(isAddrAligned(Align(8), UserHeader.data()));
+  return *reinterpret_cast<std::atomic<uint64_t> *>(UserHeader.data());
+}
+
+uint64_t OnDiskGraphDB::getStandaloneStorageSize() const {
+  return standaloneStorageSize().load(std::memory_order_relaxed);
+}
+
+size_t OnDiskGraphDB::getStorageSize() const {
+  return Index.size() + DataPool.size() + getStandaloneStorageSize();
+}
+
+unsigned OnDiskGraphDB::getHardStorageLimitUtilization() const {
+  unsigned IndexPercent = Index.size() * 100ULL / Index.capacity();
+  unsigned DataPercent = DataPool.size() * 100ULL / DataPool.capacity();
+  return std::max(IndexPercent, DataPercent);
+}
+
+Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open(
+    StringRef AbsPath, StringRef HashName, unsigned HashByteSize,
+    std::unique_ptr<OnDiskGraphDB> UpstreamDB, FaultInPolicy Policy) {
+  if (std::error_code EC = sys::fs::create_directories(AbsPath))
+    return createFileError(AbsPath, EC);
+
+  constexpr uint64_t MB = 1024ull * 1024ull;
+  constexpr uint64_t GB = 1024ull * 1024ull * 1024ull;
+
+  uint64_t MaxIndexSize = 12 * GB;
+  uint64_t MaxDataPoolSize = 24 * GB;
+
+  if (useSmallMappingSize(AbsPath)) {
+    MaxIndexSize = 1 * GB;
+    MaxDataPoolSize = 2 * GB;
+  }
+
+  auto CustomSize = getOverriddenMaxMappingSize();
+  if (!CustomSize)
+    return CustomSize.takeError();
+  if (*CustomSize)
+    MaxIndexSize = MaxDataPoolSize = **CustomSize;
+
+  SmallString<256> IndexPath(AbsPath);
+  sys::path::append(IndexPath, IndexFilePrefix + CASFormatVersion);
+  std::optional<OnDiskTrieRawHashMap> Index;
+  if (Error E = OnDiskTrieRawHashMap::create(
+                    IndexPath, IndexTableName + "[" + HashName + "]",
+                    HashByteSize * CHAR_BIT,
+                    /*DataSize=*/sizeof(TrieRecord), MaxIndexSize,
+                    /*MinFileSize=*/MB)
+                    .moveInto(Index))
+    return std::move(E);
+
+  uint32_t UserHeaderSize = sizeof(std::atomic<uint64_t>);
+
+  SmallString<256> DataPoolPath(AbsPath);
+  sys::path::append(DataPoolPath, DataPoolFilePrefix + CASFormatVersion);
+  std::optional<OnDiskDataAllocator> DataPool;
+  StringRef PolicyName =
+      Policy == FaultInPolicy::SingleNode ? "single" : "full";
+  if (Error E = OnDiskDataAllocator::create(
+                    DataPoolPath,
+                    DataPoolTableName + "[" + HashName + "]" + PolicyName,
+                    MaxDataPoolSize, /*MinFileSize=*/MB, UserHeaderSize,
+                    [](void *UserHeaderPtr) {
+                      new (UserHeaderPtr) std::atomic<uint64_t>(0);
+                    })
+                    .moveInto(DataPool))
+    return std::move(E);
+  if (DataPool->getUserHeader().size() != UserHeaderSize)
+    return createStringError(llvm::errc::argument_out_of_domain,
+                             "unexpected user header in '" + DataPoolPath +
+                                 "'");
+
+  return std::unique_ptr<OnDiskGraphDB>(
+      new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool),
+                        std::move(UpstreamDB), Policy));
+}
+
+OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
+                             OnDiskDataAllocator DataPool,
+                             std::unique_ptr<OnDiskGraphDB> UpstreamDB,
+                             FaultInPolicy Policy)
+    : Index(std::move(Index)), DataPool(std::move(DataPool)),
+      RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)),
+      FIPolicy(Policy) {
+  /// Lifetime for "big" objects not in DataPool.
+  ///
+  /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something
+  /// simpler on the assumption there won't be much contention since most data
+  /// is not big. If there is contention, and we've already fixed ObjectProxy
+  /// object handles to be cheap enough to use consistently, the fix might be
+  /// to use better use of them rather than optimizing this map.
+  ///
+  /// FIXME: Figure out the right number of shards, if any.
+  StandaloneData = new StandaloneDataMapTy();
+}
+
+OnDiskGraphDB::~OnDiskGraphDB() {
+  delete static_cast<StandaloneDataMapTy *>(StandaloneData);
+}
+
+Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID,
+                                    ObjectHandle UpstreamNode) {
+  // Copies the full CAS tree from upstream. Uses depth-first copying to protect
+  // against the process dying during importing and leaving the database with an
+  // incomplete tree. Note that if the upstream has missing nodes then the tree
+  // will be copied with missing nodes as well, it won't be considered an error.
+
+  struct UpstreamCursor {
+    ObjectHandle Node;
+    size_t RefsCount;
+    object_refs_iterator RefI;
+    object_refs_iterator RefE;
+  };
+  /// Keeps track of the state of visitation for current node and all of its
+  /// parents.
+  SmallVector<UpstreamCursor, 16> CursorStack;
+  /// Keeps track of the currently visited nodes as they are imported into
+  /// primary database, from current node and its parents. When a node is
+  /// entered for visitation it appends its own ID, then appends referenced IDs
+  /// as they get imported. When a node is fully imported it removes the
+  /// referenced IDs from the bottom of the stack which leaves its own ID at the
+  /// bottom, adding to the list of referenced IDs for the parent node.
+  SmallVector<ObjectID, 128> PrimaryNodesStack;
+
+  auto enqueueNode = [&](ObjectID PrimaryID, std::optional<ObjectHandle> Node) {
+    PrimaryNodesStack.push_back(PrimaryID);
+    if (!Node)
+      return;
+    auto Refs = UpstreamDB->getObjectRefs(*Node);
+    CursorStack.push_back({*Node,
+                           (size_t)std::distance(Refs.begin(), Refs.end()),
+                           Refs.begin(), Refs.end()});
+  };
+
+  enqueueNode(PrimaryID, UpstreamNode);
+
+  while (!CursorStack.empty()) {
+    UpstreamCursor &Cur = CursorStack.back();
+    if (Cur.RefI == Cur.RefE) {
+      // Copy the node data into the primary store.
+      // FIXME: Use hard-link or cloning if the file-system supports it and data
+      // is stored into a separate file.
+
+      // The bottom of \p PrimaryNodesStack contains the primary ID for the
+      // current node plus the list of imported referenced IDs.
+      assert(PrimaryNodesStack.size() >= Cur.RefsCount + 1);
+      ObjectID PrimaryID = *(PrimaryNodesStack.end() - Cur.RefsCount - 1);
+      auto PrimaryRefs = ArrayRef(PrimaryNodesStack)
+                             .slice(PrimaryNodesStack.size() - Cur.RefsCount);
+      auto Data = UpstreamDB->getObjectData(Cur.Node);
+      if (Error E = store(PrimaryID, PrimaryRefs, Data))
+        return E;
+      // Remove the current node and its IDs from the stack.
+      PrimaryNodesStack.truncate(PrimaryNodesStack.size() - Cur.RefsCount);
+      CursorStack.pop_back();
+      continue;
+    }
+
+    ObjectID UpstreamID = *(Cur.RefI++);
+    auto PrimaryID = getReference(UpstreamDB->getDigest(UpstreamID));
+    if (LLVM_UNLIKELY(!PrimaryID))
+      return PrimaryID.takeError();
+    if (containsObject(*PrimaryID, /*CheckUpstream=*/false)) {
+      // This \p ObjectID already exists in the primary. Either it was imported
+      // via \p importFullTree or the client created it, in which case the
+      // client takes responsibility for how it was formed.
+      enqueueNode(*PrimaryID, std::nullopt);
+      continue;
+    }
+    Expected<std::optional<ObjectHandle>> UpstreamNode =
+        UpstreamDB->load(UpstreamID);
+    if (!UpstreamNode)
+      return UpstreamNode.takeError();
+    enqueueNode(*PrimaryID, *UpstreamNode);
+  }
+
+  assert(PrimaryNodesStack.size() == 1);
+  assert(PrimaryNodesStack.front() == PrimaryID);
+  return Error::success();
+}
+
+Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
+                                      ObjectHandle UpstreamNode) {
+  // Copies only a single node, it doesn't copy the referenced nodes.
+
+  // Copy the node data into the primary store.
+  // FIXME: Use hard-link or cloning if the file-system supports it and data is
+  // stored into a separate file.
+
+  auto Data = UpstreamDB->getObjectData(UpstreamNode);
+  auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode);
+  SmallVector<ObjectID, 64> Refs;
+  Refs.reserve(std::distance(UpstreamRefs.begin(), UpstreamRefs.end()));
+  for (ObjectID UpstreamRef : UpstreamRefs) {
+    auto Ref = getReference(UpstreamDB->getDigest(UpstreamRef));
+    if (LLVM_UNLIKELY(!Ref))
+      return Ref.takeError();
+    Refs.push_back(*Ref);
+  }
+
+  return store(PrimaryID, Refs, Data);
+}
+
+Expected<std::optional<ObjectHandle>>
+OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) {
+  assert(UpstreamDB);
+
+  auto UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID));
+  if (LLVM_UNLIKELY(!UpstreamID))
+    return UpstreamID.takeError();
+
+  Expected<std::optional<ObjectHandle>> UpstreamNode =
+      UpstreamDB->load(*UpstreamID);
+  if (!UpstreamNode)
+    return UpstreamNode.takeError();
+  if (!*UpstreamNode)
+    return std::nullopt;
+
+  if (Error E = FIPolicy == FaultInPolicy::SingleNode
+                    ? importSingleNode(PrimaryID, **UpstreamNode)
+                    : importFullTree(PrimaryID, **UpstreamNode))
+    return std::move(E);
+  return load(PrimaryID);
+}

diff  --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
new file mode 100644
index 0000000000000..21860717da3bf
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
@@ -0,0 +1,113 @@
+//===- OnDiskKeyValueDB.cpp -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements OnDiskKeyValueDB, an ondisk key value database.
+///
+/// The KeyValue database file is named `actions.<version>` inside the CAS
+/// directory. The database stores a mapping between a fixed-sized key and a
+/// fixed-sized value, where the size of key and value can be configured when
+/// opening the database.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Path.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+static constexpr StringLiteral ActionCacheFile = "actions.";
+
+Expected<ArrayRef<char>> OnDiskKeyValueDB::put(ArrayRef<uint8_t> Key,
+                                               ArrayRef<char> Value) {
+  if (LLVM_UNLIKELY(Value.size() != ValueSize))
+    return createStringError(errc::invalid_argument,
+                             "expected value size of " + itostr(ValueSize) +
+                                 ", got: " + itostr(Value.size()));
+  assert(Value.size() == ValueSize);
+  auto ActionP = Cache.insertLazy(
+      Key, [&](FileOffset TentativeOffset,
+               OnDiskTrieRawHashMap::ValueProxy TentativeValue) {
+        assert(TentativeValue.Data.size() == ValueSize);
+        llvm::copy(Value, TentativeValue.Data.data());
+      });
+  if (LLVM_UNLIKELY(!ActionP))
+    return ActionP.takeError();
+  return (*ActionP)->Data;
+}
+
+Expected<std::optional<ArrayRef<char>>>
+OnDiskKeyValueDB::get(ArrayRef<uint8_t> Key) {
+  // Check the result cache.
+  OnDiskTrieRawHashMap::ConstOnDiskPtr ActionP = Cache.find(Key);
+  if (!ActionP)
+    return std::nullopt;
+  assert(isAddrAligned(Align(8), ActionP->Data.data()));
+  return ActionP->Data;
+}
+
+Expected<std::unique_ptr<OnDiskKeyValueDB>>
+OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize,
+                       StringRef ValueName, size_t ValueSize) {
+  if (std::error_code EC = sys::fs::create_directories(Path))
+    return createFileError(Path, EC);
+
+  SmallString<256> CachePath(Path);
+  sys::path::append(CachePath, ActionCacheFile + CASFormatVersion);
+  constexpr uint64_t MB = 1024ull * 1024ull;
+  constexpr uint64_t GB = 1024ull * 1024ull * 1024ull;
+
+  uint64_t MaxFileSize = GB;
+  auto CustomSize = getOverriddenMaxMappingSize();
+  if (!CustomSize)
+    return CustomSize.takeError();
+  if (*CustomSize)
+    MaxFileSize = **CustomSize;
+
+  std::optional<OnDiskTrieRawHashMap> ActionCache;
+  if (Error E = OnDiskTrieRawHashMap::create(
+                    CachePath,
+                    "llvm.actioncache[" + HashName + "->" + ValueName + "]",
+                    KeySize * 8,
+                    /*DataSize=*/ValueSize, MaxFileSize, /*MinFileSize=*/MB)
+                    .moveInto(ActionCache))
+    return std::move(E);
+
+  return std::unique_ptr<OnDiskKeyValueDB>(
+      new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache)));
+}
+
+Error OnDiskKeyValueDB::validate(CheckValueT CheckValue) const {
+  return Cache.validate(
+      [&](FileOffset Offset,
+          OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error {
+        auto formatError = [&](Twine Msg) {
+          return createStringError(
+              llvm::errc::illegal_byte_sequence,
+              "bad cache value at 0x" +
+                  utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " +
+                  Msg.str());
+        };
+
+        if (Record.Data.size() != ValueSize)
+          return formatError("wrong cache value size");
+        if (!isAddrAligned(Align(8), Record.Data.data()))
+          return formatError("wrong cache value alignment");
+        if (CheckValue)
+          return CheckValue(Offset, Record.Data);
+        return Error::success();
+      });
+}

diff  --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp
index 29e2db48db5c0..91d0970367ac3 100644
--- a/llvm/unittests/CAS/CASTestConfig.cpp
+++ b/llvm/unittests/CAS/CASTestConfig.cpp
@@ -19,3 +19,17 @@ static CASTestingEnv createInMemory(int I) {
 
 INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest,
                          ::testing::Values(createInMemory));
+
+#if LLVM_ENABLE_ONDISK_CAS
+namespace llvm::cas::ondisk {
+extern void setMaxMappingSize(uint64_t Size);
+} // namespace llvm::cas::ondisk
+
+void setMaxOnDiskCASMappingSize() {
+  static std::once_flag Flag;
+  std::call_once(
+      Flag, [] { llvm::cas::ondisk::setMaxMappingSize(100 * 1024 * 1024); });
+}
+#else
+void setMaxOnDiskCASMappingSize() {}
+#endif /* LLVM_ENABLE_ONDISK_CAS */

diff  --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h
index 8093a0b0864f9..c08968b95b9cc 100644
--- a/llvm/unittests/CAS/CASTestConfig.h
+++ b/llvm/unittests/CAS/CASTestConfig.h
@@ -18,6 +18,17 @@ struct CASTestingEnv {
   std::unique_ptr<llvm::cas::ActionCache> Cache;
 };
 
+void setMaxOnDiskCASMappingSize();
+
+// Test fixture for on-disk data base tests.
+class OnDiskCASTest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    // Use a smaller database size for testing to conserve disk space.
+    setMaxOnDiskCASMappingSize();
+  }
+};
+
 class CASTest
     : public testing::TestWithParam<std::function<CASTestingEnv(int)>> {
 protected:

diff  --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index ee40e6c9879a1..da469f7fccb5a 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -1,3 +1,19 @@
+set(ONDISK_CAS_TEST_SOURCES
+  OnDiskGraphDBTest.cpp
+  OnDiskDataAllocatorTest.cpp
+  OnDiskKeyValueDBTest.cpp
+  OnDiskTrieRawHashMapTest.cpp
+  ProgramTest.cpp
+  )
+
+set(LLVM_OPTIONAL_SOURCES
+  ${ONDISK_CAS_TEST_SOURCES}
+  )
+
+if (NOT LLVM_ENABLE_ONDISK_CAS)
+  unset(ONDISK_CAS_TEST_SOURCES)
+endif()
+
 set(LLVM_LINK_COMPONENTS
   Support
   CAS
@@ -8,9 +24,8 @@ add_llvm_unittest(CASTests
   ActionCacheTest.cpp
   CASTestConfig.cpp
   ObjectStoreTest.cpp
-  OnDiskDataAllocatorTest.cpp
-  OnDiskTrieRawHashMapTest.cpp
-  ProgramTest.cpp
+
+  ${ONDISK_CAS_TEST_SOURCES}
   )
 
 target_link_libraries(CASTests PRIVATE LLVMTestingSupport)

diff  --git a/llvm/unittests/CAS/OnDiskCommonUtils.h b/llvm/unittests/CAS/OnDiskCommonUtils.h
new file mode 100644
index 0000000000000..57c8c228867fd
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskCommonUtils.h
@@ -0,0 +1,76 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Helper functions to test OnDiskCASDatabases.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/Support/BLAKE3.h"
+#include "llvm/Testing/Support/Error.h"
+
+namespace llvm::unittest::cas {
+
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+using HasherT = BLAKE3;
+using HashType = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>()));
+using ValueType = std::array<char, 20>;
+
+inline HashType digest(StringRef Data, ArrayRef<ArrayRef<uint8_t>> RefHashes) {
+  return BuiltinObjectHasher<HasherT>::hashObject(
+      RefHashes, arrayRefFromStringRef<char>(Data));
+}
+
+inline ObjectID digest(OnDiskGraphDB &DB, StringRef Data,
+                       ArrayRef<ObjectID> Refs) {
+  SmallVector<ArrayRef<uint8_t>, 8> RefHashes;
+  for (ObjectID Ref : Refs)
+    RefHashes.push_back(DB.getDigest(Ref));
+  HashType Digest = digest(Data, RefHashes);
+  std::optional<ObjectID> ID;
+  EXPECT_THAT_ERROR(DB.getReference(Digest).moveInto(ID), Succeeded());
+  return *ID;
+}
+
+inline HashType digest(StringRef Data) {
+  return HasherT::hash(arrayRefFromStringRef(Data));
+}
+
+inline ValueType valueFromString(StringRef S) {
+  ValueType Val;
+  llvm::copy(S.substr(0, sizeof(Val)), Val.data());
+  return Val;
+}
+
+inline Expected<ObjectID> store(OnDiskGraphDB &DB, StringRef Data,
+                                ArrayRef<ObjectID> Refs) {
+  ObjectID ID = digest(DB, Data, Refs);
+  if (Error E = DB.store(ID, Refs, arrayRefFromStringRef<char>(Data)))
+    return std::move(E);
+  return ID;
+}
+
+inline Error printTree(OnDiskGraphDB &DB, ObjectID ID, raw_ostream &OS,
+                       unsigned Indent = 0) {
+  std::optional<ondisk::ObjectHandle> Obj;
+  if (Error E = DB.load(ID).moveInto(Obj))
+    return E;
+  if (!Obj)
+    return Error::success();
+  OS.indent(Indent) << toStringRef(DB.getObjectData(*Obj)) << '\n';
+  for (ObjectID Ref : DB.getObjectRefs(*Obj)) {
+    if (Error E = printTree(DB, Ref, OS, Indent + 2))
+      return E;
+  }
+  return Error::success();
+}
+
+} // namespace llvm::unittest::cas

diff  --git a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
new file mode 100644
index 0000000000000..58f5dcc625a09
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
@@ -0,0 +1,310 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CASTestConfig.h"
+#include "OnDiskCommonUtils.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+using namespace llvm::unittest::cas;
+
+TEST_F(OnDiskCASTest, OnDiskGraphDBTest) {
+  unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+  std::unique_ptr<OnDiskGraphDB> DB;
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType)).moveInto(DB),
+      Succeeded());
+
+  auto digest = [&DB](StringRef Data, ArrayRef<ObjectID> Refs) -> ObjectID {
+    return ::digest(*DB, Data, Refs);
+  };
+
+  auto store = [&](StringRef Data,
+                   ArrayRef<ObjectID> Refs) -> Expected<ObjectID> {
+    return ::store(*DB, Data, Refs);
+  };
+
+  std::optional<ObjectID> ID1;
+  ASSERT_THAT_ERROR(store("hello", {}).moveInto(ID1), Succeeded());
+
+  std::optional<ondisk::ObjectHandle> Obj1;
+  ASSERT_THAT_ERROR(DB->load(*ID1).moveInto(Obj1), Succeeded());
+  ASSERT_TRUE(Obj1.has_value());
+  EXPECT_EQ(toStringRef(DB->getObjectData(*Obj1)), "hello");
+
+  ArrayRef<uint8_t> Digest1 = DB->getDigest(*ID1);
+  std::optional<ObjectID> ID2;
+  ASSERT_THAT_ERROR(DB->getReference(Digest1).moveInto(ID2), Succeeded());
+  EXPECT_EQ(ID1, ID2);
+
+  ObjectID ID3 = digest("world", {});
+  EXPECT_FALSE(DB->containsObject(ID3));
+  std::optional<ondisk::ObjectHandle> Obj2;
+  ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj2), Succeeded());
+  EXPECT_FALSE(Obj2.has_value());
+
+  ASSERT_THAT_ERROR(DB->store(ID3, {}, arrayRefFromStringRef<char>("world")),
+                    Succeeded());
+  EXPECT_TRUE(DB->containsObject(ID3));
+  ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj2), Succeeded());
+  ASSERT_TRUE(Obj2.has_value());
+  EXPECT_EQ(toStringRef(DB->getObjectData(*Obj2)), "world");
+
+  size_t LargeDataSize = 256LL * 1024LL; // 256K.
+  // The precise size number is not important, we mainly check that the large
+  // object will be properly accounted for.
+  EXPECT_TRUE(DB->getStorageSize() > 10 &&
+              DB->getStorageSize() < LargeDataSize);
+
+  SmallString<16> Buffer;
+  Buffer.resize(LargeDataSize);
+  ASSERT_THAT_ERROR(store(Buffer, {}).moveInto(ID1), Succeeded());
+  size_t StorageSize = DB->getStorageSize();
+  EXPECT_TRUE(StorageSize > LargeDataSize);
+
+  // Close & re-open the DB and check that it reports the same storage size.
+  DB.reset();
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType)).moveInto(DB),
+      Succeeded());
+  EXPECT_EQ(DB->getStorageSize(), StorageSize);
+}
+
+TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInSingleNode) {
+  unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true);
+  std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType))
+          .moveInto(UpstreamDB),
+      Succeeded());
+  {
+    std::optional<ObjectID> ID1;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "hello", {}).moveInto(ID1),
+                      Succeeded());
+    std::optional<ObjectID> ID2;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "another", {}).moveInto(ID2),
+                      Succeeded());
+    std::optional<ObjectID> ID3;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "world", {*ID1, *ID2}).moveInto(ID3),
+                      Succeeded());
+  }
+
+  unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+  std::unique_ptr<OnDiskGraphDB> DB;
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+                          std::move(UpstreamDB),
+                          OnDiskGraphDB::FaultInPolicy::SingleNode)
+          .moveInto(DB),
+      Succeeded());
+
+  ObjectID ID1 = digest(*DB, "hello", {});
+  ObjectID ID2 = digest(*DB, "another", {});
+  ObjectID ID3 = digest(*DB, "world", {ID1, ID2});
+  ObjectID ID4 = digest(*DB, "world", {});
+
+  EXPECT_TRUE(DB->containsObject(ID1));
+  EXPECT_TRUE(DB->containsObject(ID2));
+  EXPECT_TRUE(DB->containsObject(ID3));
+  EXPECT_FALSE(DB->containsObject(ID4));
+
+  EXPECT_TRUE(DB->getExistingReference(digest("hello", {})).has_value());
+  EXPECT_TRUE(DB->getExistingReference(DB->getDigest(ID3)).has_value());
+  EXPECT_FALSE(DB->getExistingReference(digest("world", {})).has_value());
+
+  {
+    std::optional<ondisk::ObjectHandle> Obj;
+    ASSERT_THAT_ERROR(DB->load(ID1).moveInto(Obj), Succeeded());
+    ASSERT_TRUE(Obj.has_value());
+    EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "hello");
+    auto Refs = DB->getObjectRefs(*Obj);
+    EXPECT_TRUE(Refs.empty());
+  }
+  {
+    std::optional<ondisk::ObjectHandle> Obj;
+    ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj), Succeeded());
+    ASSERT_TRUE(Obj.has_value());
+    EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "world");
+    auto Refs = DB->getObjectRefs(*Obj);
+    ASSERT_EQ(std::distance(Refs.begin(), Refs.end()), 2);
+    EXPECT_EQ(Refs.begin()[0], ID1);
+    EXPECT_EQ(Refs.begin()[1], ID2);
+  }
+  {
+    std::optional<ondisk::ObjectHandle> Obj;
+    ASSERT_THAT_ERROR(DB->load(ID4).moveInto(Obj), Succeeded());
+    EXPECT_FALSE(Obj.has_value());
+  }
+
+  // Re-open the primary without chaining, to verify the data were copied from
+  // the upstream.
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+                          /*UpstreamDB=*/nullptr,
+                          OnDiskGraphDB::FaultInPolicy::SingleNode)
+          .moveInto(DB),
+      Succeeded());
+  ID1 = digest(*DB, "hello", {});
+  ID2 = digest(*DB, "another", {});
+  ID3 = digest(*DB, "world", {ID1, ID2});
+  EXPECT_TRUE(DB->containsObject(ID1));
+  EXPECT_FALSE(DB->containsObject(ID2));
+  EXPECT_TRUE(DB->containsObject(ID3));
+  {
+    std::optional<ondisk::ObjectHandle> Obj;
+    ASSERT_THAT_ERROR(DB->load(ID1).moveInto(Obj), Succeeded());
+    ASSERT_TRUE(Obj.has_value());
+    EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "hello");
+    auto Refs = DB->getObjectRefs(*Obj);
+    EXPECT_TRUE(Refs.empty());
+  }
+}
+
+TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInFullTree) {
+  unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true);
+  std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType))
+          .moveInto(UpstreamDB),
+      Succeeded());
+  HashType RootHash;
+  {
+    std::optional<ObjectID> ID11;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "11", {}).moveInto(ID11), Succeeded());
+    std::optional<ObjectID> ID121;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "121", {}).moveInto(ID121),
+                      Succeeded());
+    std::optional<ObjectID> ID12;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "12", {*ID121}).moveInto(ID12),
+                      Succeeded());
+    std::optional<ObjectID> ID1;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "1", {*ID11, *ID12}).moveInto(ID1),
+                      Succeeded());
+    std::optional<ObjectID> ID21;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "21", {}).moveInto(ID21), Succeeded());
+    std::optional<ObjectID> ID22;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "22", {}).moveInto(ID22), Succeeded());
+    std::optional<ObjectID> ID2;
+    ASSERT_THAT_ERROR(
+        store(*UpstreamDB, "2", {*ID12, *ID21, *ID22}).moveInto(ID2),
+        Succeeded());
+    std::optional<ObjectID> IDRoot;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "root", {*ID1, *ID2}).moveInto(IDRoot),
+                      Succeeded());
+    ArrayRef<uint8_t> Digest = UpstreamDB->getDigest(*IDRoot);
+    ASSERT_EQ(Digest.size(), RootHash.size());
+    llvm::copy(Digest, RootHash.data());
+  }
+
+  unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+  std::unique_ptr<OnDiskGraphDB> DB;
+  ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+                                        std::move(UpstreamDB),
+                                        OnDiskGraphDB::FaultInPolicy::FullTree)
+                        .moveInto(DB),
+                    Succeeded());
+
+  {
+    std::optional<ObjectID> IDRoot;
+    ASSERT_THAT_ERROR(DB->getReference(RootHash).moveInto(IDRoot), Succeeded());
+    std::optional<ondisk::ObjectHandle> Obj;
+    ASSERT_THAT_ERROR(DB->load(*IDRoot).moveInto(Obj), Succeeded());
+    ASSERT_TRUE(Obj.has_value());
+    EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "root");
+    auto Refs = DB->getObjectRefs(*Obj);
+    ASSERT_EQ(std::distance(Refs.begin(), Refs.end()), 2);
+  }
+
+  // Re-open the primary without chaining, to verify the data were copied from
+  // the upstream.
+  ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+                                        /*UpstreamDB=*/nullptr,
+                                        OnDiskGraphDB::FaultInPolicy::FullTree)
+                        .moveInto(DB),
+                    Succeeded());
+
+  std::optional<ObjectID> IDRoot;
+  ASSERT_THAT_ERROR(DB->getReference(RootHash).moveInto(IDRoot), Succeeded());
+  std::string PrintedTree;
+  raw_string_ostream OS(PrintedTree);
+  ASSERT_THAT_ERROR(printTree(*DB, *IDRoot, OS), Succeeded());
+  StringRef Expected = R"(root
+  1
+    11
+    12
+      121
+  2
+    12
+      121
+    21
+    22
+)";
+  EXPECT_EQ(PrintedTree, Expected);
+}
+
+TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInPolicyConflict) {
+  auto tryFaultInPolicyConflict = [](OnDiskGraphDB::FaultInPolicy Policy1,
+                                     OnDiskGraphDB::FaultInPolicy Policy2) {
+    unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true);
+    std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+    ASSERT_THAT_ERROR(
+        OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType))
+            .moveInto(UpstreamDB),
+        Succeeded());
+
+    unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+    std::unique_ptr<OnDiskGraphDB> DB;
+    ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3",
+                                          sizeof(HashType),
+                                          std::move(UpstreamDB), Policy1)
+                          .moveInto(DB),
+                      Succeeded());
+    DB.reset();
+    ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3",
+                                          sizeof(HashType),
+                                          std::move(UpstreamDB), Policy2)
+                          .moveInto(DB),
+                      Failed());
+  };
+  // Open as 'single', then as 'full'.
+  tryFaultInPolicyConflict(OnDiskGraphDB::FaultInPolicy::SingleNode,
+                           OnDiskGraphDB::FaultInPolicy::FullTree);
+  // Open as 'full', then as 'single'.
+  tryFaultInPolicyConflict(OnDiskGraphDB::FaultInPolicy::FullTree,
+                           OnDiskGraphDB::FaultInPolicy::SingleNode);
+}
+
+#if defined(EXPENSIVE_CHECKS)
+TEST_F(OnDiskCASTest, OnDiskGraphDBSpaceLimit) {
+  setMaxOnDiskCASMappingSize();
+  unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+  std::unique_ptr<OnDiskGraphDB> DB;
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType)).moveInto(DB),
+      Succeeded());
+
+  std::optional<ObjectID> ID;
+  std::string Data(500, '0');
+  auto storeSmallObject = [&]() {
+    SmallVector<ObjectID, 1> Refs;
+    if (ID)
+      Refs.push_back(*ID);
+    ASSERT_THAT_ERROR(store(*DB, Data, Refs).moveInto(ID), Succeeded());
+  };
+
+  // Insert enough small elements to overflow the data pool.
+  for (unsigned I = 0; I < 1024 * 256; ++I)
+    storeSmallObject();
+
+  EXPECT_GE(DB->getHardStorageLimitUtilization(), 99U);
+}
+#endif

diff  --git a/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp b/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp
new file mode 100644
index 0000000000000..89c03b890a488
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp
@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "CASTestConfig.h"
+#include "OnDiskCommonUtils.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+using namespace llvm::unittest::cas;
+
+TEST_F(OnDiskCASTest, OnDiskKeyValueDBTest) {
+  unittest::TempDir Temp("ondiskkv", /*Unique=*/true);
+  std::unique_ptr<OnDiskKeyValueDB> DB;
+  ASSERT_THAT_ERROR(OnDiskKeyValueDB::open(Temp.path(), "blake3",
+                                           sizeof(HashType), "test",
+                                           sizeof(ValueType))
+                        .moveInto(DB),
+                    Succeeded());
+  {
+    std::optional<ArrayRef<char>> Val;
+    ASSERT_THAT_ERROR(DB->get(digest("hello")).moveInto(Val), Succeeded());
+    EXPECT_FALSE(Val.has_value());
+  }
+
+  ValueType ValW = valueFromString("world");
+  ArrayRef<char> Val;
+  ASSERT_THAT_ERROR(DB->put(digest("hello"), ValW).moveInto(Val), Succeeded());
+  EXPECT_EQ(Val, ArrayRef(ValW));
+  ASSERT_THAT_ERROR(
+      DB->put(digest("hello"), valueFromString("other")).moveInto(Val),
+      Succeeded());
+  EXPECT_EQ(Val, ArrayRef(ValW));
+
+  {
+    std::optional<ArrayRef<char>> Val;
+    ASSERT_THAT_ERROR(DB->get(digest("hello")).moveInto(Val), Succeeded());
+    EXPECT_TRUE(Val.has_value());
+    EXPECT_EQ(*Val, ArrayRef(ValW));
+  }
+
+  // Validate
+  {
+    auto ValidateFunc = [](FileOffset Offset, ArrayRef<char> Data) -> Error {
+      EXPECT_EQ(Data.size(), sizeof(ValueType));
+      return Error::success();
+    };
+    ASSERT_THAT_ERROR(DB->validate(ValidateFunc), Succeeded());
+  }
+
+  // Size
+  {
+    size_t InitSize = DB->getStorageSize();
+    unsigned InitPrecent = DB->getHardStorageLimitUtilization();
+
+    // Insert a lot of entries.
+    for (unsigned I = 0; I < 1024 * 100; ++I) {
+      std::string Index = Twine(I).str();
+      ArrayRef<char> Val;
+      ASSERT_THAT_ERROR(
+          DB->put(digest(Index), valueFromString(Index)).moveInto(Val),
+          Succeeded());
+    }
+
+    EXPECT_GT(DB->getStorageSize(), InitSize);
+    EXPECT_GT(DB->getHardStorageLimitUtilization(), InitPrecent);
+  }
+}


        


More information about the llvm-commits mailing list