[llvm] [CAS] LLVMCAS implementation (PR #68448)
Steven Wu via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 9 14:30:05 PDT 2024
https://github.com/cachemeifyoucan updated https://github.com/llvm/llvm-project/pull/68448
>From f5b8ff65ab25398cc73ca8bf48223542a6215f8b Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Thu, 5 Oct 2023 13:02:40 -0700
Subject: [PATCH 01/11] [ADT] Add TrieRawHashMap
Implement TrieRawHashMap which stores objects into a Trie based on the
hash of the object.
User needs to supply the hashing function and guarantees the uniqueness of
the hash for the objects to be inserted. Hash collision is not
llvm/include/llvm/ADT/TrieRawHashMap.h | 398 ++++++++++++++++++
llvm/lib/Support/CMakeLists.txt | 1 +
llvm/lib/Support/TrieHashIndexGenerator.h | 89 ++++
llvm/lib/Support/TrieRawHashMap.cpp | 483 ++++++++++++++++++++++
llvm/unittests/ADT/CMakeLists.txt | 1 +
llvm/unittests/ADT/TrieRawHashMapTest.cpp | 342 +++++++++++++++
6 files changed, 1314 insertions(+)
create mode 100644 llvm/include/llvm/ADT/TrieRawHashMap.h
create mode 100644 llvm/lib/Support/TrieHashIndexGenerator.h
create mode 100644 llvm/lib/Support/TrieRawHashMap.cpp
create mode 100644 llvm/unittests/ADT/TrieRawHashMapTest.cpp
diff --git a/llvm/include/llvm/ADT/TrieRawHashMap.h b/llvm/include/llvm/ADT/TrieRawHashMap.h
new file mode 100644
index 00000000000000..baa08e214ce6fd
--- /dev/null
+++ b/llvm/include/llvm/ADT/TrieRawHashMap.h
@@ -0,0 +1,398 @@
+//===- TrieRawHashMap.h -----------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <atomic>
+#include <optional>
+namespace llvm {
+class raw_ostream;
+/// TrieRawHashMap - is a lock-free thread-safe trie that is can be used to
+/// store/index data based on a hash value. It can be customized to work with
+/// any hash algorithm or store any data.
+/// Data structure:
+/// Data node stored in the Trie contains both hash and data:
+/// struct {
+/// HashT Hash;
+/// DataT Data;
+/// };
+/// Data is stored/indexed via a prefix tree, where each node in the tree can be
+/// either the root, a sub-trie or a data node. Assuming a 4-bit hash and two
+/// data objects {0001, A} and {0100, B}, it can be stored in a trie
+/// (assuming Root has 2 bits, SubTrie has 1 bit):
+/// +--------+
+/// |Root[00]| -> {0001, A}
+/// | [01]| -> {0100, B}
+/// | [10]| (empty)
+/// | [11]| (empty)
+/// +--------+
+/// Inserting a new object {0010, C} will result in:
+/// +--------+ +----------+
+/// |Root[00]| -> |SubTrie[0]| -> {0001, A}
+/// | | | [1]| -> {0010, C}
+/// | | +----------+
+/// | [01]| -> {0100, B}
+/// | [10]| (empty)
+/// | [11]| (empty)
+/// +--------+
+/// Note object A is sunk down to a sub-trie during the insertion. All the
+/// nodes are inserted through compare-exchange to ensure thread-safe and
+/// lock-free.
+/// To find an object in the trie, walk the tree with prefix of the hash until
+/// the data node is found. Then the hash is compared with the hash stored in
+/// the data node to see if the is the same object.
+/// Hash collision is not allowed so it is recommended to use trie with a
+/// "strong" hashing algorithm. A well-distributed hash can also result in
+/// better performance and memory usage.
+/// It currently does not support iteration and deletion.
+/// Base class for a lock-free thread-safe hash-mapped trie.
+class ThreadSafeTrieRawHashMapBase {
+ static constexpr size_t TrieContentBaseSize = 4;
+ static constexpr size_t DefaultNumRootBits = 6;
+ static constexpr size_t DefaultNumSubtrieBits = 4;
+ template <class T> struct AllocValueType {
+ char Base[TrieContentBaseSize];
+ std::aligned_union_t<sizeof(T), T> Content;
+ };
+ template <class T>
+ static constexpr size_t DefaultContentAllocSize = sizeof(AllocValueType<T>);
+ template <class T>
+ static constexpr size_t DefaultContentAllocAlign = alignof(AllocValueType<T>);
+ template <class T>
+ static constexpr size_t DefaultContentOffset =
+ offsetof(AllocValueType<T>, Content);
+ void operator delete(void *Ptr) { ::free(Ptr); }
+ LLVM_DUMP_METHOD void dump() const;
+ void print(raw_ostream &OS) const;
+ /// Result of a lookup. Suitable for an insertion hint. Maybe could be
+ /// expanded into an iterator of sorts, but likely not useful (visiting
+ /// everything in the trie should probably be done some way other than
+ /// through an iterator pattern).
+ class PointerBase {
+ protected:
+ void *get() const { return I == -2u ? P : nullptr; }
+ public:
+ PointerBase() noexcept = default;
+ PointerBase(PointerBase &&) = default;
+ PointerBase(const PointerBase &) = default;
+ PointerBase &operator=(PointerBase &&) = default;
+ PointerBase &operator=(const PointerBase &) = default;
+ private:
+ friend class ThreadSafeTrieRawHashMapBase;
+ explicit PointerBase(void *Content) : P(Content), I(-2u) {}
+ PointerBase(void *P, unsigned I, unsigned B) : P(P), I(I), B(B) {}
+ bool isHint() const { return I != -1u && I != -2u; }
+ void *P = nullptr;
+ unsigned I = -1u;
+ unsigned B = 0;
+ };
+ /// Find the stored content with hash.
+ PointerBase find(ArrayRef<uint8_t> Hash) const;
+ /// Insert and return the stored content.
+ PointerBase
+ insert(PointerBase Hint, ArrayRef<uint8_t> Hash,
+ function_ref<const uint8_t *(void *Mem, ArrayRef<uint8_t> Hash)>
+ Constructor);
+ ThreadSafeTrieRawHashMapBase() = delete;
+ ThreadSafeTrieRawHashMapBase(
+ size_t ContentAllocSize, size_t ContentAllocAlign, size_t ContentOffset,
+ std::optional<size_t> NumRootBits = std::nullopt,
+ std::optional<size_t> NumSubtrieBits = std::nullopt);
+ /// Destructor, which asserts if there's anything to do. Subclasses should
+ /// call \a destroyImpl().
+ ///
+ /// \pre \a destroyImpl() was already called.
+ ~ThreadSafeTrieRawHashMapBase();
+ void destroyImpl(function_ref<void(void *ValueMem)> Destructor);
+ ThreadSafeTrieRawHashMapBase(ThreadSafeTrieRawHashMapBase &&RHS);
+ // Move assignment can be implemented in a thread-safe way if NumRootBits and
+ // NumSubtrieBits are stored inside the Root.
+ ThreadSafeTrieRawHashMapBase &
+ operator=(ThreadSafeTrieRawHashMapBase &&RHS) = delete;
+ // No copy.
+ ThreadSafeTrieRawHashMapBase(const ThreadSafeTrieRawHashMapBase &) = delete;
+ ThreadSafeTrieRawHashMapBase &
+ operator=(const ThreadSafeTrieRawHashMapBase &) = delete;
+ // Debug functions. Implementation details and not guaranteed to be
+ // thread-safe.
+ PointerBase getRoot() const;
+ unsigned getStartBit(PointerBase P) const;
+ unsigned getNumBits(PointerBase P) const;
+ unsigned getNumSlotUsed(PointerBase P) const;
+ std::string getTriePrefixAsString(PointerBase P) const;
+ unsigned getNumTries() const;
+ // Visit next trie in the allocation chain.
+ PointerBase getNextTrie(PointerBase P) const;
+ friend class TrieRawHashMapTestHelper;
+ const unsigned short ContentAllocSize;
+ const unsigned short ContentAllocAlign;
+ const unsigned short ContentOffset;
+ unsigned short NumRootBits;
+ unsigned short NumSubtrieBits;
+ struct ImplType;
+ // ImplPtr is owned by ThreadSafeTrieRawHashMapBase and needs to be freed in
+ // destoryImpl.
+ std::atomic<ImplType *> ImplPtr;
+ ImplType &getOrCreateImpl();
+ ImplType *getImpl() const;
+/// Lock-free thread-safe hash-mapped trie.
+template <class T, size_t NumHashBytes>
+class ThreadSafeTrieRawHashMap : public ThreadSafeTrieRawHashMapBase {
+ using HashT = std::array<uint8_t, NumHashBytes>;
+ class LazyValueConstructor;
+ struct value_type {
+ const HashT Hash;
+ T Data;
+ value_type(value_type &&) = default;
+ value_type(const value_type &) = default;
+ value_type(ArrayRef<uint8_t> Hash, const T &Data)
+ : Hash(makeHash(Hash)), Data(Data) {}
+ value_type(ArrayRef<uint8_t> Hash, T &&Data)
+ : Hash(makeHash(Hash)), Data(std::move(Data)) {}
+ private:
+ friend class LazyValueConstructor;
+ struct EmplaceTag {};
+ template <class... ArgsT>
+ value_type(ArrayRef<uint8_t> Hash, EmplaceTag, ArgsT &&...Args)
+ : Hash(makeHash(Hash)), Data(std::forward<ArgsT>(Args)...) {}
+ static HashT makeHash(ArrayRef<uint8_t> HashRef) {
+ HashT Hash;
+ std::copy(HashRef.begin(), HashRef.end(), Hash.data());
+ return Hash;
+ }
+ };
+ using ThreadSafeTrieRawHashMapBase::operator delete;
+ using HashType = HashT;
+ using ThreadSafeTrieRawHashMapBase::dump;
+ using ThreadSafeTrieRawHashMapBase::print;
+ template <class ValueT> class PointerImpl : PointerBase {
+ friend class ThreadSafeTrieRawHashMap;
+ ValueT *get() const {
+ if (void *B = PointerBase::get())
+ return reinterpret_cast<ValueT *>(B);
+ return nullptr;
+ }
+ public:
+ ValueT &operator*() const {
+ assert(get());
+ return *get();
+ }
+ ValueT *operator->() const {
+ assert(get());
+ return get();
+ }
+ explicit operator bool() const { return get(); }
+ PointerImpl() = default;
+ PointerImpl(PointerImpl &&) = default;
+ PointerImpl(const PointerImpl &) = default;
+ PointerImpl &operator=(PointerImpl &&) = default;
+ PointerImpl &operator=(const PointerImpl &) = default;
+ protected:
+ PointerImpl(PointerBase Result) : PointerBase(Result) {}
+ };
+ class pointer;
+ class const_pointer;
+ class pointer : public PointerImpl<value_type> {
+ friend class ThreadSafeTrieRawHashMap;
+ friend class const_pointer;
+ public:
+ pointer() = default;
+ pointer(pointer &&) = default;
+ pointer(const pointer &) = default;
+ pointer &operator=(pointer &&) = default;
+ pointer &operator=(const pointer &) = default;
+ private:
+ pointer(PointerBase Result) : pointer::PointerImpl(Result) {}
+ };
+ class const_pointer : public PointerImpl<const value_type> {
+ friend class ThreadSafeTrieRawHashMap;
+ public:
+ const_pointer() = default;
+ const_pointer(const_pointer &&) = default;
+ const_pointer(const const_pointer &) = default;
+ const_pointer &operator=(const_pointer &&) = default;
+ const_pointer &operator=(const const_pointer &) = default;
+ const_pointer(const pointer &P) : const_pointer::PointerImpl(P) {}
+ private:
+ const_pointer(PointerBase Result) : const_pointer::PointerImpl(Result) {}
+ };
+ class LazyValueConstructor {
+ public:
+ value_type &operator()(T &&RHS) {
+ assert(Mem && "Constructor already called, or moved away");
+ return assign(::new (Mem) value_type(Hash, std::move(RHS)));
+ }
+ value_type &operator()(const T &RHS) {
+ assert(Mem && "Constructor already called, or moved away");
+ return assign(::new (Mem) value_type(Hash, RHS));
+ }
+ template <class... ArgsT> value_type &emplace(ArgsT &&...Args) {
+ assert(Mem && "Constructor already called, or moved away");
+ return assign(::new (Mem)
+ value_type(Hash, typename value_type::EmplaceTag{},
+ std::forward<ArgsT>(Args)...));
+ }
+ LazyValueConstructor(LazyValueConstructor &&RHS)
+ : Mem(RHS.Mem), Result(RHS.Result), Hash(RHS.Hash) {
+ RHS.Mem = nullptr; // Moved away, cannot call.
+ }
+ ~LazyValueConstructor() { assert(!Mem && "Constructor never called!"); }
+ private:
+ value_type &assign(value_type *V) {
+ Mem = nullptr;
+ Result = V;
+ return *V;
+ }
+ friend class ThreadSafeTrieRawHashMap;
+ LazyValueConstructor() = delete;
+ LazyValueConstructor(void *Mem, value_type *&Result, ArrayRef<uint8_t> Hash)
+ : Mem(Mem), Result(Result), Hash(Hash) {
+ assert(Hash.size() == sizeof(HashT) && "Invalid hash");
+ assert(Mem && "Invalid memory for construction");
+ }
+ void *Mem;
+ value_type *&Result;
+ ArrayRef<uint8_t> Hash;
+ };
+ /// Insert with a hint. Default-constructed hint will work, but it's
+ /// recommended to start with a lookup to avoid overhead in object creation
+ /// if it already exists.
+ pointer insertLazy(const_pointer Hint, ArrayRef<uint8_t> Hash,
+ function_ref<void(LazyValueConstructor)> OnConstruct) {
+ return pointer(ThreadSafeTrieRawHashMapBase::insert(
+ Hint, Hash, [&](void *Mem, ArrayRef<uint8_t> Hash) {
+ value_type *Result = nullptr;
+ OnConstruct(LazyValueConstructor(Mem, Result, Hash));
+ return Result->Hash.data();
+ }));
+ }
+ pointer insertLazy(ArrayRef<uint8_t> Hash,
+ function_ref<void(LazyValueConstructor)> OnConstruct) {
+ return insertLazy(const_pointer(), Hash, OnConstruct);
+ }
+ pointer insert(const_pointer Hint, value_type &&HashedData) {
+ return insertLazy(Hint, HashedData.Hash, [&](LazyValueConstructor C) {
+ C(std::move(HashedData.Data));
+ });
+ }
+ pointer insert(const_pointer Hint, const value_type &HashedData) {
+ return insertLazy(Hint, HashedData.Hash,
+ [&](LazyValueConstructor C) { C(HashedData.Data); });
+ }
+ pointer find(ArrayRef<uint8_t> Hash) {
+ assert(Hash.size() == std::tuple_size<HashT>::value);
+ return ThreadSafeTrieRawHashMapBase::find(Hash);
+ }
+ const_pointer find(ArrayRef<uint8_t> Hash) const {
+ assert(Hash.size() == std::tuple_size<HashT>::value);
+ return ThreadSafeTrieRawHashMapBase::find(Hash);
+ }
+ ThreadSafeTrieRawHashMap(std::optional<size_t> NumRootBits = std::nullopt,
+ std::optional<size_t> NumSubtrieBits = std::nullopt)
+ : ThreadSafeTrieRawHashMapBase(DefaultContentAllocSize<value_type>,
+ DefaultContentAllocAlign<value_type>,
+ DefaultContentOffset<value_type>,
+ NumRootBits, NumSubtrieBits) {}
+ ~ThreadSafeTrieRawHashMap() {
+ if constexpr (std::is_trivially_destructible<value_type>::value)
+ this->destroyImpl(nullptr);
+ else
+ this->destroyImpl(
+ [](void *P) { static_cast<value_type *>(P)->~value_type(); });
+ }
+ // Move constructor okay.
+ ThreadSafeTrieRawHashMap(ThreadSafeTrieRawHashMap &&) = default;
+ // No move assignment or any copy.
+ ThreadSafeTrieRawHashMap &operator=(ThreadSafeTrieRawHashMap &&) = delete;
+ ThreadSafeTrieRawHashMap(const ThreadSafeTrieRawHashMap &) = delete;
+ ThreadSafeTrieRawHashMap &
+ operator=(const ThreadSafeTrieRawHashMap &) = delete;
+} // namespace llvm
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 97188b0672f032..0625d375bad487 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -248,6 +248,7 @@ add_llvm_component_library(LLVMSupport
+ TrieRawHashMap.cpp
diff --git a/llvm/lib/Support/TrieHashIndexGenerator.h b/llvm/lib/Support/TrieHashIndexGenerator.h
new file mode 100644
index 00000000000000..c9e9b70e10d3c7
--- /dev/null
+++ b/llvm/lib/Support/TrieHashIndexGenerator.h
@@ -0,0 +1,89 @@
+//===- TrieHashIndexGenerator.h ---------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/ArrayRef.h"
+#include <optional>
+namespace llvm {
+struct IndexGenerator {
+ size_t NumRootBits;
+ size_t NumSubtrieBits;
+ ArrayRef<uint8_t> Bytes;
+ std::optional<size_t> StartBit = std::nullopt;
+ size_t getNumBits() const {
+ assert(StartBit);
+ size_t TotalNumBits = Bytes.size() * 8;
+ assert(*StartBit <= TotalNumBits);
+ return std::min(*StartBit ? NumSubtrieBits : NumRootBits,
+ TotalNumBits - *StartBit);
+ }
+ size_t next() {
+ size_t Index;
+ if (!StartBit) {
+ StartBit = 0;
+ Index = getIndex(Bytes, *StartBit, NumRootBits);
+ } else {
+ *StartBit += *StartBit ? NumSubtrieBits : NumRootBits;
+ assert((*StartBit - NumRootBits) % NumSubtrieBits == 0);
+ Index = getIndex(Bytes, *StartBit, NumSubtrieBits);
+ }
+ return Index;
+ }
+ size_t hint(unsigned Index, unsigned Bit) {
+ assert(Index >= 0);
+ assert(Bit < Bytes.size() * 8);
+ assert(Bit == 0 || (Bit - NumRootBits) % NumSubtrieBits == 0);
+ StartBit = Bit;
+ return Index;
+ }
+ size_t getCollidingBits(ArrayRef<uint8_t> CollidingBits) const {
+ assert(StartBit);
+ return getIndex(CollidingBits, *StartBit, NumSubtrieBits);
+ }
+ static size_t getIndex(ArrayRef<uint8_t> Bytes, size_t StartBit,
+ size_t NumBits) {
+ assert(StartBit < Bytes.size() * 8);
+ Bytes = Bytes.drop_front(StartBit / 8u);
+ StartBit %= 8u;
+ size_t Index = 0;
+ for (uint8_t Byte : Bytes) {
+ size_t ByteStart = 0, ByteEnd = 8;
+ if (StartBit) {
+ ByteStart = StartBit;
+ Byte &= (1u << (8 - StartBit)) - 1u;
+ StartBit = 0;
+ }
+ size_t CurrentNumBits = ByteEnd - ByteStart;
+ if (CurrentNumBits > NumBits) {
+ Byte >>= CurrentNumBits - NumBits;
+ CurrentNumBits = NumBits;
+ }
+ Index <<= CurrentNumBits;
+ Index |= Byte & ((1u << CurrentNumBits) - 1u);
+ assert(NumBits >= CurrentNumBits);
+ NumBits -= CurrentNumBits;
+ if (!NumBits)
+ break;
+ }
+ return Index;
+ }
+} // namespace llvm
diff --git a/llvm/lib/Support/TrieRawHashMap.cpp b/llvm/lib/Support/TrieRawHashMap.cpp
new file mode 100644
index 00000000000000..af4cd8b57aed21
--- /dev/null
+++ b/llvm/lib/Support/TrieRawHashMap.cpp
@@ -0,0 +1,483 @@
+//===- TrieRawHashMap.cpp -------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/TrieRawHashMap.h"
+#include "TrieHashIndexGenerator.h"
+#include "llvm/ADT/LazyAtomicPointer.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ThreadSafeAllocator.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+using namespace llvm;
+namespace {
+struct TrieNode {
+ const bool IsSubtrie = false;
+ TrieNode(bool IsSubtrie) : IsSubtrie(IsSubtrie) {}
+ static void *operator new(size_t Size) { return ::malloc(Size); }
+ void operator delete(void *Ptr) { ::free(Ptr); }
+struct TrieContent final : public TrieNode {
+ const uint8_t ContentOffset;
+ const uint8_t HashSize;
+ const uint8_t HashOffset;
+ void *getValuePointer() const {
+ auto Content = reinterpret_cast<const uint8_t *>(this) + ContentOffset;
+ return const_cast<uint8_t *>(Content);
+ }
+ ArrayRef<uint8_t> getHash() const {
+ auto *Begin = reinterpret_cast<const uint8_t *>(this) + HashOffset;
+ return ArrayRef(Begin, Begin + HashSize);
+ }
+ TrieContent(size_t ContentOffset, size_t HashSize, size_t HashOffset)
+ : TrieNode(/*IsSubtrie=*/false), ContentOffset(ContentOffset),
+ HashSize(HashSize), HashOffset(HashOffset) {}
+static_assert(sizeof(TrieContent) ==
+ ThreadSafeTrieRawHashMapBase::TrieContentBaseSize,
+ "Check header assumption!");
+class TrieSubtrie final : public TrieNode {
+ TrieNode *get(size_t I) const { return Slots[I].load(); }
+ TrieSubtrie *
+ sink(size_t I, TrieContent &Content, size_t NumSubtrieBits, size_t NewI,
+ function_ref<TrieSubtrie *(std::unique_ptr<TrieSubtrie>)> Saver);
+ static std::unique_ptr<TrieSubtrie> create(size_t StartBit, size_t NumBits);
+ explicit TrieSubtrie(size_t StartBit, size_t NumBits);
+ // FIXME: Use a bitset to speed up access:
+ //
+ // std::array<std::atomic<uint64_t>, NumSlots/64> IsSet;
+ //
+ // This will avoid needing to visit sparsely filled slots in
+ // \a ThreadSafeTrieRawHashMapBase::destroyImpl() when there's a non-trivial
+ // destructor.
+ //
+ // It would also greatly speed up iteration, if we add that some day, and
+ // allow get() to return one level sooner.
+ //
+ // This would be the algorithm for updating IsSet (after updating Slots):
+ //
+ // std::atomic<uint64_t> &Bits = IsSet[I.High];
+ // const uint64_t NewBit = 1ULL << I.Low;
+ // uint64_t Old = 0;
+ // while (!Bits.compare_exchange_weak(Old, Old | NewBit))
+ // ;
+ // For debugging.
+ unsigned StartBit = 0;
+ unsigned NumBits = 0;
+ friend class llvm::ThreadSafeTrieRawHashMapBase;
+ /// Linked list for ownership of tries. The pointer is owned by TrieSubtrie.
+ std::atomic<TrieSubtrie *> Next;
+ /// The (co-allocated) slots of the subtrie.
+ MutableArrayRef<LazyAtomicPointer<TrieNode>> Slots;
+} // end namespace
+namespace llvm {
+template <> struct isa_impl<TrieContent, TrieNode> {
+ static inline bool doit(const TrieNode &TN) { return !TN.IsSubtrie; }
+template <> struct isa_impl<TrieSubtrie, TrieNode> {
+ static inline bool doit(const TrieNode &TN) { return TN.IsSubtrie; }
+} // end namespace llvm
+static size_t getTrieTailSize(size_t StartBit, size_t NumBits) {
+ assert(NumBits < 20 && "Tries should have fewer than ~1M slots");
+ return sizeof(TrieNode *) * (1u << NumBits);
+std::unique_ptr<TrieSubtrie> TrieSubtrie::create(size_t StartBit,
+ size_t NumBits) {
+ size_t Size = sizeof(TrieSubtrie) + getTrieTailSize(StartBit, NumBits);
+ void *Memory = ::malloc(Size);
+ TrieSubtrie *S = ::new (Memory) TrieSubtrie(StartBit, NumBits);
+ return std::unique_ptr<TrieSubtrie>(S);
+TrieSubtrie::TrieSubtrie(size_t StartBit, size_t NumBits)
+ : TrieNode(true), StartBit(StartBit), NumBits(NumBits), Next(nullptr),
+ Slots(reinterpret_cast<LazyAtomicPointer<TrieNode> *>(
+ reinterpret_cast<char *>(this) + sizeof(TrieSubtrie)),
+ (1u << NumBits)) {
+ for (auto *I = Slots.begin(), *E = Slots.end(); I != E; ++I)
+ new (I) LazyAtomicPointer<TrieNode>(nullptr);
+ static_assert(
+ std::is_trivially_destructible<LazyAtomicPointer<TrieNode>>::value,
+ "Expected no work in destructor for TrieNode");
+TrieSubtrie *TrieSubtrie::sink(
+ size_t I, TrieContent &Content, size_t NumSubtrieBits, size_t NewI,
+ function_ref<TrieSubtrie *(std::unique_ptr<TrieSubtrie>)> Saver) {
+ assert(NumSubtrieBits > 0);
+ std::unique_ptr<TrieSubtrie> S = create(StartBit + NumBits, NumSubtrieBits);
+ assert(NewI < S->Slots.size());
+ S->Slots[NewI].store(&Content);
+ TrieNode *ExistingNode = &Content;
+ assert(I < Slots.size());
+ if (Slots[I].compare_exchange_strong(ExistingNode, S.get()))
+ return Saver(std::move(S));
+ // Another thread created a subtrie already. Return it and let "S" be
+ // destructed.
+ return cast<TrieSubtrie>(ExistingNode);
+struct ThreadSafeTrieRawHashMapBase::ImplType {
+ static std::unique_ptr<ImplType> create(size_t StartBit, size_t NumBits) {
+ size_t Size = sizeof(ImplType) + getTrieTailSize(StartBit, NumBits);
+ void *Memory = ::malloc(Size);
+ ImplType* Impl = ::new (Memory) ImplType(StartBit, NumBits);
+ return std::unique_ptr<ImplType>(Impl);
+ }
+ TrieSubtrie *save(std::unique_ptr<TrieSubtrie> S) {
+ assert(!S->Next && "Expected S to a freshly-constructed leaf");
+ TrieSubtrie *CurrentHead = nullptr;
+ // Add ownership of "S" to front of the list, so that Root -> S ->
+ // Root.Next. This works by repeatedly setting S->Next to a candidate value
+ // of Root.Next (initially nullptr), then setting Root.Next to S once the
+ // candidate matches reality.
+ while (!Root.Next.compare_exchange_weak(CurrentHead, S.get()))
+ S->Next.exchange(CurrentHead);
+ // Ownership transferred to subtrie.
+ return S.release();
+ }
+ static void *operator new(size_t Size) { return ::malloc(Size); }
+ void operator delete(void *Ptr) { ::free(Ptr); }
+ /// FIXME: This should take a function that allocates and constructs the
+ /// content lazily (taking the hash as a separate parameter), in case of
+ /// collision.
+ ThreadSafeAllocator<BumpPtrAllocator> ContentAlloc;
+ TrieSubtrie Root; // Must be last! Tail-allocated.
+ ImplType(size_t StartBit, size_t NumBits) : Root(StartBit, NumBits) {}
+ThreadSafeTrieRawHashMapBase::ImplType &
+ThreadSafeTrieRawHashMapBase::getOrCreateImpl() {
+ if (ImplType *Impl = ImplPtr.load())
+ return *Impl;
+ // Create a new ImplType and store it if another thread doesn't do so first.
+ // If another thread wins this one is destroyed locally.
+ std::unique_ptr<ImplType> Impl = ImplType::create(0, NumRootBits);
+ ImplType *ExistingImpl = nullptr;
+ if (ImplPtr.compare_exchange_strong(ExistingImpl, Impl.get()))
+ return *Impl.release();
+ return *ExistingImpl;
+ThreadSafeTrieRawHashMapBase::find(ArrayRef<uint8_t> Hash) const {
+ assert(!Hash.empty() && "Uninitialized hash");
+ ImplType *Impl = ImplPtr.load();
+ if (!Impl)
+ return PointerBase();
+ TrieSubtrie *S = &Impl->Root;
+ IndexGenerator IndexGen{NumRootBits, NumSubtrieBits, Hash};
+ size_t Index = IndexGen.next();
+ while (true) {
+ // Try to set the content.
+ TrieNode *Existing = S->get(Index);
+ if (!Existing)
+ return PointerBase(S, Index, *IndexGen.StartBit);
+ // Check for an exact match.
+ if (auto *ExistingContent = dyn_cast<TrieContent>(Existing))
+ return ExistingContent->getHash() == Hash
+ ? PointerBase(ExistingContent->getValuePointer())
+ : PointerBase(S, Index, *IndexGen.StartBit);
+ Index = IndexGen.next();
+ S = cast<TrieSubtrie>(Existing);
+ }
+ThreadSafeTrieRawHashMapBase::PointerBase ThreadSafeTrieRawHashMapBase::insert(
+ PointerBase Hint, ArrayRef<uint8_t> Hash,
+ function_ref<const uint8_t *(void *Mem, ArrayRef<uint8_t> Hash)>
+ Constructor) {
+ assert(!Hash.empty() && "Uninitialized hash");
+ ImplType &Impl = getOrCreateImpl();
+ TrieSubtrie *S = &Impl.Root;
+ IndexGenerator IndexGen{NumRootBits, NumSubtrieBits, Hash};
+ size_t Index;
+ if (Hint.isHint()) {
+ S = static_cast<TrieSubtrie *>(Hint.P);
+ Index = IndexGen.hint(Hint.I, Hint.B);
+ } else {
+ Index = IndexGen.next();
+ }
+ while (true) {
+ // Load the node from the slot, allocating and calling the constructor if
+ // the slot is empty.
+ bool Generated = false;
+ TrieNode &Existing = S->Slots[Index].loadOrGenerate([&]() {
+ Generated = true;
+ // Construct the value itself at the tail.
+ uint8_t *Memory = reinterpret_cast<uint8_t *>(
+ Impl.ContentAlloc.Allocate(ContentAllocSize, ContentAllocAlign));
+ const uint8_t *HashStorage = Constructor(Memory + ContentOffset, Hash);
+ // Construct the TrieContent header, passing in the offset to the hash.
+ TrieContent *Content = ::new (Memory)
+ TrieContent(ContentOffset, Hash.size(), HashStorage - Memory);
+ assert(Hash == Content->getHash() && "Hash not properly initialized");
+ return Content;
+ });
+ // If we just generated it, return it!
+ if (Generated)
+ return PointerBase(cast<TrieContent>(Existing).getValuePointer());
+ if (auto *ST = dyn_cast<TrieSubtrie>(&Existing)) {
+ S = ST;
+ Index = IndexGen.next();
+ continue;
+ }
+ // Return the existing content if it's an exact match!
+ auto &ExistingContent = cast<TrieContent>(Existing);
+ if (ExistingContent.getHash() == Hash)
+ return PointerBase(ExistingContent.getValuePointer());
+ // Sink the existing content as long as the indexes match.
+ while (true) {
+ size_t NextIndex = IndexGen.next();
+ size_t NewIndexForExistingContent =
+ IndexGen.getCollidingBits(ExistingContent.getHash());
+ S = S->sink(Index, ExistingContent, IndexGen.getNumBits(),
+ NewIndexForExistingContent,
+ [&Impl](std::unique_ptr<TrieSubtrie> S) {
+ return Impl.save(std::move(S));
+ });
+ Index = NextIndex;
+ // Found the difference.
+ if (NextIndex != NewIndexForExistingContent)
+ break;
+ }
+ }
+ size_t ContentAllocSize, size_t ContentAllocAlign, size_t ContentOffset,
+ std::optional<size_t> NumRootBits, std::optional<size_t> NumSubtrieBits)
+ : ContentAllocSize(ContentAllocSize), ContentAllocAlign(ContentAllocAlign),
+ ContentOffset(ContentOffset),
+ NumRootBits(NumRootBits ? *NumRootBits : DefaultNumRootBits),
+ NumSubtrieBits(NumSubtrieBits ? *NumSubtrieBits : DefaultNumSubtrieBits),
+ ImplPtr(nullptr) {
+ assert((!NumRootBits || *NumRootBits < 20) &&
+ "Root should have fewer than ~1M slots");
+ assert((!NumSubtrieBits || *NumSubtrieBits < 10) &&
+ "Subtries should have fewer than ~1K slots");
+ ThreadSafeTrieRawHashMapBase &&RHS)
+ : ContentAllocSize(RHS.ContentAllocSize),
+ ContentAllocAlign(RHS.ContentAllocAlign),
+ ContentOffset(RHS.ContentOffset), NumRootBits(RHS.NumRootBits),
+ NumSubtrieBits(RHS.NumSubtrieBits) {
+ // Steal the root from RHS.
+ ImplPtr = RHS.ImplPtr.exchange(nullptr);
+ThreadSafeTrieRawHashMapBase::~ThreadSafeTrieRawHashMapBase() {
+ assert(!ImplPtr.load() && "Expected subclass to call destroyImpl()");
+void ThreadSafeTrieRawHashMapBase::destroyImpl(
+ function_ref<void(void *)> Destructor) {
+ std::unique_ptr<ImplType> Impl(ImplPtr.exchange(nullptr));
+ if (!Impl)
+ return;
+ // Destroy content nodes throughout trie. Avoid destroying any subtries since
+ // we need TrieNode::classof() to find the content nodes.
+ //
+ // FIXME: Once we have bitsets (see FIXME in TrieSubtrie class), use them
+ // facilitate sparse iteration here.
+ if (Destructor)
+ for (TrieSubtrie *Trie = &Impl->Root; Trie; Trie = Trie->Next.load())
+ for (auto &Slot : Trie->Slots)
+ if (auto *Content = dyn_cast_or_null<TrieContent>(Slot.load()))
+ Destructor(Content->getValuePointer());
+ // Destroy the subtries. Incidentally, this destroys them in the reverse order
+ // of saving.
+ TrieSubtrie *Trie = Impl->Root.Next;
+ while (Trie) {
+ TrieSubtrie *Next = Trie->Next.exchange(nullptr);
+ delete Trie;
+ Trie = Next;
+ }
+ThreadSafeTrieRawHashMapBase::getRoot() const {
+ ImplType *Impl = ImplPtr.load();
+ if (!Impl)
+ return PointerBase();
+ return PointerBase(&Impl->Root);
+unsigned ThreadSafeTrieRawHashMapBase::getStartBit(
+ ThreadSafeTrieRawHashMapBase::PointerBase P) const {
+ assert(!P.isHint() && "Not a valid trie");
+ if (!P.P)
+ return 0;
+ if (auto *S = dyn_cast<TrieSubtrie>((TrieNode *)P.P))
+ return S->StartBit;
+ return 0;
+unsigned ThreadSafeTrieRawHashMapBase::getNumBits(
+ ThreadSafeTrieRawHashMapBase::PointerBase P) const {
+ assert(!P.isHint() && "Not a valid trie");
+ if (!P.P)
+ return 0;
+ if (auto *S = dyn_cast<TrieSubtrie>((TrieNode *)P.P))
+ return S->NumBits;
+ return 0;
+unsigned ThreadSafeTrieRawHashMapBase::getNumSlotUsed(
+ ThreadSafeTrieRawHashMapBase::PointerBase P) const {
+ assert(!P.isHint() && "Not a valid trie");
+ if (!P.P)
+ return 0;
+ auto *S = dyn_cast<TrieSubtrie>((TrieNode *)P.P);
+ if (!S)
+ return 0;
+ unsigned Num = 0;
+ for (unsigned I = 0, E = S->Slots.size(); I < E; ++I)
+ if (auto *E = S->Slots[I].load())
+ ++Num;
+ return Num;
+std::string ThreadSafeTrieRawHashMapBase::getTriePrefixAsString(
+ ThreadSafeTrieRawHashMapBase::PointerBase P) const {
+ assert(!P.isHint() && "Not a valid trie");
+ if (!P.P)
+ return "";
+ auto *S = dyn_cast<TrieSubtrie>((TrieNode *)P.P);
+ if (!S || !S->IsSubtrie)
+ return "";
+ // Find a TrieContent node which has hash stored. Depth search following the
+ // first used slot until a TrieContent node is found.
+ TrieSubtrie *Current = S;
+ TrieContent *Node = nullptr;
+ while (Current) {
+ TrieSubtrie *Next = nullptr;
+ // find first used slot in the trie.
+ for (unsigned I = 0, E = Current->Slots.size(); I < E; ++I) {
+ auto *S = Current->get(I);
+ if (!S)
+ continue;
+ if (auto *Content = dyn_cast<TrieContent>(S))
+ Node = Content;
+ else if (auto *Sub = dyn_cast<TrieSubtrie>(S))
+ Next = Sub;
+ break;
+ }
+ // Found the node.
+ if (Node)
+ break;
+ // Continue to the next level if the node is not found.
+ Current = Next;
+ }
+ assert(Node && "malformed trie, cannot find TrieContent on leaf node");
+ // The prefix for the current trie is the first `StartBit` of the content
+ // stored underneath this subtrie.
+ std::string Str;
+ raw_string_ostream SS(Str);
+ unsigned StartFullBytes = (S->StartBit + 1) / 8 - 1;
+ SS << toHex(toStringRef(Node->getHash()).take_front(StartFullBytes),
+ /*LowerCase=*/true);
+ // For the part of the prefix that doesn't fill a byte, print raw bit values.
+ std::string Bits;
+ for (unsigned I = StartFullBytes * 8, E = S->StartBit; I < E; ++I) {
+ unsigned Index = I / 8;
+ unsigned Offset = 7 - I % 8;
+ Bits.push_back('0' + ((Node->getHash()[Index] >> Offset) & 1));
+ }
+ if (!Bits.empty())
+ SS << "[" << Bits << "]";
+ return SS.str();
+unsigned ThreadSafeTrieRawHashMapBase::getNumTries() const {
+ ImplType *Impl = ImplPtr.load();
+ if (!Impl)
+ return 0;
+ unsigned Num = 0;
+ for (TrieSubtrie *Trie = &Impl->Root; Trie; Trie = Trie->Next.load())
+ ++Num;
+ return Num;
+ ThreadSafeTrieRawHashMapBase::PointerBase P) const {
+ assert(!P.isHint() && "Not a valid trie");
+ if (!P.P)
+ return PointerBase();
+ auto *S = dyn_cast<TrieSubtrie>((TrieNode *)P.P);
+ if (!S)
+ return PointerBase();
+ if (auto *E = S->Next.load())
+ return PointerBase(E);
+ return PointerBase();
diff --git a/llvm/unittests/ADT/CMakeLists.txt b/llvm/unittests/ADT/CMakeLists.txt
index 745e4d9fb74a4a..b0077d5b54a3ee 100644
--- a/llvm/unittests/ADT/CMakeLists.txt
+++ b/llvm/unittests/ADT/CMakeLists.txt
@@ -86,6 +86,7 @@ add_llvm_unittest(ADTTests
+ TrieRawHashMapTest.cpp
diff --git a/llvm/unittests/ADT/TrieRawHashMapTest.cpp b/llvm/unittests/ADT/TrieRawHashMapTest.cpp
new file mode 100644
index 00000000000000..bd3610666ec941
--- /dev/null
+++ b/llvm/unittests/ADT/TrieRawHashMapTest.cpp
@@ -0,0 +1,342 @@
+//===- TrieRawHashMapTest.cpp ---------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/TrieRawHashMap.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/SHA1.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+namespace llvm {
+class TrieRawHashMapTestHelper {
+ TrieRawHashMapTestHelper() = default;
+ void setTrie(ThreadSafeTrieRawHashMapBase *T) { Trie = T; }
+ ThreadSafeTrieRawHashMapBase::PointerBase getRoot() const {
+ return Trie->getRoot();
+ }
+ unsigned getStartBit(ThreadSafeTrieRawHashMapBase::PointerBase P) const {
+ return Trie->getStartBit(P);
+ }
+ unsigned getNumBits(ThreadSafeTrieRawHashMapBase::PointerBase P) const {
+ return Trie->getNumBits(P);
+ }
+ unsigned getNumSlotUsed(ThreadSafeTrieRawHashMapBase::PointerBase P) const {
+ return Trie->getNumSlotUsed(P);
+ }
+ unsigned getNumTries() const { return Trie->getNumTries(); }
+ std::string
+ getTriePrefixAsString(ThreadSafeTrieRawHashMapBase::PointerBase P) const {
+ return Trie->getTriePrefixAsString(P);
+ }
+ ThreadSafeTrieRawHashMapBase::PointerBase
+ getNextTrie(ThreadSafeTrieRawHashMapBase::PointerBase P) const {
+ return Trie->getNextTrie(P);
+ }
+ ThreadSafeTrieRawHashMapBase *Trie = nullptr;
+} // namespace llvm
+namespace {
+template <typename DataType, size_t HashSize>
+class SimpleTrieHashMapTest : public TrieRawHashMapTestHelper,
+ public ::testing::Test {
+ using NumType = DataType;
+ using HashType = std::array<uint8_t, HashSize>;
+ using TrieType = ThreadSafeTrieRawHashMap<DataType, sizeof(HashType)>;
+ TrieType &createTrie(size_t RootBits, size_t SubtrieBits) {
+ auto &Ret = Trie.emplace(RootBits, SubtrieBits);
+ TrieRawHashMapTestHelper::setTrie(&Ret);
+ return Ret;
+ }
+ void destroyTrie() { Trie.reset(); }
+ ~SimpleTrieHashMapTest() {
+ if (Trie)
+ Trie.reset();
+ }
+ // Use the number itself as hash to test the pathological case.
+ static HashType hash(uint64_t Num) {
+ uint64_t HashN = llvm::support::endian::byte_swap(Num, llvm::support::big);
+ HashType Hash;
+ memcpy(&Hash[0], &HashN, sizeof(HashType));
+ return Hash;
+ };
+ std::optional<TrieType> Trie;
+using SmallNodeTrieTest = SimpleTrieHashMapTest<uint64_t, sizeof(uint64_t)>;
+TEST_F(SmallNodeTrieTest, TrieAllocation) {
+ NumType Numbers[] = {
+ 0x0, std::numeric_limits<NumType>::max(), 0x1, 0x2,
+ 0x3, std::numeric_limits<NumType>::max() - 1u,
+ };
+ unsigned ExpectedTries[] = {
+ 1, // Allocate Root.
+ 1, // Both on the root.
+ 64, // 0 and 1 sinks all the way down.
+ 64, // no new allocation needed.
+ 65, // need a new node between 2 and 3.
+ 65 + 63, // 63 new allocation to sink two big numbers all the way.
+ };
+ const char *ExpectedPrefix[] = {
+ "", // Root.
+ "", // Root.
+ "00000000000000[0000000]",
+ "00000000000000[0000000]",
+ "00000000000000[0000001]",
+ "ffffffffffffff[1111111]",
+ };
+ // Use root and subtrie sizes of 1 so this gets sunk quite deep.
+ auto &Trie = createTrie(/*RootBits=*/1, /*SubtrieBits=*/1);
+ for (unsigned I = 0; I < 6; ++I) {
+ // Lookup first to exercise hint code for deep tries.
+ TrieType::pointer Lookup = Trie.find(hash(Numbers[I]));
+ Trie.insert(Lookup, TrieType::value_type(hash(Numbers[I]), Numbers[I]));
+ EXPECT_EQ(getNumTries(), ExpectedTries[I]);
+ EXPECT_EQ(getTriePrefixAsString(getNextTrie(getRoot())), ExpectedPrefix[I]);
+ }
+TEST_F(SmallNodeTrieTest, TrieStructure) {
+ NumType Numbers[] = {
+ // Three numbers that will nest deeply to test (1) sinking subtries and
+ // (2) deep, non-trivial hints.
+ std::numeric_limits<NumType>::max(),
+ std::numeric_limits<NumType>::max() - 2u,
+ std::numeric_limits<NumType>::max() - 3u,
+ // One number to stay at the top-level.
+ 0x37,
+ };
+ // Use root and subtrie sizes of 1 so this gets sunk quite deep.
+ auto &Trie = createTrie(/*RootBits=*/1, /*SubtrieBits=*/1);
+ for (NumType N : Numbers) {
+ // Lookup first to exercise hint code for deep tries.
+ TrieType::pointer Lookup = Trie.find(hash(N));
+ Trie.insert(Lookup, TrieType::value_type(hash(N), N));
+ }
+ for (NumType N : Numbers) {
+ TrieType::pointer Lookup = Trie.find(hash(N));
+ EXPECT_TRUE(Lookup);
+ if (!Lookup)
+ continue;
+ EXPECT_EQ(hash(N), Lookup->Hash);
+ EXPECT_EQ(N, Lookup->Data);
+ // Confirm a subsequent insertion fails to overwrite by trying to insert a
+ // bad value.
+ auto Result = Trie.insert(Lookup, TrieType::value_type(hash(N), N - 1));
+ EXPECT_EQ(N, Result->Data);
+ }
+ // Check the trie so we can confirm the structure is correct. Each subtrie
+ // should have 2 slots. The root's index=0 should have the content for
+ // 0x37 directly, and index=1 should be a linked-list of subtries, finally
+ // ending with content for (max-2) and (max-3).
+ //
+ // Note: This structure is not exhaustive (too expensive to update tests),
+ // but it does test that the dump format is somewhat readable and that the
+ // basic structure is correct.
+ //
+ // Note: This test requires that the trie reads bytes starting from index 0
+ // of the array of uint8_t, and then reads each byte's bits from high to low.
+ // Check the Trie.
+ // We should allocated a total of 64 SubTries for 64 bit hash.
+ ASSERT_EQ(getNumTries(), 64u);
+ // Check the root trie. Two slots and both are used.
+ ASSERT_EQ(getNumSlotUsed(getRoot()), 2u);
+ // Check last subtrie.
+ // Last allocated trie is the next node in the allocation chain.
+ auto LastAlloctedSubTrie = getNextTrie(getRoot());
+ ASSERT_EQ(getTriePrefixAsString(LastAlloctedSubTrie),
+ "ffffffffffffff[1111110]");
+ ASSERT_EQ(getStartBit(LastAlloctedSubTrie), 63u);
+ ASSERT_EQ(getNumBits(LastAlloctedSubTrie), 1u);
+ ASSERT_EQ(getNumSlotUsed(LastAlloctedSubTrie), 2u);
+TEST_F(SmallNodeTrieTest, TrieStructureSmallFinalSubtrie) {
+ NumType Numbers[] = {
+ // Three numbers that will nest deeply to test (1) sinking subtries and
+ // (2) deep, non-trivial hints.
+ std::numeric_limits<NumType>::max(),
+ std::numeric_limits<NumType>::max() - 2u,
+ std::numeric_limits<NumType>::max() - 3u,
+ // One number to stay at the top-level.
+ 0x37,
+ };
+ // Use subtrie size of 5 to avoid hitting 64 evenly, making the final subtrie
+ // small.
+ auto &Trie = createTrie(/*RootBits=*/8, /*SubtrieBits=*/5);
+ for (NumType N : Numbers) {
+ // Lookup first to exercise hint code for deep tries.
+ TrieType::pointer Lookup = Trie.find(hash(N));
+ Trie.insert(Lookup, TrieType::value_type(hash(N), N));
+ }
+ for (NumType N : Numbers) {
+ TrieType::pointer Lookup = Trie.find(hash(N));
+ EXPECT_TRUE(Lookup);
+ if (!Lookup)
+ continue;
+ EXPECT_EQ(hash(N), Lookup->Hash);
+ EXPECT_EQ(N, Lookup->Data);
+ // Confirm a subsequent insertion fails to overwrite by trying to insert a
+ // bad value.
+ auto Result = Trie.insert(Lookup, TrieType::value_type(hash(N), N - 1));
+ EXPECT_EQ(N, Result->Data);
+ }
+ // Check the trie so we can confirm the structure is correct. The root
+ // should have 2^8=256 slots, most subtries should have 2^5=32 slots, and the
+ // deepest subtrie should have 2^1=2 slots (since (64-8)mod(5)=1).
+ // should have 2 slots. The root's index=0 should have the content for
+ // 0x37 directly, and index=1 should be a linked-list of subtries, finally
+ // ending with content for (max-2) and (max-3).
+ //
+ // Note: This structure is not exhaustive (too expensive to update tests),
+ // but it does test that the dump format is somewhat readable and that the
+ // basic structure is correct.
+ //
+ // Note: This test requires that the trie reads bytes starting from index 0
+ // of the array of uint8_t, and then reads each byte's bits from high to low.
+ // Check the Trie.
+ // 64 bit hash = 8 + 5 * 11 + 1, so 1 root, 11 8bit subtrie and 1 last level
+ // subtrie, 13 total.
+ ASSERT_EQ(getNumTries(), 13u);
+ // Check the root trie. Two slots and both are used.
+ ASSERT_EQ(getNumSlotUsed(getRoot()), 2u);
+ // Check last subtrie.
+ // Last allocated trie is the next node in the allocation chain.
+ auto LastAlloctedSubTrie = getNextTrie(getRoot());
+ ASSERT_EQ(getTriePrefixAsString(LastAlloctedSubTrie),
+ "ffffffffffffff[1111110]");
+ ASSERT_EQ(getStartBit(LastAlloctedSubTrie), 63u);
+ ASSERT_EQ(getNumBits(LastAlloctedSubTrie), 1u);
+ ASSERT_EQ(getNumSlotUsed(LastAlloctedSubTrie), 2u);
+TEST_F(SmallNodeTrieTest, TrieDestructionLoop) {
+ // Test destroying large Trie. Make sure there is no recursion that can
+ // overflow the stack.
+ // Limit the tries to 2 slots (1 bit) to generate subtries at a higher rate.
+ auto &Trie = createTrie(/*NumRootBits=*/1, /*NumSubtrieBits=*/1);
+ // Fill them up. Pick a MaxN high enough to cause a stack overflow in debug
+ // builds.
+ static constexpr uint64_t MaxN = 100000;
+ for (uint64_t N = 0; N != MaxN; ++N) {
+ HashType Hash = hash(N);
+ Trie.insert(TrieType::pointer(), TrieType::value_type(Hash, NumType{N}));
+ }
+ // Destroy tries. If destruction is recursive and MaxN is high enough, these
+ // will both fail.
+ destroyTrie();
+struct NumWithDestructorT {
+ uint64_t Num;
+ ~NumWithDestructorT() {}
+using NodeWithDestructorTrieTest =
+ SimpleTrieHashMapTest<NumWithDestructorT, sizeof(uint64_t)>;
+TEST_F(NodeWithDestructorTrieTest, TrieDestructionLoop) {
+ // Test destroying large Trie. Make sure there is no recursion that can
+ // overflow the stack.
+ // Limit the tries to 2 slots (1 bit) to generate subtries at a higher rate.
+ auto &Trie = createTrie(/*NumRootBits=*/1, /*NumSubtrieBits=*/1);
+ // Fill them up. Pick a MaxN high enough to cause a stack overflow in debug
+ // builds.
+ static constexpr uint64_t MaxN = 100000;
+ for (uint64_t N = 0; N != MaxN; ++N) {
+ HashType Hash = hash(N);
+ Trie.insert(TrieType::pointer(), TrieType::value_type(Hash, NumType{N}));
+ }
+ // Destroy tries. If destruction is recursive and MaxN is high enough, these
+ // will both fail.
+ destroyTrie();
+using NumStrNodeTrieTest = SimpleTrieHashMapTest<std::string, sizeof(uint64_t)>;
+TEST_F(NumStrNodeTrieTest, TrieInsertLazy) {
+ for (unsigned RootBits : {2, 3, 6, 10}) {
+ for (unsigned SubtrieBits : {2, 3, 4}) {
+ auto &Trie = createTrie(RootBits, SubtrieBits);
+ for (int I = 0, E = 1000; I != E; ++I) {
+ TrieType::pointer Lookup;
+ HashType H = hash(I);
+ if (I & 1)
+ Lookup = Trie.find(H);
+ auto insertNum = [&](uint64_t Num) {
+ std::string S = Twine(I).str();
+ auto Hash = hash(Num);
+ return Trie.insertLazy(
+ Hash, [&](TrieType::LazyValueConstructor C) { C(std::move(S)); });
+ };
+ auto S1 = insertNum(I);
+ // The address of the Data should be the same.
+ EXPECT_EQ(&S1->Data, &insertNum(I)->Data);
+ auto insertStr = [&](std::string S) {
+ int Num = std::stoi(S);
+ return insertNum(Num);
+ };
+ std::string S2 = S1->Data;
+ // The address of the Data should be the same.
+ EXPECT_EQ(&S1->Data, &insertStr(S2)->Data);
+ }
+ for (int I = 0, E = 1000; I != E; ++I) {
+ std::string S = Twine(I).str();
+ TrieType::pointer Lookup = Trie.find(hash(I));
+ EXPECT_TRUE(Lookup);
+ if (!Lookup)
+ continue;
+ EXPECT_EQ(S, Lookup->Data);
+ }
+ }
+ }
+} // end anonymous namespace
>From 2b4d1de3a274cbe94e1522e0f31a70d734ed2cdc Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Thu, 5 Oct 2023 13:05:05 -0700
Subject: [PATCH 02/11] [CAS] Add LLVMCAS library with InMemoryCAS
Add llvm::cas::ObjectStore abstraction and InMemoryCAS as a in-memory
CAS object store implementation.
The ObjectStore models its objects as:
* Content: An array of bytes for the data to be stored.
* Refs: An array of references to other objects in the ObjectStore.
And each CAS Object can be idenfied with an unqine ID/Hash.
ObjectStore supports following general action:
* Expected<ID> store(Content, ArrayRef<Ref>)
* Expected<Ref> get(ID)
It also introduces following types to interact with a CAS ObjectStore:
* CASID: Hash representation for an CAS Objects with its context to help
print/compare CASIDs.
* ObjectRef: A light-weight ref for an object in the ObjectStore. It is
implementation defined so it can be optimized for
read/store/references depending on the implementation.
* ObjectHandle: A CAS internal light-weight handle to an loaded object in the
ObjectStore. Underlying data for the object is guaranteed to be
available and no error handling is required to access data. This is
not exposed to the users of CAS from ObjectStore APIs.
* ObjectProxy: A proxy for the users of CAS to interact with the data
inside CAS Object. It bundles a ObjectHandle and an ObjectStore
Differential Revision: https://reviews.llvm.org/D133716
llvm/docs/ContentAddressableStorage.md | 120 ++++++++
llvm/docs/Reference.rst | 5 +
llvm/include/llvm/CAS/CASID.h | 156 +++++++++++
llvm/include/llvm/CAS/CASReference.h | 207 ++++++++++++++
llvm/include/llvm/CAS/ObjectStore.h | 361 +++++++++++++++++++++++++
llvm/lib/CAS/BuiltinCAS.cpp | 108 ++++++++
llvm/lib/CAS/BuiltinCAS.h | 98 +++++++
llvm/lib/CAS/BuiltinObjectHasher.h | 73 +++++
llvm/lib/CAS/CMakeLists.txt | 8 +
llvm/lib/CAS/InMemoryCAS.cpp | 321 ++++++++++++++++++++++
llvm/lib/CAS/ObjectStore.cpp | 259 ++++++++++++++++++
llvm/lib/CMakeLists.txt | 1 +
llvm/unittests/CAS/CASTestConfig.cpp | 22 ++
llvm/unittests/CAS/CASTestConfig.h | 36 +++
llvm/unittests/CAS/CMakeLists.txt | 12 +
llvm/unittests/CAS/ObjectStoreTest.cpp | 280 +++++++++++++++++++
llvm/unittests/CMakeLists.txt | 1 +
17 files changed, 2068 insertions(+)
create mode 100644 llvm/docs/ContentAddressableStorage.md
create mode 100644 llvm/include/llvm/CAS/CASID.h
create mode 100644 llvm/include/llvm/CAS/CASReference.h
create mode 100644 llvm/include/llvm/CAS/ObjectStore.h
create mode 100644 llvm/lib/CAS/BuiltinCAS.cpp
create mode 100644 llvm/lib/CAS/BuiltinCAS.h
create mode 100644 llvm/lib/CAS/BuiltinObjectHasher.h
create mode 100644 llvm/lib/CAS/CMakeLists.txt
create mode 100644 llvm/lib/CAS/InMemoryCAS.cpp
create mode 100644 llvm/lib/CAS/ObjectStore.cpp
create mode 100644 llvm/unittests/CAS/CASTestConfig.cpp
create mode 100644 llvm/unittests/CAS/CASTestConfig.h
create mode 100644 llvm/unittests/CAS/CMakeLists.txt
create mode 100644 llvm/unittests/CAS/ObjectStoreTest.cpp
diff --git a/llvm/docs/ContentAddressableStorage.md b/llvm/docs/ContentAddressableStorage.md
new file mode 100644
index 00000000000000..4f2d9a6a3a9185
--- /dev/null
+++ b/llvm/docs/ContentAddressableStorage.md
@@ -0,0 +1,120 @@
+# Content Addressable Storage
+## Introduction to CAS
+Content Addressable Storage, or `CAS`, is a storage system where it assigns
+unique addresses to the data stored. It is very useful for data deduplicaton
+and creating unique identifiers.
+Unlikely other kind of storage system like file system, CAS is immutable. It
+is more reliable to model a computation when representing the inputs and outputs
+of the computation using objects stored in CAS.
+The basic unit of the CAS library is a CASObject, where it contains:
+* Data: arbitrary data
+* References: references to other CASObject
+It can be conceptually modeled as something like:
+struct CASObject {
+ ArrayRef<char> Data;
+ ArrayRef<CASObject*> Refs;
+Such abstraction can allow simple composition of CASObjects into a DAG to
+represent complicated data structure while still allowing data deduplication.
+Note you can compare two DAGs by just comparing the CASObject hash of two
+root nodes.
+## LLVM CAS Library User Guide
+The CAS-like storage provided in LLVM is `llvm::cas::ObjectStore`.
+To reference a CASObject, there are few different abstractions provided
+with different trade-offs:
+### ObjectRef
+`ObjectRef` is a lightweight reference to a CASObject stored in the CAS.
+This is the most commonly used abstraction and it is cheap to copy/pass
+along. It has following properties:
+* `ObjectRef` is only meaningful within the `ObjectStore` that created the ref.
+`ObjectRef` created by different `ObjectStore` cannot be cross-referenced or
+* `ObjectRef` doesn't guarantee the existence of the CASObject it points to. An
+explicitly load is required before accessing the data stored in CASObject.
+This load can also fail, for reasons like but not limited to: object does
+not exist, corrupted CAS storage, operation timeout, etc.
+* If two `ObjectRef` are equal, it is guarantee that the object they point to
+(if exists) are identical. If they are not equal, the underlying objects are
+guaranteed to be not the same.
+### ObjectProxy
+`ObjectProxy` represents a loaded CASObject. With an `ObjectProxy`, the
+underlying stored data and references can be accessed without the need
+of error handling. The class APIs also provide convenient methods to
+access underlying data. The lifetime of the underlying data is equal to
+the lifetime of the instance of `ObjectStore` unless explicitly copied.
+### CASID
+`CASID` is the hash identifier for CASObjects. It owns the underlying
+storage for hash value so it can be expensive to copy and compare depending
+on the hash algorithm. `CASID` is generally only useful in rare situations
+like printing raw hash value or exchanging hash values between different
+CAS instances with the same hashing schema.
+### ObjectStore
+`ObjectStore` is the CAS-like object storage. It provides API to save
+and load CASObjects, for example:
+ObjectRef A, B, C;
+Expected<ObjectRef> Stored = ObjectStore.store("data", {A, B});
+Expected<ObjectProxy> Loaded = ObjectStore.getProxy(C);
+It also provides APIs to convert between `ObjectRef`, `ObjectProxy` and
+## CAS Library Implementation Guide
+The LLVM ObjectStore APIs are designed so that it is easy to add
+customized CAS implementation that are interchangeable with builtin
+CAS implementations.
+To add your own implementation, you just need to add a subclass to
+`llvm::cas::ObjectStore` and implement all its pure virtual methods.
+To be interchangeable with LLVM ObjectStore, the new CAS implementation
+needs to conform to following contracts:
+* Different CASObject stored in the ObjectStore needs to have a different hash
+and result in a different `ObjectRef`. Vice versa, same CASObject should have
+same hash and same `ObjectRef`. Note two different CASObjects with identical
+data but different references are considered different objects.
+* `ObjectRef`s are comparable within the same `ObjectStore` instance, and can
+be used to determine the equality of the underlying CASObjects.
+* The loaded objects from the ObjectStore need to have the lifetime to be at
+least as long as the ObjectStore itself.
+If not specified, the behavior can be implementation defined. For example,
+`ObjectRef` can be used to point to a loaded CASObject so
+`ObjectStore` never fails to load. It is also legal to use a stricter model
+than required. For example, an `ObjectRef` that can be used to compare
+objects between different `ObjectStore` instances is legal but user
+of the ObjectStore should not depend on this behavior.
+For CAS library implementer, there is also a `ObjectHandle` class that
+is an internal representation of a loaded CASObject reference.
+`ObjectProxy` is just a pair of `ObjectHandle` and `ObjectStore`, because
+just like `ObjectRef`, `ObjectHandle` is only useful when paired with
+the ObjectStore that knows about the loaded CASObject.
diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst
index df61628b06c7db..da1afb360ed6de 100644
--- a/llvm/docs/Reference.rst
+++ b/llvm/docs/Reference.rst
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
@@ -15,6 +16,7 @@ LLVM and API reference documentation.
+ ContentAddressableStorage
@@ -232,3 +234,6 @@ Additional Topics
A description of uniformity analysis in the presence of irreducible
control flow, and its implementation.
+ A reference guide for using LLVM's CAS library.
diff --git a/llvm/include/llvm/CAS/CASID.h b/llvm/include/llvm/CAS/CASID.h
new file mode 100644
index 00000000000000..5f9110a15819ad
--- /dev/null
+++ b/llvm/include/llvm/CAS/CASID.h
@@ -0,0 +1,156 @@
+//===- llvm/CAS/CASID.h -----------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+namespace llvm {
+class raw_ostream;
+namespace cas {
+class CASID;
+/// Context for CAS identifiers.
+class CASContext {
+ virtual void anchor();
+ virtual ~CASContext() = default;
+ /// Get an identifer for the schema used by this CAS context. Two CAS
+ /// instances should return \c true for this identifier if and only if their
+ /// CASIDs are safe to compare by hash. This is used by \a
+ /// CASID::equalsImpl().
+ virtual StringRef getHashSchemaIdentifier() const = 0;
+ /// Print \p ID to \p OS.
+ virtual void printIDImpl(raw_ostream &OS, const CASID &ID) const = 0;
+ friend class CASID;
+/// Unique identifier for a CAS object.
+/// Locally, stores an internal CAS identifier that's specific to a single CAS
+/// instance. It's guaranteed not to change across the view of that CAS, but
+/// might change between runs.
+/// It also has \a CASIDContext pointer to allow comparison of these
+/// identifiers. If two CASIDs are from the same CASIDContext, they can be
+/// compared directly. If they are, then \a
+/// CASIDContext::getHashSchemaIdentifier() is compared to see if they can be
+/// compared by hash, in which case the result of \a getHash() is compared.
+class CASID {
+ void dump() const;
+ void print(raw_ostream &OS) const {
+ return getContext().printIDImpl(OS, *this);
+ }
+ friend raw_ostream &operator<<(raw_ostream &OS, const CASID &ID) {
+ ID.print(OS);
+ return OS;
+ }
+ std::string toString() const;
+ ArrayRef<uint8_t> getHash() const {
+ return arrayRefFromStringRef<uint8_t>(Hash);
+ }
+ friend bool operator==(const CASID &LHS, const CASID &RHS) {
+ if (LHS.Context == RHS.Context)
+ return LHS.Hash == RHS.Hash;
+ // EmptyKey or TombstoneKey.
+ if (!LHS.Context || !RHS.Context)
+ return false;
+ // CASIDs are equal when they have the same hash schema and same hash value.
+ return LHS.Context->getHashSchemaIdentifier() ==
+ RHS.Context->getHashSchemaIdentifier() &&
+ LHS.Hash == RHS.Hash;
+ }
+ friend bool operator!=(const CASID &LHS, const CASID &RHS) {
+ return !(LHS == RHS);
+ }
+ friend hash_code hash_value(const CASID &ID) {
+ ArrayRef<uint8_t> Hash = ID.getHash();
+ return hash_combine_range(Hash.begin(), Hash.end());
+ }
+ const CASContext &getContext() const {
+ assert(Context && "Tombstone or empty key for DenseMap?");
+ return *Context;
+ }
+ static CASID getDenseMapEmptyKey() {
+ return CASID(nullptr, DenseMapInfo<StringRef>::getEmptyKey());
+ }
+ static CASID getDenseMapTombstoneKey() {
+ return CASID(nullptr, DenseMapInfo<StringRef>::getTombstoneKey());
+ }
+ CASID() = delete;
+ static CASID create(const CASContext *Context, StringRef Hash) {
+ return CASID(Context, Hash);
+ }
+ CASID(const CASContext *Context, StringRef Hash)
+ : Context(Context), Hash(Hash) {}
+ const CASContext *Context;
+ SmallString<32> Hash;
+/// This is used to workaround the issue of MSVC needing default-constructible
+/// types for \c std::promise/future.
+template <typename T> struct AsyncValue {
+ Expected<std::optional<T>> take() { return std::move(Value); }
+ AsyncValue() : Value(std::nullopt) {}
+ AsyncValue(Error &&E) : Value(std::move(E)) {}
+ AsyncValue(T &&V) : Value(std::move(V)) {}
+ AsyncValue(std::nullopt_t) : Value(std::nullopt) {}
+ AsyncValue(Expected<std::optional<T>> &&Obj) : Value(std::move(Obj)) {}
+ Expected<std::optional<T>> Value;
+} // namespace cas
+template <> struct DenseMapInfo<cas::CASID> {
+ static cas::CASID getEmptyKey() { return cas::CASID::getDenseMapEmptyKey(); }
+ static cas::CASID getTombstoneKey() {
+ return cas::CASID::getDenseMapTombstoneKey();
+ }
+ static unsigned getHashValue(cas::CASID ID) {
+ return (unsigned)hash_value(ID);
+ }
+ static bool isEqual(cas::CASID LHS, cas::CASID RHS) { return LHS == RHS; }
+} // namespace llvm
+#endif // LLVM_CAS_CASID_H
diff --git a/llvm/include/llvm/CAS/CASReference.h b/llvm/include/llvm/CAS/CASReference.h
new file mode 100644
index 00000000000000..1f435cf306c4ca
--- /dev/null
+++ b/llvm/include/llvm/CAS/CASReference.h
@@ -0,0 +1,207 @@
+//===- llvm/CAS/CASReference.h ----------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/StringRef.h"
+namespace llvm {
+class raw_ostream;
+namespace cas {
+class ObjectStore;
+class ObjectHandle;
+class ObjectRef;
+/// Base class for references to things in \a ObjectStore.
+class ReferenceBase {
+ struct DenseMapEmptyTag {};
+ struct DenseMapTombstoneTag {};
+ static constexpr uint64_t getDenseMapEmptyRef() { return -1ULL; }
+ static constexpr uint64_t getDenseMapTombstoneRef() { return -2ULL; }
+ /// Get an internal reference.
+ uint64_t getInternalRef(const ObjectStore &ExpectedCAS) const {
+ assert(CAS == &ExpectedCAS && "Extracting reference for the wrong CAS");
+ return InternalRef;
+ }
+ unsigned getDenseMapHash() const {
+ return (unsigned)llvm::hash_value(InternalRef);
+ }
+ bool isDenseMapEmpty() const { return InternalRef == getDenseMapEmptyRef(); }
+ bool isDenseMapTombstone() const {
+ return InternalRef == getDenseMapTombstoneRef();
+ }
+ bool isDenseMapSentinel() const {
+ return isDenseMapEmpty() || isDenseMapTombstone();
+ }
+ void print(raw_ostream &OS, const ObjectHandle &This) const;
+ void print(raw_ostream &OS, const ObjectRef &This) const;
+ bool hasSameInternalRef(const ReferenceBase &RHS) const {
+ assert(
+ (isDenseMapSentinel() || RHS.isDenseMapSentinel() || CAS == RHS.CAS) &&
+ "Cannot compare across CAS instances");
+ return InternalRef == RHS.InternalRef;
+ }
+ friend class ObjectStore;
+ ReferenceBase(const ObjectStore *CAS, uint64_t InternalRef, bool IsHandle)
+ : InternalRef(InternalRef) {
+ this->CAS = CAS;
+ assert(InternalRef != getDenseMapEmptyRef() && "Reserved for DenseMapInfo");
+ assert(InternalRef != getDenseMapTombstoneRef() &&
+ "Reserved for DenseMapInfo");
+ }
+ explicit ReferenceBase(DenseMapEmptyTag)
+ : InternalRef(getDenseMapEmptyRef()) {}
+ explicit ReferenceBase(DenseMapTombstoneTag)
+ : InternalRef(getDenseMapTombstoneRef()) {}
+ uint64_t InternalRef;
+ const ObjectStore *CAS = nullptr;
+/// Reference to an object in a \a ObjectStore instance.
+/// If you have an ObjectRef, you know the object exists, and you can point at
+/// it from new nodes with \a ObjectStore::store(), but you don't know anything
+/// about it. "Loading" the object is a separate step that may not have
+/// happened yet, and which can fail (due to filesystem corruption) or
+/// introduce latency (if downloading from a remote store).
+/// \a ObjectStore::store() takes a list of these, and these are returned by \a
+/// ObjectStore::forEachRef() and \a ObjectStore::readRef(), which are accessors
+/// for nodes, and \a ObjectStore::getReference().
+/// \a ObjectStore::load() will load the referenced object, and returns \a
+/// ObjectHandle, a variant that knows what kind of entity it is. \a
+/// ObjectStore::getReferenceKind() can expect the type of reference without
+/// asking for unloaded objects to be loaded.
+/// This is a wrapper around a \c uint64_t (and a \a ObjectStore instance when
+/// assertions are on). If necessary, it can be deconstructed and reconstructed
+/// using \a Reference::getInternalRef() and \a
+/// Reference::getFromInternalRef(), but clients aren't expected to need to do
+/// this. These both require the right \a ObjectStore instance.
+class ObjectRef : public ReferenceBase {
+ struct DenseMapTag {};
+ friend bool operator==(const ObjectRef &LHS, const ObjectRef &RHS) {
+ return LHS.hasSameInternalRef(RHS);
+ }
+ friend bool operator!=(const ObjectRef &LHS, const ObjectRef &RHS) {
+ return !(LHS == RHS);
+ }
+ /// Allow a reference to be recreated after it's deconstructed.
+ static ObjectRef getFromInternalRef(const ObjectStore &CAS,
+ uint64_t InternalRef) {
+ return ObjectRef(CAS, InternalRef);
+ }
+ static ObjectRef getDenseMapEmptyKey() {
+ return ObjectRef(DenseMapEmptyTag{});
+ }
+ static ObjectRef getDenseMapTombstoneKey() {
+ return ObjectRef(DenseMapTombstoneTag{});
+ }
+ /// Print internal ref and/or CASID. Only suitable for debugging.
+ void print(raw_ostream &OS) const { return ReferenceBase::print(OS, *this); }
+ LLVM_DUMP_METHOD void dump() const;
+ friend class ObjectStore;
+ friend class ReferenceBase;
+ using ReferenceBase::ReferenceBase;
+ ObjectRef(const ObjectStore &CAS, uint64_t InternalRef)
+ : ReferenceBase(&CAS, InternalRef, /*IsHandle=*/false) {
+ assert(InternalRef != -1ULL && "Reserved for DenseMapInfo");
+ assert(InternalRef != -2ULL && "Reserved for DenseMapInfo");
+ }
+ explicit ObjectRef(DenseMapEmptyTag T) : ReferenceBase(T) {}
+ explicit ObjectRef(DenseMapTombstoneTag T) : ReferenceBase(T) {}
+ explicit ObjectRef(ReferenceBase) = delete;
+/// Handle to a loaded object in a \a ObjectStore instance.
+/// ObjectHandle encapulates a *loaded* object in the CAS. You need one
+/// of these to inspect the content of an object: to look at its stored
+/// data and references.
+class ObjectHandle : public ReferenceBase {
+ friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS) {
+ return LHS.hasSameInternalRef(RHS);
+ }
+ friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS) {
+ return !(LHS == RHS);
+ }
+ /// Print internal ref and/or CASID. Only suitable for debugging.
+ void print(raw_ostream &OS) const { return ReferenceBase::print(OS, *this); }
+ LLVM_DUMP_METHOD void dump() const;
+ friend class ObjectStore;
+ friend class ReferenceBase;
+ using ReferenceBase::ReferenceBase;
+ explicit ObjectHandle(ReferenceBase) = delete;
+ ObjectHandle(const ObjectStore &CAS, uint64_t InternalRef)
+ : ReferenceBase(&CAS, InternalRef, /*IsHandle=*/true) {}
+} // namespace cas
+template <> struct DenseMapInfo<cas::ObjectRef> {
+ static cas::ObjectRef getEmptyKey() {
+ return cas::ObjectRef::getDenseMapEmptyKey();
+ }
+ static cas::ObjectRef getTombstoneKey() {
+ return cas::ObjectRef::getDenseMapTombstoneKey();
+ }
+ static unsigned getHashValue(cas::ObjectRef Ref) {
+ return Ref.getDenseMapHash();
+ }
+ static bool isEqual(cas::ObjectRef LHS, cas::ObjectRef RHS) {
+ return LHS == RHS;
+ }
+} // namespace llvm
diff --git a/llvm/include/llvm/CAS/ObjectStore.h b/llvm/include/llvm/CAS/ObjectStore.h
new file mode 100644
index 00000000000000..16c133198f13df
--- /dev/null
+++ b/llvm/include/llvm/CAS/ObjectStore.h
@@ -0,0 +1,361 @@
+//===- llvm/CAS/ObjectStore.h -----------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/CASID.h"
+#include "llvm/CAS/CASReference.h"
+#include "llvm/CAS/TreeEntry.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include <cstddef>
+#include <future>
+namespace llvm {
+class MemoryBuffer;
+template <typename T> class unique_function;
+namespace cas {
+class ObjectStore;
+class ObjectProxy;
+using AsyncProxyValue = AsyncValue<ObjectProxy>;
+/// Content-addressable storage for objects.
+/// Conceptually, objects are stored in a "unique set".
+/// - Objects are immutable ("value objects") that are defined by their
+/// content. They are implicitly deduplicated by content.
+/// - Each object has a unique identifier (UID) that's derived from its content,
+/// called a \a CASID.
+/// - This UID is a fixed-size (strong) hash of the transitive content of a
+/// CAS object.
+/// - It's comparable between any two CAS instances that have the same \a
+/// CASIDContext::getHashSchemaIdentifier().
+/// - The UID can be printed (e.g., \a CASID::toString()) and it can parsed
+/// by the same or a different CAS instance with \a
+/// ObjectStore::parseID().
+/// - An object can be looked up by content or by UID.
+/// - \a store() is "get-or-create" methods, writing an object if it
+/// doesn't exist yet, and return a ref to it in any case.
+/// - \a loadObject(const CASID&) looks up an object by its UID.
+/// - Objects can reference other objects, forming an arbitrary DAG.
+/// The \a ObjectStore interface has a few ways of referencing objects:
+/// - \a ObjectRef encapsulates a reference to something in the CAS. It is an
+/// opaque type that references an object inside a specific CAS. It is
+/// implementation defined if the underlying object exists or not for an
+/// ObjectRef, and it can used to speed up CAS lookup as an implementation
+/// detail. However, you don't know anything about the underlying objects.
+/// "Loading" the object is a separate step that may not have happened
+/// yet, and which can fail (e.g. due to filesystem corruption) or introduce
+/// latency (if downloading from a remote store).
+/// - \a ObjectHandle encapulates a *loaded* object in the CAS. You need one of
+/// these to inspect the content of an object: to look at its stored
+/// data and references. This is internal to CAS implementation and not
+/// availble from CAS public APIs.
+/// - \a CASID: the UID for an object in the CAS, obtained through \a
+/// ObjectStore::getID() or \a ObjectStore::parseID(). This is a valid CAS
+/// identifier, but may reference an object that is unknown to this CAS
+/// instance.
+/// - \a ObjectProxy pairs an ObjectHandle (subclass) with a ObjectStore, and
+/// wraps access APIs to avoid having to pass extra parameters. It is the
+/// object used for accessing underlying data and refs by CAS users.
+/// There are a few options for accessing content of objects, with different
+/// lifetime tradeoffs:
+/// - \a getData() accesses data without exposing lifetime at all.
+/// - \a getMemoryBuffer() returns a \a MemoryBuffer whose lifetime
+/// is independent of the CAS (it can live longer).
+/// - \a getDataString() return StringRef with lifetime is guaranteed to last as
+/// long as \a ObjectStore.
+/// - \a readRef() and \a forEachRef() iterate through the references in an
+/// object. There is no lifetime assumption.
+/// Both ObjectRef and ObjectHandle are lightweight, wrapping a `uint64_t`.
+/// Doing anything with them requires a ObjectStore. As a convenience:
+class ObjectStore {
+ friend class ObjectProxy;
+ void anchor();
+ /// Get a \p CASID from a \p ID, which should have been generated by \a
+ /// CASID::print(). This succeeds as long as \a validateID() would pass. The
+ /// object may be unknown to this CAS instance.
+ ///
+ /// TODO: Remove, and update callers to use \a validateID() or \a
+ /// extractHashFromID().
+ virtual Expected<CASID> parseID(StringRef ID) = 0;
+ /// Store object into ObjectStore.
+ virtual Expected<ObjectRef> store(ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) = 0;
+ /// Get an ID for \p Ref.
+ virtual CASID getID(ObjectRef Ref) const = 0;
+ /// Get an existing reference to the object called \p ID.
+ ///
+ /// Returns \c None if the object is not stored in this CAS.
+ virtual std::optional<ObjectRef> getReference(const CASID &ID) const = 0;
+ /// \returns true if the object is directly available from the local CAS, for
+ /// implementations that have this kind of distinction.
+ virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0;
+ /// Validate the underlying object referred by CASID.
+ virtual Error validate(const CASID &ID) = 0;
+ /// Load the object referenced by \p Ref.
+ ///
+ /// Errors if the object cannot be loaded.
+ /// \returns \c std::nullopt if the object is missing from the CAS.
+ virtual Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) = 0;
+ /// Asynchronous version of \c loadIfExists.
+ virtual void loadIfExistsAsync(
+ ObjectRef Ref,
+ unique_function<void(Expected<std::optional<ObjectHandle>>)> Callback);
+ /// Like \c loadIfExists but returns an error if the object is missing.
+ Expected<ObjectHandle> load(ObjectRef Ref);
+ /// Get the size of some data.
+ virtual uint64_t getDataSize(ObjectHandle Node) const = 0;
+ /// Methods for handling objects.
+ virtual Error forEachRef(ObjectHandle Node,
+ function_ref<Error(ObjectRef)> Callback) const = 0;
+ virtual ObjectRef readRef(ObjectHandle Node, size_t I) const = 0;
+ virtual size_t getNumRefs(ObjectHandle Node) const = 0;
+ virtual ArrayRef<char> getData(ObjectHandle Node,
+ bool RequiresNullTerminator = false) const = 0;
+ /// Get ObjectRef from open file.
+ virtual Expected<ObjectRef>
+ storeFromOpenFileImpl(sys::fs::file_t FD,
+ std::optional<sys::fs::file_status> Status);
+ /// Get a lifetime-extended StringRef pointing at \p Data.
+ ///
+ /// Depending on the CAS implementation, this may involve in-memory storage
+ /// overhead.
+ StringRef getDataString(ObjectHandle Node) {
+ return toStringRef(getData(Node));
+ }
+ /// Get a lifetime-extended MemoryBuffer pointing at \p Data.
+ ///
+ /// Depending on the CAS implementation, this may involve in-memory storage
+ /// overhead.
+ std::unique_ptr<MemoryBuffer>
+ getMemoryBuffer(ObjectHandle Node, StringRef Name = "",
+ bool RequiresNullTerminator = true);
+ /// Read all the refs from object in a SmallVector.
+ virtual void readRefs(ObjectHandle Node,
+ SmallVectorImpl<ObjectRef> &Refs) const;
+ /// Allow ObjectStore implementations to create internal handles.
+ HandleKind make##HandleKind(uint64_t InternalRef) const { \
+ return HandleKind(*this, InternalRef); \
+ }
+ /// Helper functions to store object and returns a ObjectProxy.
+ Expected<ObjectProxy> createProxy(ArrayRef<ObjectRef> Refs, StringRef Data);
+ /// Store object from StringRef.
+ Expected<ObjectRef> storeFromString(ArrayRef<ObjectRef> Refs,
+ StringRef String) {
+ return store(Refs, arrayRefFromStringRef<char>(String));
+ }
+ /// Default implementation reads \p FD and calls \a storeNode(). Does not
+ /// take ownership of \p FD; the caller is responsible for closing it.
+ ///
+ /// If \p Status is sent in it is to be treated as a hint. Implementations
+ /// must protect against the file size potentially growing after the status
+ /// was taken (i.e., they cannot assume that an mmap will be null-terminated
+ /// where \p Status implies).
+ ///
+ /// Returns the \a CASID and the size of the file.
+ Expected<ObjectRef>
+ storeFromOpenFile(sys::fs::file_t FD,
+ std::optional<sys::fs::file_status> Status = std::nullopt) {
+ return storeFromOpenFileImpl(FD, Status);
+ }
+ static Error createUnknownObjectError(const CASID &ID);
+ /// Create ObjectProxy from CASID. If the object doesn't exist, get an error.
+ Expected<ObjectProxy> getProxy(const CASID &ID);
+ /// Create ObjectProxy from ObjectRef. If the object can't be loaded, get an
+ /// error.
+ Expected<ObjectProxy> getProxy(ObjectRef Ref);
+ /// \returns \c std::nullopt if the object is missing from the CAS.
+ Expected<std::optional<ObjectProxy>> getProxyIfExists(ObjectRef Ref);
+ /// Asynchronous version of \c getProxyIfExists.
+ std::future<AsyncProxyValue> getProxyFuture(ObjectRef Ref);
+ /// Asynchronous version of \c getProxyIfExists using a callback.
+ void getProxyAsync(
+ ObjectRef Ref,
+ unique_function<void(Expected<std::optional<ObjectProxy>>)> Callback);
+ /// Read the data from \p Data into \p OS.
+ uint64_t readData(ObjectHandle Node, raw_ostream &OS, uint64_t Offset = 0,
+ uint64_t MaxBytes = -1ULL) const {
+ ArrayRef<char> Data = getData(Node);
+ assert(Offset < Data.size() && "Expected valid offset");
+ Data = Data.drop_front(Offset).take_front(MaxBytes);
+ OS << toStringRef(Data);
+ return Data.size();
+ }
+ /// Validate the whole node tree.
+ Error validateTree(ObjectRef Ref);
+ /// Print the ObjectStore internals for debugging purpose.
+ virtual void print(raw_ostream &) const {}
+ void dump() const;
+ /// Get CASContext
+ const CASContext &getContext() const { return Context; }
+ virtual ~ObjectStore() = default;
+ ObjectStore(const CASContext &Context) : Context(Context) {}
+ const CASContext &Context;
+/// Reference to an abstract hierarchical node, with data and references.
+/// Reference is passed by value and is expected to be valid as long as the \a
+/// ObjectStore is.
+class ObjectProxy {
+ const ObjectStore &getCAS() const { return *CAS; }
+ ObjectStore &getCAS() { return *CAS; }
+ CASID getID() const { return CAS->getID(Ref); }
+ ObjectRef getRef() const { return Ref; }
+ size_t getNumReferences() const { return CAS->getNumRefs(H); }
+ ObjectRef getReference(size_t I) const { return CAS->readRef(H, I); }
+ operator CASID() const { return getID(); }
+ CASID getReferenceID(size_t I) const {
+ std::optional<CASID> ID = getCAS().getID(getReference(I));
+ assert(ID && "Expected reference to be first-class object");
+ return *ID;
+ }
+ /// Visit each reference in order, returning an error from \p Callback to
+ /// stop early.
+ Error forEachReference(function_ref<Error(ObjectRef)> Callback) const {
+ return CAS->forEachRef(H, Callback);
+ }
+ std::unique_ptr<MemoryBuffer>
+ getMemoryBuffer(StringRef Name = "",
+ bool RequiresNullTerminator = true) const;
+ /// Get the content of the node. Valid as long as the CAS is valid.
+ StringRef getData() const { return CAS->getDataString(H); }
+ friend bool operator==(const ObjectProxy &Proxy, ObjectRef Ref) {
+ return Proxy.getRef() == Ref;
+ }
+ friend bool operator==(ObjectRef Ref, const ObjectProxy &Proxy) {
+ return Proxy.getRef() == Ref;
+ }
+ friend bool operator!=(const ObjectProxy &Proxy, ObjectRef Ref) {
+ return !(Proxy.getRef() == Ref);
+ }
+ friend bool operator!=(ObjectRef Ref, const ObjectProxy &Proxy) {
+ return !(Proxy.getRef() == Ref);
+ }
+ ObjectProxy() = delete;
+ static ObjectProxy load(ObjectStore &CAS, ObjectRef Ref, ObjectHandle Node) {
+ return ObjectProxy(CAS, Ref, Node);
+ }
+ ObjectProxy(ObjectStore &CAS, ObjectRef Ref, ObjectHandle H)
+ : CAS(&CAS), Ref(Ref), H(H) {}
+ ObjectStore *CAS;
+ ObjectRef Ref;
+ ObjectHandle H;
+std::unique_ptr<ObjectStore> createInMemoryCAS();
+/// \returns true if \c LLVM_ENABLE_ONDISK_CAS configuration was enabled.
+bool isOnDiskCASEnabled();
+/// Gets or creates a persistent on-disk path at \p Path.
+/// Deprecated: if \p Path resolves to \a getDefaultOnDiskCASStableID(),
+/// automatically opens \a getDefaultOnDiskCASPath() instead.
+/// FIXME: Remove the special behaviour for getDefaultOnDiskCASStableID(). The
+/// client should handle this logic, if/when desired.
+Expected<std::unique_ptr<ObjectStore>> createOnDiskCAS(const Twine &Path);
+/// Set \p Path to a reasonable default on-disk path for a persistent CAS for
+/// the current user.
+void getDefaultOnDiskCASPath(SmallVectorImpl<char> &Path);
+/// Get a reasonable default on-disk path for a persistent CAS for the current
+/// user.
+std::string getDefaultOnDiskCASPath();
+/// Create ObjectStore from a string identifier.
+/// Currently the string identifier is using URL scheme with following supported
+/// schemes:
+/// * InMemory CAS: mem://
+/// * OnDisk CAS: file://${PATH_TO_ONDISK_CAS}
+/// * PlugIn CAS: plugin://${PATH_TO_PLUGIN}?${OPT1}=${VAL1}&${OPT2}=${VAL2}..
+/// If no URL scheme is used, it defaults to following (but might change in
+/// future)
+/// * empty string: Error!
+/// * "auto": default OnDiskCAS location
+/// * Other: path to OnDiskCAS.
+/// For the plugin scheme, use argument "ondisk-path=${PATH}" to choose the
+/// on-disk directory that the plugin should use, otherwise the default
+/// OnDiskCAS location will be used.
+/// FIXME: Need to implement proper URL encoding scheme that allows "%".
+Expected<std::shared_ptr<ObjectStore>> createCASFromIdentifier(StringRef Path);
+/// Register a URL scheme to CAS Identifier.
+using ObjectStoreCreateFuncTy =
+ Expected<std::shared_ptr<ObjectStore>>(const Twine &);
+void registerCASURLScheme(StringRef Prefix, ObjectStoreCreateFuncTy *Func);
+} // namespace cas
+} // namespace llvm
diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp
new file mode 100644
index 00000000000000..41e273cee1ba77
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinCAS.cpp
@@ -0,0 +1,108 @@
+//===- BuiltinCAS.cpp -------------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "BuiltinCAS.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Process.h"
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::builtin;
+static StringRef getCASIDPrefix() { return "llvmcas://"; }
+void BuiltinCASContext::anchor() {}
+Expected<HashType> BuiltinCASContext::parseID(StringRef Reference) {
+ if (!Reference.consume_front(getCASIDPrefix()))
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "invalid cas-id '" + Reference + "'");
+ // FIXME: Allow shortened references?
+ if (Reference.size() != 2 * sizeof(HashType))
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "wrong size for cas-id hash '" + Reference + "'");
+ std::string Binary;
+ if (!tryGetFromHex(Reference, Binary))
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "invalid hash in cas-id '" + Reference + "'");
+ assert(Binary.size() == sizeof(HashType));
+ HashType Digest;
+ llvm::copy(Binary, Digest.data());
+ return Digest;
+Expected<CASID> BuiltinCAS::parseID(StringRef Reference) {
+ Expected<HashType> Digest = BuiltinCASContext::parseID(Reference);
+ if (!Digest)
+ return Digest.takeError();
+ return CASID::create(&getContext(), toStringRef(*Digest));
+void BuiltinCASContext::printID(ArrayRef<uint8_t> Digest, raw_ostream &OS) {
+ SmallString<64> Hash;
+ toHex(Digest, /*LowerCase=*/true, Hash);
+ OS << getCASIDPrefix() << Hash;
+void BuiltinCASContext::printIDImpl(raw_ostream &OS, const CASID &ID) const {
+ BuiltinCASContext::printID(ID.getHash(), OS);
+const BuiltinCASContext &BuiltinCASContext::getDefaultContext() {
+ static BuiltinCASContext DefaultContext;
+ return DefaultContext;
+Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) {
+ return storeImpl(BuiltinObjectHasher<HasherT>::hashObject(*this, Refs, Data),
+ Refs, Data);
+Error BuiltinCAS::validate(const CASID &ID) {
+ auto Ref = getReference(ID);
+ if (!Ref)
+ return createUnknownObjectError(ID);
+ auto Handle = load(*Ref);
+ if (!Handle)
+ return Handle.takeError();
+ auto Proxy = ObjectProxy::load(*this, *Ref, *Handle);
+ SmallVector<ObjectRef> Refs;
+ if (auto E = Proxy.forEachReference([&](ObjectRef Ref) -> Error {
+ Refs.push_back(Ref);
+ return Error::success();
+ }))
+ return E;
+ ArrayRef<char> Data(Proxy.getData().data(), Proxy.getData().size());
+ auto Hash = BuiltinObjectHasher<HasherT>::hashObject(*this, Refs, Data);
+ if (!ID.getHash().equals(Hash))
+ return createCorruptObjectError(ID);
+ return Error::success();
+cas::builtin::createBuiltinUnifiedOnDiskCache(StringRef Path) {
+ return ondisk::UnifiedOnDiskCache::open(Path, /*SizeLimit=*/std::nullopt,
+ BuiltinCASContext::getHashName(),
+ sizeof(HashType));
+ return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled");
diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h
new file mode 100644
index 00000000000000..75f3c92105bbea
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinCAS.h
@@ -0,0 +1,98 @@
+//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/BuiltinCASContext.h"
+#include "llvm/CAS/ObjectStore.h"
+#include <cstddef>
+namespace llvm {
+namespace cas {
+class ActionCache;
+namespace ondisk {
+class UnifiedOnDiskCache;
+namespace builtin {
+class BuiltinCAS : public ObjectStore {
+ BuiltinCAS() : ObjectStore(BuiltinCASContext::getDefaultContext()) {}
+ Expected<CASID> parseID(StringRef Reference) final;
+ Expected<ObjectRef> store(ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) final;
+ virtual Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash,
+ ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) = 0;
+ virtual Expected<ObjectRef>
+ storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
+ sys::fs::mapped_file_region Map) {
+ return storeImpl(ComputedHash, std::nullopt,
+ ArrayRef(Map.data(), Map.size()));
+ }
+ /// Both builtin CAS implementations provide lifetime for free, so this can
+ /// be const, and readData() and getDataSize() can be implemented on top of
+ /// it.
+ virtual ArrayRef<char> getDataConst(ObjectHandle Node) const = 0;
+ ArrayRef<char> getData(ObjectHandle Node,
+ bool RequiresNullTerminator) const final {
+ // BuiltinCAS Objects are always null terminated.
+ return getDataConst(Node);
+ }
+ uint64_t getDataSize(ObjectHandle Node) const final {
+ return getDataConst(Node).size();
+ }
+ Error createUnknownObjectError(const CASID &ID) const {
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "unknown object '" + ID.toString() + "'");
+ }
+ Error createCorruptObjectError(const CASID &ID) const {
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "corrupt object '" + ID.toString() + "'");
+ }
+ Error createCorruptStorageError() const {
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "corrupt storage");
+ }
+ Error validate(const CASID &ID) final;
+/// Create a \p UnifiedOnDiskCache instance that uses \p BLAKE3 hashing.
+createBuiltinUnifiedOnDiskCache(StringRef Path);
+/// \param UniDB A \p UnifiedOnDiskCache instance from \p
+/// createBuiltinUnifiedOnDiskCache.
+std::unique_ptr<ObjectStore> createObjectStoreFromUnifiedOnDiskCache(
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+/// \param UniDB A \p UnifiedOnDiskCache instance from \p
+/// createBuiltinUnifiedOnDiskCache.
+std::unique_ptr<ActionCache> createActionCacheFromUnifiedOnDiskCache(
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+// FIXME: Proxy not portable. Maybe also error-prone?
+constexpr StringLiteral DefaultDirProxy = "/^llvm::cas::builtin::default";
+constexpr StringLiteral DefaultDir = "llvm.cas.builtin.default";
+} // end namespace builtin
+} // end namespace cas
+} // end namespace llvm
diff --git a/llvm/lib/CAS/BuiltinObjectHasher.h b/llvm/lib/CAS/BuiltinObjectHasher.h
new file mode 100644
index 00000000000000..e9d7f7d887515f
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinObjectHasher.h
@@ -0,0 +1,73 @@
+//===- BuiltinObjectHasher.h ------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Support/Endian.h"
+namespace llvm {
+namespace cas {
+template <class HasherT> class BuiltinObjectHasher {
+ using HashT = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>()));
+ static HashT hashObject(const ObjectStore &CAS, ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) {
+ BuiltinObjectHasher H;
+ H.updateSize(Refs.size());
+ for (const ObjectRef &Ref : Refs)
+ H.updateRef(CAS, Ref);
+ H.updateArray(Data);
+ return H.finish();
+ }
+ HashT finish() { return Hasher.final(); }
+ void updateRef(const ObjectStore &CAS, ObjectRef Ref) {
+ updateID(CAS.getID(Ref));
+ }
+ void updateID(const CASID &ID) {
+ // NOTE: Does not hash the size of the hash. That's a CAS implementation
+ // detail that shouldn't leak into the UUID for an object.
+ ArrayRef<uint8_t> Hash = ID.getHash();
+ assert(Hash.size() == sizeof(HashT) &&
+ "Expected object ref to match the hash size");
+ Hasher.update(Hash);
+ }
+ void updateArray(ArrayRef<uint8_t> Bytes) {
+ updateSize(Bytes.size());
+ Hasher.update(Bytes);
+ }
+ void updateArray(ArrayRef<char> Bytes) {
+ updateArray(ArrayRef(reinterpret_cast<const uint8_t *>(Bytes.data()),
+ Bytes.size()));
+ }
+ void updateSize(uint64_t Size) {
+ Size = support::endian::byte_swap(Size, support::endianness::little);
+ Hasher.update(
+ ArrayRef(reinterpret_cast<const uint8_t *>(&Size), sizeof(Size)));
+ }
+ BuiltinObjectHasher() = default;
+ ~BuiltinObjectHasher() = default;
+ HasherT Hasher;
+} // namespace cas
+} // namespace llvm
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
new file mode 100644
index 00000000000000..a486ab66ae4266
--- /dev/null
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -0,0 +1,8 @@
+ BuiltinCAS.cpp
+ InMemoryCAS.cpp
+ ObjectStore.cpp
diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp
new file mode 100644
index 00000000000000..378e822d284207
--- /dev/null
+++ b/llvm/lib/CAS/InMemoryCAS.cpp
@@ -0,0 +1,321 @@
+//===- InMemoryCAS.cpp ------------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "BuiltinCAS.h"
+#include "llvm/ADT/LazyAtomicPointer.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/TrieRawHashMap.h"
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ThreadSafeAllocator.h"
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::builtin;
+namespace {
+class InMemoryObject;
+/// Index of referenced IDs (map: Hash -> InMemoryObject*). Uses
+/// LazyAtomicPointer to coordinate creation of objects.
+using InMemoryIndexT =
+ ThreadSafeTrieRawHashMap<LazyAtomicPointer<const InMemoryObject>,
+ sizeof(HashType)>;
+/// Values in \a InMemoryIndexT. \a InMemoryObject's point at this to access
+/// their hash.
+using InMemoryIndexValueT = InMemoryIndexT::value_type;
+class InMemoryObject {
+ enum class Kind {
+ /// Node with refs and data.
+ RefNode,
+ /// Node with refs and data co-allocated.
+ InlineNode,
+ Max = InlineNode,
+ };
+ Kind getKind() const { return IndexAndKind.getInt(); }
+ const InMemoryIndexValueT &getIndex() const {
+ assert(IndexAndKind.getPointer());
+ return *IndexAndKind.getPointer();
+ }
+ ArrayRef<uint8_t> getHash() const { return getIndex().Hash; }
+ InMemoryObject() = delete;
+ InMemoryObject(InMemoryObject &&) = delete;
+ InMemoryObject(const InMemoryObject &) = delete;
+ InMemoryObject(Kind K, const InMemoryIndexValueT &I) : IndexAndKind(&I, K) {}
+ enum Counts : int {
+ NumKindBits = 2,
+ };
+ PointerIntPair<const InMemoryIndexValueT *, NumKindBits, Kind> IndexAndKind;
+ static_assert((1U << NumKindBits) <= alignof(InMemoryIndexValueT),
+ "Kind will clobber pointer");
+ static_assert(((int)Kind::Max >> NumKindBits) == 0, "Kind will be truncated");
+ inline ArrayRef<char> getData() const;
+ inline ArrayRef<const InMemoryObject *> getRefs() const;
+class InMemoryRefObject : public InMemoryObject {
+ static constexpr Kind KindValue = Kind::RefNode;
+ static bool classof(const InMemoryObject *O) {
+ return O->getKind() == KindValue;
+ }
+ ArrayRef<const InMemoryObject *> getRefsImpl() const { return Refs; }
+ ArrayRef<const InMemoryObject *> getRefs() const { return Refs; }
+ ArrayRef<char> getDataImpl() const { return Data; }
+ ArrayRef<char> getData() const { return Data; }
+ static InMemoryRefObject &create(function_ref<void *(size_t Size)> Allocate,
+ const InMemoryIndexValueT &I,
+ ArrayRef<const InMemoryObject *> Refs,
+ ArrayRef<char> Data) {
+ void *Mem = Allocate(sizeof(InMemoryRefObject));
+ return *new (Mem) InMemoryRefObject(I, Refs, Data);
+ }
+ InMemoryRefObject(const InMemoryIndexValueT &I,
+ ArrayRef<const InMemoryObject *> Refs, ArrayRef<char> Data)
+ : InMemoryObject(KindValue, I), Refs(Refs), Data(Data) {
+ assert(isAddrAligned(Align(8), this) && "Expected 8-byte alignment");
+ assert(isAddrAligned(Align(8), Data.data()) && "Expected 8-byte alignment");
+ assert(*Data.end() == 0 && "Expected null-termination");
+ }
+ ArrayRef<const InMemoryObject *> Refs;
+ ArrayRef<char> Data;
+class InMemoryInlineObject : public InMemoryObject {
+ static constexpr Kind KindValue = Kind::InlineNode;
+ static bool classof(const InMemoryObject *O) {
+ return O->getKind() == KindValue;
+ }
+ ArrayRef<const InMemoryObject *> getRefs() const { return getRefsImpl(); }
+ ArrayRef<const InMemoryObject *> getRefsImpl() const {
+ return ArrayRef(reinterpret_cast<const InMemoryObject *const *>(this + 1),
+ NumRefs);
+ }
+ ArrayRef<char> getData() const { return getDataImpl(); }
+ ArrayRef<char> getDataImpl() const {
+ ArrayRef<const InMemoryObject *> Refs = getRefs();
+ return ArrayRef(reinterpret_cast<const char *>(Refs.data() + Refs.size()),
+ DataSize);
+ }
+ static InMemoryInlineObject &
+ create(function_ref<void *(size_t Size)> Allocate,
+ const InMemoryIndexValueT &I, ArrayRef<const InMemoryObject *> Refs,
+ ArrayRef<char> Data) {
+ void *Mem = Allocate(sizeof(InMemoryInlineObject) +
+ sizeof(uintptr_t) * Refs.size() + Data.size() + 1);
+ return *new (Mem) InMemoryInlineObject(I, Refs, Data);
+ }
+ InMemoryInlineObject(const InMemoryIndexValueT &I,
+ ArrayRef<const InMemoryObject *> Refs,
+ ArrayRef<char> Data)
+ : InMemoryObject(KindValue, I), NumRefs(Refs.size()),
+ DataSize(Data.size()) {
+ auto *BeginRefs = reinterpret_cast<const InMemoryObject **>(this + 1);
+ llvm::copy(Refs, BeginRefs);
+ auto *BeginData = reinterpret_cast<char *>(BeginRefs + NumRefs);
+ llvm::copy(Data, BeginData);
+ BeginData[Data.size()] = 0;
+ }
+ uint32_t NumRefs;
+ uint32_t DataSize;
+/// In-memory CAS database and action cache (the latter should be separated).
+class InMemoryCAS : public BuiltinCAS {
+ Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash,
+ ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) final;
+ Expected<ObjectRef>
+ storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
+ sys::fs::mapped_file_region Map) override;
+ CASID getID(const InMemoryIndexValueT &I) const {
+ StringRef Hash = toStringRef(I.Hash);
+ return CASID::create(&getContext(), Hash);
+ }
+ CASID getID(const InMemoryObject &O) const { return getID(O.getIndex()); }
+ ObjectHandle getObjectHandle(const InMemoryObject &Node) const {
+ assert(!(reinterpret_cast<uintptr_t>(&Node) & 0x1ULL));
+ return makeObjectHandle(reinterpret_cast<uintptr_t>(&Node));
+ }
+ Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) override {
+ return getObjectHandle(asInMemoryObject(Ref));
+ }
+ InMemoryIndexValueT &indexHash(ArrayRef<uint8_t> Hash) {
+ return *Index.insertLazy(
+ Hash, [](auto ValueConstructor) { ValueConstructor.emplace(nullptr); });
+ }
+ /// TODO: Consider callers to actually do an insert and to return a handle to
+ /// the slot in the trie.
+ const InMemoryObject *getInMemoryObject(CASID ID) const {
+ assert(ID.getContext().getHashSchemaIdentifier() ==
+ getContext().getHashSchemaIdentifier() &&
+ "Expected ID from same hash schema");
+ if (InMemoryIndexT::const_pointer P = Index.find(ID.getHash()))
+ return P->Data;
+ return nullptr;
+ }
+ const InMemoryObject &getInMemoryObject(ObjectHandle OH) const {
+ return *reinterpret_cast<const InMemoryObject *>(
+ (uintptr_t)OH.getInternalRef(*this));
+ }
+ const InMemoryObject &asInMemoryObject(ReferenceBase Ref) const {
+ uintptr_t P = Ref.getInternalRef(*this);
+ return *reinterpret_cast<const InMemoryObject *>(P);
+ }
+ ObjectRef toReference(const InMemoryObject &O) const {
+ return makeObjectRef(reinterpret_cast<uintptr_t>(&O));
+ }
+ CASID getID(ObjectRef Ref) const final { return getIDImpl(Ref); }
+ CASID getIDImpl(ReferenceBase Ref) const {
+ return getID(asInMemoryObject(Ref));
+ }
+ std::optional<ObjectRef> getReference(const CASID &ID) const final {
+ if (const InMemoryObject *Object = getInMemoryObject(ID))
+ return toReference(*Object);
+ return std::nullopt;
+ }
+ Expected<bool> isMaterialized(ObjectRef Ref) const final { return true; }
+ ArrayRef<char> getDataConst(ObjectHandle Node) const final {
+ return cast<InMemoryObject>(asInMemoryObject(Node)).getData();
+ }
+ InMemoryCAS() = default;
+ size_t getNumRefs(ObjectHandle Node) const final {
+ return getInMemoryObject(Node).getRefs().size();
+ }
+ ObjectRef readRef(ObjectHandle Node, size_t I) const final {
+ return toReference(*getInMemoryObject(Node).getRefs()[I]);
+ }
+ Error forEachRef(ObjectHandle Node,
+ function_ref<Error(ObjectRef)> Callback) const final;
+ /// Index of referenced IDs (map: Hash -> InMemoryObject*). Mapped to nullptr
+ /// as a convenient way to store hashes.
+ ///
+ /// - Insert nullptr on lookups.
+ /// - InMemoryObject points back to here.
+ InMemoryIndexT Index;
+ ThreadSafeAllocator<BumpPtrAllocator> Objects;
+ ThreadSafeAllocator<SpecificBumpPtrAllocator<sys::fs::mapped_file_region>>
+ MemoryMaps;
+} // end anonymous namespace
+ArrayRef<char> InMemoryObject::getData() const {
+ if (auto *Derived = dyn_cast<InMemoryRefObject>(this))
+ return Derived->getDataImpl();
+ return cast<InMemoryInlineObject>(this)->getDataImpl();
+ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const {
+ if (auto *Derived = dyn_cast<InMemoryRefObject>(this))
+ return Derived->getRefsImpl();
+ return cast<InMemoryInlineObject>(this)->getRefsImpl();
+InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
+ sys::fs::mapped_file_region Map) {
+ // Look up the hash in the index, initializing to nullptr if it's new.
+ ArrayRef<char> Data(Map.data(), Map.size());
+ auto &I = indexHash(ComputedHash);
+ // Load or generate.
+ auto Allocator = [&](size_t Size) -> void * {
+ return Objects.Allocate(Size, alignof(InMemoryObject));
+ };
+ auto Generator = [&]() -> const InMemoryObject * {
+ return &InMemoryRefObject::create(Allocator, I, std::nullopt, Data);
+ };
+ const InMemoryObject &Node =
+ cast<InMemoryObject>(I.Data.loadOrGenerate(Generator));
+ // Save Map if the winning node uses it.
+ if (auto *RefNode = dyn_cast<InMemoryRefObject>(&Node))
+ if (RefNode->getData().data() == Map.data())
+ new (MemoryMaps.Allocate(1)) sys::fs::mapped_file_region(std::move(Map));
+ return toReference(Node);
+Expected<ObjectRef> InMemoryCAS::storeImpl(ArrayRef<uint8_t> ComputedHash,
+ ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) {
+ // Look up the hash in the index, initializing to nullptr if it's new.
+ auto &I = indexHash(ComputedHash);
+ // Create the node.
+ SmallVector<const InMemoryObject *> InternalRefs;
+ for (ObjectRef Ref : Refs)
+ InternalRefs.push_back(&asInMemoryObject(Ref));
+ auto Allocator = [&](size_t Size) -> void * {
+ return Objects.Allocate(Size, alignof(InMemoryObject));
+ };
+ auto Generator = [&]() -> const InMemoryObject * {
+ return &InMemoryInlineObject::create(Allocator, I, InternalRefs, Data);
+ };
+ return toReference(cast<InMemoryObject>(I.Data.loadOrGenerate(Generator)));
+Error InMemoryCAS::forEachRef(ObjectHandle Handle,
+ function_ref<Error(ObjectRef)> Callback) const {
+ auto &Node = getInMemoryObject(Handle);
+ for (const InMemoryObject *Ref : Node.getRefs())
+ if (Error E = Callback(toReference(*Ref)))
+ return E;
+ return Error::success();
+std::unique_ptr<ObjectStore> cas::createInMemoryCAS() {
+ return std::make_unique<InMemoryCAS>();
diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp
new file mode 100644
index 00000000000000..a0e8d5541acd9b
--- /dev/null
+++ b/llvm/lib/CAS/ObjectStore.cpp
@@ -0,0 +1,259 @@
+//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/ObjectStore.h"
+#include "BuiltinCAS.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
+using namespace llvm;
+using namespace llvm::cas;
+void CASContext::anchor() {}
+void ObjectStore::anchor() {}
+LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); }
+std::string CASID::toString() const {
+ std::string S;
+ raw_string_ostream(S) << *this;
+ return S;
+static void printReferenceBase(raw_ostream &OS, StringRef Kind,
+ uint64_t InternalRef, std::optional<CASID> ID) {
+ OS << Kind << "=" << InternalRef;
+ if (ID)
+ OS << "[" << *ID << "]";
+void ReferenceBase::print(raw_ostream &OS, const ObjectHandle &This) const {
+ assert(this == &This);
+ printReferenceBase(OS, "object-handle", InternalRef, std::nullopt);
+void ReferenceBase::print(raw_ostream &OS, const ObjectRef &This) const {
+ assert(this == &This);
+ std::optional<CASID> ID;
+ if (CAS)
+ ID = CAS->getID(This);
+ printReferenceBase(OS, "object-ref", InternalRef, ID);
+void ObjectStore::loadIfExistsAsync(
+ ObjectRef Ref,
+ unique_function<void(Expected<std::optional<ObjectHandle>>)> Callback) {
+ // The default implementation is synchronous.
+ Callback(loadIfExists(Ref));
+Expected<ObjectHandle> ObjectStore::load(ObjectRef Ref) {
+ std::optional<ObjectHandle> Handle;
+ if (Error E = loadIfExists(Ref).moveInto(Handle))
+ return std::move(E);
+ if (!Handle)
+ return createStringError(errc::invalid_argument,
+ "missing object '" + getID(Ref).toString() + "'");
+ return *Handle;
+ObjectStore::getMemoryBuffer(ObjectHandle Node, StringRef Name,
+ bool RequiresNullTerminator) {
+ return MemoryBuffer::getMemBuffer(
+ toStringRef(getData(Node, RequiresNullTerminator)), Name,
+ RequiresNullTerminator);
+void ObjectStore::readRefs(ObjectHandle Node,
+ SmallVectorImpl<ObjectRef> &Refs) const {
+ consumeError(forEachRef(Node, [&Refs](ObjectRef Ref) -> Error {
+ Refs.push_back(Ref);
+ return Error::success();
+ }));
+Expected<ObjectProxy> ObjectStore::getProxy(const CASID &ID) {
+ std::optional<ObjectRef> Ref = getReference(ID);
+ if (!Ref)
+ return createUnknownObjectError(ID);
+ return getProxy(*Ref);
+Expected<ObjectProxy> ObjectStore::getProxy(ObjectRef Ref) {
+ std::optional<ObjectHandle> H;
+ if (Error E = load(Ref).moveInto(H))
+ return std::move(E);
+ return ObjectProxy::load(*this, Ref, *H);
+ObjectStore::getProxyIfExists(ObjectRef Ref) {
+ std::optional<ObjectHandle> H;
+ if (Error E = loadIfExists(Ref).moveInto(H))
+ return std::move(E);
+ if (!H)
+ return std::nullopt;
+ return ObjectProxy::load(*this, Ref, *H);
+std::future<AsyncProxyValue> ObjectStore::getProxyFuture(ObjectRef Ref) {
+ std::promise<AsyncProxyValue> Promise;
+ auto Future = Promise.get_future();
+ getProxyAsync(Ref, [Promise = std::move(Promise)](
+ Expected<std::optional<ObjectProxy>> Obj) mutable {
+ Promise.set_value(std::move(Obj));
+ });
+ return Future;
+void ObjectStore::getProxyAsync(
+ ObjectRef Ref,
+ unique_function<void(Expected<std::optional<ObjectProxy>>)> Callback) {
+ // FIXME: there is potential for use-after-free for the 'this' pointer.
+ // Either we should always allocate shared pointers for \c ObjectStore objects
+ // and pass \c shared_from_this() or expect that the caller will not release
+ // the \c ObjectStore before the callback returns.
+ return loadIfExistsAsync(
+ Ref, [this, Ref, Callback = std::move(Callback)](
+ Expected<std::optional<ObjectHandle>> H) mutable {
+ if (!H)
+ Callback(H.takeError());
+ else if (!*H)
+ Callback(std::nullopt);
+ else
+ Callback(ObjectProxy::load(*this, Ref, **H));
+ });
+Error ObjectStore::createUnknownObjectError(const CASID &ID) {
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "unknown object '" + ID.toString() + "'");
+Expected<ObjectProxy> ObjectStore::createProxy(ArrayRef<ObjectRef> Refs,
+ StringRef Data) {
+ Expected<ObjectRef> Ref = store(Refs, arrayRefFromStringRef<char>(Data));
+ if (!Ref)
+ return Ref.takeError();
+ return getProxy(*Ref);
+ObjectStore::storeFromOpenFileImpl(sys::fs::file_t FD,
+ std::optional<sys::fs::file_status> Status) {
+ // Copy the file into an immutable memory buffer and call \c store on that.
+ // Using \c mmap would be unsafe because there's a race window between when we
+ // get the digest hash for the \c mmap contents and when we store the data; if
+ // the file changes in-between we will create an invalid object.
+ // FIXME: For the on-disk CAS implementation use cloning to store it as a
+ // standalone file if the file-system supports it and the file is large.
+ constexpr size_t ChunkSize = 4 * 4096;
+ SmallString<0> Data;
+ Data.reserve(ChunkSize * 2);
+ if (Error E = sys::fs::readNativeFileToEOF(FD, Data, ChunkSize))
+ return std::move(E);
+ return store(std::nullopt, ArrayRef(Data.data(), Data.size()));
+Error ObjectStore::validateTree(ObjectRef Root) {
+ SmallDenseSet<ObjectRef> ValidatedRefs;
+ SmallVector<ObjectRef, 16> RefsToValidate;
+ RefsToValidate.push_back(Root);
+ while (!RefsToValidate.empty()) {
+ ObjectRef Ref = RefsToValidate.pop_back_val();
+ auto [I, Inserted] = ValidatedRefs.insert(Ref);
+ if (!Inserted)
+ continue; // already validated.
+ if (Error E = validate(getID(Ref)))
+ return E;
+ Expected<ObjectHandle> Obj = load(Ref);
+ if (!Obj)
+ return Obj.takeError();
+ if (Error E = forEachRef(*Obj, [&RefsToValidate](ObjectRef R) -> Error {
+ RefsToValidate.push_back(R);
+ return Error::success();
+ }))
+ return E;
+ }
+ return Error::success();
+ObjectProxy::getMemoryBuffer(StringRef Name,
+ bool RequiresNullTerminator) const {
+ return CAS->getMemoryBuffer(H, Name, RequiresNullTerminator);
+static Expected<std::shared_ptr<ObjectStore>>
+createOnDiskCASImpl(const Twine &Path) {
+ return createOnDiskCAS(Path);
+static Expected<std::shared_ptr<ObjectStore>>
+createInMemoryCASImpl(const Twine &) {
+ return createInMemoryCAS();
+static ManagedStatic<StringMap<ObjectStoreCreateFuncTy *>> RegisteredScheme;
+static StringMap<ObjectStoreCreateFuncTy *> &getRegisteredScheme() {
+ if (!RegisteredScheme.isConstructed()) {
+ RegisteredScheme->insert({"mem://", &createInMemoryCASImpl});
+ RegisteredScheme->insert({"file://", &createOnDiskCASImpl});
+ }
+ return *RegisteredScheme;
+cas::createCASFromIdentifier(StringRef Path) {
+ for (auto &Scheme : getRegisteredScheme()) {
+ if (Path.consume_front(Scheme.getKey()))
+ return Scheme.getValue()(Path);
+ }
+ if (Path.empty())
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "No CAS identifier is provided");
+ // FIXME: some current default behavior.
+ SmallString<256> PathBuf;
+ if (Path == "auto") {
+ getDefaultOnDiskCASPath(PathBuf);
+ Path = PathBuf;
+ }
+ // Fallback is to create UnifiedOnDiskCache.
+ auto UniDB = builtin::createBuiltinUnifiedOnDiskCache(Path);
+ if (!UniDB)
+ return UniDB.takeError();
+ return builtin::createObjectStoreFromUnifiedOnDiskCache(std::move(*UniDB));
+void cas::registerCASURLScheme(StringRef Prefix,
+ ObjectStoreCreateFuncTy *Func) {
+ getRegisteredScheme().insert({Prefix, Func});
diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt
index 503c77cb13bd07..b06f4ffd83ff5a 100644
--- a/llvm/lib/CMakeLists.txt
+++ b/llvm/lib/CMakeLists.txt
@@ -9,6 +9,7 @@ add_subdirectory(FileCheck)
diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp
new file mode 100644
index 00000000000000..bb06ee5573134f
--- /dev/null
+++ b/llvm/unittests/CAS/CASTestConfig.cpp
@@ -0,0 +1,22 @@
+//===- CASTestConfig.cpp --------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "CASTestConfig.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+using namespace llvm::cas;
+CASTestingEnv createInMemory(int I) {
+ std::unique_ptr<ObjectStore> CAS = createInMemoryCAS();
+ return CASTestingEnv{std::move(CAS)};
+ ::testing::Values(createInMemory));
diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h
new file mode 100644
index 00000000000000..c787e800396543
--- /dev/null
+++ b/llvm/unittests/CAS/CASTestConfig.h
@@ -0,0 +1,36 @@
+//===- CASTestConfig.h ----------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+struct CASTestingEnv {
+ std::unique_ptr<llvm::cas::ObjectStore> CAS;
+class CASTest
+ : public testing::TestWithParam<std::function<CASTestingEnv(int)>> {
+ std::optional<int> NextCASIndex;
+ std::unique_ptr<llvm::cas::ObjectStore> createObjectStore() {
+ auto TD = GetParam()(++(*NextCASIndex));
+ return std::move(TD.CAS);
+ }
+ void SetUp() { NextCASIndex = 0; }
+ void TearDown() { NextCASIndex = std::nullopt; }
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
new file mode 100644
index 00000000000000..39a2100c4909ee
--- /dev/null
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -0,0 +1,12 @@
+ Support
+ TestingSupport
+ )
+ CASTestConfig.cpp
+ ObjectStoreTest.cpp
+ )
+target_link_libraries(CASTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp
new file mode 100644
index 00000000000000..fb29d76cff46f5
--- /dev/null
+++ b/llvm/unittests/CAS/ObjectStoreTest.cpp
@@ -0,0 +1,280 @@
+//===- ObjectStoreTest.cpp ------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+#include "CASTestConfig.h"
+using namespace llvm;
+using namespace llvm::cas;
+TEST_P(CASTest, PrintIDs) {
+ std::unique_ptr<ObjectStore> CAS = createObjectStore();
+ std::optional<CASID> ID1, ID2;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2),
+ Succeeded());
+ std::string PrintedID1 = ID1->toString();
+ std::string PrintedID2 = ID2->toString();
+ EXPECT_NE(PrintedID1, PrintedID2);
+ std::optional<CASID> ParsedID1, ParsedID2;
+ ASSERT_THAT_ERROR(CAS->parseID(PrintedID1).moveInto(ParsedID1), Succeeded());
+ ASSERT_THAT_ERROR(CAS->parseID(PrintedID2).moveInto(ParsedID2), Succeeded());
+ EXPECT_EQ(ID1, ParsedID1);
+ EXPECT_EQ(ID2, ParsedID2);
+TEST_P(CASTest, Blobs) {
+ std::unique_ptr<ObjectStore> CAS1 = createObjectStore();
+ StringRef ContentStrings[] = {
+ "word",
+ "some longer text std::string's local memory",
+ R"(multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text)",
+ };
+ SmallVector<CASID> IDs;
+ for (StringRef Content : ContentStrings) {
+ // Use StringRef::str() to create a temporary std::string. This could cause
+ // problems if the CAS is storing references to the input string instead of
+ // copying it.
+ std::optional<ObjectProxy> Blob;
+ ASSERT_THAT_ERROR(CAS1->createProxy(std::nullopt, Content).moveInto(Blob),
+ Succeeded());
+ IDs.push_back(Blob->getID());
+ // Check basic printing of IDs.
+ EXPECT_EQ(IDs.back().toString(), IDs.back().toString());
+ if (IDs.size() > 2)
+ EXPECT_NE(IDs.front().toString(), IDs.back().toString());
+ }
+ // Check that the blobs give the same IDs later.
+ for (int I = 0, E = IDs.size(); I != E; ++I) {
+ std::optional<ObjectProxy> Blob;
+ CAS1->createProxy(std::nullopt, ContentStrings[I]).moveInto(Blob),
+ Succeeded());
+ EXPECT_EQ(IDs[I], Blob->getID());
+ }
+ // Run validation on all CASIDs.
+ for (int I = 0, E = IDs.size(); I != E; ++I)
+ ASSERT_THAT_ERROR(CAS1->validate(IDs[I]), Succeeded());
+ // Check that the blobs can be retrieved multiple times.
+ for (int I = 0, E = IDs.size(); I != E; ++I) {
+ for (int J = 0, JE = 3; J != JE; ++J) {
+ std::optional<ObjectProxy> Buffer;
+ ASSERT_THAT_ERROR(CAS1->getProxy(IDs[I]).moveInto(Buffer), Succeeded());
+ EXPECT_EQ(ContentStrings[I], Buffer->getData());
+ }
+ }
+ // Confirm these blobs don't exist in a fresh CAS instance.
+ std::unique_ptr<ObjectStore> CAS2 = createObjectStore();
+ for (int I = 0, E = IDs.size(); I != E; ++I) {
+ std::optional<ObjectProxy> Proxy;
+ EXPECT_THAT_ERROR(CAS2->getProxy(IDs[I]).moveInto(Proxy), Failed());
+ }
+ // Insert into the second CAS and confirm the IDs are stable. Getting them
+ // should work now.
+ for (int I = IDs.size(), E = 0; I != E; --I) {
+ auto &ID = IDs[I - 1];
+ auto &Content = ContentStrings[I - 1];
+ std::optional<ObjectProxy> Blob;
+ ASSERT_THAT_ERROR(CAS2->createProxy(std::nullopt, Content).moveInto(Blob),
+ Succeeded());
+ EXPECT_EQ(ID, Blob->getID());
+ std::optional<ObjectProxy> Buffer;
+ ASSERT_THAT_ERROR(CAS2->getProxy(ID).moveInto(Buffer), Succeeded());
+ EXPECT_EQ(Content, Buffer->getData());
+ }
+TEST_P(CASTest, BlobsBig) {
+ // A little bit of validation that bigger blobs are okay. Climb up to 1MB.
+ std::unique_ptr<ObjectStore> CAS = createObjectStore();
+ SmallString<256> String1 = StringRef("a few words");
+ SmallString<256> String2 = StringRef("others");
+ while (String1.size() < 1024U * 1024U) {
+ std::optional<CASID> ID1;
+ std::optional<CASID> ID2;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String1).moveInto(ID1),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String1).moveInto(ID2),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded());
+ ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded());
+ String1.append(String2);
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String2).moveInto(ID1),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String2).moveInto(ID2),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded());
+ ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded());
+ String2.append(String1);
+ }
+ // Specifically check near 1MB for objects large enough they're likely to be
+ // stored externally in an on-disk CAS and will be near a page boundary.
+ SmallString<0> Storage;
+ const size_t InterestingSize = 1024U * 1024ULL;
+ const size_t SizeE = InterestingSize + 2;
+ if (Storage.size() < SizeE)
+ Storage.resize(SizeE, '\01');
+ for (size_t Size = InterestingSize - 2; Size != SizeE; ++Size) {
+ StringRef Data(Storage.data(), Size);
+ std::optional<ObjectProxy> Blob;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, Data).moveInto(Blob),
+ Succeeded());
+ ASSERT_EQ(Data, Blob->getData());
+ ASSERT_EQ(0, Blob->getData().end()[0]);
+ }
+TEST_P(CASTest, LeafNodes) {
+ std::unique_ptr<ObjectStore> CAS1 = createObjectStore();
+ StringRef ContentStrings[] = {
+ "word",
+ "some longer text std::string's local memory",
+ R"(multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text)",
+ };
+ SmallVector<ObjectRef> Nodes;
+ SmallVector<CASID> IDs;
+ for (StringRef Content : ContentStrings) {
+ // Use StringRef::str() to create a temporary std::string. This could cause
+ // problems if the CAS is storing references to the input string instead of
+ // copying it.
+ std::optional<ObjectRef> Node;
+ CAS1->store(std::nullopt, arrayRefFromStringRef<char>(Content))
+ .moveInto(Node),
+ Succeeded());
+ Nodes.push_back(*Node);
+ // Check basic printing of IDs.
+ IDs.push_back(CAS1->getID(*Node));
+ EXPECT_EQ(IDs.back().toString(), IDs.back().toString());
+ EXPECT_EQ(Nodes.front(), Nodes.front());
+ EXPECT_EQ(Nodes.back(), Nodes.back());
+ EXPECT_EQ(IDs.front(), IDs.front());
+ EXPECT_EQ(IDs.back(), IDs.back());
+ if (Nodes.size() <= 1)
+ continue;
+ EXPECT_NE(Nodes.front(), Nodes.back());
+ EXPECT_NE(IDs.front(), IDs.back());
+ }
+ // Check that the blobs give the same IDs later.
+ for (int I = 0, E = IDs.size(); I != E; ++I) {
+ std::optional<ObjectRef> Node;
+ ASSERT_THAT_ERROR(CAS1->store(std::nullopt, arrayRefFromStringRef<char>(
+ ContentStrings[I]))
+ .moveInto(Node),
+ Succeeded());
+ EXPECT_EQ(IDs[I], CAS1->getID(*Node));
+ }
+ // Check that the blobs can be retrieved multiple times.
+ for (int I = 0, E = IDs.size(); I != E; ++I) {
+ for (int J = 0, JE = 3; J != JE; ++J) {
+ std::optional<ObjectProxy> Object;
+ ASSERT_THAT_ERROR(CAS1->getProxy(IDs[I]).moveInto(Object), Succeeded());
+ ASSERT_TRUE(Object);
+ EXPECT_EQ(ContentStrings[I], Object->getData());
+ }
+ }
+ // Confirm these blobs don't exist in a fresh CAS instance.
+ std::unique_ptr<ObjectStore> CAS2 = createObjectStore();
+ for (int I = 0, E = IDs.size(); I != E; ++I) {
+ std::optional<ObjectProxy> Object;
+ EXPECT_THAT_ERROR(CAS2->getProxy(IDs[I]).moveInto(Object), Failed());
+ }
+ // Insert into the second CAS and confirm the IDs are stable. Getting them
+ // should work now.
+ for (int I = IDs.size(), E = 0; I != E; --I) {
+ auto &ID = IDs[I - 1];
+ auto &Content = ContentStrings[I - 1];
+ std::optional<ObjectRef> Node;
+ CAS2->store(std::nullopt, arrayRefFromStringRef<char>(Content))
+ .moveInto(Node),
+ Succeeded());
+ EXPECT_EQ(ID, CAS2->getID(*Node));
+ std::optional<ObjectProxy> Object;
+ ASSERT_THAT_ERROR(CAS2->getProxy(ID).moveInto(Object), Succeeded());
+ ASSERT_TRUE(Object);
+ EXPECT_EQ(Content, Object->getData());
+ }
+TEST_P(CASTest, NodesBig) {
+ std::unique_ptr<ObjectStore> CAS = createObjectStore();
+ // Specifically check near 1MB for objects large enough they're likely to be
+ // stored externally in an on-disk CAS, and such that one of them will be
+ // near a page boundary.
+ SmallString<0> Storage;
+ constexpr size_t InterestingSize = 1024U * 1024ULL;
+ constexpr size_t WordSize = sizeof(void *);
+ // Start much smaller to account for headers.
+ constexpr size_t SizeB = InterestingSize - 8 * WordSize;
+ constexpr size_t SizeE = InterestingSize + 1;
+ if (Storage.size() < SizeE)
+ Storage.resize(SizeE, '\01');
+ SmallVector<ObjectRef, 4> CreatedNodes;
+ // Avoid checking every size because this is an expensive test. Just check
+ // for data that is 8B-word-aligned, and one less. Also appending the created
+ // nodes as the references in the next block to check references are created
+ // correctly.
+ for (size_t Size = SizeB; Size < SizeE; Size += WordSize) {
+ for (bool IsAligned : {false, true}) {
+ StringRef Data(Storage.data(), Size - (IsAligned ? 0 : 1));
+ std::optional<ObjectProxy> Node;
+ ASSERT_THAT_ERROR(CAS->createProxy(CreatedNodes, Data).moveInto(Node),
+ Succeeded());
+ ASSERT_EQ(Data, Node->getData());
+ ASSERT_EQ(0, Node->getData().end()[0]);
+ ASSERT_EQ(Node->getNumReferences(), CreatedNodes.size());
+ CreatedNodes.emplace_back(Node->getRef());
+ }
+ }
+ for (auto ID : CreatedNodes)
+ ASSERT_THAT_ERROR(CAS->validate(CAS->getID(ID)), Succeeded());
diff --git a/llvm/unittests/CMakeLists.txt b/llvm/unittests/CMakeLists.txt
index 911ede701982f6..f6a8acacfa4bce 100644
--- a/llvm/unittests/CMakeLists.txt
+++ b/llvm/unittests/CMakeLists.txt
@@ -20,6 +20,7 @@ add_subdirectory(AsmParser)
>From 6bce89dae513b6b2c4f65cc725e8103634b29f9e Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Mon, 9 Oct 2023 09:07:44 -0700
Subject: [PATCH 03/11] [FileSystem] Allow exclusive file lock
Add parameter to file lock API to allow exclusive file lock. Both Unix
and Windows support lock the file exclusively for write for one process
and LLVM OnDiskCAS uses exclusive file lock to coordinate CAS creation.
llvm/include/llvm/Support/FileSystem.h | 8 ++++++--
llvm/lib/Support/Unix/Path.inc | 11 +++++++----
llvm/lib/Support/Windows/Path.inc | 12 ++++++++----
3 files changed, 21 insertions(+), 10 deletions(-)
diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h
index 9cf53360b4e966..38ad0e712b32ed 100644
--- a/llvm/include/llvm/Support/FileSystem.h
+++ b/llvm/include/llvm/Support/FileSystem.h
@@ -1184,12 +1184,16 @@ openNativeFileForRead(const Twine &Name, OpenFlags Flags = OF_None,
/// descriptor.
tryLockFile(int FD,
- std::chrono::milliseconds Timeout = std::chrono::milliseconds(0));
+ std::chrono::milliseconds Timeout = std::chrono::milliseconds(0),
+ bool Exclusive = true);
/// Lock the file.
/// This function acts as @ref tryLockFile but it waits infinitely.
-std::error_code lockFile(int FD);
+/// \param FD file descriptor to use for locking.
+/// \param Exclusive if \p true use exclusive/writer lock, otherwise use
+/// shared/reader lock.
+std::error_code lockFile(int FD, bool Exclusive = true);
/// Unlock the file.
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index 44097bad7b46ed..9f6f15bbd05f2d 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -1223,13 +1223,14 @@ Expected<size_t> readNativeFileSlice(file_t FD, MutableArrayRef<char> Buf,
return NumRead;
-std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) {
+std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout,
+ bool Exclusive) {
auto Start = std::chrono::steady_clock::now();
auto End = Start + Timeout;
do {
struct flock Lock;
memset(&Lock, 0, sizeof(Lock));
- Lock.l_type = F_WRLCK;
+ Lock.l_type = Exclusive ? F_WRLCK : F_RDLCK;
Lock.l_whence = SEEK_SET;
Lock.l_start = 0;
Lock.l_len = 0;
@@ -1238,15 +1239,17 @@ std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) {
int Error = errno;
if (Error != EACCES && Error != EAGAIN)
return std::error_code(Error, std::generic_category());
+ if (Timeout.count() == 0)
+ break;
} while (std::chrono::steady_clock::now() < End);
return make_error_code(errc::no_lock_available);
-std::error_code lockFile(int FD) {
+std::error_code lockFile(int FD, bool Exclusive) {
struct flock Lock;
memset(&Lock, 0, sizeof(Lock));
- Lock.l_type = F_WRLCK;
+ Lock.l_type = Exclusive ? F_WRLCK : F_RDLCK;
Lock.l_whence = SEEK_SET;
Lock.l_start = 0;
Lock.l_len = 0;
diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index c4bd5e24723517..07ee3d96be5ec6 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -1327,8 +1327,10 @@ Expected<size_t> readNativeFileSlice(file_t FileHandle,
return readNativeFileImpl(FileHandle, Buf, &Overlapped);
-std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) {
+std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout,
+ bool Exclusive) {
+ DWORD Flags = Exclusive ? LOCKFILE_EXCLUSIVE_LOCK : 0;
file_t File = convertFDToNativeFile(FD);
auto Start = std::chrono::steady_clock::now();
@@ -1338,6 +1340,8 @@ std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) {
return std::error_code();
DWORD Error = ::GetLastError();
+ if (Timeout.count() == 0)
+ break;
@@ -1346,8 +1350,8 @@ std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) {
return mapWindowsError(ERROR_LOCK_VIOLATION);
-std::error_code lockFile(int FD) {
+std::error_code lockFile(int FD, bool Exclusive) {
+ DWORD Flags = Exclusive ? LOCKFILE_EXCLUSIVE_LOCK : 0;
file_t File = convertFDToNativeFile(FD);
if (::LockFileEx(File, Flags, 0, MAXDWORD, MAXDWORD, &OV))
>From 36d055c87f283c94b1fb33a41bab21d933f15b10 Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Mon, 9 Oct 2023 09:10:51 -0700
Subject: [PATCH 04/11] [CAS] Add implementation for current OnDisk CAS +
Add current downstream cas API and implementation that includes
OnDiskCAS implementation, different level of abstractions for CAS,
different utilities.
llvm/CMakeLists.txt | 7 +
llvm/include/llvm/CAS/ActionCache.h | 149 ++
llvm/include/llvm/CAS/BuiltinCASContext.h | 88 +
llvm/include/llvm/CAS/BuiltinObjectHasher.h | 82 +
.../llvm/CAS/BuiltinUnifiedCASDatabases.h | 26 +
llvm/include/llvm/CAS/CASNodeSchema.h | 74 +
.../llvm/CAS/HierarchicalTreeBuilder.h | 86 +
.../llvm/CAS/MappedFileRegionBumpPtr.h | 126 ++
llvm/include/llvm/CAS/ObjectStore.h | 11 +-
llvm/include/llvm/CAS/OnDiskGraphDB.h | 406 +++++
llvm/include/llvm/CAS/OnDiskHashMappedTrie.h | 391 +++++
llvm/include/llvm/CAS/OnDiskKeyValueDB.h | 63 +
llvm/include/llvm/CAS/TreeEntry.h | 71 +
llvm/include/llvm/CAS/TreeSchema.h | 125 ++
llvm/include/llvm/CAS/UnifiedOnDiskCache.h | 140 ++
llvm/lib/CAS/ActionCache.cpp | 60 +
llvm/lib/CAS/ActionCaches.cpp | 242 +++
llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp | 25 +
llvm/lib/CAS/CASNodeSchema.cpp | 23 +
llvm/lib/CAS/CMakeLists.txt | 21 +
llvm/lib/CAS/HashMappedTrieIndexGenerator.h | 90 +
llvm/lib/CAS/HierarchicalTreeBuilder.cpp | 266 +++
llvm/lib/CAS/MappedFileRegionBumpPtr.cpp | 284 ++++
llvm/lib/CAS/ObjectStore.cpp | 37 +-
llvm/lib/CAS/OnDiskCAS.cpp | 205 +++
llvm/lib/CAS/OnDiskCommon.cpp | 26 +
llvm/lib/CAS/OnDiskCommon.h | 24 +
llvm/lib/CAS/OnDiskGraphDB.cpp | 1508 +++++++++++++++++
llvm/lib/CAS/OnDiskHashMappedTrie.cpp | 1356 +++++++++++++++
llvm/lib/CAS/OnDiskKeyValueDB.cpp | 78 +
llvm/lib/CAS/TreeEntry.cpp | 47 +
llvm/lib/CAS/TreeSchema.cpp | 231 +++
llvm/lib/CAS/UnifiedOnDiskCache.cpp | 339 ++++
llvm/tools/llvm-cas/CMakeLists.txt | 8 +
llvm/tools/llvm-cas/llvm-cas.cpp | 449 +++++
35 files changed, 7139 insertions(+), 25 deletions(-)
create mode 100644 llvm/include/llvm/CAS/ActionCache.h
create mode 100644 llvm/include/llvm/CAS/BuiltinCASContext.h
create mode 100644 llvm/include/llvm/CAS/BuiltinObjectHasher.h
create mode 100644 llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h
create mode 100644 llvm/include/llvm/CAS/CASNodeSchema.h
create mode 100644 llvm/include/llvm/CAS/HierarchicalTreeBuilder.h
create mode 100644 llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h
create mode 100644 llvm/include/llvm/CAS/OnDiskGraphDB.h
create mode 100644 llvm/include/llvm/CAS/OnDiskHashMappedTrie.h
create mode 100644 llvm/include/llvm/CAS/OnDiskKeyValueDB.h
create mode 100644 llvm/include/llvm/CAS/TreeEntry.h
create mode 100644 llvm/include/llvm/CAS/TreeSchema.h
create mode 100644 llvm/include/llvm/CAS/UnifiedOnDiskCache.h
create mode 100644 llvm/lib/CAS/ActionCache.cpp
create mode 100644 llvm/lib/CAS/ActionCaches.cpp
create mode 100644 llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp
create mode 100644 llvm/lib/CAS/CASNodeSchema.cpp
create mode 100644 llvm/lib/CAS/HashMappedTrieIndexGenerator.h
create mode 100644 llvm/lib/CAS/HierarchicalTreeBuilder.cpp
create mode 100644 llvm/lib/CAS/MappedFileRegionBumpPtr.cpp
create mode 100644 llvm/lib/CAS/OnDiskCAS.cpp
create mode 100644 llvm/lib/CAS/OnDiskCommon.cpp
create mode 100644 llvm/lib/CAS/OnDiskCommon.h
create mode 100644 llvm/lib/CAS/OnDiskGraphDB.cpp
create mode 100644 llvm/lib/CAS/OnDiskHashMappedTrie.cpp
create mode 100644 llvm/lib/CAS/OnDiskKeyValueDB.cpp
create mode 100644 llvm/lib/CAS/TreeEntry.cpp
create mode 100644 llvm/lib/CAS/TreeSchema.cpp
create mode 100644 llvm/lib/CAS/UnifiedOnDiskCache.cpp
create mode 100644 llvm/tools/llvm-cas/CMakeLists.txt
create mode 100644 llvm/tools/llvm-cas/llvm-cas.cpp
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 741c95f3a7d02a..66a6f5c0bac72d 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -831,6 +831,13 @@ option (LLVM_ENABLE_SPHINX "Use Sphinx to generate llvm documentation." OFF)
option (LLVM_ENABLE_OCAMLDOC "Build OCaml bindings documentation." ON)
option (LLVM_ENABLE_BINDINGS "Build bindings." ON)
CACHE STRING "Doxygen-generated HTML documentation install directory")
diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h
new file mode 100644
index 00000000000000..134c586fa0a9a7
--- /dev/null
+++ b/llvm/include/llvm/CAS/ActionCache.h
@@ -0,0 +1,149 @@
+//===- llvm/CAS/ActionCache.h -----------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/CASID.h"
+#include "llvm/CAS/CASReference.h"
+#include "llvm/Support/Error.h"
+#include <future>
+namespace llvm::cas {
+class ObjectStore;
+class CASID;
+class ObjectProxy;
+/// A key for caching an operation.
+/// It is implemented as a bag of bytes and provides a convenient constructor
+/// for CAS types.
+class CacheKey {
+ StringRef getKey() const { return Key; }
+ // TODO: Support CacheKey other than a CASID but rather any array of bytes.
+ // To do that, ActionCache need to be able to rehash the key into the index,
+ // which then `getOrCompute` method can be used to avoid multiple calls to
+ // has function.
+ CacheKey(const CASID &ID);
+ CacheKey(const ObjectProxy &Proxy);
+ CacheKey(const ObjectStore &CAS, const ObjectRef &Ref);
+ std::string Key;
+using AsyncCASIDValue = AsyncValue<CASID>;
+/// This is used to workaround the issue of MSVC needing default-constructible
+/// types for \c std::promise/future.
+struct AsyncErrorValue {
+ Error take() { return std::move(Value); }
+ AsyncErrorValue() : Value(Error::success()) {}
+ AsyncErrorValue(Error &&E) : Value(std::move(E)) {}
+ Error Value;
+/// A cache from a key describing an action to the result of doing it.
+/// Actions are expected to be pure (collision is an error).
+class ActionCache {
+ virtual void anchor();
+ /// Get a previously computed result for \p ActionKey.
+ ///
+ /// \param Globally if true it is a hint to the underlying implementation that
+ /// the lookup is profitable to be done on a distributed caching level, not
+ /// just locally. The implementation is free to ignore this flag.
+ Expected<std::optional<CASID>> get(const CacheKey &ActionKey,
+ bool Globally = false) const {
+ return getImpl(arrayRefFromStringRef(ActionKey.getKey()), Globally);
+ }
+ /// Asynchronous version of \c get.
+ std::future<AsyncCASIDValue> getFuture(const CacheKey &ActionKey,
+ bool Globally = false) const;
+ /// Asynchronous version of \c get.
+ void getAsync(
+ const CacheKey &ActionKey, bool Globally,
+ unique_function<void(Expected<std::optional<CASID>>)> Callback) const {
+ return getImplAsync(arrayRefFromStringRef(ActionKey.getKey()), Globally,
+ std::move(Callback));
+ }
+ /// Cache \p Result for the \p ActionKey computation.
+ ///
+ /// \param Globally if true it is a hint to the underlying implementation that
+ /// the association is profitable to be done on a distributed caching level,
+ /// not just locally. The implementation is free to ignore this flag.
+ Error put(const CacheKey &ActionKey, const CASID &Result,
+ bool Globally = false) {
+ assert(Result.getContext().getHashSchemaIdentifier() ==
+ getContext().getHashSchemaIdentifier() &&
+ "Hash schema mismatch");
+ return putImpl(arrayRefFromStringRef(ActionKey.getKey()), Result, Globally);
+ }
+ /// Asynchronous version of \c put.
+ std::future<AsyncErrorValue> putFuture(const CacheKey &ActionKey,
+ const CASID &Result,
+ bool Globally = false);
+ /// Asynchronous version of \c put.
+ void putAsync(const CacheKey &ActionKey, const CASID &Result, bool Globally,
+ unique_function<void(Error)> Callback) {
+ assert(Result.getContext().getHashSchemaIdentifier() ==
+ getContext().getHashSchemaIdentifier() &&
+ "Hash schema mismatch");
+ return putImplAsync(arrayRefFromStringRef(ActionKey.getKey()), Result,
+ Globally, std::move(Callback));
+ }
+ virtual ~ActionCache() = default;
+ virtual Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ResolvedKey,
+ bool Globally) const = 0;
+ virtual void getImplAsync(
+ ArrayRef<uint8_t> ResolvedKey, bool Globally,
+ unique_function<void(Expected<std::optional<CASID>>)> Callback) const;
+ virtual Error putImpl(ArrayRef<uint8_t> ResolvedKey, const CASID &Result,
+ bool Globally) = 0;
+ virtual void putImplAsync(ArrayRef<uint8_t> ResolvedKey, const CASID &Result,
+ bool Globally,
+ unique_function<void(Error)> Callback);
+ ActionCache(const CASContext &Context) : Context(Context) {}
+ const CASContext &getContext() const { return Context; }
+ const CASContext &Context;
+/// Create an action cache in memory.
+std::unique_ptr<ActionCache> createInMemoryActionCache();
+/// Get a reasonable default on-disk path for a persistent ActionCache for the
+/// current user.
+std::string getDefaultOnDiskActionCachePath();
+/// Create an action cache on disk.
+Expected<std::unique_ptr<ActionCache>> createOnDiskActionCache(StringRef Path);
+} // end namespace llvm::cas
diff --git a/llvm/include/llvm/CAS/BuiltinCASContext.h b/llvm/include/llvm/CAS/BuiltinCASContext.h
new file mode 100644
index 00000000000000..ebc4ca8bd1f2e9
--- /dev/null
+++ b/llvm/include/llvm/CAS/BuiltinCASContext.h
@@ -0,0 +1,88 @@
+//===- BuiltinCASContext.h --------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/CASID.h"
+#include "llvm/Support/BLAKE3.h"
+#include "llvm/Support/Error.h"
+namespace llvm::cas::builtin {
+/// Current hash type for the builtin CAS.
+/// FIXME: This should be configurable via an enum to allow configuring the hash
+/// function. The enum should be sent into \a createInMemoryCAS() and \a
+/// createOnDiskCAS().
+/// This is important (at least) for future-proofing, when we want to make new
+/// CAS instances use BLAKE7, but still know how to read/write BLAKE3.
+/// Even just for BLAKE3, it would be useful to have these values:
+/// BLAKE3 => 32B hash from BLAKE3
+/// BLAKE3_16B => 16B hash from BLAKE3 (truncated)
+/// ... where BLAKE3_16 uses \a TruncatedBLAKE3<16>.
+/// Motivation for a truncated hash is that it's cheaper to store. It's not
+/// clear if we always (or ever) need the full 32B, and for an ephemeral
+/// in-memory CAS, we almost certainly don't need it.
+/// Note that the cost is linear in the number of objects for the builtin CAS,
+/// since we're using internal offsets and/or pointers as an optimization.
+/// However, it's possible we'll want to hook up a local builtin CAS to, e.g.,
+/// a distributed generic hash map to use as an ActionCache. In that scenario,
+/// the transitive closure of the structured objects that are the results of
+/// the cached actions would need to be serialized into the map, something
+/// like:
+/// "action:<schema>:<key>" -> "0123"
+/// "object:<schema>:0123" -> "3,4567,89AB,CDEF,9,some data"
+/// "object:<schema>:4567" -> ...
+/// "object:<schema>:89AB" -> ...
+/// "object:<schema>:CDEF" -> ...
+/// These references would be full cost.
+using HasherT = BLAKE3;
+using HashType = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>()));
+class BuiltinCASContext : public CASContext {
+ void printIDImpl(raw_ostream &OS, const CASID &ID) const final;
+ void anchor() override;
+ /// Get the name of the hash for any table identifiers.
+ ///
+ /// FIXME: This should be configurable via an enum, with at the following
+ /// values:
+ ///
+ /// "BLAKE3" => 32B hash from BLAKE3
+ /// "BLAKE3.16" => 16B hash from BLAKE3 (truncated)
+ ///
+ /// Enum can be sent into \a createInMemoryCAS() and \a createOnDiskCAS().
+ static StringRef getHashName() { return "BLAKE3"; }
+ StringRef getHashSchemaIdentifier() const final {
+ static const std::string ID =
+ ("llvm.cas.builtin.v2[" + getHashName() + "]").str();
+ return ID;
+ }
+ static const BuiltinCASContext &getDefaultContext();
+ BuiltinCASContext() = default;
+ static Expected<HashType> parseID(StringRef PrintedDigest);
+ static void printID(ArrayRef<uint8_t> Digest, raw_ostream &OS);
+} // namespace llvm::cas::builtin
diff --git a/llvm/include/llvm/CAS/BuiltinObjectHasher.h b/llvm/include/llvm/CAS/BuiltinObjectHasher.h
new file mode 100644
index 00000000000000..ac95edf6de7f1b
--- /dev/null
+++ b/llvm/include/llvm/CAS/BuiltinObjectHasher.h
@@ -0,0 +1,82 @@
+//===- BuiltinObjectHasher.h ------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Support/Endian.h"
+namespace llvm::cas {
+template <class HasherT> class BuiltinObjectHasher {
+ using HashT = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>()));
+ static HashT hashObject(const ObjectStore &CAS, ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) {
+ BuiltinObjectHasher H;
+ H.updateSize(Refs.size());
+ for (const ObjectRef &Ref : Refs)
+ H.updateRef(CAS, Ref);
+ H.updateArray(Data);
+ return H.finish();
+ }
+ static HashT hashObject(ArrayRef<ArrayRef<uint8_t>> Refs,
+ ArrayRef<char> Data) {
+ BuiltinObjectHasher H;
+ H.updateSize(Refs.size());
+ for (const ArrayRef<uint8_t> &Ref : Refs)
+ H.updateID(Ref);
+ H.updateArray(Data);
+ return H.finish();
+ }
+ HashT finish() { return Hasher.final(); }
+ void updateRef(const ObjectStore &CAS, ObjectRef Ref) {
+ updateID(CAS.getID(Ref));
+ }
+ void updateID(const CASID &ID) { updateID(ID.getHash()); }
+ void updateID(ArrayRef<uint8_t> Hash) {
+ // NOTE: Does not hash the size of the hash. That's a CAS implementation
+ // detail that shouldn't leak into the UUID for an object.
+ assert(Hash.size() == sizeof(HashT) &&
+ "Expected object ref to match the hash size");
+ Hasher.update(Hash);
+ }
+ void updateArray(ArrayRef<uint8_t> Bytes) {
+ updateSize(Bytes.size());
+ Hasher.update(Bytes);
+ }
+ void updateArray(ArrayRef<char> Bytes) {
+ updateArray(ArrayRef(reinterpret_cast<const uint8_t *>(Bytes.data()),
+ Bytes.size()));
+ }
+ void updateSize(uint64_t Size) {
+ Size = support::endian::byte_swap(Size, support::endianness::little);
+ Hasher.update(
+ ArrayRef(reinterpret_cast<const uint8_t *>(&Size), sizeof(Size)));
+ }
+ BuiltinObjectHasher() = default;
+ ~BuiltinObjectHasher() = default;
+ HasherT Hasher;
+} // namespace llvm::cas
diff --git a/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h
new file mode 100644
index 00000000000000..969d097b6cecac
--- /dev/null
+++ b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h
@@ -0,0 +1,26 @@
+//===- BuiltinUnifiedCASDatabases.h -----------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/Support/Error.h"
+namespace llvm::cas {
+class ActionCache;
+class ObjectStore;
+/// Create on-disk \c ObjectStore and \c ActionCache instances based on
+/// \c ondisk::UnifiedOnDiskCache, with built-in hashing.
+Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>>
+createOnDiskUnifiedCASDatabases(StringRef Path);
+} // namespace llvm::cas
diff --git a/llvm/include/llvm/CAS/CASNodeSchema.h b/llvm/include/llvm/CAS/CASNodeSchema.h
new file mode 100644
index 00000000000000..490337857cae0f
--- /dev/null
+++ b/llvm/include/llvm/CAS/CASNodeSchema.h
@@ -0,0 +1,74 @@
+//===- llvm/CAS/CASNodeSchema.h ---------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/CASReference.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+namespace llvm::cas {
+class ObjectProxy;
+/// A base class for schemas built on top of CAS nodes.
+/// TODO: Build a FilesystemSchema on top of this for reimplementing Trees on
+/// top of the CAS.
+class NodeSchema : public RTTIExtends<NodeSchema, RTTIRoot> {
+ void anchor() override;
+ static char ID;
+ /// Check if \a Node is a root (entry node) for the schema. This is a strong
+ /// check, since it requires that the first reference matches a complete
+ /// type-id DAG.
+ virtual bool isRootNode(const cas::ObjectProxy &Node) const = 0;
+ virtual bool isNode(const cas::ObjectProxy &Node) const = 0;
+ cas::ObjectStore &CAS;
+ NodeSchema(cas::ObjectStore &CAS) : CAS(CAS) {}
+ virtual ~NodeSchema() = default;
+/// Creates all the schemas and can be used to retrieve a particular schema
+/// based on a CAS root node. A client should aim to create and maximize re-use
+/// of an instance of this object.
+class SchemaPool {
+ /// Look up the schema for the provided root node. Returns \a nullptr if no
+ /// schema was found or it's not actually a root node. The returned \p
+ /// NodeSchema pointer is owned by the \p SchemaPool instance, therefore it
+ /// cannot be used beyond the \p SchemaPool instance's lifetime.
+ ///
+ /// Thread-safe.
+ NodeSchema *getSchemaForRoot(cas::ObjectProxy Node) const;
+ /// Add a schema to the pool.
+ void addSchema(std::unique_ptr<NodeSchema> S) {
+ Schemas.push_back(std::move(S));
+ }
+ cas::ObjectStore &getCAS() const { return CAS; }
+ explicit SchemaPool(cas::ObjectStore &CAS) : CAS(CAS) {}
+ cas::ObjectStore &CAS;
+ SmallVector<std::unique_ptr<NodeSchema>> Schemas;
+} // namespace llvm::cas
diff --git a/llvm/include/llvm/CAS/HierarchicalTreeBuilder.h b/llvm/include/llvm/CAS/HierarchicalTreeBuilder.h
new file mode 100644
index 00000000000000..11ab3ec8629eb2
--- /dev/null
+++ b/llvm/include/llvm/CAS/HierarchicalTreeBuilder.h
@@ -0,0 +1,86 @@
+//===- llvm/CAS/HierarchicalTreeBuilder.h -----------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/CASReference.h"
+#include "llvm/CAS/TreeEntry.h"
+#include "llvm/CAS/TreeSchema.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <cstddef>
+namespace llvm::cas {
+class ObjectStore;
+/// Structure to facilitating building full tree hierarchies.
+class HierarchicalTreeBuilder {
+ struct HierarchicalEntry {
+ public:
+ StringRef getPath() const { return Path; }
+ std::optional<ObjectRef> getRef() const { return Ref; }
+ TreeEntry::EntryKind getKind() const { return Kind; }
+ HierarchicalEntry(std::optional<ObjectRef> Ref, TreeEntry::EntryKind Kind,
+ StringRef Path)
+ : Ref(Ref), Kind(Kind), Path(Path.str()) {
+ assert(Ref || Kind == TreeEntry::Tree);
+ }
+ private:
+ std::optional<ObjectRef> Ref;
+ TreeEntry::EntryKind Kind;
+ std::string Path;
+ };
+ /// Preallocate space for small trees, common when creating cache keys.
+ SmallVector<HierarchicalEntry, 8> Entries;
+ SmallVector<HierarchicalEntry, 0> TreeContents;
+ void pushImpl(std::optional<ObjectRef> Ref, TreeEntry::EntryKind Kind,
+ const Twine &Path);
+ /// Add a hierarchical entry at \p Path, which is expected to be from the
+ /// top-level (otherwise, the caller should prepend a working directory).
+ ///
+ /// All ".." components will be squashed by eating the parent. Paths through
+ /// symlinks will not work, and should be resolved ahead of time. Paths must
+ /// be POSIX-style.
+ void push(ObjectRef Ref, TreeEntry::EntryKind Kind, const Twine &Path) {
+ return pushImpl(Ref, Kind, Path);
+ }
+ /// Add a directory. Ensures the directory will exist even if there are no
+ /// files pushed from within it.
+ void pushDirectory(const Twine &Path) {
+ return pushImpl(std::nullopt, TreeEntry::Tree, Path);
+ }
+ /// Add a directory with specific contents. It is functionally equivalent to:
+ /// * Calling pushDirectory() for every tree
+ /// * Calling push() for every non-tree
+ ///
+ /// Allows merging the contents of multiple directories.
+ void pushTreeContent(ObjectRef Ref, const Twine &Path);
+ /// Drop all entries.
+ void clear() { Entries.clear(); }
+ /// Recursively create the trees implied by calls to \a push(), return the
+ /// top-level \a CASID.
+ Expected<ObjectProxy> create(ObjectStore &CAS);
+} // namespace llvm::cas
diff --git a/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h b/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h
new file mode 100644
index 00000000000000..ac97158d48ddb5
--- /dev/null
+++ b/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h
@@ -0,0 +1,126 @@
+//===- MappedFileRegionBumpPtr.h --------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/FileSystem.h"
+#include <atomic>
+namespace llvm::cas {
+/// Allocator for an owned mapped file region that supports thread-safe and
+/// process-safe bump pointer allocation.
+/// This allocator is designed to create a sparse file when supported by the
+/// filesystem's \c ftruncate so that it can be used with a large maximum size.
+/// It will also attempt to shrink the underlying file down to its current
+/// allocation size when the last concurrent mapping is closed.
+/// Process-safe. Uses file locks when resizing the file during initialization
+/// and destruction.
+/// Thread-safe, assuming all threads use the same instance to talk to a given
+/// file/mapping. Unsafe to have multiple instances talking to the same file
+/// in the same process since file locks will misbehave. Clients should
+/// coordinate (somehow).
+/// \note Currently we allocate the whole file without sparseness on Windows.
+/// Provides 8-byte alignment for all allocations.
+class MappedFileRegionBumpPtr {
+ using RegionT = sys::fs::mapped_file_region;
+ /// Create a \c MappedFileRegionBumpPtr.
+ ///
+ /// \param Path the path to open the mapped region.
+ /// \param Capacity the maximum size for the mapped file region.
+ /// \param BumpPtrOffset the offset at which to store the bump pointer.
+ /// \param NewFileConstructor is for constructing new files. It has exclusive
+ /// access to the file. Must call \c initializeBumpPtr.
+ static Expected<MappedFileRegionBumpPtr>
+ create(const Twine &Path, uint64_t Capacity, int64_t BumpPtrOffset,
+ function_ref<Error(MappedFileRegionBumpPtr &)> NewFileConstructor);
+ /// Create a \c MappedFileRegionBumpPtr., shared across the process via a
+ /// singleton map.
+ ///
+ /// FIXME: Singleton map should be based on sys::fs::UniqueID, but currently
+ /// it is just based on \p Path.
+ ///
+ /// \param Path the path to open the mapped region.
+ /// \param Capacity the maximum size for the mapped file region.
+ /// \param BumpPtrOffset the offset at which to store the bump pointer.
+ /// \param NewFileConstructor is for constructing new files. It has exclusive
+ /// access to the file. Must call \c initializeBumpPtr.
+ static Expected<std::shared_ptr<MappedFileRegionBumpPtr>> createShared(
+ const Twine &Path, uint64_t Capacity, int64_t BumpPtrOffset,
+ function_ref<Error(MappedFileRegionBumpPtr &)> NewFileConstructor);
+ /// Finish initializing the bump pointer. Must be called by
+ /// \c NewFileConstructor.
+ void initializeBumpPtr(int64_t BumpPtrOffset);
+ /// Minimum alignment for allocations, currently hardcoded to 8B.
+ static constexpr Align getAlign() {
+ // Trick Align into giving us '8' as a constexpr.
+ struct alignas(8) T {};
+ static_assert(alignof(T) == 8, "Tautology failed?");
+ return Align::Of<T>();
+ }
+ /// Allocate at least \p AllocSize. Rounds up to \a getAlign().
+ char *allocate(uint64_t AllocSize) {
+ return data() + allocateOffset(AllocSize);
+ }
+ /// Allocate, returning the offset from \a data() instead of a pointer.
+ int64_t allocateOffset(uint64_t AllocSize);
+ char *data() const { return Region.data(); }
+ uint64_t size() const { return *BumpPtr; }
+ uint64_t capacity() const { return Region.size(); }
+ RegionT &getRegion() { return Region; }
+ ~MappedFileRegionBumpPtr() { destroyImpl(); }
+ MappedFileRegionBumpPtr() = default;
+ MappedFileRegionBumpPtr(MappedFileRegionBumpPtr &&RHS) { moveImpl(RHS); }
+ MappedFileRegionBumpPtr &operator=(MappedFileRegionBumpPtr &&RHS) {
+ destroyImpl();
+ moveImpl(RHS);
+ return *this;
+ }
+ MappedFileRegionBumpPtr(const MappedFileRegionBumpPtr &) = delete;
+ MappedFileRegionBumpPtr &operator=(const MappedFileRegionBumpPtr &) = delete;
+ void destroyImpl();
+ void moveImpl(MappedFileRegionBumpPtr &RHS) {
+ std::swap(Region, RHS.Region);
+ std::swap(BumpPtr, RHS.BumpPtr);
+ std::swap(Path, RHS.Path);
+ std::swap(FD, RHS.FD);
+ std::swap(SharedLockFD, RHS.SharedLockFD);
+ }
+ RegionT Region;
+ std::atomic<int64_t> *BumpPtr = nullptr;
+ std::string Path;
+ std::optional<int> FD;
+ std::optional<int> SharedLockFD;
+} // namespace llvm::cas
diff --git a/llvm/include/llvm/CAS/ObjectStore.h b/llvm/include/llvm/CAS/ObjectStore.h
index 16c133198f13df..d8977f2d13c709 100644
--- a/llvm/include/llvm/CAS/ObjectStore.h
+++ b/llvm/include/llvm/CAS/ObjectStore.h
@@ -338,17 +338,14 @@ std::string getDefaultOnDiskCASPath();
/// schemes:
/// * InMemory CAS: mem://
/// * OnDisk CAS: file://${PATH_TO_ONDISK_CAS}
-/// * PlugIn CAS: plugin://${PATH_TO_PLUGIN}?${OPT1}=${VAL1}&${OPT2}=${VAL2}..
-/// If no URL scheme is used, it defaults to following (but might change in
-/// future)
-/// * empty string: Error!
-/// * "auto": default OnDiskCAS location
-/// * Other: path to OnDiskCAS.
/// For the plugin scheme, use argument "ondisk-path=${PATH}" to choose the
/// on-disk directory that the plugin should use, otherwise the default
/// OnDiskCAS location will be used.
/// FIXME: Need to implement proper URL encoding scheme that allows "%".
-Expected<std::shared_ptr<ObjectStore>> createCASFromIdentifier(StringRef Path);
+Expected<std::shared_ptr<ObjectStore>> createCASFromIdentifier(StringRef Id);
+/// Check if a string is a CAS identifier.
+bool isRegisteredCASIdentifier(StringRef Config);
/// Register a URL scheme to CAS Identifier.
using ObjectStoreCreateFuncTy =
diff --git a/llvm/include/llvm/CAS/OnDiskGraphDB.h b/llvm/include/llvm/CAS/OnDiskGraphDB.h
new file mode 100644
index 00000000000000..cfcaeed9fc85a0
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskGraphDB.h
@@ -0,0 +1,406 @@
+//===- OnDiskGraphDB.h ------------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/CAS/OnDiskHashMappedTrie.h"
+namespace llvm::cas::ondisk {
+/// 8B reference.
+class InternalRef {
+ FileOffset getFileOffset() const { return FileOffset(getRawOffset()); }
+ uint64_t getRawData() const { return Data; }
+ uint64_t getRawOffset() const { return Data; }
+ static InternalRef getFromRawData(uint64_t Data) { return InternalRef(Data); }
+ static InternalRef getFromOffset(FileOffset Offset) {
+ return InternalRef(Offset.get());
+ }
+ friend bool operator==(InternalRef LHS, InternalRef RHS) {
+ return LHS.Data == RHS.Data;
+ }
+ InternalRef(FileOffset Offset) : Data((uint64_t)Offset.get()) {}
+ InternalRef(uint64_t Data) : Data(Data) {}
+ uint64_t Data;
+/// 4B reference.
+class InternalRef4B {
+ FileOffset getFileOffset() const { return FileOffset(Data); }
+ uint32_t getRawData() const { return Data; }
+ /// Shrink to 4B reference.
+ static std::optional<InternalRef4B> tryToShrink(InternalRef Ref) {
+ uint64_t Offset = Ref.getRawOffset();
+ if (Offset > UINT32_MAX)
+ return std::nullopt;
+ return InternalRef4B(Offset);
+ }
+ operator InternalRef() const {
+ return InternalRef::getFromOffset(getFileOffset());
+ }
+ friend class InternalRef;
+ InternalRef4B(uint32_t Data) : Data(Data) {}
+ uint32_t Data;
+/// Array of internal node references.
+class InternalRefArrayRef {
+ size_t size() const { return Size; }
+ bool empty() const { return !Size; }
+ class iterator
+ : public iterator_facade_base<iterator, std::random_access_iterator_tag,
+ const InternalRef> {
+ public:
+ bool operator==(const iterator &RHS) const { return I == RHS.I; }
+ InternalRef operator*() const {
+ if (auto *Ref = dyn_cast<const InternalRef *>(I))
+ return *Ref;
+ return InternalRef(*I.get<const InternalRef4B *>());
+ }
+ bool operator<(const iterator &RHS) const {
+ assert(I.is<const InternalRef *>() == RHS.I.is<const InternalRef *>());
+ if (auto *Ref = dyn_cast<const InternalRef *>(I))
+ return Ref < RHS.I.get<const InternalRef *>();
+ return I.get<const InternalRef4B *>() -
+ RHS.I.get<const InternalRef4B *>();
+ }
+ ptrdiff_t operator-(const iterator &RHS) const {
+ assert(I.is<const InternalRef *>() == RHS.I.is<const InternalRef *>());
+ if (auto *Ref = dyn_cast<const InternalRef *>(I))
+ return Ref - RHS.I.get<const InternalRef *>();
+ return I.get<const InternalRef4B *>() -
+ RHS.I.get<const InternalRef4B *>();
+ }
+ iterator &operator+=(ptrdiff_t N) {
+ if (auto *Ref = dyn_cast<const InternalRef *>(I))
+ I = Ref + N;
+ else
+ I = I.get<const InternalRef4B *>() + N;
+ return *this;
+ }
+ iterator &operator-=(ptrdiff_t N) {
+ if (auto *Ref = dyn_cast<const InternalRef *>(I))
+ I = Ref - N;
+ else
+ I = I.get<const InternalRef4B *>() - N;
+ return *this;
+ }
+ InternalRef operator[](ptrdiff_t N) const { return *(this->operator+(N)); }
+ iterator() = default;
+ uint64_t getOpaqueData() const { return uintptr_t(I.getOpaqueValue()); }
+ static iterator fromOpaqueData(uint64_t Opaque) {
+ return iterator(
+ PointerUnion<const InternalRef *,
+ const InternalRef4B *>::getFromOpaqueValue((void *)
+ Opaque));
+ }
+ private:
+ friend class InternalRefArrayRef;
+ explicit iterator(
+ PointerUnion<const InternalRef *, const InternalRef4B *> I)
+ : I(I) {}
+ PointerUnion<const InternalRef *, const InternalRef4B *> I;
+ };
+ bool operator==(const InternalRefArrayRef &RHS) const {
+ return size() == RHS.size() && std::equal(begin(), end(), RHS.begin());
+ }
+ iterator begin() const { return iterator(Begin); }
+ iterator end() const { return begin() + Size; }
+ /// Array accessor.
+ InternalRef operator[](ptrdiff_t N) const { return begin()[N]; }
+ bool is4B() const { return Begin.is<const InternalRef4B *>(); }
+ bool is8B() const { return Begin.is<const InternalRef *>(); }
+ ArrayRef<uint8_t> getBuffer() const {
+ if (is4B()) {
+ auto *B = Begin.get<const InternalRef4B *>();
+ return ArrayRef((const uint8_t *)B, sizeof(InternalRef4B) * Size);
+ } else {
+ auto *B = Begin.get<const InternalRef *>();
+ return ArrayRef((const uint8_t *)B, sizeof(InternalRef) * Size);
+ }
+ }
+ InternalRefArrayRef(std::nullopt_t = std::nullopt) {
+ // This is useful so that all the casts in the \p iterator functions can
+ // operate without needing to check for a null value.
+ static InternalRef PlaceHolder = InternalRef::getFromRawData(0);
+ Begin = &PlaceHolder;
+ }
+ InternalRefArrayRef(ArrayRef<InternalRef> Refs)
+ : Begin(Refs.begin()), Size(Refs.size()) {}
+ InternalRefArrayRef(ArrayRef<InternalRef4B> Refs)
+ : Begin(Refs.begin()), Size(Refs.size()) {}
+ PointerUnion<const InternalRef *, const InternalRef4B *> Begin;
+ size_t Size = 0;
+struct OnDiskContent;
+/// Reference to a node. The node's data may not be stored in the database.
+/// An \p ObjectID instance can only be used with the \p OnDiskGraphDB instance
+/// it came from. \p ObjectIDs from different \p OnDiskGraphDB instances are not
+/// comparable.
+class ObjectID {
+ uint64_t getOpaqueData() const { return Opaque; }
+ static ObjectID fromOpaqueData(uint64_t Opaque) { return ObjectID(Opaque); }
+ friend bool operator==(const ObjectID &LHS, const ObjectID &RHS) {
+ return LHS.Opaque == RHS.Opaque;
+ }
+ friend bool operator!=(const ObjectID &LHS, const ObjectID &RHS) {
+ return !(LHS == RHS);
+ }
+ explicit ObjectID(uint64_t Opaque) : Opaque(Opaque) {}
+ uint64_t Opaque;
+/// Handle for a loaded node object.
+class ObjectHandle {
+ uint64_t getOpaqueData() const { return Opaque; }
+ static ObjectHandle fromOpaqueData(uint64_t Opaque) {
+ return ObjectHandle(Opaque);
+ }
+ friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS) {
+ return LHS.Opaque == RHS.Opaque;
+ }
+ friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS) {
+ return !(LHS == RHS);
+ }
+ explicit ObjectHandle(uint64_t Opaque) : Opaque(Opaque) {}
+ uint64_t Opaque;
+class object_refs_iterator
+ : public iterator_facade_base<object_refs_iterator,
+ std::random_access_iterator_tag, ObjectID> {
+ bool operator==(const object_refs_iterator &RHS) const { return I == RHS.I; }
+ ObjectID operator*() const {
+ return ObjectID::fromOpaqueData((*I).getRawData());
+ }
+ bool operator<(const object_refs_iterator &RHS) const { return I < RHS.I; }
+ ptrdiff_t operator-(const object_refs_iterator &RHS) const {
+ return I - RHS.I;
+ }
+ object_refs_iterator &operator+=(ptrdiff_t N) {
+ I += N;
+ return *this;
+ }
+ object_refs_iterator &operator-=(ptrdiff_t N) {
+ I -= N;
+ return *this;
+ }
+ ObjectID operator[](ptrdiff_t N) const { return *(this->operator+(N)); }
+ object_refs_iterator() = default;
+ object_refs_iterator(InternalRefArrayRef::iterator I) : I(I) {}
+ uint64_t getOpaqueData() const { return I.getOpaqueData(); }
+ static object_refs_iterator fromOpaqueData(uint64_t Opaque) {
+ return InternalRefArrayRef::iterator::fromOpaqueData(Opaque);
+ }
+ InternalRefArrayRef::iterator I;
+using object_refs_range = llvm::iterator_range<object_refs_iterator>;
+/// On-disk CAS nodes database, independent of a particular hashing algorithm.
+class OnDiskGraphDB {
+ /// Associate data & references with a particular object ID. If there is
+ /// already a record for this object the operation is a no-op. \param ID the
+ /// object ID to associate the data & references with. \param Refs references
+ /// \param Data data buffer.
+ Error store(ObjectID ID, ArrayRef<ObjectID> Refs, ArrayRef<char> Data);
+ /// \returns \p nullopt if the object associated with \p Ref does not exist.
+ Expected<std::optional<ObjectHandle>> load(ObjectID Ref);
+ /// \returns the hash bytes digest for the object reference.
+ ArrayRef<uint8_t> getDigest(ObjectID Ref) const {
+ return getDigest(getInternalRef(Ref));
+ }
+ /// Form a reference for the provided hash. The reference can be used as part
+ /// of a CAS object even if it's not associated with an object yet.
+ ObjectID getReference(ArrayRef<uint8_t> Hash);
+ /// Get an existing reference to the object \p Digest.
+ ///
+ /// Returns \p nullopt if the object is not stored in this CAS.
+ std::optional<ObjectID> getExistingReference(ArrayRef<uint8_t> Digest);
+ /// \returns true if the object associated with \p Ref is stored in the CAS.
+ bool containsObject(ObjectID Ref) const {
+ return containsObject(Ref, /*CheckUpstream=*/true);
+ }
+ /// \returns the data part of the provided object handle.
+ ArrayRef<char> getObjectData(ObjectHandle Node) const;
+ object_refs_range getObjectRefs(ObjectHandle Node) const {
+ InternalRefArrayRef Refs = getInternalRefs(Node);
+ return make_range(Refs.begin(), Refs.end());
+ }
+ /// \returns Total size of stored objects.
+ ///
+ /// NOTE: There's a possibility that the returned size is not including a
+ /// large object if the process crashed right at the point of inserting it.
+ size_t getStorageSize() const;
+ void print(raw_ostream &OS) const;
+ /// How to fault-in nodes if an upstream database is used.
+ enum class FaultInPolicy {
+ /// Copy only the requested node.
+ SingleNode,
+ /// Copy the the entire graph of a node.
+ FullTree,
+ };
+ /// Open the on-disk store from a directory.
+ ///
+ /// \param Path directory for the on-disk store. The directory will be created
+ /// if it doesn't exist.
+ /// \param HashName Identifier name for the hashing algorithm that is going to
+ /// be used.
+ /// \param HashByteSize Size for the object digest hash bytes.
+ /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes
+ /// if they don't exist in the primary store. The upstream store is only used
+ /// for reading nodes, new nodes are only written to the primary store.
+ /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied
+ /// to primary store. This is recorded at creation time and subsequent opens
+ /// need to pass the same policy otherwise the \p open will fail.
+ static Expected<std::unique_ptr<OnDiskGraphDB>>
+ open(StringRef Path, StringRef HashName, unsigned HashByteSize,
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB = nullptr,
+ FaultInPolicy Policy = FaultInPolicy::FullTree);
+ ~OnDiskGraphDB();
+ struct IndexProxy;
+ class TempFile;
+ class MappedTempFile;
+ bool containsObject(ObjectID Ref, bool CheckUpstream) const;
+ /// When \p load is called for a node that doesn't exist, this function tries
+ /// to load it from the upstream store and copy it to the primary one.
+ Expected<std::optional<ObjectHandle>> faultInFromUpstream(ObjectID PrimaryID);
+ Error importFullTree(ObjectID PrimaryID, ObjectHandle UpstreamNode);
+ Error importSingleNode(ObjectID PrimaryID, ObjectHandle UpstreamNode);
+ IndexProxy indexHash(ArrayRef<uint8_t> Hash);
+ Error createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data);
+ Expected<MappedTempFile> createTempFile(StringRef FinalPath, uint64_t Size);
+ OnDiskContent getContentFromHandle(ObjectHandle H) const;
+ static InternalRef getInternalRef(ObjectID Ref) {
+ return InternalRef::getFromRawData(Ref.getOpaqueData());
+ }
+ static ObjectID getExternalReference(InternalRef Ref) {
+ return ObjectID::fromOpaqueData(Ref.getRawData());
+ }
+ static ObjectID getExternalReference(const IndexProxy &I);
+ void getStandalonePath(StringRef FileSuffix, const IndexProxy &I,
+ SmallVectorImpl<char> &Path) const;
+ ArrayRef<uint8_t> getDigest(InternalRef Ref) const;
+ ArrayRef<uint8_t> getDigest(const IndexProxy &I) const;
+ IndexProxy getIndexProxyFromRef(InternalRef Ref) const;
+ static InternalRef makeInternalRef(FileOffset IndexOffset);
+ IndexProxy
+ getIndexProxyFromPointer(OnDiskHashMappedTrie::const_pointer P) const;
+ InternalRefArrayRef getInternalRefs(ObjectHandle Node) const;
+ void recordStandaloneSizeIncrease(size_t SizeIncrease);
+ std::atomic<uint64_t> &getStandaloneStorageSize();
+ uint64_t getStandaloneStorageSize() const;
+ OnDiskGraphDB(StringRef RootPath, OnDiskHashMappedTrie Index,
+ OnDiskDataAllocator DataPool,
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB,
+ FaultInPolicy Policy);
+ /// Mapping from hash to object reference.
+ ///
+ /// Data type is TrieRecord.
+ OnDiskHashMappedTrie Index;
+ /// Storage for most objects.
+ ///
+ /// Data type is DataRecordHandle.
+ OnDiskDataAllocator DataPool;
+ void *StandaloneData; // a StandaloneDataMap.
+ std::string RootPath;
+ /// Optional on-disk store to be used for faulting-in nodes.
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+ FaultInPolicy FIPolicy;
+} // namespace llvm::cas::ondisk
diff --git a/llvm/include/llvm/CAS/OnDiskHashMappedTrie.h b/llvm/include/llvm/CAS/OnDiskHashMappedTrie.h
new file mode 100644
index 00000000000000..410e5d955eec2b
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskHashMappedTrie.h
@@ -0,0 +1,391 @@
+//===- OnDiskHashMappedTrie.h -----------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/FileSystem.h"
+#include <atomic>
+#include <mutex>
+#include <optional>
+namespace llvm {
+class MemoryBuffer;
+class raw_ostream;
+namespace cas {
+class FileOffset {
+ int64_t get() const { return Offset; }
+ explicit operator bool() const { return Offset; }
+ FileOffset() = default;
+ explicit FileOffset(int64_t Offset) : Offset(Offset) { assert(Offset >= 0); }
+ int64_t Offset = 0;
+/// On-disk hash-mapped trie. Thread-safe / lock-free.
+/// This is an on-disk, (mostly) thread-safe key-value store that is (mostly)
+/// lock-free. The keys are fixed length, and are expected to be binary hashes
+/// with a normal distribution.
+/// - Thread-safety is achieved through the use of atomics within a shared
+/// memory mapping. Atomic access does not work on networked filesystems.
+/// - Filesystem locks are used, but only sparingly:
+/// - during initialization, for creating / opening an existing store;
+/// - for the lifetime of the instance, a shared/reader lock is held
+/// - during destruction, if there are no concurrent readers, to shrink the
+/// files to their minimum size.
+/// - Path is used as a directory:
+/// - "index" stores the root trie and subtries.
+/// - "data" stores (most of) the entries, like a bump-ptr-allocator.
+/// - Large entries are stored externally in a file named by the key.
+/// - Code is system-dependent (Windows not yet implemented), and binary format
+/// itself is not portable. These are not artifacts that can/should be moved
+/// between different systems; they are only appropriate for local storage.
+/// FIXME: Add support for storing top-level metadata or identifiers that can
+/// be created / read during initialization.
+/// FIXME: Implement for Windows. See comment next to implementation of \a
+/// OnDiskHashMappedTrie::MappedFileInfo::open().
+class OnDiskHashMappedTrie {
+ LLVM_DUMP_METHOD void dump() const;
+ void
+ print(raw_ostream &OS,
+ function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const;
+ struct ConstValueProxy {
+ ConstValueProxy() = default;
+ ConstValueProxy(ArrayRef<uint8_t> Hash, ArrayRef<char> Data)
+ : Hash(Hash), Data(Data) {}
+ ConstValueProxy(ArrayRef<uint8_t> Hash, StringRef Data)
+ : Hash(Hash), Data(Data.begin(), Data.size()) {}
+ ArrayRef<uint8_t> Hash;
+ ArrayRef<char> Data;
+ };
+ struct ValueProxy {
+ operator ConstValueProxy() const { return ConstValueProxy(Hash, Data); }
+ ValueProxy() = default;
+ ValueProxy(ArrayRef<uint8_t> Hash, MutableArrayRef<char> Data)
+ : Hash(Hash), Data(Data) {}
+ ArrayRef<uint8_t> Hash;
+ MutableArrayRef<char> Data;
+ };
+ struct HintT {
+ explicit operator ValueProxy() const {
+ ValueProxy Value;
+ Value.Data = MutableArrayRef<char>(
+ const_cast<char *>(reinterpret_cast<const char *>(P)), I);
+ Value.Hash = ArrayRef<uint8_t>(nullptr, B);
+ return Value;
+ }
+ explicit HintT(ConstValueProxy Value)
+ : P(Value.Data.data()), I(Value.Data.size()), B(Value.Hash.size()) {
+ // Spot-check that this really was a hint.
+ assert(Value.Data.size() <= UINT16_MAX);
+ assert(Value.Hash.size() <= UINT16_MAX);
+ assert(Value.Hash.data() == nullptr);
+ }
+ HintT(const void *P, uint16_t I, uint16_t B) : P(P), I(I), B(B) {}
+ const void *P = nullptr;
+ uint16_t I = 0;
+ uint16_t B = 0;
+ };
+ template <class ProxyT> class PointerImpl {
+ public:
+ FileOffset getOffset() const {
+ return FileOffset(OffsetLow32 | (uint64_t)OffsetHigh16 << 32);
+ }
+ explicit operator bool() const { return IsValue; }
+ const ProxyT &operator*() const {
+ assert(IsValue);
+ return ValueOrHint;
+ }
+ const ProxyT *operator->() const {
+ assert(IsValue);
+ return &ValueOrHint;
+ }
+ PointerImpl() = default;
+ protected:
+ PointerImpl(FileOffset Offset, ProxyT Value)
+ : PointerImpl(Value, Offset, /*IsHint=*/false, /*IsValue=*/true) {}
+ explicit PointerImpl(FileOffset Offset, HintT H)
+ : PointerImpl(ValueProxy(H), Offset, /*IsHint=*/true,
+ /*IsValue=*/false) {}
+ PointerImpl(ProxyT ValueOrHint, FileOffset Offset, bool IsHint,
+ bool IsValue)
+ : ValueOrHint(ValueOrHint), OffsetLow32((uint64_t)Offset.get()),
+ OffsetHigh16((uint64_t)Offset.get() >> 32), IsHint(IsHint),
+ IsValue(IsValue) {
+ checkOffset(Offset);
+ }
+ static void checkOffset(FileOffset Offset) {
+ assert(Offset.get() > 0);
+ assert((uint64_t)Offset.get() < (1LL << 48));
+ }
+ std::optional<HintT> getHint(const OnDiskHashMappedTrie &This) const {
+ if (!IsHint)
+ return std::nullopt;
+ HintT H(ValueOrHint);
+ assert(H.P == &This && "Expected hint to be for This");
+ if (H.P != &This)
+ return std::nullopt;
+ return H;
+ }
+ ProxyT ValueOrHint;
+ uint32_t OffsetLow32 = 0;
+ uint16_t OffsetHigh16 = 0;
+ bool IsHint = false;
+ bool IsValue = false;
+ };
+ class pointer;
+ class const_pointer : public PointerImpl<ConstValueProxy> {
+ public:
+ const_pointer() = default;
+ private:
+ friend class pointer;
+ friend class OnDiskHashMappedTrie;
+ using const_pointer::PointerImpl::PointerImpl;
+ };
+ class pointer : public PointerImpl<ValueProxy> {
+ public:
+ operator const_pointer() const {
+ return const_pointer(ValueOrHint, getOffset(), IsHint, IsValue);
+ }
+ pointer() = default;
+ private:
+ friend class OnDiskHashMappedTrie;
+ using pointer::PointerImpl::PointerImpl;
+ };
+ pointer getMutablePointer(const_pointer CP) {
+ if (std::optional<HintT> H = CP.getHint(*this))
+ return pointer(CP.getOffset(), *H);
+ if (!CP)
+ return pointer();
+ ValueProxy V{CP->Hash, MutableArrayRef(const_cast<char *>(CP->Data.data()),
+ CP->Data.size())};
+ return pointer(CP.getOffset(), V);
+ }
+ const_pointer find(ArrayRef<uint8_t> Hash) const;
+ pointer find(ArrayRef<uint8_t> Hash) {
+ return getMutablePointer(
+ const_cast<const OnDiskHashMappedTrie *>(this)->find(Hash));
+ }
+ const_pointer recoverFromHashPointer(const uint8_t *HashBegin) const;
+ pointer recoverFromHashPointer(const uint8_t *HashBegin) {
+ return getMutablePointer(
+ const_cast<const OnDiskHashMappedTrie *>(this)->recoverFromHashPointer(
+ HashBegin));
+ }
+ const_pointer recoverFromFileOffset(FileOffset Offset) const;
+ pointer recoverFromFileOffset(FileOffset Offset) {
+ return getMutablePointer(
+ const_cast<const OnDiskHashMappedTrie *>(this)->recoverFromFileOffset(
+ Offset));
+ }
+ using LazyInsertOnConstructCB =
+ function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue)>;
+ using LazyInsertOnLeakCB =
+ function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue,
+ FileOffset FinalOffset, ValueProxy FinalValue)>;
+ /// Insert lazily.
+ ///
+ /// \p OnConstruct is called when ready to insert a value, after allocating
+ /// space for the data. It is called at most once.
+ ///
+ /// \p OnLeak is called only if \p OnConstruct has been called and a race
+ /// occurred before insertion, causing the tentative offset and data to be
+ /// abandoned. This allows clients to clean up other results or update any
+ /// references.
+ ///
+ /// NOTE: Does *not* guarantee that \p OnConstruct is only called on success.
+ /// The in-memory \a HashMappedTrie uses LazyAtomicPointer to synchronize
+ /// simultaneous writes, but that seems dangerous to use in a memory-mapped
+ /// file in case a process crashes in the busy state.
+ pointer insertLazy(const_pointer Hint, ArrayRef<uint8_t> Hash,
+ LazyInsertOnConstructCB OnConstruct = nullptr,
+ LazyInsertOnLeakCB OnLeak = nullptr);
+ pointer insertLazy(ArrayRef<uint8_t> Hash,
+ LazyInsertOnConstructCB OnConstruct = nullptr,
+ LazyInsertOnLeakCB OnLeak = nullptr) {
+ return insertLazy(const_pointer(), Hash, OnConstruct, OnLeak);
+ }
+ pointer insert(const_pointer Hint, const ConstValueProxy &Value) {
+ return insertLazy(Hint, Value.Hash, [&](FileOffset, ValueProxy Allocated) {
+ assert(Allocated.Hash == Value.Hash);
+ assert(Allocated.Data.size() == Value.Data.size());
+ llvm::copy(Value.Data, Allocated.Data.begin());
+ });
+ }
+ pointer insert(const ConstValueProxy &Value) {
+ return insert(const_pointer(), Value);
+ }
+ size_t size() const;
+ /// Gets or creates a file at \p Path with a hash-mapped trie named \p
+ /// TrieName. The hash size is \p NumHashBits (in bits) and the records store
+ /// data of size \p DataSize (in bytes).
+ ///
+ /// \p MaxFileSize controls the maximum file size to support, limiting the
+ /// size of the \a mapped_file_region. \p NewFileInitialSize is the starting
+ /// size if a new file is created.
+ ///
+ /// \p NewTableNumRootBits and \p NewTableNumSubtrieBits are hints to
+ /// configure the trie, if it doesn't already exist.
+ ///
+ /// \pre NumHashBits is a multiple of 8 (byte-aligned).
+ ///
+ /// TODO: Expose the internal DatabaseFile abstraction and add support for
+ /// adding more tables to a single file.
+ ///
+ /// FIXME: Rename to getOrCreate().
+ static Expected<OnDiskHashMappedTrie>
+ create(const Twine &Path, const Twine &TrieName, size_t NumHashBits,
+ uint64_t DataSize, uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize,
+ std::optional<size_t> NewTableNumRootBits = std::nullopt,
+ std::optional<size_t> NewTableNumSubtrieBits = std::nullopt);
+ OnDiskHashMappedTrie(OnDiskHashMappedTrie &&RHS);
+ OnDiskHashMappedTrie &operator=(OnDiskHashMappedTrie &&RHS);
+ ~OnDiskHashMappedTrie();
+ struct ImplType;
+ explicit OnDiskHashMappedTrie(std::unique_ptr<ImplType> Impl);
+ std::unique_ptr<ImplType> Impl;
+/// Sink for data. Stores variable length data with 8-byte alignment. Does not
+/// track size of data, which is assumed to known from context, or embedded.
+/// Uses 0-padding but does not guarantee 0-termination.
+class OnDiskDataAllocator {
+ using ValueProxy = MutableArrayRef<char>;
+ /// An iterator-like return value for data insertion. Maybe it should be
+ /// called \c iterator, but it has no increment.
+ class pointer {
+ public:
+ FileOffset getOffset() const { return Offset; }
+ explicit operator bool() const { return bool(getOffset()); }
+ const ValueProxy &operator*() const {
+ assert(Offset && "Null dereference");
+ return Value;
+ }
+ const ValueProxy *operator->() const {
+ assert(Offset && "Null dereference");
+ return &Value;
+ }
+ pointer() = default;
+ private:
+ friend class OnDiskDataAllocator;
+ pointer(FileOffset Offset, ValueProxy Value)
+ : Offset(Offset), Value(Value) {}
+ FileOffset Offset;
+ ValueProxy Value;
+ };
+ // Look up the data stored at the given offset.
+ const char *beginData(FileOffset Offset) const;
+ char *beginData(FileOffset Offset) {
+ return const_cast<char *>(
+ const_cast<const OnDiskDataAllocator *>(this)->beginData(Offset));
+ }
+ pointer allocate(size_t Size);
+ pointer save(ArrayRef<char> Data) {
+ pointer P = allocate(Data.size());
+ llvm::copy(Data, P->begin());
+ return P;
+ }
+ pointer save(StringRef Data) {
+ return save(ArrayRef<char>(Data.begin(), Data.size()));
+ }
+ /// \returns the buffer that was allocated at \p create time, with size
+ /// \p UserHeaderSize.
+ MutableArrayRef<uint8_t> getUserHeader();
+ size_t size() const;
+ static Expected<OnDiskDataAllocator>
+ create(const Twine &Path, const Twine &TableName, uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize,
+ uint32_t UserHeaderSize = 0,
+ function_ref<void(void *)> UserHeaderInit = nullptr);
+ OnDiskDataAllocator(OnDiskDataAllocator &&RHS);
+ OnDiskDataAllocator &operator=(OnDiskDataAllocator &&RHS);
+ // No copy. Just call \a create() again.
+ OnDiskDataAllocator(const OnDiskDataAllocator &) = delete;
+ OnDiskDataAllocator &operator=(const OnDiskDataAllocator &) = delete;
+ ~OnDiskDataAllocator();
+ struct ImplType;
+ explicit OnDiskDataAllocator(std::unique_ptr<ImplType> Impl);
+ std::unique_ptr<ImplType> Impl;
+} // namespace cas
+} // namespace llvm
diff --git a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
new file mode 100644
index 00000000000000..a6b4b12491c4dd
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
@@ -0,0 +1,63 @@
+//===- OnDiskKeyValueDB.h ---------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/OnDiskHashMappedTrie.h"
+namespace llvm::cas::ondisk {
+/// An on-disk key-value data store with the following properties:
+/// * Keys are fixed length binary hashes with expected normal distribution.
+/// * Values are buffers of the same size, specified at creation time.
+/// * The value of a key cannot be changed once it is set.
+/// * The value buffers returned from a key lookup have 8-byte alignment.
+class OnDiskKeyValueDB {
+ /// Associate a value with a key.
+ ///
+ /// \param Key the hash bytes for the key
+ /// \param Value the value bytes, same size as \p ValueSize parameter of
+ /// \p open call.
+ ///
+ /// \returns the value associated with the \p Key. It may be different than
+ /// \p Value if another value is already associated with this key.
+ Expected<ArrayRef<char>> put(ArrayRef<uint8_t> Key, ArrayRef<char> Value);
+ /// \returns the value associated with the \p Key, or \p std::nullopt if the
+ /// key does not exist.
+ Expected<std::optional<ArrayRef<char>>> get(ArrayRef<uint8_t> Key);
+ /// \returns Total size of stored data.
+ size_t getStorageSize() const { return Cache.size(); }
+ /// Open the on-disk store from a directory.
+ ///
+ /// \param Path directory for the on-disk store. The directory will be created
+ /// if it doesn't exist.
+ /// \param HashName Identifier name for the hashing algorithm that is going to
+ /// be used.
+ /// \param KeySize Size for the key hash bytes.
+ /// \param ValueName Identifier name for the values.
+ /// \param ValueSize Size for the value bytes.
+ static Expected<std::unique_ptr<OnDiskKeyValueDB>>
+ open(StringRef Path, StringRef HashName, unsigned KeySize,
+ StringRef ValueName, size_t ValueSize);
+ OnDiskKeyValueDB(size_t ValueSize, OnDiskHashMappedTrie Cache)
+ : ValueSize(ValueSize), Cache(std::move(Cache)) {}
+ const size_t ValueSize;
+ OnDiskHashMappedTrie Cache;
+} // namespace llvm::cas::ondisk
diff --git a/llvm/include/llvm/CAS/TreeEntry.h b/llvm/include/llvm/CAS/TreeEntry.h
new file mode 100644
index 00000000000000..ab50986cf482d2
--- /dev/null
+++ b/llvm/include/llvm/CAS/TreeEntry.h
@@ -0,0 +1,71 @@
+//===- llvm/CAS/TreeEntry.h -------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/CASReference.h"
+namespace llvm::cas {
+class ObjectStore;
+class TreeEntry {
+ enum EntryKind {
+ Regular, /// A file.
+ Executable, /// A file that's executable.
+ Symlink, /// A symbolic link.
+ Tree, /// A filesystem tree.
+ };
+ EntryKind getKind() const { return Kind; }
+ bool isRegular() const { return Kind == Regular; }
+ bool isExecutable() const { return Kind == Executable; }
+ bool isFile() const { return isRegular() || isExecutable(); }
+ bool isSymlink() const { return Kind == Symlink; }
+ bool isTree() const { return Kind == Tree; }
+ ObjectRef getRef() const { return Ref; }
+ friend bool operator==(const TreeEntry &LHS, const TreeEntry &RHS) {
+ return LHS.Kind == RHS.Kind && LHS.Ref == RHS.Ref;
+ }
+ TreeEntry(ObjectRef Ref, EntryKind Kind) : Kind(Kind), Ref(Ref) {}
+ EntryKind Kind;
+ ObjectRef Ref;
+class NamedTreeEntry : public TreeEntry {
+ StringRef getName() const { return Name; }
+ friend bool operator==(const NamedTreeEntry &LHS, const NamedTreeEntry &RHS) {
+ return static_cast<const TreeEntry &>(LHS) == RHS && LHS.Name == RHS.Name;
+ }
+ friend bool operator<(const NamedTreeEntry &LHS, const NamedTreeEntry &RHS) {
+ return LHS.Name < RHS.Name;
+ }
+ NamedTreeEntry(ObjectRef Ref, EntryKind Kind, StringRef Name)
+ : TreeEntry(Ref, Kind), Name(Name) {}
+ void print(raw_ostream &OS, ObjectStore &CAS) const;
+ StringRef Name;
+} // namespace llvm::cas
diff --git a/llvm/include/llvm/CAS/TreeSchema.h b/llvm/include/llvm/CAS/TreeSchema.h
new file mode 100644
index 00000000000000..ff796bc8c215f9
--- /dev/null
+++ b/llvm/include/llvm/CAS/TreeSchema.h
@@ -0,0 +1,125 @@
+//===- llvm/CAS/TreeSchema.h ------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/CASNodeSchema.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/CAS/TreeEntry.h"
+namespace llvm::cas {
+class TreeProxy;
+class TreeSchema : public RTTIExtends<TreeSchema, NodeSchema> {
+ void anchor() override;
+ static char ID;
+ bool isRootNode(const ObjectProxy &Node) const final {
+ return false; // TreeSchema doesn't have a root node.
+ }
+ bool isNode(const ObjectProxy &Node) const final;
+ TreeSchema(ObjectStore &CAS);
+ size_t getNumTreeEntries(TreeProxy Tree) const;
+ Error
+ forEachTreeEntry(TreeProxy Tree,
+ function_ref<Error(const NamedTreeEntry &)> Callback) const;
+ /// Visit each file entry in order, returning an error from \p Callback to
+ /// stop early.
+ ///
+ /// The \p NamedTreeEntry, that the \p Callback receives, points to a name
+ /// string that may not live beyond the return of the callback function.
+ ///
+ /// Passes the \p TreeNodeProxy if the entry is a \p TreeEntry::Tree,
+ /// otherwise passes \p None.
+ Error walkFileTreeRecursively(
+ ObjectStore &CAS, ObjectRef Root,
+ function_ref<Error(const NamedTreeEntry &, std::optional<TreeProxy>)>
+ Callback);
+ std::optional<size_t> lookupTreeEntry(TreeProxy Tree, StringRef Name) const;
+ NamedTreeEntry loadTreeEntry(TreeProxy Tree, size_t I) const;
+ Expected<TreeProxy> load(ObjectRef Object) const;
+ Expected<TreeProxy> load(ObjectProxy Object) const;
+ Expected<TreeProxy> create(ArrayRef<NamedTreeEntry> Entries = std::nullopt);
+ static constexpr StringLiteral SchemaName = "llvm::cas::schema::tree::v1";
+ std::optional<ObjectRef> TreeKindRef;
+ friend class TreeProxy;
+ ObjectRef getKindRef() const;
+class TreeProxy : public ObjectProxy {
+ static Expected<TreeProxy> get(const TreeSchema &Schema,
+ Expected<ObjectProxy> Ref);
+ static Expected<TreeProxy> create(TreeSchema &Schema,
+ ArrayRef<NamedTreeEntry> Entries);
+ const TreeSchema &getSchema() const { return *Schema; }
+ bool operator==(const TreeProxy &RHS) const {
+ return Schema == RHS.Schema && cas::CASID(*this) == cas::CASID(RHS);
+ }
+ Error
+ forEachEntry(function_ref<Error(const NamedTreeEntry &)> Callback) const {
+ return Schema->forEachTreeEntry(*this, Callback);
+ }
+ bool empty() const { return size() == 0; }
+ size_t size() const { return Schema->getNumTreeEntries(*this); }
+ std::optional<NamedTreeEntry> lookup(StringRef Name) const {
+ if (auto I = Schema->lookupTreeEntry(*this, Name))
+ return get(*I);
+ return std::nullopt;
+ }
+ StringRef getName(size_t I) const;
+ NamedTreeEntry get(size_t I) const { return Schema->loadTreeEntry(*this, I); }
+ TreeProxy() = delete;
+ TreeProxy(const TreeSchema &Schema, const ObjectProxy &Node)
+ : ObjectProxy(Node), Schema(&Schema) {}
+ class Builder {
+ public:
+ static Expected<Builder> startNode(TreeSchema &Schema);
+ Expected<TreeProxy> build(ArrayRef<NamedTreeEntry> Entries);
+ private:
+ Builder(const TreeSchema &Schema) : Schema(&Schema) {}
+ const TreeSchema *Schema;
+ public:
+ SmallString<256> Data;
+ SmallVector<ObjectRef, 16> Refs;
+ };
+ const TreeSchema *Schema;
+} // namespace llvm::cas
diff --git a/llvm/include/llvm/CAS/UnifiedOnDiskCache.h b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h
new file mode 100644
index 00000000000000..9c076cdf5fd6e3
--- /dev/null
+++ b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h
@@ -0,0 +1,140 @@
+//===- UnifiedOnDiskCache.h -------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/OnDiskGraphDB.h"
+namespace llvm::cas::ondisk {
+class OnDiskKeyValueDB;
+/// A unified CAS nodes and key-value database, using on-disk storage for both.
+/// It manages storage growth and provides APIs for garbage collection.
+/// High-level properties:
+/// * While \p UnifiedOnDiskCache is open on a directory, by any process, the
+/// storage size in that directory will keep growing unrestricted. For data to
+/// become eligible for garbase-collection there should be no open instances
+/// of \p UnifiedOnDiskCache for that directory, by any process.
+/// * Garbage-collection needs to be triggered explicitly by the client. It can
+/// be triggered on a directory concurrently, at any time and by any process,
+/// without affecting any active readers/writers, in the same process or other
+/// processes.
+/// Usage patterns should be that an instance of \p UnifiedOnDiskCache is open
+/// for a limited period of time, e.g. for the duration of a build operation.
+/// For long-living processes that need periodic access to a
+/// \p UnifiedOnDiskCache, the client should device a scheme where access is
+/// performed within some defined period. For example, if a service is designed
+/// to continuously wait for requests that access a \p UnifiedOnDiskCache, it
+/// could keep the instance alive while new requests are coming in but close it
+/// after a time period in which there are no new requests.
+class UnifiedOnDiskCache {
+ /// The \p OnDiskGraphDB instance for the open directory.
+ OnDiskGraphDB &getGraphDB() { return *PrimaryGraphDB; }
+ /// Associate an \p ObjectID, of the \p OnDiskGraphDB instance, with a key.
+ ///
+ /// \param Key the hash bytes for the key.
+ /// \param Value the \p ObjectID value.
+ ///
+ /// \returns the \p ObjectID associated with the \p Key. It may be different
+ /// than \p Value if another value was already associated with this key.
+ Expected<ObjectID> KVPut(ArrayRef<uint8_t> Key, ObjectID Value);
+ /// Associate an \p ObjectID, of the \p OnDiskGraphDB instance, with a key.
+ /// An \p ObjectID as a key is equivalent to its digest bytes.
+ ///
+ /// \param Key the \p ObjectID for the key.
+ /// \param Value the \p ObjectID value.
+ ///
+ /// \returns the \p ObjectID associated with the \p Key. It may be different
+ /// than \p Value if another value was already associated with this key.
+ Expected<ObjectID> KVPut(ObjectID Key, ObjectID Value);
+ /// \returns the \p ObjectID, of the \p OnDiskGraphDB instance, associated
+ /// with the \p Key, or \p std::nullopt if the key does not exist.
+ Expected<std::optional<ObjectID>> KVGet(ArrayRef<uint8_t> Key);
+ /// Open a \p UnifiedOnDiskCache instance for a directory.
+ ///
+ /// \param Path directory for the on-disk database. The directory will be
+ /// created if it doesn't exist.
+ /// \param SizeLimit Optional size for limiting growth. This has an effect for
+ /// when the instance is closed.
+ /// \param HashName Identifier name for the hashing algorithm that is going to
+ /// be used.
+ /// \param HashByteSize Size for the object digest hash bytes.
+ /// \param FaultInPolicy Controls how nodes are copied to primary store. This
+ /// is recorded at creation time and subsequent opens need to pass the same
+ /// policy otherwise the \p open will fail.
+ static Expected<std::unique_ptr<UnifiedOnDiskCache>>
+ open(StringRef Path, std::optional<uint64_t> SizeLimit, StringRef HashName,
+ unsigned HashByteSize,
+ OnDiskGraphDB::FaultInPolicy FaultInPolicy =
+ OnDiskGraphDB::FaultInPolicy::FullTree);
+ /// This is called implicitly at destruction time, so it is not required for a
+ /// client to call this. After calling \p close the only method that is valid
+ /// to call is \p needsGarbaseCollection.
+ ///
+ /// \param CheckSizeLimit if true it will check whether the primary store has
+ /// exceeded its intended size limit. If false the check is skipped even if a
+ /// \p SizeLimit was passed to the \p open call.
+ Error close(bool CheckSizeLimit = true);
+ /// \returns whether the primary store has exceeded the intended size limit.
+ /// This can return false even if the overall size of the opened directory is
+ /// over the \p SizeLimit passed to \p open. To know whether garbage
+ /// collection needs to be triggered or not, call \p needsGarbaseCollection.
+ bool hasExceededSizeLimit() const;
+ /// \returns whether there are unused data that can be deleted using a
+ /// \p collectGarbage call.
+ bool needsGarbaseCollection() const { return NeedsGarbageCollection; }
+ /// Remove any unused data from the directory at \p Path. If there are no such
+ /// data the operation is a no-op.
+ ///
+ /// This can be called concurrently, regardless of whether there is an open
+ /// \p UnifiedOnDiskCache instance or not; it has no effect on readers/writers
+ /// in the same process or other processes.
+ ///
+ /// It is recommended that garbage-collection is triggered concurrently in the
+ /// background, so that it has minimal effect on the workload of the process.
+ static Error collectGarbage(StringRef Path);
+ ~UnifiedOnDiskCache();
+ UnifiedOnDiskCache();
+ Expected<std::optional<ObjectID>>
+ faultInFromUpstreamKV(ArrayRef<uint8_t> Key);
+ std::string RootPath;
+ std::optional<uint64_t> SizeLimit;
+ int LockFD = -1;
+ std::atomic<bool> NeedsGarbageCollection;
+ std::string PrimaryDBDir;
+ OnDiskGraphDB *UpstreamGraphDB = nullptr;
+ std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB;
+ std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB;
+ std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB;
+} // namespace llvm::cas::ondisk
diff --git a/llvm/lib/CAS/ActionCache.cpp b/llvm/lib/CAS/ActionCache.cpp
new file mode 100644
index 00000000000000..ded1fc4879fc0b
--- /dev/null
+++ b/llvm/lib/CAS/ActionCache.cpp
@@ -0,0 +1,60 @@
+//===- ActionCache.cpp ------------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/CASID.h"
+#include "llvm/CAS/ObjectStore.h"
+using namespace llvm;
+using namespace llvm::cas;
+void ActionCache::anchor() {}
+CacheKey::CacheKey(const CASID &ID) : Key(toStringRef(ID.getHash()).str()) {}
+CacheKey::CacheKey(const ObjectProxy &Proxy)
+ : CacheKey(Proxy.getCAS(), Proxy.getRef()) {}
+CacheKey::CacheKey(const ObjectStore &CAS, const ObjectRef &Ref)
+ : Key(toStringRef(CAS.getID(Ref).getHash())) {}
+std::future<AsyncCASIDValue> ActionCache::getFuture(const CacheKey &ActionKey,
+ bool Globally) const {
+ std::promise<AsyncCASIDValue> Promise;
+ auto Future = Promise.get_future();
+ getAsync(ActionKey, Globally,
+ [Promise =
+ std::move(Promise)](Expected<std::optional<CASID>> ID) mutable {
+ Promise.set_value(std::move(ID));
+ });
+ return Future;
+std::future<AsyncErrorValue> ActionCache::putFuture(const CacheKey &ActionKey,
+ const CASID &Result,
+ bool Globally) {
+ std::promise<AsyncErrorValue> Promise;
+ auto Future = Promise.get_future();
+ putAsync(ActionKey, Result, Globally,
+ [Promise = std::move(Promise)](Error E) mutable {
+ Promise.set_value(std::move(E));
+ });
+ return Future;
+void ActionCache::getImplAsync(
+ ArrayRef<uint8_t> ResolvedKey, bool Globally,
+ unique_function<void(Expected<std::optional<CASID>>)> Callback) const {
+ // The default implementation is synchronous.
+ return Callback(getImpl(ResolvedKey, Globally));
+void ActionCache::putImplAsync(ArrayRef<uint8_t> ResolvedKey,
+ const CASID &Result, bool Globally,
+ unique_function<void(Error)> Callback) {
+ // The default implementation is synchronous.
+ return Callback(putImpl(ResolvedKey, Result, Globally));
diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp
new file mode 100644
index 00000000000000..ff4f3c637a46e1
--- /dev/null
+++ b/llvm/lib/CAS/ActionCaches.cpp
@@ -0,0 +1,242 @@
+//===- ActionCaches.cpp -----------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "BuiltinCAS.h"
+#include "llvm/ADT/TrieRawHashMap.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/CAS/OnDiskHashMappedTrie.h"
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/BLAKE3.h"
+#include "llvm/Support/Path.h"
+#define DEBUG_TYPE "action-caches"
+using namespace llvm;
+using namespace llvm::cas;
+namespace {
+using HasherT = BLAKE3;
+using HashType = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>()));
+template <size_t Size> class CacheEntry {
+ CacheEntry() = default;
+ CacheEntry(ArrayRef<uint8_t> Hash) { llvm::copy(Hash, Value.data()); }
+ CacheEntry(const CacheEntry &Entry) { llvm::copy(Entry.Value, Value.data()); }
+ ArrayRef<uint8_t> getValue() const { return Value; }
+ std::array<uint8_t, Size> Value;
+class InMemoryActionCache final : public ActionCache {
+ InMemoryActionCache()
+ : ActionCache(builtin::BuiltinCASContext::getDefaultContext()) {}
+ Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result,
+ bool Globally) final;
+ Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
+ bool Globally) const final;
+ using DataT = CacheEntry<sizeof(HashType)>;
+ using InMemoryCacheT = ThreadSafeTrieRawHashMap<DataT, sizeof(HashType)>;
+ InMemoryCacheT Cache;
+class OnDiskActionCache final : public ActionCache {
+ Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result,
+ bool Globally) final;
+ Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
+ bool Globally) const final;
+ static Expected<std::unique_ptr<OnDiskActionCache>> create(StringRef Path);
+ static StringRef getHashName() { return "BLAKE3"; }
+ OnDiskActionCache(std::unique_ptr<ondisk::OnDiskKeyValueDB> DB);
+ std::unique_ptr<ondisk::OnDiskKeyValueDB> DB;
+ using DataT = CacheEntry<sizeof(HashType)>;
+class UnifiedOnDiskActionCache final : public ActionCache {
+ Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result,
+ bool Globally) final;
+ Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
+ bool Globally) const final;
+ UnifiedOnDiskActionCache(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB;
+} // end namespace
+static std::string hashToString(ArrayRef<uint8_t> Hash) {
+ SmallString<64> Str;
+ toHex(Hash, /*LowerCase=*/true, Str);
+ return Str.str().str();
+static Error createResultCachePoisonedError(StringRef Key,
+ const CASContext &Context,
+ CASID Output,
+ ArrayRef<uint8_t> ExistingOutput) {
+ std::string Existing =
+ CASID::create(&Context, toStringRef(ExistingOutput)).toString();
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "cache poisoned for '" + Key + "' (new='" +
+ Output.toString() + "' vs. existing '" +
+ Existing + "')");
+InMemoryActionCache::getImpl(ArrayRef<uint8_t> Key, bool /*Globally*/) const {
+ auto Result = Cache.find(Key);
+ if (!Result)
+ return std::nullopt;
+ return CASID::create(&getContext(), toStringRef(Result->Data.getValue()));
+Error InMemoryActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result,
+ bool /*Globally*/) {
+ DataT Expected(Result.getHash());
+ const InMemoryCacheT::value_type &Cached = *Cache.insertLazy(
+ Key, [&](auto ValueConstructor) { ValueConstructor.emplace(Expected); });
+ const DataT &Observed = Cached.Data;
+ if (Expected.getValue() == Observed.getValue())
+ return Error::success();
+ return createResultCachePoisonedError(hashToString(Key), getContext(), Result,
+ Observed.getValue());
+static constexpr StringLiteral DefaultName = "actioncache";
+namespace llvm {
+namespace cas {
+std::string getDefaultOnDiskActionCachePath() {
+ SmallString<128> Path;
+ if (!llvm::sys::path::cache_directory(Path))
+ report_fatal_error("cannot get default cache directory");
+ llvm::sys::path::append(Path, builtin::DefaultDir, DefaultName);
+ return Path.str().str();
+std::unique_ptr<ActionCache> createInMemoryActionCache() {
+ return std::make_unique<InMemoryActionCache>();
+} // namespace cas
+} // namespace llvm
+ std::unique_ptr<ondisk::OnDiskKeyValueDB> DB)
+ : ActionCache(builtin::BuiltinCASContext::getDefaultContext()),
+ DB(std::move(DB)) {}
+OnDiskActionCache::create(StringRef AbsPath) {
+ std::unique_ptr<ondisk::OnDiskKeyValueDB> DB;
+ if (Error E = ondisk::OnDiskKeyValueDB::open(AbsPath, getHashName(),
+ sizeof(HashType), getHashName(),
+ sizeof(DataT))
+ .moveInto(DB))
+ return std::move(E);
+ return std::unique_ptr<OnDiskActionCache>(
+ new OnDiskActionCache(std::move(DB)));
+OnDiskActionCache::getImpl(ArrayRef<uint8_t> Key, bool /*Globally*/) const {
+ std::optional<ArrayRef<char>> Val;
+ if (Error E = DB->get(Key).moveInto(Val))
+ return std::move(E);
+ if (!Val)
+ return std::nullopt;
+ return CASID::create(&getContext(), toStringRef(*Val));
+Error OnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result,
+ bool /*Globally*/) {
+ auto ResultHash = Result.getHash();
+ ArrayRef Expected((const char *)ResultHash.data(), ResultHash.size());
+ ArrayRef<char> Observed;
+ if (Error E = DB->put(Key, Expected).moveInto(Observed))
+ return E;
+ if (Expected == Observed)
+ return Error::success();
+ return createResultCachePoisonedError(
+ hashToString(Key), getContext(), Result,
+ ArrayRef((const uint8_t *)Observed.data(), Observed.size()));
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB)
+ : ActionCache(builtin::BuiltinCASContext::getDefaultContext()),
+ UniDB(std::move(UniDB)) {}
+UnifiedOnDiskActionCache::getImpl(ArrayRef<uint8_t> Key,
+ bool /*Globally*/) const {
+ std::optional<ondisk::ObjectID> Val;
+ if (Error E = UniDB->KVGet(Key).moveInto(Val))
+ return std::move(E);
+ if (!Val)
+ return std::nullopt;
+ return CASID::create(&getContext(),
+ toStringRef(UniDB->getGraphDB().getDigest(*Val)));
+Error UnifiedOnDiskActionCache::putImpl(ArrayRef<uint8_t> Key,
+ const CASID &Result,
+ bool /*Globally*/) {
+ ondisk::ObjectID Expected =
+ UniDB->getGraphDB().getReference(Result.getHash());
+ std::optional<ondisk::ObjectID> Observed;
+ if (Error E = UniDB->KVPut(Key, Expected).moveInto(Observed))
+ return E;
+ if (Expected == Observed)
+ return Error::success();
+ return createResultCachePoisonedError(
+ hashToString(Key), getContext(), Result,
+ UniDB->getGraphDB().getDigest(*Observed));
+cas::createOnDiskActionCache(StringRef Path) {
+ return OnDiskActionCache::create(Path);
+ return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled");
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) {
+ return std::make_unique<UnifiedOnDiskActionCache>(std::move(UniDB));
diff --git a/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp
new file mode 100644
index 00000000000000..87073cf2b4f230
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp
@@ -0,0 +1,25 @@
+//===- BuiltinUnifiedCASDatabases.cpp ---------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "BuiltinCAS.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+using namespace llvm;
+using namespace llvm::cas;
+Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>>
+cas::createOnDiskUnifiedCASDatabases(StringRef Path) {
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB;
+ if (Error E = builtin::createBuiltinUnifiedOnDiskCache(Path).moveInto(UniDB))
+ return std::move(E);
+ auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB);
+ auto AC = builtin::createActionCacheFromUnifiedOnDiskCache(std::move(UniDB));
+ return std::make_pair(std::move(CAS), std::move(AC));
diff --git a/llvm/lib/CAS/CASNodeSchema.cpp b/llvm/lib/CAS/CASNodeSchema.cpp
new file mode 100644
index 00000000000000..0ef47f7cc33ef6
--- /dev/null
+++ b/llvm/lib/CAS/CASNodeSchema.cpp
@@ -0,0 +1,23 @@
+//===- CASNodeSchema.cpp --------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/CASNodeSchema.h"
+#include "llvm/CAS/ObjectStore.h"
+using namespace llvm;
+using namespace llvm::cas;
+char NodeSchema::ID = 0;
+void NodeSchema::anchor() {}
+NodeSchema *SchemaPool::getSchemaForRoot(cas::ObjectProxy Node) const {
+ for (auto &Schema : Schemas)
+ if (Schema->isRootNode(Node))
+ return Schema.get();
+ return nullptr;
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index a486ab66ae4266..b3f9ddc8e315c2 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -1,8 +1,29 @@
+ add_definitions(-DLLVM_ENABLE_ONDISK_CAS=1)
+ ActionCache.cpp
+ ActionCaches.cpp
+ BuiltinUnifiedCASDatabases.cpp
+ CASNodeSchema.cpp
+ HierarchicalTreeBuilder.cpp
+ MappedFileRegionBumpPtr.cpp
+ OnDiskCAS.cpp
+ OnDiskCommon.cpp
+ OnDiskGraphDB.cpp
+ OnDiskHashMappedTrie.cpp
+ OnDiskKeyValueDB.cpp
+ TreeEntry.cpp
+ TreeSchema.cpp
+ UnifiedOnDiskCache.cpp
+ Support
diff --git a/llvm/lib/CAS/HashMappedTrieIndexGenerator.h b/llvm/lib/CAS/HashMappedTrieIndexGenerator.h
new file mode 100644
index 00000000000000..e021d51a2b3060
--- /dev/null
+++ b/llvm/lib/CAS/HashMappedTrieIndexGenerator.h
@@ -0,0 +1,90 @@
+//===- HashMappedTrieIndexGenerator.h ---------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/ArrayRef.h"
+namespace llvm {
+namespace cas {
+struct IndexGenerator {
+ size_t NumRootBits;
+ size_t NumSubtrieBits;
+ ArrayRef<uint8_t> Bytes;
+ std::optional<size_t> StartBit = std::nullopt;
+ size_t getNumBits() const {
+ assert(StartBit);
+ size_t TotalNumBits = Bytes.size() * 8;
+ assert(*StartBit <= TotalNumBits);
+ return std::min(*StartBit ? NumSubtrieBits : NumRootBits,
+ TotalNumBits - *StartBit);
+ }
+ size_t next() {
+ size_t Index;
+ if (!StartBit) {
+ StartBit = 0;
+ Index = getIndex(Bytes, *StartBit, NumRootBits);
+ } else {
+ *StartBit += *StartBit ? NumSubtrieBits : NumRootBits;
+ assert((*StartBit - NumRootBits) % NumSubtrieBits == 0);
+ Index = getIndex(Bytes, *StartBit, NumSubtrieBits);
+ }
+ return Index;
+ }
+ size_t hint(unsigned Index, unsigned Bit) {
+ assert(Index >= 0);
+ assert(Bit < Bytes.size() * 8);
+ assert(Bit == 0 || (Bit - NumRootBits) % NumSubtrieBits == 0);
+ StartBit = Bit;
+ return Index;
+ }
+ size_t getCollidingBits(ArrayRef<uint8_t> CollidingBits) const {
+ assert(StartBit);
+ return getIndex(CollidingBits, *StartBit, NumSubtrieBits);
+ }
+ static size_t getIndex(ArrayRef<uint8_t> Bytes, size_t StartBit,
+ size_t NumBits) {
+ assert(StartBit < Bytes.size() * 8);
+ Bytes = Bytes.drop_front(StartBit / 8u);
+ StartBit %= 8u;
+ size_t Index = 0;
+ for (uint8_t Byte : Bytes) {
+ size_t ByteStart = 0, ByteEnd = 8;
+ if (StartBit) {
+ ByteStart = StartBit;
+ Byte &= (1u << (8 - StartBit)) - 1u;
+ StartBit = 0;
+ }
+ size_t CurrentNumBits = ByteEnd - ByteStart;
+ if (CurrentNumBits > NumBits) {
+ Byte >>= CurrentNumBits - NumBits;
+ CurrentNumBits = NumBits;
+ }
+ Index <<= CurrentNumBits;
+ Index |= Byte & ((1u << CurrentNumBits) - 1u);
+ assert(NumBits >= CurrentNumBits);
+ NumBits -= CurrentNumBits;
+ if (!NumBits)
+ break;
+ }
+ return Index;
+ }
+} // namespace cas
+} // namespace llvm
diff --git a/llvm/lib/CAS/HierarchicalTreeBuilder.cpp b/llvm/lib/CAS/HierarchicalTreeBuilder.cpp
new file mode 100644
index 00000000000000..0590cee17a1781
--- /dev/null
+++ b/llvm/lib/CAS/HierarchicalTreeBuilder.cpp
@@ -0,0 +1,266 @@
+//===- HierarchicalTreeBuilder.cpp ------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/HierarchicalTreeBuilder.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Path.h"
+using namespace llvm;
+using namespace llvm::cas;
+/// Critical to canonicalize components so that paths come up next to each
+/// other when sorted.
+static StringRef canonicalize(SmallVectorImpl<char> &Path,
+ TreeEntry::EntryKind Kind) {
+ // Make absolute.
+ if (Path.empty() || Path.front() != '/')
+ Path.insert(Path.begin(), '/');
+ // FIXME: consider rejecting ".." instead of removing them.
+ sys::path::remove_dots(Path, /*remove_dot_dot=*/true,
+ sys::path::Style::posix);
+ // Canonicalize slashes.
+ bool PendingSlash = false;
+ char *NewEnd = Path.begin();
+ for (int I = 0, E = Path.size(); I != E; ++I) {
+ if (Path[I] == '/') {
+ PendingSlash = true;
+ continue;
+ }
+ if (PendingSlash)
+ *NewEnd++ = '/';
+ PendingSlash = false;
+ *NewEnd++ = Path[I];
+ }
+ Path.erase(NewEnd, Path.end());
+ // For correct sorting, all explicit trees need to end with a '/'.
+ if (Path.empty() || Kind == TreeEntry::Tree)
+ Path.push_back('/');
+ return StringRef(Path.begin(), Path.size());
+void HierarchicalTreeBuilder::pushImpl(std::optional<ObjectRef> Ref,
+ TreeEntry::EntryKind Kind,
+ const Twine &Path) {
+ SmallVector<char, 256> CanonicalPath;
+ Path.toVector(CanonicalPath);
+ Entries.emplace_back(Ref, Kind, canonicalize(CanonicalPath, Kind));
+void HierarchicalTreeBuilder::pushTreeContent(ObjectRef Ref,
+ const Twine &Path) {
+ SmallVector<char, 256> CanonicalPath;
+ Path.toVector(CanonicalPath);
+ TreeEntry::EntryKind Kind = TreeEntry::Tree;
+ TreeContents.emplace_back(Ref, Kind, canonicalize(CanonicalPath, Kind));
+Expected<ObjectProxy> HierarchicalTreeBuilder::create(ObjectStore &CAS) {
+ // FIXME: It is inefficient expanding the whole tree recursively like this,
+ // use a more efficient algorithm to merge contents.
+ TreeSchema Schema(CAS);
+ for (const auto &TreeContent : TreeContents) {
+ StringRef Path = TreeContent.getPath();
+ Error E = Schema.walkFileTreeRecursively(
+ CAS, *TreeContent.getRef(),
+ [&](const NamedTreeEntry &Entry,
+ std::optional<TreeProxy> Tree) -> Error {
+ if (Entry.getKind() != TreeEntry::Tree) {
+ pushImpl(Entry.getRef(), Entry.getKind(), Path + Entry.getName());
+ return Error::success();
+ }
+ if (Tree->empty())
+ pushDirectory(Path + Entry.getName());
+ return Error::success();
+ });
+ if (E)
+ return std::move(E);
+ }
+ TreeContents.clear();
+ if (Entries.empty())
+ return Schema.create();
+ std::stable_sort(
+ Entries.begin(), Entries.end(),
+ [](const HierarchicalEntry &LHS, const HierarchicalEntry &RHS) {
+ // Lexicographically smaller paths first.
+ if (int Compare = LHS.getPath().compare(RHS.getPath()))
+ return Compare < 0;
+ // Nodes with IDs first (only trees may have a missing Ref).
+ return bool(LHS.getRef()) > bool(RHS.getRef());
+ });
+ // Compile into trees.
+ struct Tree;
+ struct Node {
+ Node *Next = nullptr;
+ Tree *Parent = nullptr;
+ std::optional<ObjectRef> Ref;
+ TreeEntry::EntryKind Kind;
+ StringRef Name;
+ bool isTree() const { return Kind == TreeEntry::Tree; }
+ };
+ struct Tree : Node {
+ Node *First = nullptr;
+ bool Visited = false;
+ };
+ BumpPtrAllocator Alloc;
+ Tree Root;
+ const HierarchicalEntry *PrevEntry = nullptr;
+ for (const HierarchicalEntry &Entry : Entries) {
+ // Check for duplicates.
+ if (PrevEntry && PrevEntry->getPath() == Entry.getPath()) {
+ // Error if it's not identical.
+ //
+ // FIXME: Maybe we should allow clobbering / merging / etc., but for now
+ // just error.
+ if (Entry.getKind() != PrevEntry->getKind())
+ return createStringError(
+ std::make_error_code(std::errc::invalid_argument),
+ "duplicate path '" + Entry.getPath() + "' with different kind");
+ if (!Entry.getRef()) {
+ assert(Entry.getKind() == TreeEntry::Tree);
+ continue;
+ }
+ assert(PrevEntry->getRef());
+ if (*Entry.getRef() != *PrevEntry->getRef())
+ return createStringError(
+ std::make_error_code(std::errc::invalid_argument),
+ "duplicate path '" + Entry.getPath() + "' with different ID");
+ // Skip the duplicate.
+ continue;
+ }
+ PrevEntry = &Entry;
+ Tree *Current = &Root;
+ StringRef Path = Entry.getPath();
+ {
+ bool Consumed = Path.consume_front("/");
+ (void)Consumed;
+ assert(Consumed && "Expected canonical POSIX absolute paths");
+ }
+ for (auto Slash = Path.find('/'); !Path.empty(); Slash = Path.find('/')) {
+ StringRef Name;
+ if (Slash == StringRef::npos) {
+ Name = Path;
+ Path = "";
+ } else {
+ Name = Path.take_front(Slash);
+ Path = Path.drop_front(Slash + 1);
+ }
+ // If the tree Current already has a ref, then it's fixed and we can't
+ // add anything to it.
+ if (Current->Ref)
+ return createStringError(
+ std::make_error_code(std::errc::invalid_argument),
+ "cannot add '" + Entry.getPath() + "' under fixed tree");
+ // Need to canonicalize first, or else the sorting trick doesn't work.
+ assert(Name != "");
+ assert(Name != "/");
+ assert(Name != ".");
+ assert(Name != "..");
+ // Check if it's the first node (sorting ahead of time means it's either
+ // the first node, or it doesn't exist yet). Also, check for conflicts
+ // between implied trees and other nodes, such as a blob "/a" and an
+ // implied tree from "/a/b".
+ if (Current->First && Name == Current->First->Name) {
+ if (Path == "" && Entry.getKind() == TreeEntry::Tree) {
+ // Tree already exists. Sort order should ensure a fixed tree comes
+ // first.
+ assert(!Entry.getRef() ||
+ (Current->Ref && *Current->Ref == *Entry.getRef()));
+ break;
+ }
+ if (Current->First->Kind == TreeEntry::Tree) {
+ // Navigate deeper.
+ Current = static_cast<Tree *>(Current->First);
+ continue;
+ }
+ // This is reachable if there are two entries "/duplicate" and
+ // "/duplicate/suffix".
+ return createStringError(
+ std::make_error_code(std::errc::invalid_argument),
+ "duplicate path '" +
+ Entry.getPath().take_front(Name.end() -
+ Entry.getPath().begin()) +
+ "'");
+ }
+ // Doesn't exist yet.
+ Node *New;
+ Tree *Next = nullptr;
+ if (Path == "" && Entry.getKind() != TreeEntry::Tree) {
+ New = new (Alloc.Allocate<Node>()) Node();
+ } else {
+ Next = new (Alloc.Allocate<Tree>()) Tree();
+ New = Next;
+ }
+ New->Parent = Current;
+ New->Next = Current->First;
+ New->Name = Name;
+ if (Path == "") {
+ New->Kind = Entry.getKind();
+ New->Ref = Entry.getRef();
+ } else {
+ New->Kind = TreeEntry::Tree;
+ }
+ Current->First = New;
+ Current = Next;
+ }
+ }
+ // Create the trees bottom up. Pre-allocate space for 8 entries, since many
+ // trees are fairly small when building cache keys.
+ SmallVector<NamedTreeEntry, 8> Entries;
+ SmallVector<Tree *> Worklist = {&Root};
+ while (!Worklist.empty()) {
+ Tree *T = Worklist.back();
+ if (!T->Visited) {
+ assert(!T->Ref && "Trees with fixed content shouldn't be visited");
+ for (Node *N = T->First; N; N = N->Next) {
+ if (!N->Ref) {
+ assert(N->Kind == TreeEntry::Tree);
+ Worklist.push_back(static_cast<Tree *>(N));
+ }
+ }
+ T->Visited = true;
+ continue;
+ }
+ Worklist.pop_back();
+ for (Node *N = T->First; N; N = N->Next)
+ Entries.emplace_back(*N->Ref, N->Kind, N->Name);
+ Expected<TreeProxy> ExpectedTree = Schema.create(Entries);
+ Entries.clear();
+ if (!ExpectedTree)
+ return ExpectedTree.takeError();
+ T->Ref = ExpectedTree->getRef();
+ }
+ Expected<ObjectProxy> Obj = cantFail(CAS.getProxy(*Root.Ref));
+#ifndef NDEBUG
+ if (Obj) {
+ if (Error E = CAS.validateTree(Obj->getRef()))
+ return std::move(E);
+ }
+ return Obj;
diff --git a/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp b/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp
new file mode 100644
index 00000000000000..157871f2dab716
--- /dev/null
+++ b/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp
@@ -0,0 +1,284 @@
+//===- MappedFileRegionBumpPtr.cpp ------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+/// \file
+/// A bump pointer allocator, backed by a memory-mapped file.
+/// The effect we want is:
+/// 1. If it doesn't exist, create the file with an initial size.
+/// 2. Reserve virtual memory large enough for the max file size.
+/// 3. Map the file into memory in the reserved region.
+/// 4. Increase the file size and update the mapping when necessary.
+/// However, updating the mapping is challenging when it needs to work portably,
+/// and across multiple processes without locking for every read. Our current
+/// implementation strategy is:
+/// 1. Use \c ftruncate (\c sys::fs::resize_file) to grow the file to its max
+/// size (typically several GB). Many modern filesystems will create a sparse
+/// file, so that the trailing unused pages do not take space on disk.
+/// 2. Call \c mmap (\c sys::fs::mapped_file_region)
+/// 3. [Automatic as part of 2.]
+/// 4. [Automatic as part of 2.]
+/// Additionally, we attempt to resize the file to its actual data size when
+/// closing the mapping, if this is the only concurrent instance. This is done
+/// using file locks. Shrinking the file mitigates problems with having large
+/// files: on filesystems without sparse files it avoids unnecessary space use;
+/// it also avoids allocating the full size if another process copies the file,
+/// which typically loses sparseness. These mitigations only work while the file
+/// is not in use.
+/// FIXME: we assume that all concurrent users of the file will use the same
+/// value for Capacity. Otherwise a process with a larger capacity can write
+/// data that is "out of bounds" for processes with smaller capacity. Currently
+/// this is true in the CAS.
+/// To support resizing, we use two separate file locks:
+/// 1. We use a shared reader lock on a ".shared" file until destruction.
+/// 2. We use a lock on the main file during initialization - shared to check
+/// the status, upgraded to exclusive to resize/initialize the file.
+/// Then during destruction we attempt to get exclusive access on (1), which
+/// requires no concurrent readers. If so, we shrink the file. Using two
+/// separate locks simplifies the implementation and enables it to work on
+/// platforms (e.g. Windows) where a shared/reader lock prevents writing.
+#include "llvm/CAS/MappedFileRegionBumpPtr.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/StringMap.h"
+#include <mutex>
+using namespace llvm;
+using namespace llvm::cas;
+namespace {
+struct FileLockRAII {
+ std::string Path;
+ int FD;
+ enum LockKind { Shared, Exclusive };
+ std::optional<LockKind> Locked;
+ FileLockRAII(std::string Path, int FD) : Path(std::move(Path)), FD(FD) {}
+ ~FileLockRAII() { consumeError(unlock()); }
+ Error lock(LockKind LK) {
+ if (std::error_code EC = sys::fs::lockFile(FD, LK == Exclusive))
+ return createFileError(Path, EC);
+ Locked = LK;
+ return Error::success();
+ }
+ Error unlock() {
+ if (Locked) {
+ Locked = std::nullopt;
+ if (std::error_code EC = sys::fs::unlockFile(FD))
+ return createFileError(Path, EC);
+ }
+ return Error::success();
+ }
+} // end anonymous namespace
+Expected<MappedFileRegionBumpPtr> MappedFileRegionBumpPtr::create(
+ const Twine &Path, uint64_t Capacity, int64_t BumpPtrOffset,
+ function_ref<Error(MappedFileRegionBumpPtr &)> NewFileConstructor) {
+ MappedFileRegionBumpPtr Result;
+ Result.Path = Path.str();
+ // Open the main file.
+ int FD;
+ if (std::error_code EC = sys::fs::openFileForReadWrite(
+ Result.Path, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+ return createFileError(Path, EC);
+ Result.FD = FD;
+ // Open the shared lock file. See file comment for details of locking scheme.
+ SmallString<128> SharedLockPath(Result.Path);
+ SharedLockPath.append(".shared");
+ int SharedLockFD;
+ if (std::error_code EC = sys::fs::openFileForReadWrite(
+ SharedLockPath, SharedLockFD, sys::fs::CD_OpenAlways,
+ sys::fs::OF_None))
+ return createFileError(SharedLockPath, EC);
+ Result.SharedLockFD = SharedLockFD;
+ // Take shared/reader lock that will be held until we close the file; unlocked
+ // by destroyImpl.
+ if (std::error_code EC = sys::fs::lockFile(SharedLockFD, /*Exclusive=*/false))
+ return createFileError(Path, EC);
+ // Take shared/reader lock for initialization.
+ FileLockRAII InitLock(Result.Path, FD);
+ if (Error E = InitLock.lock(FileLockRAII::Shared))
+ return std::move(E);
+ sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+ sys::fs::file_status Status;
+ if (std::error_code EC = sys::fs::status(File, Status))
+ return createFileError(Result.Path, EC);
+ if (Status.getSize() < Capacity) {
+ // Lock the file exclusively so only one process will do the initialization.
+ if (Error E = InitLock.unlock())
+ return std::move(E);
+ if (Error E = InitLock.lock(FileLockRAII::Exclusive))
+ return std::move(E);
+ // Retrieve the current size now that we have exclusive access.
+ if (std::error_code EC = sys::fs::status(File, Status))
+ return createFileError(Result.Path, EC);
+ }
+ // At this point either the file is still under-sized, or we have the size for
+ // the completely initialized file.
+ if (Status.getSize() < Capacity) {
+ // We are initializing the file; it may be empty, or may have been shrunk
+ // during a previous close.
+ // FIXME: Detect a case where someone opened it with a smaller capacity.
+ // FIXME: On Windows we should use FSCTL_SET_SPARSE and FSCTL_SET_ZERO_DATA
+ // to make this a sparse region, if supported.
+ if (std::error_code EC = sys::fs::resize_file(FD, Capacity))
+ return createFileError(Result.Path, EC);
+ } else {
+ // Someone else initialized it.
+ Capacity = Status.getSize();
+ }
+ // Create the mapped region.
+ {
+ std::error_code EC;
+ sys::fs::mapped_file_region Map(
+ File, sys::fs::mapped_file_region::readwrite, Capacity, 0, EC);
+ if (EC)
+ return createFileError(Result.Path, EC);
+ Result.Region = std::move(Map);
+ }
+ if (Status.getSize() == 0) {
+ // We are creating a new file; run the constructor.
+ if (Error E = NewFileConstructor(Result))
+ return std::move(E);
+ } else {
+ Result.initializeBumpPtr(BumpPtrOffset);
+ }
+ return Result;
+ const Twine &PathTwine, uint64_t Capacity, int64_t BumpPtrOffset,
+ function_ref<Error(MappedFileRegionBumpPtr &)> NewFileConstructor) {
+ struct MapNode {
+ std::mutex Mutex;
+ std::weak_ptr<MappedFileRegionBumpPtr> MFR;
+ };
+ static std::mutex Mutex;
+ // FIXME: Map should be by sys::fs::UniqueID instead of by path. Here's how
+ // it should work:
+ //
+ // 1. Open the file.
+ // 2. Stat the file descriptor to get the UniqueID.
+ // 3. Check the map.
+ // 4. If new, pass the open file descriptor to a helper extracted from
+ // MappedFileRegionBumpPtr::create().
+ static StringMap<MapNode> Regions;
+ SmallString<128> PathStorage;
+ const StringRef Path = PathTwine.toStringRef(PathStorage);
+ MapNode *Node;
+ {
+ std::lock_guard<std::mutex> Lock(Mutex);
+ Node = &Regions[Path];
+ }
+ if (std::shared_ptr<MappedFileRegionBumpPtr> MFR = Node->MFR.lock())
+ return MFR;
+ // Construct a new region. Use a fine-grained lock to allow other regions to
+ // be opened concurrently.
+ std::lock_guard<std::mutex> Lock(Node->Mutex);
+ // Open / create / initialize files on disk.
+ Expected<MappedFileRegionBumpPtr> ExpectedMFR =
+ MappedFileRegionBumpPtr::create(Path, Capacity, BumpPtrOffset,
+ NewFileConstructor);
+ if (!ExpectedMFR)
+ return ExpectedMFR.takeError();
+ auto SharedMFR =
+ std::make_shared<MappedFileRegionBumpPtr>(std::move(*ExpectedMFR));
+ // Success.
+ Node->MFR = SharedMFR;
+ return std::move(SharedMFR);
+void MappedFileRegionBumpPtr::destroyImpl() {
+ if (!FD)
+ return;
+ // Drop the shared lock indicating we are no longer accessing the file.
+ if (SharedLockFD)
+ (void)sys::fs::unlockFile(*SharedLockFD);
+ // Attempt to truncate the file if we can get exclusive access. Ignore any
+ // errors.
+ if (BumpPtr) {
+ assert(SharedLockFD && "Must have shared lock file open");
+ if (sys::fs::tryLockFile(*SharedLockFD) == std::error_code()) {
+ assert(size() <= capacity());
+ (void)sys::fs::resize_file(*FD, size());
+ (void)sys::fs::unlockFile(*SharedLockFD);
+ }
+ }
+ auto Close = [](std::optional<int> &FD) {
+ if (FD) {
+ sys::fs::file_t File = sys::fs::convertFDToNativeFile(*FD);
+ sys::fs::closeFile(File);
+ FD = std::nullopt;
+ }
+ };
+ // Close the file and shared lock.
+ Close(FD);
+ Close(SharedLockFD);
+void MappedFileRegionBumpPtr::initializeBumpPtr(int64_t BumpPtrOffset) {
+ assert(capacity() < (uint64_t)INT64_MAX && "capacity must fit in int64_t");
+ int64_t BumpPtrEndOffset = BumpPtrOffset + sizeof(decltype(*BumpPtr));
+ assert(BumpPtrEndOffset <= (int64_t)capacity() &&
+ "Expected end offset to be pre-allocated");
+ assert(isAligned(Align::Of<decltype(*BumpPtr)>(), BumpPtrOffset) &&
+ "Expected end offset to be aligned");
+ BumpPtr = reinterpret_cast<decltype(BumpPtr)>(data() + BumpPtrOffset);
+ int64_t ExistingValue = 0;
+ if (!BumpPtr->compare_exchange_strong(ExistingValue, BumpPtrEndOffset))
+ assert(ExistingValue >= BumpPtrEndOffset &&
+ "Expected 0, or past the end of the BumpPtr itself");
+int64_t MappedFileRegionBumpPtr::allocateOffset(uint64_t AllocSize) {
+ AllocSize = alignTo(AllocSize, getAlign());
+ int64_t OldEnd = BumpPtr->fetch_add(AllocSize);
+ int64_t NewEnd = OldEnd + AllocSize;
+ if (LLVM_UNLIKELY(NewEnd > (int64_t)capacity())) {
+ // Try to return the allocation.
+ (void)BumpPtr->compare_exchange_strong(OldEnd, NewEnd);
+ report_fatal_error(
+ errorCodeToError(std::make_error_code(std::errc::not_enough_memory)));
+ }
+ return OldEnd;
diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp
index a0e8d5541acd9b..a9c5c53c1fcfea 100644
--- a/llvm/lib/CAS/ObjectStore.cpp
+++ b/llvm/lib/CAS/ObjectStore.cpp
@@ -210,7 +210,14 @@ ObjectProxy::getMemoryBuffer(StringRef Name,
static Expected<std::shared_ptr<ObjectStore>>
createOnDiskCASImpl(const Twine &Path) {
- return createOnDiskCAS(Path);
+ std::string CASPath = Path.str();
+ // If path is empty, use default ondisk CAS path.
+ if (CASPath.empty())
+ CASPath = getDefaultOnDiskCASPath();
+ auto UniDB = builtin::createBuiltinUnifiedOnDiskCache(CASPath);
+ if (!UniDB)
+ return UniDB.takeError();
+ return builtin::createObjectStoreFromUnifiedOnDiskCache(std::move(*UniDB));
static Expected<std::shared_ptr<ObjectStore>>
@@ -229,28 +236,22 @@ static StringMap<ObjectStoreCreateFuncTy *> &getRegisteredScheme() {
-cas::createCASFromIdentifier(StringRef Path) {
+cas::createCASFromIdentifier(StringRef Id) {
for (auto &Scheme : getRegisteredScheme()) {
- if (Path.consume_front(Scheme.getKey()))
- return Scheme.getValue()(Path);
+ if (Id.consume_front(Scheme.getKey()))
+ return Scheme.getValue()(Id);
- if (Path.empty())
- return createStringError(std::make_error_code(std::errc::invalid_argument),
- "No CAS identifier is provided");
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "Unknown CAS identifier is provided");
- // FIXME: some current default behavior.
- SmallString<256> PathBuf;
- if (Path == "auto") {
- getDefaultOnDiskCASPath(PathBuf);
- Path = PathBuf;
+bool cas::isRegisteredCASIdentifier(StringRef Id) {
+ for (auto &Scheme : getRegisteredScheme()) {
+ if (Id.consume_front(Scheme.getKey()))
+ return true;
- // Fallback is to create UnifiedOnDiskCache.
- auto UniDB = builtin::createBuiltinUnifiedOnDiskCache(Path);
- if (!UniDB)
- return UniDB.takeError();
- return builtin::createObjectStoreFromUnifiedOnDiskCache(std::move(*UniDB));
+ return false;
void cas::registerCASURLScheme(StringRef Prefix,
diff --git a/llvm/lib/CAS/OnDiskCAS.cpp b/llvm/lib/CAS/OnDiskCAS.cpp
new file mode 100644
index 00000000000000..85ea5cae7318ed
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskCAS.cpp
@@ -0,0 +1,205 @@
+//===- OnDiskCAS.cpp --------------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "BuiltinCAS.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "llvm/Support/Path.h"
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::builtin;
+namespace {
+class OnDiskCAS : public BuiltinCAS {
+ Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash,
+ ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) final;
+ Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) final;
+ CASID getID(ObjectRef Ref) const final;
+ std::optional<ObjectRef> getReference(const CASID &ID) const final;
+ Expected<bool> isMaterialized(ObjectRef Ref) const final;
+ ArrayRef<char> getDataConst(ObjectHandle Node) const final;
+ void print(raw_ostream &OS) const final;
+ static Expected<std::unique_ptr<OnDiskCAS>> open(StringRef Path);
+ OnDiskCAS(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB_)
+ : UniDB(std::move(UniDB_)), DB(&UniDB->getGraphDB()) {}
+ ObjectHandle convertHandle(ondisk::ObjectHandle Node) const {
+ return makeObjectHandle(Node.getOpaqueData());
+ }
+ ondisk::ObjectHandle convertHandle(ObjectHandle Node) const {
+ return ondisk::ObjectHandle::fromOpaqueData(Node.getInternalRef(*this));
+ }
+ ObjectRef convertRef(ondisk::ObjectID Ref) const {
+ return makeObjectRef(Ref.getOpaqueData());
+ }
+ ondisk::ObjectID convertRef(ObjectRef Ref) const {
+ return ondisk::ObjectID::fromOpaqueData(Ref.getInternalRef(*this));
+ }
+ size_t getNumRefs(ObjectHandle Node) const final {
+ auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+ return std::distance(RefsRange.begin(), RefsRange.end());
+ }
+ ObjectRef readRef(ObjectHandle Node, size_t I) const final {
+ auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+ return convertRef(RefsRange.begin()[I]);
+ }
+ Error forEachRef(ObjectHandle Node,
+ function_ref<Error(ObjectRef)> Callback) const final;
+ OnDiskCAS(std::unique_ptr<ondisk::OnDiskGraphDB> DB_)
+ : OwnedDB(std::move(DB_)), DB(OwnedDB.get()) {}
+ std::unique_ptr<ondisk::OnDiskGraphDB> OwnedDB;
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB;
+ ondisk::OnDiskGraphDB *DB;
+} // end anonymous namespace
+void OnDiskCAS::print(raw_ostream &OS) const { DB->print(OS); }
+CASID OnDiskCAS::getID(ObjectRef Ref) const {
+ ArrayRef<uint8_t> Hash = DB->getDigest(convertRef(Ref));
+ return CASID::create(&getContext(), toStringRef(Hash));
+std::optional<ObjectRef> OnDiskCAS::getReference(const CASID &ID) const {
+ std::optional<ondisk::ObjectID> ObjID =
+ DB->getExistingReference(ID.getHash());
+ if (!ObjID)
+ return std::nullopt;
+ return convertRef(*ObjID);
+Expected<bool> OnDiskCAS::isMaterialized(ObjectRef ExternalRef) const {
+ return DB->containsObject(convertRef(ExternalRef));
+ArrayRef<char> OnDiskCAS::getDataConst(ObjectHandle Node) const {
+ return DB->getObjectData(convertHandle(Node));
+OnDiskCAS::loadIfExists(ObjectRef ExternalRef) {
+ Expected<std::optional<ondisk::ObjectHandle>> ObjHnd =
+ DB->load(convertRef(ExternalRef));
+ if (!ObjHnd)
+ return ObjHnd.takeError();
+ if (!*ObjHnd)
+ return std::nullopt;
+ return convertHandle(**ObjHnd);
+Expected<ObjectRef> OnDiskCAS::storeImpl(ArrayRef<uint8_t> ComputedHash,
+ ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) {
+ SmallVector<ondisk::ObjectID, 64> IDs;
+ IDs.reserve(Refs.size());
+ for (ObjectRef Ref : Refs) {
+ IDs.push_back(convertRef(Ref));
+ }
+ ondisk::ObjectID StoredID = DB->getReference(ComputedHash);
+ if (Error E = DB->store(StoredID, IDs, Data))
+ return std::move(E);
+ return convertRef(StoredID);
+Error OnDiskCAS::forEachRef(ObjectHandle Node,
+ function_ref<Error(ObjectRef)> Callback) const {
+ auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+ for (ondisk::ObjectID Ref : RefsRange) {
+ if (Error E = Callback(convertRef(Ref)))
+ return E;
+ }
+ return Error::success();
+Expected<std::unique_ptr<OnDiskCAS>> OnDiskCAS::open(StringRef AbsPath) {
+ Expected<std::unique_ptr<ondisk::OnDiskGraphDB>> DB =
+ ondisk::OnDiskGraphDB::open(AbsPath, BuiltinCASContext::getHashName(),
+ sizeof(HashType));
+ if (!DB)
+ return DB.takeError();
+ return std::unique_ptr<OnDiskCAS>(new OnDiskCAS(std::move(*DB)));
+bool cas::isOnDiskCASEnabled() {
+ return true;
+ return false;
+static constexpr StringLiteral DefaultName = "cas";
+static void getDefaultOnDiskCASStableID(SmallVectorImpl<char> &Path) {
+ Path.assign(DefaultDirProxy.begin(), DefaultDirProxy.end());
+ llvm::sys::path::append(Path, DefaultDir, DefaultName);
+static std::string getDefaultOnDiskCASStableID() {
+ SmallString<128> Path;
+ getDefaultOnDiskCASStableID(Path);
+ return Path.str().str();
+Expected<std::unique_ptr<ObjectStore>> cas::createOnDiskCAS(const Twine &Path) {
+ // FIXME: An absolute path isn't really good enough. Should open a directory
+ // and use openat() for files underneath.
+ SmallString<256> AbsPath;
+ Path.toVector(AbsPath);
+ sys::fs::make_absolute(AbsPath);
+ // FIXME: Remove this and update clients to do this logic.
+ if (AbsPath == getDefaultOnDiskCASStableID())
+ AbsPath = StringRef(getDefaultOnDiskCASPath());
+ return OnDiskCAS::open(AbsPath);
+ return createStringError(inconvertibleErrorCode(), "OnDiskCAS is disabled");
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) {
+ return std::make_unique<OnDiskCAS>(std::move(UniDB));
+void cas::getDefaultOnDiskCASPath(SmallVectorImpl<char> &Path) {
+ // FIXME: Should this return 'Error' instead of hard-failing?
+ if (!llvm::sys::path::cache_directory(Path))
+ report_fatal_error("cannot get default cache directory");
+ llvm::sys::path::append(Path, DefaultDir, DefaultName);
+std::string cas::getDefaultOnDiskCASPath() {
+ SmallString<128> Path;
+ getDefaultOnDiskCASPath(Path);
+ return Path.str().str();
diff --git a/llvm/lib/CAS/OnDiskCommon.cpp b/llvm/lib/CAS/OnDiskCommon.cpp
new file mode 100644
index 00000000000000..718d8992379a8f
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskCommon.cpp
@@ -0,0 +1,26 @@
+//===- OnDiskCommon.cpp ---------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "OnDiskCommon.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+using namespace llvm;
+Expected<std::optional<uint64_t>> cas::ondisk::getOverriddenMaxMappingSize() {
+ constexpr const char *EnvVar = "LLVM_CAS_MAX_MAPPING_SIZE";
+ const char *Value = getenv(EnvVar);
+ if (!Value)
+ return std::nullopt;
+ uint64_t Size;
+ if (StringRef(Value).getAsInteger(/*auto*/ 0, Size))
+ return createStringError(inconvertibleErrorCode(),
+ "invalid value for %s: expected integer", EnvVar);
+ return Size;
diff --git a/llvm/lib/CAS/OnDiskCommon.h b/llvm/lib/CAS/OnDiskCommon.h
new file mode 100644
index 00000000000000..7394e45dc4e3de
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskCommon.h
@@ -0,0 +1,24 @@
+//===- OnDiskCommon.h -------------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/Support/Error.h"
+#include <optional>
+namespace llvm::cas::ondisk {
+/// Retrieves an overridden maximum mapping size for CAS files, if any, by
+/// checking LLVM_CAS_MAX_MAPPING_SIZE in the environment. If the value is
+/// unreadable, returns an error.
+Expected<std::optional<uint64_t>> getOverriddenMaxMappingSize();
+} // namespace llvm::cas::ondisk
diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp
new file mode 100644
index 00000000000000..4486bdb2863661
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskGraphDB.cpp
@@ -0,0 +1,1508 @@
+//===- OnDiskGraphDB.cpp ----------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// On-disk CAS nodes database, independent of a particular hashing algorithm.
+// Here's a top-level description of the current layout (could expose or make
+// this configurable in the future).
+// Files, each with a prefix set by \a FilePrefix:
+// - db/<prefix>.index: a file for the "index" table, named by \a
+// IndexTableName and managed by \a HashMappedTrie. The contents are 8B
+// that are accessed atomically, describing the object kind and where/how
+// it's stored (including an optional file offset). See \a TrieRecord for
+// more details.
+// - db/<prefix>.data: a file for the "data" table, named by \a
+// DataPoolTableName and managed by \a DataStore. New objects within
+// TrieRecord::MaxEmbeddedSize are inserted here as \a
+// TrieRecord::StorageKind::DataPool.
+// - db/<prefix>.<offset>.data: a file storing an object outside the main
+// "data" table, named by its offset into the "index" table, with the
+// format of \a TrieRecord::StorageKind::Standalone.
+// - db/<prefix>.<offset>.leaf: a file storing a leaf node outside the
+// main "data" table, named by its offset into the "index" table, with
+// the format of \a TrieRecord::StorageKind::StandaloneLeaf.
+// - db/<prefix>.<offset>.leaf+0: a file storing a leaf object outside the
+// main "data" table, named by its offset into the "index" table, with
+// the format of \a TrieRecord::StorageKind::StandaloneLeaf0.
+// The "index", and "data" tables could be stored in a single file,
+// (using a root record that points at the two types of stores), but splitting
+// the files seems more convenient for now.
+// ObjectID: this is a pointer to Trie record
+// ObjectHandle: this is a pointer to Data record
+// Eventually: consider creating a StringPool for strings instead of using
+// RecordDataStore table.
+// - Lookup by prefix tree
+// - Store by suffix tree
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#define DEBUG_TYPE "on-disk-cas"
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+static constexpr StringLiteral IndexTableName = "llvm.cas.index";
+static constexpr StringLiteral DataPoolTableName = "llvm.cas.data";
+static constexpr StringLiteral IndexFile = "index";
+static constexpr StringLiteral DataPoolFile = "data";
+static constexpr StringLiteral FilePrefix = "v8.";
+static constexpr StringLiteral FileSuffixData = ".data";
+static constexpr StringLiteral FileSuffixLeaf = ".leaf";
+static constexpr StringLiteral FileSuffixLeaf0 = ".leaf+0";
+static Error createCorruptObjectError(ArrayRef<uint8_t> ID) {
+ return createStringError(llvm::errc::invalid_argument,
+ "corrupt object '" + toHex(ID) + "'");
+namespace {
+/// Trie record data: 8B, atomic<uint64_t>
+/// - 1-byte: StorageKind
+/// - 7-bytes: DataStoreOffset (offset into referenced file)
+class TrieRecord {
+ enum class StorageKind : uint8_t {
+ /// Unknown object.
+ Unknown = 0,
+ /// vX.data: main pool, full DataStore record.
+ DataPool = 1,
+ /// vX.<TrieRecordOffset>.data: standalone, with a full DataStore record.
+ Standalone = 10,
+ /// vX.<TrieRecordOffset>.leaf: standalone, just the data. File contents
+ /// exactly the data content and file size matches the data size. No refs.
+ StandaloneLeaf = 11,
+ /// vX.<TrieRecordOffset>.leaf+0: standalone, just the data plus an
+ /// extra null character ('\0'). File size is 1 bigger than the data size.
+ /// No refs.
+ StandaloneLeaf0 = 12,
+ };
+ static StringRef getStandaloneFileSuffix(StorageKind SK) {
+ switch (SK) {
+ default:
+ llvm_unreachable("Expected standalone storage kind");
+ case TrieRecord::StorageKind::Standalone:
+ return FileSuffixData;
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ return FileSuffixLeaf0;
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ return FileSuffixLeaf;
+ }
+ }
+ enum Limits : int64_t {
+ // Saves files bigger than 64KB standalone instead of embedding them.
+ MaxEmbeddedSize = 64LL * 1024LL - 1,
+ };
+ struct Data {
+ StorageKind SK = StorageKind::Unknown;
+ FileOffset Offset;
+ };
+ static uint64_t pack(Data D) {
+ assert(D.Offset.get() < (int64_t)(1ULL << 56));
+ uint64_t Packed = uint64_t(D.SK) << 56 | D.Offset.get();
+ assert(D.SK != StorageKind::Unknown || Packed == 0);
+#ifndef NDEBUG
+ Data RoundTrip = unpack(Packed);
+ assert(D.SK == RoundTrip.SK);
+ assert(D.Offset.get() == RoundTrip.Offset.get());
+ return Packed;
+ }
+ static Data unpack(uint64_t Packed) {
+ Data D;
+ if (!Packed)
+ return D;
+ D.SK = (StorageKind)(Packed >> 56);
+ D.Offset = FileOffset(Packed & (UINT64_MAX >> 8));
+ return D;
+ }
+ TrieRecord() : Storage(0) {}
+ Data load() const { return unpack(Storage); }
+ bool compare_exchange_strong(Data &Existing, Data New);
+ std::atomic<uint64_t> Storage;
+/// DataStore record data: 4B + size? + refs? + data + 0
+/// - 4-bytes: Header
+/// - {0,4,8}-bytes: DataSize (may be packed in Header)
+/// - {0,4,8}-bytes: NumRefs (may be packed in Header)
+/// - NumRefs*{4,8}-bytes: Refs[] (end-ptr is 8-byte aligned)
+/// - <data>
+/// - 1-byte: 0-term
+struct DataRecordHandle {
+ /// NumRefs storage: 4B, 2B, 1B, or 0B (no refs). Or, 8B, for alignment
+ /// convenience to avoid computing padding later.
+ enum class NumRefsFlags : uint8_t {
+ Uses0B = 0U,
+ Uses1B = 1U,
+ Uses2B = 2U,
+ Uses4B = 3U,
+ Uses8B = 4U,
+ Max = Uses8B,
+ };
+ /// DataSize storage: 8B, 4B, 2B, or 1B.
+ enum class DataSizeFlags {
+ Uses1B = 0U,
+ Uses2B = 1U,
+ Uses4B = 2U,
+ Uses8B = 3U,
+ Max = Uses8B,
+ };
+ /// Kind of ref stored in Refs[]: InternalRef or InternalRef4B.
+ enum class RefKindFlags {
+ InternalRef = 0U,
+ InternalRef4B = 1U,
+ Max = InternalRef4B,
+ };
+ enum Counts : int {
+ NumRefsShift = 0,
+ NumRefsBits = 3,
+ DataSizeShift = NumRefsShift + NumRefsBits,
+ DataSizeBits = 2,
+ RefKindShift = DataSizeShift + DataSizeBits,
+ RefKindBits = 1,
+ };
+ static_assert(((UINT32_MAX << NumRefsBits) & (uint32_t)NumRefsFlags::Max) ==
+ 0,
+ "Not enough bits");
+ static_assert(((UINT32_MAX << DataSizeBits) & (uint32_t)DataSizeFlags::Max) ==
+ 0,
+ "Not enough bits");
+ static_assert(((UINT32_MAX << RefKindBits) & (uint32_t)RefKindFlags::Max) ==
+ 0,
+ "Not enough bits");
+ struct LayoutFlags {
+ NumRefsFlags NumRefs;
+ DataSizeFlags DataSize;
+ RefKindFlags RefKind;
+ static uint64_t pack(LayoutFlags LF) {
+ unsigned Packed = ((unsigned)LF.NumRefs << NumRefsShift) |
+ ((unsigned)LF.DataSize << DataSizeShift) |
+ ((unsigned)LF.RefKind << RefKindShift);
+#ifndef NDEBUG
+ LayoutFlags RoundTrip = unpack(Packed);
+ assert(LF.NumRefs == RoundTrip.NumRefs);
+ assert(LF.DataSize == RoundTrip.DataSize);
+ assert(LF.RefKind == RoundTrip.RefKind);
+ return Packed;
+ }
+ static LayoutFlags unpack(uint64_t Storage) {
+ assert(Storage <= UINT8_MAX && "Expect storage to fit in a byte");
+ LayoutFlags LF;
+ LF.NumRefs =
+ (NumRefsFlags)((Storage >> NumRefsShift) & ((1U << NumRefsBits) - 1));
+ LF.DataSize = (DataSizeFlags)((Storage >> DataSizeShift) &
+ ((1U << DataSizeBits) - 1));
+ LF.RefKind =
+ (RefKindFlags)((Storage >> RefKindShift) & ((1U << RefKindBits) - 1));
+ return LF;
+ }
+ };
+ /// Header layout:
+ /// - 1-byte: LayoutFlags
+ /// - 1-byte: 1B size field
+ /// - {0,2}-bytes: 2B size field
+ struct Header {
+ using PackTy = uint32_t;
+ PackTy Packed;
+ static constexpr unsigned LayoutFlagsShift =
+ (sizeof(PackTy) - 1) * CHAR_BIT;
+ };
+ struct Input {
+ InternalRefArrayRef Refs;
+ ArrayRef<char> Data;
+ };
+ LayoutFlags getLayoutFlags() const {
+ return LayoutFlags::unpack(H->Packed >> Header::LayoutFlagsShift);
+ }
+ uint64_t getDataSize() const;
+ void skipDataSize(LayoutFlags LF, int64_t &RelOffset) const;
+ uint32_t getNumRefs() const;
+ void skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const;
+ int64_t getRefsRelOffset() const;
+ int64_t getDataRelOffset() const;
+ static uint64_t getTotalSize(uint64_t DataRelOffset, uint64_t DataSize) {
+ return DataRelOffset + DataSize + 1;
+ }
+ uint64_t getTotalSize() const {
+ return getDataRelOffset() + getDataSize() + 1;
+ }
+ struct Layout {
+ explicit Layout(const Input &I);
+ LayoutFlags Flags{};
+ uint64_t DataSize = 0;
+ uint32_t NumRefs = 0;
+ int64_t RefsRelOffset = 0;
+ int64_t DataRelOffset = 0;
+ uint64_t getTotalSize() const {
+ return DataRecordHandle::getTotalSize(DataRelOffset, DataSize);
+ }
+ };
+ InternalRefArrayRef getRefs() const {
+ assert(H && "Expected valid handle");
+ auto *BeginByte = reinterpret_cast<const char *>(H) + getRefsRelOffset();
+ size_t Size = getNumRefs();
+ if (!Size)
+ return InternalRefArrayRef();
+ if (getLayoutFlags().RefKind == RefKindFlags::InternalRef4B)
+ return ArrayRef(reinterpret_cast<const InternalRef4B *>(BeginByte), Size);
+ return ArrayRef(reinterpret_cast<const InternalRef *>(BeginByte), Size);
+ }
+ ArrayRef<char> getData() const {
+ assert(H && "Expected valid handle");
+ return ArrayRef(reinterpret_cast<const char *>(H) + getDataRelOffset(),
+ getDataSize());
+ }
+ static DataRecordHandle create(function_ref<char *(size_t Size)> Alloc,
+ const Input &I);
+ static Expected<DataRecordHandle>
+ createWithError(function_ref<Expected<char *>(size_t Size)> Alloc,
+ const Input &I);
+ static DataRecordHandle construct(char *Mem, const Input &I);
+ static DataRecordHandle get(const char *Mem) {
+ return DataRecordHandle(
+ *reinterpret_cast<const DataRecordHandle::Header *>(Mem));
+ }
+ explicit operator bool() const { return H; }
+ const Header &getHeader() const { return *H; }
+ DataRecordHandle() = default;
+ explicit DataRecordHandle(const Header &H) : H(&H) {}
+ static DataRecordHandle constructImpl(char *Mem, const Input &I,
+ const Layout &L);
+ const Header *H = nullptr;
+class StandaloneDataInMemory {
+ OnDiskContent getContent() const;
+ /// FIXME: Should be mapped_file_region instead of MemoryBuffer to drop a
+ /// layer of indirection.
+ std::unique_ptr<MemoryBuffer> Region;
+ TrieRecord::StorageKind SK;
+ StandaloneDataInMemory(std::unique_ptr<MemoryBuffer> Region,
+ TrieRecord::StorageKind SK)
+ : Region(std::move(Region)), SK(SK) {
+#ifndef NDEBUG
+ bool IsStandalone = false;
+ switch (SK) {
+ case TrieRecord::StorageKind::Standalone:
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ IsStandalone = true;
+ break;
+ default:
+ break;
+ }
+ assert(IsStandalone);
+ }
+/// Container for "big" objects mapped in separately.
+template <size_t NumShards> class StandaloneDataMap {
+ static_assert(isPowerOf2_64(NumShards), "Expected power of 2");
+ const StandaloneDataInMemory &insert(ArrayRef<uint8_t> Hash,
+ TrieRecord::StorageKind SK,
+ std::unique_ptr<MemoryBuffer> Buffer);
+ const StandaloneDataInMemory *lookup(ArrayRef<uint8_t> Hash) const;
+ bool count(ArrayRef<uint8_t> Hash) const { return bool(lookup(Hash)); }
+ struct Shard {
+ /// Needs to store a std::unique_ptr for a stable address identity.
+ DenseMap<const uint8_t *, std::unique_ptr<StandaloneDataInMemory>> Map;
+ mutable std::mutex Mutex;
+ };
+ Shard &getShard(ArrayRef<uint8_t> Hash) {
+ return const_cast<Shard &>(
+ const_cast<const StandaloneDataMap *>(this)->getShard(Hash));
+ }
+ const Shard &getShard(ArrayRef<uint8_t> Hash) const {
+ static_assert(NumShards <= 256, "Expected only 8 bits of shard");
+ return Shards[Hash[0] % NumShards];
+ }
+ Shard Shards[NumShards];
+using StandaloneDataMapTy = StandaloneDataMap<16>;
+struct InternalHandle {
+ FileOffset getAsFileOffset() const { return *DataOffset; }
+ uint64_t getRawData() const {
+ if (DataOffset) {
+ uint64_t Raw = DataOffset->get();
+ assert(!(Raw & 0x1));
+ return Raw;
+ }
+ uint64_t Raw = reinterpret_cast<uintptr_t>(SDIM);
+ assert(!(Raw & 0x1));
+ return Raw | 1;
+ }
+ explicit InternalHandle(FileOffset DataOffset) : DataOffset(DataOffset) {}
+ explicit InternalHandle(uint64_t DataOffset) : DataOffset(DataOffset) {}
+ explicit InternalHandle(const StandaloneDataInMemory &SDIM) : SDIM(&SDIM) {}
+ std::optional<FileOffset> DataOffset;
+ const StandaloneDataInMemory *SDIM = nullptr;
+class InternalRefVector {
+ void push_back(InternalRef Ref) {
+ if (NeedsFull)
+ return FullRefs.push_back(Ref);
+ if (std::optional<InternalRef4B> Small = InternalRef4B::tryToShrink(Ref))
+ return SmallRefs.push_back(*Small);
+ NeedsFull = true;
+ assert(FullRefs.empty());
+ FullRefs.reserve(SmallRefs.size() + 1);
+ for (InternalRef4B Small : SmallRefs)
+ FullRefs.push_back(Small);
+ FullRefs.push_back(Ref);
+ SmallRefs.clear();
+ }
+ operator InternalRefArrayRef() const {
+ assert(SmallRefs.empty() || FullRefs.empty());
+ return NeedsFull ? InternalRefArrayRef(FullRefs)
+ : InternalRefArrayRef(SmallRefs);
+ }
+ bool NeedsFull = false;
+ SmallVector<InternalRef4B> SmallRefs;
+ SmallVector<InternalRef> FullRefs;
+} // namespace
+/// Proxy for any on-disk object or raw data.
+struct ondisk::OnDiskContent {
+ std::optional<DataRecordHandle> Record;
+ std::optional<ArrayRef<char>> Bytes;
+Expected<DataRecordHandle> DataRecordHandle::createWithError(
+ function_ref<Expected<char *>(size_t Size)> Alloc, const Input &I) {
+ Layout L(I);
+ if (Expected<char *> Mem = Alloc(L.getTotalSize()))
+ return constructImpl(*Mem, I, L);
+ else
+ return Mem.takeError();
+DataRecordHandle::create(function_ref<char *(size_t Size)> Alloc,
+ const Input &I) {
+ Layout L(I);
+ return constructImpl(Alloc(L.getTotalSize()), I, L);
+/// Proxy for an on-disk index record.
+struct OnDiskGraphDB::IndexProxy {
+ FileOffset Offset;
+ ArrayRef<uint8_t> Hash;
+ TrieRecord &Ref;
+template <size_t N>
+const StandaloneDataInMemory &
+StandaloneDataMap<N>::insert(ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
+ std::unique_ptr<MemoryBuffer> Buffer) {
+ auto &S = getShard(Hash);
+ std::lock_guard<std::mutex> Lock(S.Mutex);
+ auto &V = S.Map[Hash.data()];
+ if (!V)
+ V = std::make_unique<StandaloneDataInMemory>(std::move(Buffer), SK);
+ return *V;
+template <size_t N>
+const StandaloneDataInMemory *
+StandaloneDataMap<N>::lookup(ArrayRef<uint8_t> Hash) const {
+ auto &S = getShard(Hash);
+ std::lock_guard<std::mutex> Lock(S.Mutex);
+ auto I = S.Map.find(Hash.data());
+ if (I == S.Map.end())
+ return nullptr;
+ return &*I->second;
+/// Copy of \a sys::fs::TempFile that skips RemoveOnSignal, which is too
+/// expensive to register/unregister at this rate.
+/// FIXME: Add a TempFileManager that maintains a thread-safe list of open temp
+/// files and has a signal handler registerd that removes them all.
+class OnDiskGraphDB::TempFile {
+ bool Done = false;
+ TempFile(StringRef Name, int FD) : TmpName(std::string(Name)), FD(FD) {}
+ /// This creates a temporary file with createUniqueFile.
+ static Expected<TempFile> create(const Twine &Model);
+ TempFile(TempFile &&Other) { *this = std::move(Other); }
+ TempFile &operator=(TempFile &&Other) {
+ TmpName = std::move(Other.TmpName);
+ FD = Other.FD;
+ Other.Done = true;
+ Other.FD = -1;
+ return *this;
+ }
+ // Name of the temporary file.
+ std::string TmpName;
+ // The open file descriptor.
+ int FD = -1;
+ // Keep this with the given name.
+ Error keep(const Twine &Name);
+ Error discard();
+ // This checks that keep or delete was called.
+ ~TempFile() { consumeError(discard()); }
+class OnDiskGraphDB::MappedTempFile {
+ char *data() const { return Map.data(); }
+ size_t size() const { return Map.size(); }
+ Error discard() {
+ assert(Map && "Map already destroyed");
+ Map.unmap();
+ return Temp.discard();
+ }
+ Error keep(const Twine &Name) {
+ assert(Map && "Map already destroyed");
+ Map.unmap();
+ return Temp.keep(Name);
+ }
+ MappedTempFile(TempFile Temp, sys::fs::mapped_file_region Map)
+ : Temp(std::move(Temp)), Map(std::move(Map)) {}
+ TempFile Temp;
+ sys::fs::mapped_file_region Map;
+Error OnDiskGraphDB::TempFile::discard() {
+ Done = true;
+ if (FD != -1) {
+ sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+ if (std::error_code EC = sys::fs::closeFile(File))
+ return errorCodeToError(EC);
+ }
+ FD = -1;
+ // Always try to close and remove.
+ std::error_code RemoveEC;
+ if (!TmpName.empty())
+ if (std::error_code EC = sys::fs::remove(TmpName))
+ return errorCodeToError(EC);
+ TmpName = "";
+ return Error::success();
+Error OnDiskGraphDB::TempFile::keep(const Twine &Name) {
+ assert(!Done);
+ Done = true;
+ // Always try to close and rename.
+ std::error_code RenameEC = sys::fs::rename(TmpName, Name);
+ if (!RenameEC)
+ TmpName = "";
+ sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+ if (std::error_code EC = sys::fs::closeFile(File))
+ return errorCodeToError(EC);
+ FD = -1;
+ return errorCodeToError(RenameEC);
+OnDiskGraphDB::TempFile::create(const Twine &Model) {
+ int FD;
+ SmallString<128> ResultPath;
+ if (std::error_code EC = sys::fs::createUniqueFile(Model, FD, ResultPath))
+ return errorCodeToError(EC);
+ TempFile Ret(ResultPath, FD);
+ return std::move(Ret);
+bool TrieRecord::compare_exchange_strong(Data &Existing, Data New) {
+ uint64_t ExistingPacked = pack(Existing);
+ uint64_t NewPacked = pack(New);
+ if (Storage.compare_exchange_strong(ExistingPacked, NewPacked))
+ return true;
+ Existing = unpack(ExistingPacked);
+ return false;
+DataRecordHandle DataRecordHandle::construct(char *Mem, const Input &I) {
+ return constructImpl(Mem, I, Layout(I));
+DataRecordHandle DataRecordHandle::constructImpl(char *Mem, const Input &I,
+ const Layout &L) {
+ char *Next = Mem + sizeof(Header);
+ // Fill in Packed and set other data, then come back to construct the header.
+ Header::PackTy Packed = 0;
+ Packed |= LayoutFlags::pack(L.Flags) << Header::LayoutFlagsShift;
+ // Construct DataSize.
+ switch (L.Flags.DataSize) {
+ case DataSizeFlags::Uses1B:
+ assert(I.Data.size() <= UINT8_MAX);
+ Packed |= (Header::PackTy)I.Data.size()
+ << ((sizeof(Packed) - 2) * CHAR_BIT);
+ break;
+ case DataSizeFlags::Uses2B:
+ assert(I.Data.size() <= UINT16_MAX);
+ Packed |= (Header::PackTy)I.Data.size()
+ << ((sizeof(Packed) - 4) * CHAR_BIT);
+ break;
+ case DataSizeFlags::Uses4B:
+ support::endian::write32le(Next, I.Data.size());
+ Next += 4;
+ break;
+ case DataSizeFlags::Uses8B:
+ support::endian::write64le(Next, I.Data.size());
+ Next += 8;
+ break;
+ }
+ // Construct NumRefs.
+ //
+ // NOTE: May be writing NumRefs even if there are zero refs in order to fix
+ // alignment.
+ switch (L.Flags.NumRefs) {
+ case NumRefsFlags::Uses0B:
+ break;
+ case NumRefsFlags::Uses1B:
+ assert(I.Refs.size() <= UINT8_MAX);
+ Packed |= (Header::PackTy)I.Refs.size()
+ << ((sizeof(Packed) - 2) * CHAR_BIT);
+ break;
+ case NumRefsFlags::Uses2B:
+ assert(I.Refs.size() <= UINT16_MAX);
+ Packed |= (Header::PackTy)I.Refs.size()
+ << ((sizeof(Packed) - 4) * CHAR_BIT);
+ break;
+ case NumRefsFlags::Uses4B:
+ support::endian::write32le(Next, I.Refs.size());
+ Next += 4;
+ break;
+ case NumRefsFlags::Uses8B:
+ support::endian::write64le(Next, I.Refs.size());
+ Next += 8;
+ break;
+ }
+ // Construct Refs[].
+ if (!I.Refs.empty()) {
+ assert((L.Flags.RefKind == RefKindFlags::InternalRef4B) == I.Refs.is4B());
+ ArrayRef<uint8_t> RefsBuffer = I.Refs.getBuffer();
+ llvm::copy(RefsBuffer, Next);
+ Next += RefsBuffer.size();
+ }
+ // Construct Data and the trailing null.
+ assert(isAddrAligned(Align(8), Next));
+ llvm::copy(I.Data, Next);
+ Next[I.Data.size()] = 0;
+ // Construct the header itself and return.
+ Header *H = new (Mem) Header{Packed};
+ DataRecordHandle Record(*H);
+ assert(Record.getData() == I.Data);
+ assert(Record.getNumRefs() == I.Refs.size());
+ assert(Record.getRefs() == I.Refs);
+ assert(Record.getLayoutFlags().DataSize == L.Flags.DataSize);
+ assert(Record.getLayoutFlags().NumRefs == L.Flags.NumRefs);
+ assert(Record.getLayoutFlags().RefKind == L.Flags.RefKind);
+ return Record;
+DataRecordHandle::Layout::Layout(const Input &I) {
+ // Start initial relative offsets right after the Header.
+ uint64_t RelOffset = sizeof(Header);
+ // Initialize the easy stuff.
+ DataSize = I.Data.size();
+ NumRefs = I.Refs.size();
+ // Check refs size.
+ Flags.RefKind =
+ I.Refs.is4B() ? RefKindFlags::InternalRef4B : RefKindFlags::InternalRef;
+ // Find the smallest slot available for DataSize.
+ bool Has1B = true;
+ bool Has2B = true;
+ if (DataSize <= UINT8_MAX && Has1B) {
+ Flags.DataSize = DataSizeFlags::Uses1B;
+ Has1B = false;
+ } else if (DataSize <= UINT16_MAX && Has2B) {
+ Flags.DataSize = DataSizeFlags::Uses2B;
+ Has2B = false;
+ } else if (DataSize <= UINT32_MAX) {
+ Flags.DataSize = DataSizeFlags::Uses4B;
+ RelOffset += 4;
+ } else {
+ Flags.DataSize = DataSizeFlags::Uses8B;
+ RelOffset += 8;
+ }
+ // Find the smallest slot available for NumRefs. Never sets NumRefs8B here.
+ if (!NumRefs) {
+ Flags.NumRefs = NumRefsFlags::Uses0B;
+ } else if (NumRefs <= UINT8_MAX && Has1B) {
+ Flags.NumRefs = NumRefsFlags::Uses1B;
+ Has1B = false;
+ } else if (NumRefs <= UINT16_MAX && Has2B) {
+ Flags.NumRefs = NumRefsFlags::Uses2B;
+ Has2B = false;
+ } else {
+ Flags.NumRefs = NumRefsFlags::Uses4B;
+ RelOffset += 4;
+ }
+ // Helper to "upgrade" either DataSize or NumRefs by 4B to avoid complicated
+ // padding rules when reading and writing. This also bumps RelOffset.
+ //
+ // The value for NumRefs is strictly limited to UINT32_MAX, but it can be
+ // stored as 8B. This means we can *always* find a size to grow.
+ //
+ // NOTE: Only call this once.
+ auto GrowSizeFieldsBy4B = [&]() {
+ assert(isAligned(Align(4), RelOffset));
+ RelOffset += 4;
+ assert(Flags.NumRefs != NumRefsFlags::Uses8B &&
+ "Expected to be able to grow NumRefs8B");
+ // First try to grow DataSize. NumRefs will not (yet) be 8B, and if
+ // DataSize is upgraded to 8B it'll already be aligned.
+ //
+ // Failing that, grow NumRefs.
+ if (Flags.DataSize < DataSizeFlags::Uses4B)
+ Flags.DataSize = DataSizeFlags::Uses4B; // DataSize: Packed => 4B.
+ else if (Flags.DataSize < DataSizeFlags::Uses8B)
+ Flags.DataSize = DataSizeFlags::Uses8B; // DataSize: 4B => 8B.
+ else if (Flags.NumRefs < NumRefsFlags::Uses4B)
+ Flags.NumRefs = NumRefsFlags::Uses4B; // NumRefs: Packed => 4B.
+ else
+ Flags.NumRefs = NumRefsFlags::Uses8B; // NumRefs: 4B => 8B.
+ };
+ assert(isAligned(Align(4), RelOffset));
+ if (Flags.RefKind == RefKindFlags::InternalRef) {
+ // List of 8B refs should be 8B-aligned. Grow one of the sizes to get this
+ // without padding.
+ if (!isAligned(Align(8), RelOffset))
+ GrowSizeFieldsBy4B();
+ assert(isAligned(Align(8), RelOffset));
+ RefsRelOffset = RelOffset;
+ RelOffset += 8 * NumRefs;
+ } else {
+ // The array of 4B refs doesn't need 8B alignment, but the data will need
+ // to be 8B-aligned. Detect this now, and, if necessary, shift everything
+ // by 4B by growing one of the sizes.
+ // If we remove the need for 8B-alignment for data there is <1% savings in
+ // disk storage for a clang build using MCCAS but the 8B-alignment may be
+ // useful in the future so keep it for now.
+ uint64_t RefListSize = 4 * NumRefs;
+ if (!isAligned(Align(8), RelOffset + RefListSize))
+ GrowSizeFieldsBy4B();
+ RefsRelOffset = RelOffset;
+ RelOffset += RefListSize;
+ }
+ assert(isAligned(Align(8), RelOffset));
+ DataRelOffset = RelOffset;
+uint64_t DataRecordHandle::getDataSize() const {
+ int64_t RelOffset = sizeof(Header);
+ auto *DataSizePtr = reinterpret_cast<const char *>(H) + RelOffset;
+ switch (getLayoutFlags().DataSize) {
+ case DataSizeFlags::Uses1B:
+ return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
+ case DataSizeFlags::Uses2B:
+ return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
+ case DataSizeFlags::Uses4B:
+ return support::endian::read32le(DataSizePtr);
+ case DataSizeFlags::Uses8B:
+ return support::endian::read64le(DataSizePtr);
+ }
+void DataRecordHandle::skipDataSize(LayoutFlags LF, int64_t &RelOffset) const {
+ if (LF.DataSize >= DataSizeFlags::Uses4B)
+ RelOffset += 4;
+ if (LF.DataSize >= DataSizeFlags::Uses8B)
+ RelOffset += 4;
+uint32_t DataRecordHandle::getNumRefs() const {
+ LayoutFlags LF = getLayoutFlags();
+ int64_t RelOffset = sizeof(Header);
+ skipDataSize(LF, RelOffset);
+ auto *NumRefsPtr = reinterpret_cast<const char *>(H) + RelOffset;
+ switch (LF.NumRefs) {
+ case NumRefsFlags::Uses0B:
+ return 0;
+ case NumRefsFlags::Uses1B:
+ return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
+ case NumRefsFlags::Uses2B:
+ return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
+ case NumRefsFlags::Uses4B:
+ return support::endian::read32le(NumRefsPtr);
+ case NumRefsFlags::Uses8B:
+ return support::endian::read64le(NumRefsPtr);
+ }
+void DataRecordHandle::skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const {
+ if (LF.NumRefs >= NumRefsFlags::Uses4B)
+ RelOffset += 4;
+ if (LF.NumRefs >= NumRefsFlags::Uses8B)
+ RelOffset += 4;
+int64_t DataRecordHandle::getRefsRelOffset() const {
+ LayoutFlags LF = getLayoutFlags();
+ int64_t RelOffset = sizeof(Header);
+ skipDataSize(LF, RelOffset);
+ skipNumRefs(LF, RelOffset);
+ return RelOffset;
+int64_t DataRecordHandle::getDataRelOffset() const {
+ LayoutFlags LF = getLayoutFlags();
+ int64_t RelOffset = sizeof(Header);
+ skipDataSize(LF, RelOffset);
+ skipNumRefs(LF, RelOffset);
+ uint32_t RefSize = LF.RefKind == RefKindFlags::InternalRef4B ? 4 : 8;
+ RelOffset += RefSize * getNumRefs();
+ return RelOffset;
+void OnDiskGraphDB::print(raw_ostream &OS) const {
+ OS << "on-disk-root-path: " << RootPath << "\n";
+ struct PoolInfo {
+ int64_t Offset;
+ };
+ SmallVector<PoolInfo> Pool;
+ OS << "\n";
+ OS << "index:\n";
+ Index.print(OS, [&](ArrayRef<char> Data) {
+ assert(Data.size() == sizeof(TrieRecord));
+ assert(isAligned(Align::Of<TrieRecord>(), Data.size()));
+ auto *R = reinterpret_cast<const TrieRecord *>(Data.data());
+ TrieRecord::Data D = R->load();
+ OS << " SK=";
+ switch (D.SK) {
+ case TrieRecord::StorageKind::Unknown:
+ OS << "unknown ";
+ break;
+ case TrieRecord::StorageKind::DataPool:
+ OS << "datapool ";
+ Pool.push_back({D.Offset.get()});
+ break;
+ case TrieRecord::StorageKind::Standalone:
+ OS << "standalone-data ";
+ break;
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ OS << "standalone-leaf ";
+ break;
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ OS << "standalone-leaf+0";
+ break;
+ }
+ OS << " Offset=" << (void *)D.Offset.get();
+ });
+ if (Pool.empty())
+ return;
+ OS << "\n";
+ OS << "pool:\n";
+ llvm::sort(
+ Pool, [](PoolInfo LHS, PoolInfo RHS) { return LHS.Offset < RHS.Offset; });
+ for (PoolInfo PI : Pool) {
+ OS << "- addr=" << (void *)PI.Offset << " ";
+ DataRecordHandle D =
+ DataRecordHandle::get(DataPool.beginData(FileOffset(PI.Offset)));
+ OS << "record refs=" << D.getNumRefs() << " data=" << D.getDataSize()
+ << " size=" << D.getTotalSize()
+ << " end=" << (void *)(PI.Offset + D.getTotalSize()) << "\n";
+ }
+OnDiskGraphDB::IndexProxy OnDiskGraphDB::indexHash(ArrayRef<uint8_t> Hash) {
+ OnDiskHashMappedTrie::pointer P = Index.insertLazy(
+ Hash, [](FileOffset TentativeOffset,
+ OnDiskHashMappedTrie::ValueProxy TentativeValue) {
+ assert(TentativeValue.Data.size() == sizeof(TrieRecord));
+ assert(
+ isAddrAligned(Align::Of<TrieRecord>(), TentativeValue.Data.data()));
+ new (TentativeValue.Data.data()) TrieRecord();
+ });
+ assert(P && "Expected insertion");
+ return getIndexProxyFromPointer(P);
+OnDiskGraphDB::IndexProxy OnDiskGraphDB::getIndexProxyFromPointer(
+ OnDiskHashMappedTrie::const_pointer P) const {
+ assert(P);
+ assert(P.getOffset());
+ return IndexProxy{P.getOffset(), P->Hash,
+ *const_cast<TrieRecord *>(
+ reinterpret_cast<const TrieRecord *>(P->Data.data()))};
+ObjectID OnDiskGraphDB::getReference(ArrayRef<uint8_t> Hash) {
+ IndexProxy I = indexHash(Hash);
+ return getExternalReference(I);
+ObjectID OnDiskGraphDB::getExternalReference(const IndexProxy &I) {
+ return getExternalReference(makeInternalRef(I.Offset));
+OnDiskGraphDB::getExistingReference(ArrayRef<uint8_t> Digest) {
+ auto tryUpstream =
+ [&](std::optional<IndexProxy> I) -> std::optional<ObjectID> {
+ if (!UpstreamDB)
+ return std::nullopt;
+ std::optional<ObjectID> UpstreamID =
+ UpstreamDB->getExistingReference(Digest);
+ if (!UpstreamID)
+ return std::nullopt;
+ if (!I)
+ I.emplace(indexHash(Digest));
+ return getExternalReference(*I);
+ };
+ OnDiskHashMappedTrie::const_pointer P = Index.find(Digest);
+ if (!P)
+ return tryUpstream(std::nullopt);
+ IndexProxy I = getIndexProxyFromPointer(P);
+ TrieRecord::Data Obj = I.Ref.load();
+ if (Obj.SK == TrieRecord::StorageKind::Unknown)
+ return tryUpstream(I);
+ return getExternalReference(makeInternalRef(I.Offset));
+OnDiskGraphDB::getIndexProxyFromRef(InternalRef Ref) const {
+ OnDiskHashMappedTrie::const_pointer P =
+ Index.recoverFromFileOffset(Ref.getFileOffset());
+ report_fatal_error("OnDiskCAS: corrupt internal reference");
+ return getIndexProxyFromPointer(P);
+ArrayRef<uint8_t> OnDiskGraphDB::getDigest(InternalRef Ref) const {
+ IndexProxy I = getIndexProxyFromRef(Ref);
+ return I.Hash;
+ArrayRef<uint8_t> OnDiskGraphDB::getDigest(const IndexProxy &I) const {
+ return I.Hash;
+ArrayRef<char> OnDiskGraphDB::getObjectData(ObjectHandle Node) const {
+ OnDiskContent Content = getContentFromHandle(Node);
+ if (Content.Bytes)
+ return *Content.Bytes;
+ assert(Content.Record && "Expected record or bytes");
+ return Content.Record->getData();
+InternalRefArrayRef OnDiskGraphDB::getInternalRefs(ObjectHandle Node) const {
+ if (std::optional<DataRecordHandle> Record =
+ getContentFromHandle(Node).Record)
+ return Record->getRefs();
+ return std::nullopt;
+OnDiskGraphDB::load(ObjectID ExternalRef) {
+ InternalRef Ref = getInternalRef(ExternalRef);
+ IndexProxy I = getIndexProxyFromRef(Ref);
+ TrieRecord::Data Object = I.Ref.load();
+ if (Object.SK == TrieRecord::StorageKind::Unknown) {
+ if (!UpstreamDB)
+ return std::nullopt;
+ return faultInFromUpstream(ExternalRef);
+ }
+ auto toObjectHandle = [](InternalHandle H) -> ObjectHandle {
+ return ObjectHandle::fromOpaqueData(H.getRawData());
+ };
+ if (Object.SK == TrieRecord::StorageKind::DataPool)
+ return toObjectHandle(InternalHandle(Object.Offset));
+ // Only TrieRecord::StorageKind::Standalone (and variants) need to be
+ // explicitly loaded.
+ //
+ // There's corruption if standalone objects have offsets, or if we get here
+ // for something that isn't standalone.
+ if (Object.Offset)
+ return createCorruptObjectError(getDigest(I));
+ switch (Object.SK) {
+ case TrieRecord::StorageKind::Unknown:
+ case TrieRecord::StorageKind::DataPool:
+ llvm_unreachable("unexpected storage kind");
+ case TrieRecord::StorageKind::Standalone:
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ break;
+ }
+ // Load it from disk.
+ //
+ // Note: Creation logic guarantees that data that needs null-termination is
+ // suitably 0-padded. Requiring null-termination here would be too expensive
+ // for extremely large objects that happen to be page-aligned.
+ SmallString<256> Path;
+ getStandalonePath(TrieRecord::getStandaloneFileSuffix(Object.SK), I, Path);
+ ErrorOr<std::unique_ptr<MemoryBuffer>> OwnedBuffer = MemoryBuffer::getFile(
+ Path, /*IsText=*/false, /*RequiresNullTerminator=*/false);
+ if (!OwnedBuffer)
+ return createCorruptObjectError(getDigest(I));
+ return toObjectHandle(
+ InternalHandle(static_cast<StandaloneDataMapTy *>(StandaloneData)
+ ->insert(I.Hash, Object.SK, std::move(*OwnedBuffer))));
+bool OnDiskGraphDB::containsObject(ObjectID ExternalRef,
+ bool CheckUpstream) const {
+ InternalRef Ref = getInternalRef(ExternalRef);
+ IndexProxy I = getIndexProxyFromRef(Ref);
+ TrieRecord::Data Object = I.Ref.load();
+ if (Object.SK != TrieRecord::StorageKind::Unknown)
+ return true;
+ if (!CheckUpstream || !UpstreamDB)
+ return false;
+ std::optional<ObjectID> UpstreamID =
+ UpstreamDB->getExistingReference(getDigest(I));
+ return UpstreamID.has_value();
+InternalRef OnDiskGraphDB::makeInternalRef(FileOffset IndexOffset) {
+ return InternalRef::getFromOffset(IndexOffset);
+void OnDiskGraphDB::getStandalonePath(StringRef Suffix, const IndexProxy &I,
+ SmallVectorImpl<char> &Path) const {
+ Path.assign(RootPath.begin(), RootPath.end());
+ sys::path::append(Path, FilePrefix + Twine(I.Offset.get()) + Suffix);
+OnDiskContent OnDiskGraphDB::getContentFromHandle(ObjectHandle OH) const {
+ auto getInternalHandle = [](ObjectHandle Handle) -> InternalHandle {
+ uint64_t Data = Handle.getOpaqueData();
+ if (Data & 1)
+ return InternalHandle(*reinterpret_cast<const StandaloneDataInMemory *>(
+ Data & (-1ULL << 1)));
+ return InternalHandle(Data);
+ };
+ InternalHandle Handle = getInternalHandle(OH);
+ if (Handle.SDIM)
+ return Handle.SDIM->getContent();
+ auto DataHandle =
+ DataRecordHandle::get(DataPool.beginData(Handle.getAsFileOffset()));
+ assert(DataHandle.getData().end()[0] == 0 && "Null termination");
+ return OnDiskContent{DataHandle, std::nullopt};
+OnDiskContent StandaloneDataInMemory::getContent() const {
+ bool Leaf0 = false;
+ bool Leaf = false;
+ switch (SK) {
+ default:
+ llvm_unreachable("Storage kind must be standalone");
+ case TrieRecord::StorageKind::Standalone:
+ break;
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ Leaf = Leaf0 = true;
+ break;
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ Leaf = true;
+ break;
+ }
+ if (Leaf) {
+ assert(Region->getBuffer().drop_back(Leaf0).end()[0] == 0 &&
+ "Standalone node data missing null termination");
+ return OnDiskContent{
+ std::nullopt,
+ arrayRefFromStringRef<char>(Region->getBuffer().drop_back(Leaf0))};
+ }
+ DataRecordHandle Record = DataRecordHandle::get(Region->getBuffer().data());
+ assert(Record.getData().end()[0] == 0 &&
+ "Standalone object record missing null termination for data");
+ return OnDiskContent{Record, std::nullopt};
+OnDiskGraphDB::createTempFile(StringRef FinalPath, uint64_t Size) {
+ assert(Size && "Unexpected request for an empty temp file");
+ Expected<TempFile> File = TempFile::create(FinalPath + ".%%%%%%");
+ if (!File)
+ return File.takeError();
+ if (auto EC = sys::fs::resize_file_before_mapping_readwrite(File->FD, Size))
+ return createFileError(File->TmpName, EC);
+ std::error_code EC;
+ sys::fs::mapped_file_region Map(sys::fs::convertFDToNativeFile(File->FD),
+ sys::fs::mapped_file_region::readwrite, Size,
+ 0, EC);
+ if (EC)
+ return createFileError(File->TmpName, EC);
+ return MappedTempFile(std::move(*File), std::move(Map));
+static size_t getPageSize() {
+ static int PageSize = sys::Process::getPageSizeEstimate();
+ return PageSize;
+Error OnDiskGraphDB::createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data) {
+ assert(Data.size() > TrieRecord::MaxEmbeddedSize &&
+ "Expected a bigger file for external content...");
+ bool Leaf0 = isAligned(Align(getPageSize()), Data.size());
+ TrieRecord::StorageKind SK = Leaf0 ? TrieRecord::StorageKind::StandaloneLeaf0
+ : TrieRecord::StorageKind::StandaloneLeaf;
+ SmallString<256> Path;
+ int64_t FileSize = Data.size() + Leaf0;
+ getStandalonePath(TrieRecord::getStandaloneFileSuffix(SK), I, Path);
+ // Write the file. Don't reuse this mapped_file_region, which is read/write.
+ // Let load() pull up one that's read-only.
+ Expected<MappedTempFile> File = createTempFile(Path, FileSize);
+ if (!File)
+ return File.takeError();
+ assert(File->size() == (uint64_t)FileSize);
+ llvm::copy(Data, File->data());
+ if (Leaf0)
+ File->data()[Data.size()] = 0;
+ assert(File->data()[Data.size()] == 0);
+ if (Error E = File->keep(Path))
+ return E;
+ // Store the object reference.
+ TrieRecord::Data Existing;
+ {
+ TrieRecord::Data Leaf{SK, FileOffset()};
+ if (I.Ref.compare_exchange_strong(Existing, Leaf)) {
+ recordStandaloneSizeIncrease(FileSize);
+ return Error::success();
+ }
+ }
+ // If there was a race, confirm that the new value has valid storage.
+ if (Existing.SK == TrieRecord::StorageKind::Unknown)
+ return createCorruptObjectError(getDigest(I));
+ return Error::success();
+Error OnDiskGraphDB::store(ObjectID ID, ArrayRef<ObjectID> Refs,
+ ArrayRef<char> Data) {
+ IndexProxy I = getIndexProxyFromRef(getInternalRef(ID));
+ // Early return in case the node exists.
+ {
+ TrieRecord::Data Existing = I.Ref.load();
+ if (Existing.SK != TrieRecord::StorageKind::Unknown)
+ return Error::success();
+ }
+ // Big leaf nodes.
+ if (Refs.empty() && Data.size() > TrieRecord::MaxEmbeddedSize)
+ return createStandaloneLeaf(I, Data);
+ // TODO: Check whether it's worth checking the index for an already existing
+ // object (like storeTreeImpl() does) before building up the
+ // InternalRefVector.
+ InternalRefVector InternalRefs;
+ for (ObjectID Ref : Refs)
+ InternalRefs.push_back(getInternalRef(Ref));
+ // Create the object.
+ DataRecordHandle::Input Input{InternalRefs, Data};
+ // Compute the storage kind, allocate it, and create the record.
+ TrieRecord::StorageKind SK = TrieRecord::StorageKind::Unknown;
+ FileOffset PoolOffset;
+ SmallString<256> Path;
+ std::optional<MappedTempFile> File;
+ std::optional<uint64_t> FileSize;
+ auto Alloc = [&](size_t Size) -> Expected<char *> {
+ if (Size <= TrieRecord::MaxEmbeddedSize) {
+ SK = TrieRecord::StorageKind::DataPool;
+ OnDiskDataAllocator::pointer P = DataPool.allocate(Size);
+ PoolOffset = P.getOffset();
+ dbgs() << "pool-alloc addr=" << (void *)PoolOffset.get()
+ << " size=" << Size
+ << " end=" << (void *)(PoolOffset.get() + Size) << "\n";
+ });
+ return P->data();
+ }
+ SK = TrieRecord::StorageKind::Standalone;
+ getStandalonePath(TrieRecord::getStandaloneFileSuffix(SK), I, Path);
+ if (Error E = createTempFile(Path, Size).moveInto(File))
+ return std::move(E);
+ assert(File->size() == Size);
+ FileSize = Size;
+ return File->data();
+ };
+ DataRecordHandle Record;
+ if (Error E =
+ DataRecordHandle::createWithError(Alloc, Input).moveInto(Record))
+ return E;
+ assert(Record.getData().end()[0] == 0 && "Expected null-termination");
+ assert(Record.getData() == Input.Data && "Expected initialization");
+ assert(SK != TrieRecord::StorageKind::Unknown);
+ assert(bool(File) != bool(PoolOffset) &&
+ "Expected either a mapped file or a pooled offset");
+ // Check for a race before calling MappedTempFile::keep().
+ //
+ // Then decide what to do with the file. Better to discard than overwrite if
+ // another thread/process has already added this.
+ TrieRecord::Data Existing = I.Ref.load();
+ {
+ TrieRecord::Data NewObject{SK, PoolOffset};
+ if (File) {
+ if (Existing.SK == TrieRecord::StorageKind::Unknown) {
+ // Keep the file!
+ if (Error E = File->keep(Path))
+ return E;
+ } else {
+ File.reset();
+ }
+ }
+ // If we didn't already see a racing/existing write, then try storing the
+ // new object. If that races, confirm that the new value has valid storage.
+ //
+ // TODO: Find a way to reuse the storage from the new-but-abandoned record
+ // handle.
+ if (Existing.SK == TrieRecord::StorageKind::Unknown) {
+ if (I.Ref.compare_exchange_strong(Existing, NewObject)) {
+ if (FileSize)
+ recordStandaloneSizeIncrease(*FileSize);
+ return Error::success();
+ }
+ }
+ }
+ if (Existing.SK == TrieRecord::StorageKind::Unknown)
+ return createCorruptObjectError(getDigest(I));
+ // Load existing object.
+ return Error::success();
+void OnDiskGraphDB::recordStandaloneSizeIncrease(size_t SizeIncrease) {
+ getStandaloneStorageSize().fetch_add(SizeIncrease, std::memory_order_relaxed);
+std::atomic<uint64_t> &OnDiskGraphDB::getStandaloneStorageSize() {
+ MutableArrayRef<uint8_t> UserHeader = DataPool.getUserHeader();
+ assert(UserHeader.size() == sizeof(std::atomic<uint64_t>));
+ assert(isAddrAligned(Align(8), UserHeader.data()));
+ return *reinterpret_cast<std::atomic<uint64_t> *>(UserHeader.data());
+uint64_t OnDiskGraphDB::getStandaloneStorageSize() const {
+ return const_cast<OnDiskGraphDB *>(this)->getStandaloneStorageSize().load(
+ std::memory_order_relaxed);
+size_t OnDiskGraphDB::getStorageSize() const {
+ return Index.size() + DataPool.size() + getStandaloneStorageSize();
+Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open(
+ StringRef AbsPath, StringRef HashName, unsigned HashByteSize,
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB, FaultInPolicy Policy) {
+ if (std::error_code EC = sys::fs::create_directories(AbsPath))
+ return createFileError(AbsPath, EC);
+ const StringRef Slash = sys::path::get_separator();
+ constexpr uint64_t MB = 1024ull * 1024ull;
+ constexpr uint64_t GB = 1024ull * 1024ull * 1024ull;
+ uint64_t MaxIndexSize = 8 * GB;
+ uint64_t MaxDataPoolSize = 16 * GB;
+ auto CustomSize = getOverriddenMaxMappingSize();
+ if (!CustomSize)
+ return CustomSize.takeError();
+ if (*CustomSize)
+ MaxIndexSize = MaxDataPoolSize = **CustomSize;
+ std::optional<OnDiskHashMappedTrie> Index;
+ if (Error E =
+ OnDiskHashMappedTrie::create(
+ AbsPath + Slash + FilePrefix + IndexFile,
+ IndexTableName + "[" + HashName + "]", HashByteSize * CHAR_BIT,
+ /*DataSize=*/sizeof(TrieRecord), MaxIndexSize, /*MinFileSize=*/MB)
+ .moveInto(Index))
+ return std::move(E);
+ uint32_t UserHeaderSize = sizeof(std::atomic<uint64_t>);
+ std::optional<OnDiskDataAllocator> DataPool;
+ StringRef PolicyName =
+ Policy == FaultInPolicy::SingleNode ? "single" : "full";
+ if (Error E = OnDiskDataAllocator::create(
+ AbsPath + Slash + FilePrefix + DataPoolFile,
+ DataPoolTableName + "[" + HashName + "]" + PolicyName,
+ MaxDataPoolSize, /*MinFileSize=*/MB, UserHeaderSize,
+ [](void *UserHeaderPtr) {
+ new (UserHeaderPtr) std::atomic<uint64_t>(0);
+ })
+ .moveInto(DataPool))
+ return std::move(E);
+ if (DataPool->getUserHeader().size() != UserHeaderSize)
+ return createStringError(llvm::errc::argument_out_of_domain,
+ "unexpected user header in '" + AbsPath + Slash +
+ FilePrefix + DataPoolFile + "'");
+ return std::unique_ptr<OnDiskGraphDB>(
+ new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool),
+ std::move(UpstreamDB), Policy));
+OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskHashMappedTrie Index,
+ OnDiskDataAllocator DataPool,
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB,
+ FaultInPolicy Policy)
+ : Index(std::move(Index)), DataPool(std::move(DataPool)),
+ RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)),
+ FIPolicy(Policy) {
+ /// Lifetime for "big" objects not in DataPool.
+ ///
+ /// NOTE: Could use ThreadSafeHashMappedTrie here. For now, doing something
+ /// simpler on the assumption there won't be much contention since most data
+ /// is not big. If there is contention, and we've already fixed ObjectProxy
+ /// object handles to be cheap enough to use consistently, the fix might be
+ /// to use better use of them rather than optimizing this map.
+ ///
+ /// FIXME: Figure out the right number of shards, if any.
+ StandaloneData = new StandaloneDataMapTy();
+OnDiskGraphDB::~OnDiskGraphDB() {
+ delete static_cast<StandaloneDataMapTy *>(StandaloneData);
+Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID,
+ ObjectHandle UpstreamNode) {
+ // Copies the full CAS tree from upstream. Uses depth-first copying to protect
+ // against the process dying during importing and leaving the database with an
+ // incomplete tree. Note that if the upstream has missing nodes then the tree
+ // will be copied with missing nodes as well, it won't be considered an error.
+ struct UpstreamCursor {
+ ObjectHandle Node;
+ size_t RefsCount;
+ object_refs_iterator RefI;
+ object_refs_iterator RefE;
+ };
+ /// Keeps track of the state of visitation for current node and all of its
+ /// parents.
+ SmallVector<UpstreamCursor, 16> CursorStack;
+ /// Keeps track of the currently visited nodes as they are imported into
+ /// primary database, from current node and its parents. When a node is
+ /// entered for visitation it appends its own ID, then appends referenced IDs
+ /// as they get imported. When a node is fully imported it removes the
+ /// referenced IDs from the bottom of the stack which leaves its own ID at the
+ /// bottom, adding to the list of referenced IDs for the parent node.
+ SmallVector<ObjectID, 128> PrimaryNodesStack;
+ auto enqueueNode = [&](ObjectID PrimaryID, std::optional<ObjectHandle> Node) {
+ PrimaryNodesStack.push_back(PrimaryID);
+ if (!Node)
+ return;
+ auto Refs = UpstreamDB->getObjectRefs(*Node);
+ CursorStack.push_back({*Node,
+ (size_t)std::distance(Refs.begin(), Refs.end()),
+ Refs.begin(), Refs.end()});
+ };
+ enqueueNode(PrimaryID, UpstreamNode);
+ while (!CursorStack.empty()) {
+ UpstreamCursor &Cur = CursorStack.back();
+ if (Cur.RefI == Cur.RefE) {
+ // Copy the node data into the primary store.
+ // FIXME: Use hard-link or cloning if the file-system supports it and data
+ // is stored into a separate file.
+ // The bottom of \p PrimaryNodesStack contains the primary ID for the
+ // current node plus the list of imported referenced IDs.
+ assert(PrimaryNodesStack.size() >= Cur.RefsCount + 1);
+ ObjectID PrimaryID = *(PrimaryNodesStack.end() - Cur.RefsCount - 1);
+ auto PrimaryRefs = ArrayRef(PrimaryNodesStack)
+ .slice(PrimaryNodesStack.size() - Cur.RefsCount);
+ auto Data = UpstreamDB->getObjectData(Cur.Node);
+ if (Error E = store(PrimaryID, PrimaryRefs, Data))
+ return E;
+ // Remove the current node and its IDs from the stack.
+ PrimaryNodesStack.truncate(PrimaryNodesStack.size() - Cur.RefsCount);
+ CursorStack.pop_back();
+ continue;
+ }
+ ObjectID UpstreamID = *(Cur.RefI++);
+ ObjectID PrimaryID = getReference(UpstreamDB->getDigest(UpstreamID));
+ if (containsObject(PrimaryID, /*CheckUpstream=*/false)) {
+ // This \p ObjectID already exists in the primary. Either it was imported
+ // via \p importFullTree or the client created it, in which case the
+ // client takes responsibility for how it was formed.
+ enqueueNode(PrimaryID, std::nullopt);
+ continue;
+ }
+ Expected<std::optional<ObjectHandle>> UpstreamNode =
+ UpstreamDB->load(UpstreamID);
+ if (!UpstreamNode)
+ return UpstreamNode.takeError();
+ enqueueNode(PrimaryID, *UpstreamNode);
+ }
+ assert(PrimaryNodesStack.size() == 1);
+ assert(PrimaryNodesStack.front() == PrimaryID);
+ return Error::success();
+Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
+ ObjectHandle UpstreamNode) {
+ // Copies only a single node, it doesn't copy the referenced nodes.
+ // Copy the node data into the primary store.
+ // FIXME: Use hard-link or cloning if the file-system supports it and data is
+ // stored into a separate file.
+ auto Data = UpstreamDB->getObjectData(UpstreamNode);
+ auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode);
+ SmallVector<ObjectID, 64> Refs;
+ Refs.reserve(std::distance(UpstreamRefs.begin(), UpstreamRefs.end()));
+ for (ObjectID UpstreamRef : UpstreamRefs)
+ Refs.push_back(getReference(UpstreamDB->getDigest(UpstreamRef)));
+ return store(PrimaryID, Refs, Data);
+OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) {
+ assert(UpstreamDB);
+ ObjectID UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID));
+ Expected<std::optional<ObjectHandle>> UpstreamNode =
+ UpstreamDB->load(UpstreamID);
+ if (!UpstreamNode)
+ return UpstreamNode.takeError();
+ if (!*UpstreamNode)
+ return std::nullopt;
+ if (Error E = FIPolicy == FaultInPolicy::SingleNode
+ ? importSingleNode(PrimaryID, **UpstreamNode)
+ : importFullTree(PrimaryID, **UpstreamNode))
+ return std::move(E);
+ return load(PrimaryID);
diff --git a/llvm/lib/CAS/OnDiskHashMappedTrie.cpp b/llvm/lib/CAS/OnDiskHashMappedTrie.cpp
new file mode 100644
index 00000000000000..09fef70ee7fc06
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskHashMappedTrie.cpp
@@ -0,0 +1,1356 @@
+//===- OnDiskHashMappedTrie.cpp -------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/OnDiskHashMappedTrie.h"
+#include "HashMappedTrieIndexGenerator.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CAS/MappedFileRegionBumpPtr.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace llvm::cas;
+static_assert(sizeof(size_t) == sizeof(uint64_t), "64-bit only");
+static_assert(sizeof(std::atomic<int64_t>) == sizeof(uint64_t),
+ "Requires lock-free 64-bit atomics");
+// Generic database data structures.
+namespace {
+using MappedFileRegion = MappedFileRegionBumpPtr::RegionT;
+/// Generic handle for a table.
+/// Probably we want some table kinds for pointing at multiple tables.
+/// - Probably a tree or trie type makes sense.
+/// - Or a deque. Linear search is okay as long as there aren't many tables in
+/// a file.
+/// Generic table header layout:
+/// - 2-bytes: TableKind
+/// - 2-bytes: TableNameSize
+/// - 4-bytes: TableNameRelOffset (relative to header)
+class TableHandle {
+ enum class TableKind : uint16_t {
+ HashMappedTrie = 1,
+ DataAllocator = 2,
+ };
+ struct Header {
+ TableKind Kind;
+ uint16_t NameSize;
+ int32_t NameRelOffset; // Relative to Header.
+ };
+ explicit operator bool() const { return H; }
+ const Header &getHeader() const { return *H; }
+ MappedFileRegion &getRegion() const { return *Region; }
+ template <class T> static void check() {
+ static_assert(
+ std::is_same<decltype(T::Header::GenericHeader), Header>::value,
+ "T::GenericHeader should be of type TableHandle::Header");
+ static_assert(offsetof(typename T::Header, GenericHeader) == 0,
+ "T::GenericHeader must be the head of T::Header");
+ }
+ template <class T> bool is() const { return T::Kind == H->Kind; }
+ template <class T> T dyn_cast() const {
+ check<T>();
+ if (is<T>())
+ return T(*Region, *reinterpret_cast<typename T::Header *>(H));
+ return T();
+ }
+ template <class T> T cast() const {
+ assert(is<T>());
+ return dyn_cast<T>();
+ }
+ StringRef getName() const {
+ auto *Begin = reinterpret_cast<const char *>(H) + H->NameRelOffset;
+ return StringRef(Begin, H->NameSize);
+ }
+ TableHandle() = default;
+ TableHandle(MappedFileRegion &Region, Header &H) : Region(&Region), H(&H) {}
+ TableHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+ : TableHandle(Region,
+ *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+ }
+ MappedFileRegion *Region = nullptr;
+ Header *H = nullptr;
+/// Encapsulate a database file, which:
+/// - Sets/checks magic.
+/// - Sets/checks version.
+/// - Points at an arbitrary root table (can be changed later using a lock-free
+/// algorithm).
+/// - Sets up a BumpPtr for allocation.
+/// Top-level layout:
+/// - 8-bytes: Magic
+/// - 8-bytes: Version
+/// - 8-bytes: RootTable (16-bits: Kind; 48-bits: Offset)
+/// - 8-bytes: BumpPtr
+class DatabaseFile {
+ static constexpr uint64_t getMagic() { return 0x00FFDA7ABA53FF00ULL; }
+ static constexpr uint64_t getVersion() { return 1ULL; }
+ struct Header {
+ uint64_t Magic;
+ uint64_t Version;
+ std::atomic<int64_t> RootTableOffset;
+ std::atomic<int64_t> BumpPtr;
+ };
+ const Header &getHeader() { return *H; }
+ MappedFileRegionBumpPtr &getAlloc() { return Alloc; }
+ MappedFileRegion &getRegion() { return Alloc.getRegion(); }
+ /// Add a table.
+ ///
+ /// TODO: Allow lazy construction via getOrCreate()-style API.
+ void addTable(TableHandle Table);
+ /// Find a table. May return null.
+ std::optional<TableHandle> findTable(StringRef Name);
+ static Expected<DatabaseFile>
+ create(const Twine &Path, uint64_t Capacity,
+ function_ref<Error(DatabaseFile &)> NewDBConstructor);
+ size_t size() const { return Alloc.size(); }
+ static Expected<DatabaseFile>
+ get(std::shared_ptr<MappedFileRegionBumpPtr> Alloc) {
+ if (Error E = validate(Alloc->getRegion()))
+ return std::move(E);
+ return DatabaseFile(std::move(Alloc));
+ }
+ static Error validate(MappedFileRegion &Region);
+ DatabaseFile(MappedFileRegionBumpPtr &Alloc)
+ : H(reinterpret_cast<Header *>(Alloc.data())), Alloc(Alloc) {}
+ DatabaseFile(std::shared_ptr<MappedFileRegionBumpPtr> Alloc)
+ : DatabaseFile(*Alloc) {
+ OwnedAlloc = std::move(Alloc);
+ }
+ Header *H = nullptr;
+ MappedFileRegionBumpPtr &Alloc;
+ std::shared_ptr<MappedFileRegionBumpPtr> OwnedAlloc;
+} // end anonymous namespace
+DatabaseFile::create(const Twine &Path, uint64_t Capacity,
+ function_ref<Error(DatabaseFile &)> NewDBConstructor) {
+ // Constructor for if the file doesn't exist.
+ auto NewFileConstructor = [&](MappedFileRegionBumpPtr &Alloc) -> Error {
+ assert(Alloc.capacity() >= sizeof(Header));
+ (void)new (Alloc.data()) Header{getMagic(), getVersion(), {0}, {0}};
+ Alloc.initializeBumpPtr(offsetof(Header, BumpPtr));
+ DatabaseFile DB(Alloc);
+ return NewDBConstructor(DB);
+ };
+ // Get or create the file.
+ std::shared_ptr<MappedFileRegionBumpPtr> Alloc;
+ if (Error E = MappedFileRegionBumpPtr::createShared(Path, Capacity,
+ offsetof(Header, BumpPtr),
+ NewFileConstructor)
+ .moveInto(Alloc))
+ return std::move(E);
+ return DatabaseFile::get(std::move(Alloc));
+void DatabaseFile::addTable(TableHandle Table) {
+ assert(Table);
+ assert(&Table.getRegion() == &getRegion());
+ int64_t ExistingRootOffset = 0;
+ const int64_t NewOffset =
+ reinterpret_cast<const char *>(&Table.getHeader()) - getRegion().data();
+ if (H->RootTableOffset.compare_exchange_strong(ExistingRootOffset, NewOffset))
+ return;
+ // Silently ignore attempts to set the root to itself.
+ if (ExistingRootOffset == NewOffset)
+ return;
+ // FIXME: Fix the API so that having the same name is not an error. Instead,
+ // the colliding table should just be used as-is and the client can decide
+ // what to do with the new one.
+ //
+ // TODO: Add support for creating a chain or tree of tables (more than one at
+ // all!) to avoid this error.
+ TableHandle Root(getRegion(), ExistingRootOffset);
+ if (Root.getName() == Table.getName())
+ report_fatal_error(
+ createStringError(make_error_code(std::errc::not_supported),
+ "table name collision '" + Table.getName() + "'"));
+ else
+ report_fatal_error(
+ createStringError(make_error_code(std::errc::not_supported),
+ "cannot add new table '" + Table.getName() +
+ "'"
+ " to existing root '" +
+ Root.getName() + "'"));
+std::optional<TableHandle> DatabaseFile::findTable(StringRef Name) {
+ int64_t RootTableOffset = H->RootTableOffset.load();
+ if (!RootTableOffset)
+ return std::nullopt;
+ TableHandle Root(getRegion(), RootTableOffset);
+ if (Root.getName() == Name)
+ return Root;
+ // TODO: Once multiple tables are supported, need to walk to find them.
+ return std::nullopt;
+Error DatabaseFile::validate(MappedFileRegion &Region) {
+ if (Region.size() < sizeof(Header))
+ return createStringError(std::errc::invalid_argument,
+ "database: missing header");
+ // Check the magic and version.
+ auto *H = reinterpret_cast<Header *>(Region.data());
+ if (H->Magic != getMagic())
+ return createStringError(std::errc::invalid_argument,
+ "database: bad magic");
+ if (H->Version != getVersion())
+ return createStringError(std::errc::invalid_argument,
+ "database: wrong version");
+ // Check the bump-ptr, which should point past the header.
+ if (H->BumpPtr.load() < (int64_t)sizeof(Header))
+ return createStringError(std::errc::invalid_argument,
+ "database: corrupt bump-ptr");
+ return Error::success();
+// HashMappedTrie data structures.
+namespace {
+class SubtrieHandle;
+class SubtrieSlotValue {
+ explicit operator bool() const { return !isEmpty(); }
+ bool isEmpty() const { return !Offset; }
+ bool isData() const { return Offset > 0; }
+ bool isSubtrie() const { return Offset < 0; }
+ int64_t asData() const {
+ assert(isData());
+ return Offset;
+ }
+ int64_t asSubtrie() const {
+ assert(isSubtrie());
+ return -Offset;
+ }
+ FileOffset asSubtrieFileOffset() const { return FileOffset(asSubtrie()); }
+ FileOffset asDataFileOffset() const { return FileOffset(asData()); }
+ int64_t getRawOffset() const { return Offset; }
+ static SubtrieSlotValue getDataOffset(int64_t Offset) {
+ return SubtrieSlotValue(Offset);
+ }
+ static SubtrieSlotValue getSubtrieOffset(int64_t Offset) {
+ return SubtrieSlotValue(-Offset);
+ }
+ static SubtrieSlotValue getDataOffset(FileOffset Offset) {
+ return getDataOffset(Offset.get());
+ }
+ static SubtrieSlotValue getSubtrieOffset(FileOffset Offset) {
+ return getDataOffset(Offset.get());
+ }
+ static SubtrieSlotValue getFromSlot(std::atomic<int64_t> &Slot) {
+ return SubtrieSlotValue(Slot.load());
+ }
+ SubtrieSlotValue() = default;
+ friend class SubtrieHandle;
+ explicit SubtrieSlotValue(int64_t Offset) : Offset(Offset) {}
+ int64_t Offset = 0;
+class HashMappedTrieHandle;
+/// Subtrie layout:
+/// - 2-bytes: StartBit
+/// - 1-bytes: NumBits=lg(num-slots)
+/// - 1-bytes: NumUnusedBits=lg(num-slots-unused)
+/// - 4-bytes: 0-pad
+/// - <slots>
+class SubtrieHandle {
+ struct Header {
+ /// The bit this subtrie starts on.
+ uint16_t StartBit;
+ /// The number of bits this subtrie handles. It has 2^NumBits slots.
+ uint8_t NumBits;
+ /// The number of extra bits this allocation *could* handle, due to
+ /// over-allocation. It has 2^NumUnusedBits unused slots.
+ uint8_t NumUnusedBits;
+ /// 0-pad to 8B.
+ uint32_t ZeroPad4B;
+ };
+ /// Slot storage:
+ /// - zero: Empty
+ /// - positive: RecordOffset
+ /// - negative: SubtrieOffset
+ using SlotT = std::atomic<int64_t>;
+ static int64_t getSlotsSize(uint32_t NumBits) {
+ return sizeof(int64_t) * (1u << NumBits);
+ }
+ static int64_t getSize(uint32_t NumBits) {
+ return sizeof(SubtrieHandle::Header) + getSlotsSize(NumBits);
+ }
+ int64_t getSize() const { return getSize(H->NumBits); }
+ SubtrieSlotValue load(size_t I) const {
+ return SubtrieSlotValue(Slots[I].load());
+ }
+ void store(size_t I, SubtrieSlotValue V) {
+ return Slots[I].store(V.getRawOffset());
+ }
+ void printHash(raw_ostream &OS, ArrayRef<uint8_t> Bytes) const;
+ void print(raw_ostream &OS, HashMappedTrieHandle Trie,
+ SmallVectorImpl<int64_t> &Records,
+ std::optional<std::string> Prefix = std::nullopt) const;
+ /// Return None on success, or the existing offset on failure.
+ bool compare_exchange_strong(size_t I, SubtrieSlotValue &Expected,
+ SubtrieSlotValue New) {
+ return Slots[I].compare_exchange_strong(Expected.Offset, New.Offset);
+ }
+ /// Sink \p V from \p I in this subtrie down to \p NewI in a new subtrie with
+ /// \p NumSubtrieBits.
+ ///
+ /// \p UnusedSubtrie maintains a 1-item "free" list of unused subtries. If a
+ /// new subtrie is created that isn't used because of a lost race, then it If
+ /// it's already valid, it should be used instead of allocating a new one.
+ /// should be returned as an out parameter to be passed back in the future.
+ /// If it's already valid, it should be used instead of allocating a new one.
+ ///
+ /// Returns the subtrie that now lives at \p I.
+ SubtrieHandle sink(size_t I, SubtrieSlotValue V,
+ MappedFileRegionBumpPtr &Alloc, size_t NumSubtrieBits,
+ SubtrieHandle &UnusedSubtrie, size_t NewI);
+ /// Only safe if the subtrie is empty.
+ void reinitialize(uint32_t StartBit, uint32_t NumBits);
+ SubtrieSlotValue getOffset() const {
+ return SubtrieSlotValue::getSubtrieOffset(
+ reinterpret_cast<const char *>(H) - Region->data());
+ }
+ FileOffset getFileOffset() const { return getOffset().asSubtrieFileOffset(); }
+ explicit operator bool() const { return H; }
+ Header &getHeader() const { return *H; }
+ uint32_t getStartBit() const { return H->StartBit; }
+ uint32_t getNumBits() const { return H->NumBits; }
+ uint32_t getNumUnusedBits() const { return H->NumUnusedBits; }
+ static SubtrieHandle create(MappedFileRegionBumpPtr &Alloc, uint32_t StartBit,
+ uint32_t NumBits, uint32_t NumUnusedBits = 0);
+ static SubtrieHandle getFromFileOffset(MappedFileRegion &Region,
+ FileOffset Offset) {
+ return SubtrieHandle(Region, SubtrieSlotValue::getSubtrieOffset(Offset));
+ }
+ SubtrieHandle() = default;
+ SubtrieHandle(MappedFileRegion &Region, Header &H)
+ : Region(&Region), H(&H), Slots(getSlots(H)) {}
+ SubtrieHandle(MappedFileRegion &Region, SubtrieSlotValue Offset)
+ : SubtrieHandle(Region, *reinterpret_cast<Header *>(
+ Region.data() + Offset.asSubtrie())) {}
+ MappedFileRegion *Region = nullptr;
+ Header *H = nullptr;
+ MutableArrayRef<SlotT> Slots;
+ static MutableArrayRef<SlotT> getSlots(Header &H) {
+ return MutableArrayRef(reinterpret_cast<SlotT *>(&H + 1), 1u << H.NumBits);
+ }
+/// Handle for a HashMappedTrie table.
+/// HashMappedTrie table layout:
+/// - [8-bytes: Generic table header]
+/// - 1-byte: NumSubtrieBits
+/// - 1-byte: Flags (not used yet)
+/// - 2-bytes: NumHashBits
+/// - 4-bytes: RecordDataSize (in bytes)
+/// - 8-bytes: RootTrieOffset
+/// - 8-bytes: AllocatorOffset (reserved for implementing free lists)
+/// - <name> '\0'
+/// Record layout:
+/// - <data>
+/// - <hash>
+class HashMappedTrieHandle {
+ static constexpr TableHandle::TableKind Kind =
+ TableHandle::TableKind::HashMappedTrie;
+ struct Header {
+ TableHandle::Header GenericHeader;
+ uint8_t NumSubtrieBits;
+ uint8_t Flags; // None used yet.
+ uint16_t NumHashBits;
+ uint32_t RecordDataSize;
+ std::atomic<int64_t> RootTrieOffset;
+ std::atomic<int64_t> AllocatorOffset;
+ };
+ operator TableHandle() const {
+ if (!H)
+ return TableHandle();
+ return TableHandle(*Region, H->GenericHeader);
+ }
+ struct RecordData {
+ OnDiskHashMappedTrie::ValueProxy Proxy;
+ SubtrieSlotValue Offset;
+ FileOffset getFileOffset() const { return Offset.asDataFileOffset(); }
+ };
+ enum Limits : size_t {
+ /// Seems like 65528 hash bits ought to be enough.
+ MaxNumHashBytes = UINT16_MAX >> 3,
+ MaxNumHashBits = MaxNumHashBytes << 3,
+ /// 2^16 bits in a trie is 65536 slots. This restricts us to a 16-bit
+ /// index. This many slots is suspicously large anyway.
+ MaxNumRootBits = 16,
+ /// 2^10 bits in a trie is 1024 slots. This many slots seems suspiciously
+ /// large for subtries.
+ MaxNumSubtrieBits = 10,
+ };
+ static constexpr size_t getNumHashBytes(size_t NumHashBits) {
+ assert(NumHashBits % 8 == 0);
+ return NumHashBits / 8;
+ }
+ static constexpr size_t getRecordSize(size_t RecordDataSize,
+ size_t NumHashBits) {
+ return RecordDataSize + getNumHashBytes(NumHashBits);
+ }
+ RecordData getRecord(SubtrieSlotValue Offset);
+ RecordData createRecord(MappedFileRegionBumpPtr &Alloc,
+ ArrayRef<uint8_t> Hash);
+ explicit operator bool() const { return H; }
+ const Header &getHeader() const { return *H; }
+ SubtrieHandle getRoot() const;
+ SubtrieHandle getOrCreateRoot(MappedFileRegionBumpPtr &Alloc);
+ MappedFileRegion &getRegion() const { return *Region; }
+ size_t getFlags() const { return H->Flags; }
+ uint64_t getNumSubtrieBits() const { return H->NumSubtrieBits; }
+ uint64_t getNumHashBits() const { return H->NumHashBits; }
+ size_t getNumHashBytes() const { return getNumHashBytes(H->NumHashBits); }
+ size_t getRecordDataSize() const { return H->RecordDataSize; }
+ size_t getRecordSize() const {
+ return getRecordSize(H->RecordDataSize, H->NumHashBits);
+ }
+ IndexGenerator getIndexGen(SubtrieHandle Root, ArrayRef<uint8_t> Hash) {
+ assert(Root.getStartBit() == 0);
+ assert(getNumHashBytes() == Hash.size());
+ assert(getNumHashBits() == Hash.size() * 8);
+ return IndexGenerator{Root.getNumBits(), getNumSubtrieBits(), Hash};
+ }
+ static HashMappedTrieHandle
+ create(MappedFileRegionBumpPtr &Alloc, StringRef Name,
+ std::optional<uint64_t> NumRootBits, uint64_t NumSubtrieBits,
+ uint64_t NumHashBits, uint64_t RecordDataSize);
+ void
+ print(raw_ostream &OS,
+ function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const;
+ HashMappedTrieHandle() = default;
+ HashMappedTrieHandle(MappedFileRegion &Region, Header &H)
+ : Region(&Region), H(&H) {}
+ HashMappedTrieHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+ : HashMappedTrieHandle(
+ Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+ }
+ MappedFileRegion *Region = nullptr;
+ Header *H = nullptr;
+} // end anonymous namespace
+struct OnDiskHashMappedTrie::ImplType {
+ DatabaseFile File;
+ HashMappedTrieHandle Trie;
+SubtrieHandle SubtrieHandle::create(MappedFileRegionBumpPtr &Alloc,
+ uint32_t StartBit, uint32_t NumBits,
+ uint32_t NumUnusedBits) {
+ assert(StartBit <= HashMappedTrieHandle::MaxNumHashBits);
+ assert(NumBits <= UINT8_MAX);
+ assert(NumUnusedBits <= UINT8_MAX);
+ assert(NumBits + NumUnusedBits <= HashMappedTrieHandle::MaxNumRootBits);
+ void *Mem = Alloc.allocate(getSize(NumBits + NumUnusedBits));
+ auto *H =
+ new (Mem) SubtrieHandle::Header{(uint16_t)StartBit, (uint8_t)NumBits,
+ (uint8_t)NumUnusedBits, /*ZeroPad4B=*/0};
+ SubtrieHandle S(Alloc.getRegion(), *H);
+ for (auto I = S.Slots.begin(), E = S.Slots.end(); I != E; ++I)
+ new (I) SlotT(0);
+ return S;
+SubtrieHandle HashMappedTrieHandle::getRoot() const {
+ if (int64_t Root = H->RootTrieOffset)
+ return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Root));
+ return SubtrieHandle();
+HashMappedTrieHandle::getOrCreateRoot(MappedFileRegionBumpPtr &Alloc) {
+ assert(&Alloc.getRegion() == &getRegion());
+ if (SubtrieHandle Root = getRoot())
+ return Root;
+ int64_t Race = 0;
+ SubtrieHandle LazyRoot = SubtrieHandle::create(Alloc, 0, H->NumSubtrieBits);
+ if (H->RootTrieOffset.compare_exchange_strong(
+ Race, LazyRoot.getOffset().asSubtrie()))
+ return LazyRoot;
+ // There was a race. Return the other root.
+ //
+ // TODO: Avoid leaking the lazy root by storing it in an allocator.
+ return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Race));
+HashMappedTrieHandle::create(MappedFileRegionBumpPtr &Alloc, StringRef Name,
+ std::optional<uint64_t> NumRootBits,
+ uint64_t NumSubtrieBits, uint64_t NumHashBits,
+ uint64_t RecordDataSize) {
+ // Allocate.
+ intptr_t Offset = Alloc.allocateOffset(sizeof(Header) + Name.size() + 1);
+ // Construct the header and the name.
+ assert(Name.size() <= UINT16_MAX && "Expected smaller table name");
+ assert(NumSubtrieBits <= UINT8_MAX && "Expected valid subtrie bits");
+ assert(NumHashBits <= UINT16_MAX && "Expected valid hash size");
+ assert(RecordDataSize <= UINT32_MAX && "Expected smaller table name");
+ auto *H = new (Alloc.getRegion().data() + Offset)
+ Header{{TableHandle::TableKind::HashMappedTrie, (uint16_t)Name.size(),
+ (uint32_t)sizeof(Header)},
+ (uint8_t)NumSubtrieBits,
+ /*Flags=*/0,
+ (uint16_t)NumHashBits,
+ (uint32_t)RecordDataSize,
+ /*RootTrieOffset=*/{0},
+ /*AllocatorOffset=*/{0}};
+ char *NameStorage = reinterpret_cast<char *>(H + 1);
+ llvm::copy(Name, NameStorage);
+ NameStorage[Name.size()] = 0;
+ // Construct a root trie, if requested.
+ HashMappedTrieHandle Trie(Alloc.getRegion(), *H);
+ if (NumRootBits)
+ H->RootTrieOffset =
+ SubtrieHandle::create(Alloc, 0, *NumRootBits).getOffset().asSubtrie();
+ return Trie;
+HashMappedTrieHandle::getRecord(SubtrieSlotValue Offset) {
+ char *Begin = Region->data() + Offset.asData();
+ OnDiskHashMappedTrie::ValueProxy Proxy;
+ Proxy.Data = MutableArrayRef(Begin, getRecordDataSize());
+ Proxy.Hash = ArrayRef(reinterpret_cast<const uint8_t *>(Proxy.Data.end()),
+ getNumHashBytes());
+ return RecordData{Proxy, Offset};
+HashMappedTrieHandle::createRecord(MappedFileRegionBumpPtr &Alloc,
+ ArrayRef<uint8_t> Hash) {
+ assert(&Alloc.getRegion() == Region);
+ assert(Hash.size() == getNumHashBytes());
+ RecordData Record = getRecord(
+ SubtrieSlotValue::getDataOffset(Alloc.allocateOffset(getRecordSize())));
+ llvm::copy(Hash, const_cast<uint8_t *>(Record.Proxy.Hash.begin()));
+ return Record;
+ const uint8_t *HashBeginPtr) const {
+ // Record hashes occur immediately after data. Compute the beginning of the
+ // record and check for overflow.
+ const uintptr_t HashBegin = reinterpret_cast<uintptr_t>(HashBeginPtr);
+ const uintptr_t RecordBegin = HashBegin - Impl->Trie.getRecordSize();
+ if (HashBegin < RecordBegin)
+ return const_pointer();
+ // Check that it'll be a positive offset.
+ const uintptr_t FileBegin =
+ reinterpret_cast<uintptr_t>(Impl->File.getRegion().data());
+ if (RecordBegin < FileBegin)
+ return const_pointer();
+ // Good enough to form an offset. Continue checking there.
+ return recoverFromFileOffset(FileOffset(RecordBegin - FileBegin));
+OnDiskHashMappedTrie::recoverFromFileOffset(FileOffset Offset) const {
+ // Check alignment.
+ if (!isAligned(MappedFileRegionBumpPtr::getAlign(), Offset.get()))
+ return const_pointer();
+ // Check bounds.
+ //
+ // Note: There's no potential overflow when using \c uint64_t because Offset
+ // is in \c [0,INT64_MAX] and the record size is in \c [0,UINT32_MAX].
+ assert(Offset.get() >= 0 && "Expected FileOffset constructor guarantee this");
+ if ((uint64_t)Offset.get() + Impl->Trie.getRecordSize() >
+ Impl->File.getAlloc().size())
+ return const_pointer();
+ // Looks okay...
+ HashMappedTrieHandle::RecordData D =
+ Impl->Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset));
+ return const_pointer(D.getFileOffset(), D.Proxy);
+OnDiskHashMappedTrie::find(ArrayRef<uint8_t> Hash) const {
+ HashMappedTrieHandle Trie = Impl->Trie;
+ assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash");
+ SubtrieHandle S = Trie.getRoot();
+ if (!S)
+ return const_pointer();
+ IndexGenerator IndexGen = Trie.getIndexGen(S, Hash);
+ size_t Index = IndexGen.next();
+ for (;;) {
+ // Try to set the content.
+ SubtrieSlotValue V = S.load(Index);
+ if (!V)
+ return const_pointer(S.getFileOffset(),
+ HintT(this, Index, *IndexGen.StartBit));
+ // Check for an exact match.
+ if (V.isData()) {
+ HashMappedTrieHandle::RecordData D = Trie.getRecord(V);
+ return D.Proxy.Hash == Hash
+ ? const_pointer(D.getFileOffset(), D.Proxy)
+ : const_pointer(S.getFileOffset(),
+ HintT(this, Index, *IndexGen.StartBit));
+ }
+ Index = IndexGen.next();
+ S = SubtrieHandle(Trie.getRegion(), V);
+ }
+/// Only safe if the subtrie is empty.
+void SubtrieHandle::reinitialize(uint32_t StartBit, uint32_t NumBits) {
+ assert(StartBit > H->StartBit);
+ assert(NumBits <= H->NumBits);
+ // Ideally would also assert that all slots are empty, but that's expensive.
+ H->StartBit = StartBit;
+ H->NumBits = NumBits;
+OnDiskHashMappedTrie::insertLazy(const_pointer Hint, ArrayRef<uint8_t> Hash,
+ LazyInsertOnConstructCB OnConstruct,
+ LazyInsertOnLeakCB OnLeak) {
+ HashMappedTrieHandle Trie = Impl->Trie;
+ assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash");
+ MappedFileRegionBumpPtr &Alloc = Impl->File.getAlloc();
+ SubtrieHandle S = Trie.getOrCreateRoot(Alloc);
+ IndexGenerator IndexGen = Trie.getIndexGen(S, Hash);
+ size_t Index;
+ if (std::optional<HintT> H = Hint.getHint(*this)) {
+ S = SubtrieHandle::getFromFileOffset(Trie.getRegion(), Hint.getOffset());
+ Index = IndexGen.hint(H->I, H->B);
+ } else {
+ Index = IndexGen.next();
+ }
+ // FIXME: Add non-assertion based checks for data corruption that would
+ // otherwise cause infinite loops in release builds, instead calling
+ // report_fatal_error().
+ //
+ // Two loops are possible:
+ // - All bits used up in the IndexGenerator because subtries are somehow
+ // linked in a cycle. Could confirm that each subtrie's start-bit
+ // follows from the start-bit and num-bits of its parent. Could also check
+ // that the generator doesn't run out of bits.
+ // - Existing data matches tail of Hash but not the head (stored in an
+ // invalid spot). Probably a cheap way to check this too, but needs
+ // thought.
+ std::optional<HashMappedTrieHandle::RecordData> NewRecord;
+ SubtrieHandle UnusedSubtrie;
+ for (;;) {
+ SubtrieSlotValue Existing = S.load(Index);
+ // Try to set it, if it's empty.
+ if (!Existing) {
+ if (!NewRecord) {
+ NewRecord = Trie.createRecord(Alloc, Hash);
+ if (OnConstruct)
+ OnConstruct(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy);
+ }
+ if (S.compare_exchange_strong(Index, Existing, NewRecord->Offset))
+ return pointer(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy);
+ // Race means that Existing is no longer empty; fall through...
+ }
+ if (Existing.isSubtrie()) {
+ S = SubtrieHandle(Trie.getRegion(), Existing);
+ Index = IndexGen.next();
+ continue;
+ }
+ // Check for an exact match.
+ HashMappedTrieHandle::RecordData ExistingRecord = Trie.getRecord(Existing);
+ if (ExistingRecord.Proxy.Hash == Hash) {
+ if (NewRecord && OnLeak)
+ OnLeak(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy,
+ ExistingRecord.Offset.asDataFileOffset(), ExistingRecord.Proxy);
+ return pointer(ExistingRecord.Offset.asDataFileOffset(),
+ ExistingRecord.Proxy);
+ }
+ // Sink the existing content as long as the indexes match.
+ for (;;) {
+ size_t NextIndex = IndexGen.next();
+ size_t NewIndexForExistingContent =
+ IndexGen.getCollidingBits(ExistingRecord.Proxy.Hash);
+ S = S.sink(Index, Existing, Alloc, IndexGen.getNumBits(), UnusedSubtrie,
+ NewIndexForExistingContent);
+ Index = NextIndex;
+ // Found the difference.
+ if (NextIndex != NewIndexForExistingContent)
+ break;
+ }
+ }
+SubtrieHandle SubtrieHandle::sink(size_t I, SubtrieSlotValue V,
+ MappedFileRegionBumpPtr &Alloc,
+ size_t NumSubtrieBits,
+ SubtrieHandle &UnusedSubtrie, size_t NewI) {
+ SubtrieHandle NewS;
+ if (UnusedSubtrie) {
+ // Steal UnusedSubtrie and initialize it.
+ std::swap(NewS, UnusedSubtrie);
+ NewS.reinitialize(getStartBit() + getNumBits(), NumSubtrieBits);
+ } else {
+ // Allocate a new, empty subtrie.
+ NewS = SubtrieHandle::create(Alloc, getStartBit() + getNumBits(),
+ NumSubtrieBits);
+ }
+ NewS.store(NewI, V);
+ if (compare_exchange_strong(I, V, NewS.getOffset()))
+ return NewS; // Success!
+ // Raced.
+ assert(V.isSubtrie() && "Expected racing sink() to add a subtrie");
+ // Wipe out the new slot so NewS can be reused and set the out parameter.
+ NewS.store(NewI, SubtrieSlotValue());
+ UnusedSubtrie = NewS;
+ // Return the subtrie added by the concurrent sink() call.
+ return SubtrieHandle(Alloc.getRegion(), V);
+void OnDiskHashMappedTrie::print(
+ raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+ Impl->Trie.print(OS, PrintRecordData);
+static void printHexDigit(raw_ostream &OS, uint8_t Digit) {
+ if (Digit < 10)
+ OS << char(Digit + '0');
+ else
+ OS << char(Digit - 10 + 'a');
+static void printHexDigits(raw_ostream &OS, ArrayRef<uint8_t> Bytes,
+ size_t StartBit, size_t NumBits) {
+ assert(StartBit % 4 == 0);
+ assert(NumBits % 4 == 0);
+ for (size_t I = StartBit, E = StartBit + NumBits; I != E; I += 4) {
+ uint8_t HexPair = Bytes[I / 8];
+ uint8_t HexDigit = I % 8 == 0 ? HexPair >> 4 : HexPair & 0xf;
+ printHexDigit(OS, HexDigit);
+ }
+void HashMappedTrieHandle::print(
+ raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+ OS << "hash-num-bits=" << getNumHashBits()
+ << " hash-size=" << getNumHashBytes()
+ << " record-data-size=" << getRecordDataSize() << "\n";
+ SubtrieHandle Root = getRoot();
+ SmallVector<int64_t> Records;
+ if (Root)
+ Root.print(OS, *this, Records);
+ if (Records.empty())
+ return;
+ llvm::sort(Records);
+ OS << "records\n";
+ for (int64_t Offset : Records) {
+ OS << "- addr=" << (void *)Offset << " ";
+ HashMappedTrieHandle Trie = *this;
+ HashMappedTrieHandle::RecordData Record =
+ Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset));
+ if (PrintRecordData) {
+ PrintRecordData(Record.Proxy.Data);
+ } else {
+ OS << "bytes=";
+ ArrayRef<uint8_t> Data(
+ reinterpret_cast<const uint8_t *>(Record.Proxy.Data.data()),
+ Record.Proxy.Data.size());
+ printHexDigits(OS, Data, 0, Data.size() * 8);
+ }
+ OS << "\n";
+ }
+static void printBits(raw_ostream &OS, ArrayRef<uint8_t> Bytes, size_t StartBit,
+ size_t NumBits) {
+ assert(StartBit + NumBits <= Bytes.size() * 8u);
+ for (size_t I = StartBit, E = StartBit + NumBits; I != E; ++I) {
+ uint8_t Byte = Bytes[I / 8];
+ size_t ByteOffset = I % 8;
+ if (size_t ByteShift = 8 - ByteOffset - 1)
+ Byte >>= ByteShift;
+ OS << (Byte & 0x1 ? '1' : '0');
+ }
+void SubtrieHandle::printHash(raw_ostream &OS, ArrayRef<uint8_t> Bytes) const {
+ // afb[1c:00*01110*0]def
+ size_t EndBit = getStartBit() + getNumBits();
+ size_t HashEndBit = Bytes.size() * 8u;
+ size_t FirstBinaryBit = getStartBit() & ~0x3u;
+ printHexDigits(OS, Bytes, 0, FirstBinaryBit);
+ size_t LastBinaryBit = (EndBit + 3u) & ~0x3u;
+ OS << "[";
+ printBits(OS, Bytes, FirstBinaryBit, LastBinaryBit - FirstBinaryBit);
+ OS << "]";
+ printHexDigits(OS, Bytes, LastBinaryBit, HashEndBit - LastBinaryBit);
+static void appendIndexBits(std::string &Prefix, size_t Index,
+ size_t NumSlots) {
+ std::string Bits;
+ for (size_t NumBits = 1u; NumBits < NumSlots; NumBits <<= 1) {
+ Bits.push_back('0' + (Index & 0x1));
+ Index >>= 1;
+ }
+ for (char Ch : llvm::reverse(Bits))
+ Prefix += Ch;
+static void printPrefix(raw_ostream &OS, StringRef Prefix) {
+ while (Prefix.size() >= 4) {
+ uint8_t Digit;
+ bool ErrorParsingBinary = Prefix.take_front(4).getAsInteger(2, Digit);
+ assert(!ErrorParsingBinary);
+ (void)ErrorParsingBinary;
+ printHexDigit(OS, Digit);
+ Prefix = Prefix.drop_front(4);
+ }
+ if (!Prefix.empty())
+ OS << "[" << Prefix << "]";
+void SubtrieHandle::print(raw_ostream &OS, HashMappedTrieHandle Trie,
+ SmallVectorImpl<int64_t> &Records,
+ std::optional<std::string> Prefix) const {
+ if (!Prefix) {
+ OS << "root";
+ Prefix.emplace();
+ } else {
+ OS << "subtrie=";
+ printPrefix(OS, *Prefix);
+ }
+ OS << " addr="
+ << (void *)(reinterpret_cast<const char *>(H) - Region->data());
+ const size_t NumSlots = Slots.size();
+ OS << " num-slots=" << NumSlots << "\n";
+ SmallVector<SubtrieHandle> Subs;
+ SmallVector<std::string> Prefixes;
+ for (size_t I = 0, E = NumSlots; I != E; ++I) {
+ SubtrieSlotValue Slot = load(I);
+ if (!Slot)
+ continue;
+ OS << "- index=";
+ for (size_t Pad : {10, 100, 1000})
+ if (I < Pad && NumSlots >= Pad)
+ OS << "0";
+ OS << I << " ";
+ if (Slot.isSubtrie()) {
+ SubtrieHandle S(*Region, Slot);
+ std::string SubtriePrefix = *Prefix;
+ appendIndexBits(SubtriePrefix, I, NumSlots);
+ OS << "addr=" << (void *)Slot.asSubtrie();
+ OS << " subtrie=";
+ printPrefix(OS, SubtriePrefix);
+ OS << "\n";
+ Subs.push_back(S);
+ Prefixes.push_back(SubtriePrefix);
+ continue;
+ }
+ Records.push_back(Slot.asData());
+ HashMappedTrieHandle::RecordData Record = Trie.getRecord(Slot);
+ OS << "addr=" << (void *)Record.getFileOffset().get();
+ OS << " content=";
+ printHash(OS, Record.Proxy.Hash);
+ OS << "\n";
+ }
+ for (size_t I = 0, E = Subs.size(); I != E; ++I)
+ Subs[I].print(OS, Trie, Records, Prefixes[I]);
+LLVM_DUMP_METHOD void OnDiskHashMappedTrie::dump() const { print(dbgs()); }
+static Error createTableConfigError(std::errc ErrC, StringRef Path,
+ StringRef TableName, const Twine &Msg) {
+ return createStringError(make_error_code(ErrC),
+ Path + "[" + TableName + "]: " + Msg);
+static Expected<size_t> checkParameter(StringRef Label, size_t Max,
+ std::optional<size_t> Value,
+ std::optional<size_t> Default,
+ StringRef Path, StringRef TableName) {
+ assert(Value || Default);
+ assert(!Default || *Default <= Max);
+ if (!Value)
+ return *Default;
+ if (*Value <= Max)
+ return *Value;
+ return createTableConfigError(
+ std::errc::argument_out_of_domain, Path, TableName,
+ "invalid " + Label + ": " + Twine(*Value) + " (max: " + Twine(Max) + ")");
+static Error checkTable(StringRef Label, size_t Expected, size_t Observed,
+ StringRef Path, StringRef TrieName) {
+ if (Expected == Observed)
+ return Error::success();
+ return createTableConfigError(std::errc::invalid_argument, Path, TrieName,
+ "mismatched " + Label +
+ " (expected: " + Twine(Expected) +
+ ", observed: " + Twine(Observed) + ")");
+size_t OnDiskHashMappedTrie::size() const { return Impl->File.size(); }
+OnDiskHashMappedTrie::create(const Twine &PathTwine, const Twine &TrieNameTwine,
+ size_t NumHashBits, uint64_t DataSize,
+ uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize,
+ std::optional<size_t> NewTableNumRootBits,
+ std::optional<size_t> NewTableNumSubtrieBits) {
+ SmallString<128> PathStorage;
+ StringRef Path = PathTwine.toStringRef(PathStorage);
+ SmallString<128> TrieNameStorage;
+ StringRef TrieName = TrieNameTwine.toStringRef(TrieNameStorage);
+ constexpr size_t DefaultNumRootBits = 10;
+ constexpr size_t DefaultNumSubtrieBits = 6;
+ size_t NumRootBits;
+ if (Error E = checkParameter(
+ "root bits", HashMappedTrieHandle::MaxNumRootBits,
+ NewTableNumRootBits, DefaultNumRootBits, Path, TrieName)
+ .moveInto(NumRootBits))
+ return std::move(E);
+ size_t NumSubtrieBits;
+ if (Error E = checkParameter("subtrie bits",
+ HashMappedTrieHandle::MaxNumSubtrieBits,
+ NewTableNumSubtrieBits, DefaultNumSubtrieBits,
+ Path, TrieName)
+ .moveInto(NumSubtrieBits))
+ return std::move(E);
+ size_t NumHashBytes = NumHashBits >> 3;
+ if (Error E =
+ checkParameter("hash size", HashMappedTrieHandle::MaxNumHashBits,
+ NumHashBits, std::nullopt, Path, TrieName)
+ .takeError())
+ return std::move(E);
+ assert(NumHashBits == NumHashBytes << 3 &&
+ "Expected hash size to be byte-aligned");
+ if (NumHashBits != NumHashBytes << 3)
+ return createTableConfigError(
+ std::errc::argument_out_of_domain, Path, TrieName,
+ "invalid hash size: " + Twine(NumHashBits) + " (not byte-aligned)");
+ // Constructor for if the file doesn't exist.
+ auto NewDBConstructor = [&](DatabaseFile &DB) -> Error {
+ HashMappedTrieHandle Trie =
+ HashMappedTrieHandle::create(DB.getAlloc(), TrieName, NumRootBits,
+ NumSubtrieBits, NumHashBits, DataSize);
+ DB.addTable(Trie);
+ return Error::success();
+ };
+ // Get or create the file.
+ Expected<DatabaseFile> File =
+ DatabaseFile::create(Path, MaxFileSize, NewDBConstructor);
+ if (!File)
+ return File.takeError();
+ // Find the trie and validate it.
+ //
+ // TODO: Add support for creating/adding a table to an existing file.
+ std::optional<TableHandle> Table = File->findTable(TrieName);
+ if (!Table)
+ return createTableConfigError(std::errc::argument_out_of_domain, Path,
+ TrieName, "table not found");
+ if (Error E = checkTable("table kind", (size_t)HashMappedTrieHandle::Kind,
+ (size_t)Table->getHeader().Kind, Path, TrieName))
+ return std::move(E);
+ auto Trie = Table->cast<HashMappedTrieHandle>();
+ assert(Trie && "Already checked the kind");
+ // Check the hash and data size.
+ if (Error E = checkTable("hash size", NumHashBits, Trie.getNumHashBits(),
+ Path, TrieName))
+ return std::move(E);
+ if (Error E = checkTable("data size", DataSize, Trie.getRecordDataSize(),
+ Path, TrieName))
+ return std::move(E);
+ // No flags supported right now. Either corrupt, or coming from a future
+ // writer.
+ if (size_t Flags = Trie.getFlags())
+ return createTableConfigError(std::errc::invalid_argument, Path, TrieName,
+ "unsupported flags: " + Twine(Flags));
+ // Success.
+ OnDiskHashMappedTrie::ImplType Impl{DatabaseFile(std::move(*File)), Trie};
+ return OnDiskHashMappedTrie(std::make_unique<ImplType>(std::move(Impl)));
+// DataAllocator data structures.
+namespace {
+/// DataAllocator table layout:
+/// - [8-bytes: Generic table header]
+/// - 8-bytes: AllocatorOffset (reserved for implementing free lists)
+/// - 8-bytes: Size for user data header
+/// - <user data buffer>
+/// Record layout:
+/// - <data>
+class DataAllocatorHandle {
+ static constexpr TableHandle::TableKind Kind =
+ TableHandle::TableKind::DataAllocator;
+ struct Header {
+ TableHandle::Header GenericHeader;
+ std::atomic<int64_t> AllocatorOffset;
+ const uint64_t UserHeaderSize;
+ };
+ operator TableHandle() const {
+ if (!H)
+ return TableHandle();
+ return TableHandle(*Region, H->GenericHeader);
+ }
+ MutableArrayRef<char> allocate(MappedFileRegionBumpPtr &Alloc,
+ size_t DataSize) {
+ assert(&Alloc.getRegion() == Region);
+ return MutableArrayRef(Alloc.allocate(DataSize), DataSize);
+ }
+ explicit operator bool() const { return H; }
+ const Header &getHeader() const { return *H; }
+ MappedFileRegion &getRegion() const { return *Region; }
+ MutableArrayRef<uint8_t> getUserHeader() {
+ return MutableArrayRef(reinterpret_cast<uint8_t *>(H + 1),
+ H->UserHeaderSize);
+ }
+ static DataAllocatorHandle create(MappedFileRegionBumpPtr &Alloc,
+ StringRef Name, uint32_t UserHeaderSize);
+ DataAllocatorHandle() = default;
+ DataAllocatorHandle(MappedFileRegion &Region, Header &H)
+ : Region(&Region), H(&H) {}
+ DataAllocatorHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+ : DataAllocatorHandle(
+ Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+ }
+ MappedFileRegion *Region = nullptr;
+ Header *H = nullptr;
+} // end anonymous namespace
+struct OnDiskDataAllocator::ImplType {
+ DatabaseFile File;
+ DataAllocatorHandle Store;
+DataAllocatorHandle DataAllocatorHandle::create(MappedFileRegionBumpPtr &Alloc,
+ StringRef Name,
+ uint32_t UserHeaderSize) {
+ // Allocate.
+ intptr_t Offset =
+ Alloc.allocateOffset(sizeof(Header) + UserHeaderSize + Name.size() + 1);
+ // Construct the header and the name.
+ assert(Name.size() <= UINT16_MAX && "Expected smaller table name");
+ auto *H = new (Alloc.getRegion().data() + Offset)
+ Header{{TableHandle::TableKind::DataAllocator, (uint16_t)Name.size(),
+ (int32_t)(sizeof(Header) + UserHeaderSize)},
+ /*AllocatorOffset=*/{0},
+ /*UserHeaderSize=*/UserHeaderSize};
+ memset(H + 1, 0, UserHeaderSize);
+ char *NameStorage = reinterpret_cast<char *>(H + 1) + UserHeaderSize;
+ llvm::copy(Name, NameStorage);
+ NameStorage[Name.size()] = 0;
+ return DataAllocatorHandle(Alloc.getRegion(), *H);
+Expected<OnDiskDataAllocator> OnDiskDataAllocator::create(
+ const Twine &PathTwine, const Twine &TableNameTwine, uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize,
+ function_ref<void(void *)> UserHeaderInit) {
+ assert(!UserHeaderSize || UserHeaderInit);
+ SmallString<128> PathStorage;
+ StringRef Path = PathTwine.toStringRef(PathStorage);
+ SmallString<128> TableNameStorage;
+ StringRef TableName = TableNameTwine.toStringRef(TableNameStorage);
+ // Constructor for if the file doesn't exist.
+ auto NewDBConstructor = [&](DatabaseFile &DB) -> Error {
+ DataAllocatorHandle Store =
+ DataAllocatorHandle::create(DB.getAlloc(), TableName, UserHeaderSize);
+ DB.addTable(Store);
+ if (UserHeaderSize)
+ UserHeaderInit(Store.getUserHeader().data());
+ return Error::success();
+ };
+ // Get or create the file.
+ Expected<DatabaseFile> File =
+ DatabaseFile::create(Path, MaxFileSize, NewDBConstructor);
+ if (!File)
+ return File.takeError();
+ // Find the table and validate it.
+ //
+ // TODO: Add support for creating/adding a table to an existing file.
+ std::optional<TableHandle> Table = File->findTable(TableName);
+ if (!Table)
+ return createTableConfigError(std::errc::argument_out_of_domain, Path,
+ TableName, "table not found");
+ if (Error E = checkTable("table kind", (size_t)DataAllocatorHandle::Kind,
+ (size_t)Table->getHeader().Kind, Path, TableName))
+ return std::move(E);
+ auto Store = Table->cast<DataAllocatorHandle>();
+ assert(Store && "Already checked the kind");
+ // Success.
+ OnDiskDataAllocator::ImplType Impl{DatabaseFile(std::move(*File)), Store};
+ return OnDiskDataAllocator(std::make_unique<ImplType>(std::move(Impl)));
+OnDiskDataAllocator::pointer OnDiskDataAllocator::allocate(size_t Size) {
+ MutableArrayRef<char> Data =
+ Impl->Store.allocate(Impl->File.getAlloc(), Size);
+ return pointer(FileOffset(Data.data() - Impl->Store.getRegion().data()),
+ Data);
+const char *OnDiskDataAllocator::beginData(FileOffset Offset) const {
+ assert(Offset);
+ assert(Impl);
+ assert(Offset.get() < (int64_t)Impl->File.getAlloc().size());
+ return Impl->File.getRegion().data() + Offset.get();
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() {
+ return Impl->Store.getUserHeader();
+size_t OnDiskDataAllocator::size() const { return Impl->File.size(); }
+OnDiskDataAllocator::OnDiskDataAllocator(std::unique_ptr<ImplType> Impl)
+ : Impl(std::move(Impl)) {}
+struct OnDiskHashMappedTrie::ImplType {};
+OnDiskHashMappedTrie::create(const Twine &PathTwine, const Twine &TrieNameTwine,
+ size_t NumHashBits, uint64_t DataSize,
+ uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize,
+ std::optional<size_t> NewTableNumRootBits,
+ std::optional<size_t> NewTableNumSubtrieBits) {
+ report_fatal_error("not supported");
+OnDiskHashMappedTrie::insertLazy(const_pointer Hint, ArrayRef<uint8_t> Hash,
+ LazyInsertOnConstructCB OnConstruct,
+ LazyInsertOnLeakCB OnLeak) {
+ report_fatal_error("not supported");
+OnDiskHashMappedTrie::recoverFromFileOffset(FileOffset Offset) const {
+ report_fatal_error("not supported");
+OnDiskHashMappedTrie::find(ArrayRef<uint8_t> Hash) const {
+ report_fatal_error("not supported");
+void OnDiskHashMappedTrie::print(
+ raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+ report_fatal_error("not supported");
+size_t OnDiskHashMappedTrie::size() const {
+ report_fatal_error("not supported");
+struct OnDiskDataAllocator::ImplType {};
+Expected<OnDiskDataAllocator> OnDiskDataAllocator::create(
+ const Twine &Path, const Twine &TableName, uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize,
+ function_ref<void(void *)> UserHeaderInit) {
+ report_fatal_error("not supported");
+OnDiskDataAllocator::pointer OnDiskDataAllocator::allocate(size_t Size) {
+ report_fatal_error("not supported");
+const char *OnDiskDataAllocator::beginData(FileOffset Offset) const {
+ report_fatal_error("not supported");
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() {
+ report_fatal_error("not supported");
+size_t OnDiskDataAllocator::size() const {
+ report_fatal_error("not supported");
+OnDiskHashMappedTrie::OnDiskHashMappedTrie(std::unique_ptr<ImplType> Impl)
+ : Impl(std::move(Impl)) {}
+OnDiskHashMappedTrie::OnDiskHashMappedTrie(OnDiskHashMappedTrie &&RHS) =
+ default;
+OnDiskHashMappedTrie &
+OnDiskHashMappedTrie::operator=(OnDiskHashMappedTrie &&RHS) = default;
+OnDiskHashMappedTrie::~OnDiskHashMappedTrie() = default;
+OnDiskDataAllocator::OnDiskDataAllocator(OnDiskDataAllocator &&RHS) = default;
+OnDiskDataAllocator &
+OnDiskDataAllocator::operator=(OnDiskDataAllocator &&RHS) = default;
+OnDiskDataAllocator::~OnDiskDataAllocator() = default;
diff --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
new file mode 100644
index 00000000000000..b5d24a93e218be
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
@@ -0,0 +1,78 @@
+//===- OnDiskKeyValueDB.cpp -------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Path.h"
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+static constexpr StringLiteral ActionCacheFile = "actions";
+static constexpr StringLiteral FilePrefix = "v3.";
+Expected<ArrayRef<char>> OnDiskKeyValueDB::put(ArrayRef<uint8_t> Key,
+ ArrayRef<char> Value) {
+ if (LLVM_UNLIKELY(Value.size() != ValueSize))
+ return createStringError(errc::invalid_argument,
+ "expected value size of " + itostr(ValueSize) +
+ ", got: " + itostr(Value.size()));
+ assert(Value.size() == ValueSize);
+ OnDiskHashMappedTrie::pointer ActionP = Cache.insertLazy(
+ Key, [&](FileOffset TentativeOffset,
+ OnDiskHashMappedTrie::ValueProxy TentativeValue) {
+ assert(TentativeValue.Data.size() == ValueSize);
+ llvm::copy(Value, TentativeValue.Data.data());
+ });
+ return ActionP->Data;
+OnDiskKeyValueDB::get(ArrayRef<uint8_t> Key) {
+ // Check the result cache.
+ OnDiskHashMappedTrie::const_pointer ActionP = Cache.find(Key);
+ if (!ActionP)
+ return std::nullopt;
+ assert(isAddrAligned(Align(8), ActionP->Data.data()));
+ return ActionP->Data;
+OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize,
+ StringRef ValueName, size_t ValueSize) {
+ if (std::error_code EC = sys::fs::create_directories(Path))
+ return createFileError(Path, EC);
+ SmallString<256> CachePath(Path);
+ sys::path::append(CachePath, FilePrefix + ActionCacheFile);
+ constexpr uint64_t MB = 1024ull * 1024ull;
+ constexpr uint64_t GB = 1024ull * 1024ull * 1024ull;
+ uint64_t MaxFileSize = GB;
+ auto CustomSize = getOverriddenMaxMappingSize();
+ if (!CustomSize)
+ return CustomSize.takeError();
+ if (*CustomSize)
+ MaxFileSize = **CustomSize;
+ std::optional<OnDiskHashMappedTrie> ActionCache;
+ if (Error E = OnDiskHashMappedTrie::create(
+ CachePath,
+ "llvm.actioncache[" + HashName + "->" + ValueName + "]",
+ KeySize * 8,
+ /*DataSize=*/ValueSize, MaxFileSize, /*MinFileSize=*/MB)
+ .moveInto(ActionCache))
+ return std::move(E);
+ return std::unique_ptr<OnDiskKeyValueDB>(
+ new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache)));
diff --git a/llvm/lib/CAS/TreeEntry.cpp b/llvm/lib/CAS/TreeEntry.cpp
new file mode 100644
index 00000000000000..712ae40be15d6f
--- /dev/null
+++ b/llvm/lib/CAS/TreeEntry.cpp
@@ -0,0 +1,47 @@
+//===- Utils.cpp ------------------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/BinaryFormat/Magic.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/CAS/TreeSchema.h"
+#include "llvm/Support/MemoryBufferRef.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/StringSaver.h"
+using namespace llvm;
+using namespace llvm::cas;
+static void printTreeEntryKind(raw_ostream &OS, TreeEntry::EntryKind Kind) {
+ switch (Kind) {
+ case TreeEntry::Regular:
+ OS << "file";
+ break;
+ case TreeEntry::Executable:
+ OS << "exec";
+ break;
+ case TreeEntry::Symlink:
+ OS << "syml";
+ break;
+ case TreeEntry::Tree:
+ OS << "tree";
+ break;
+ }
+void cas::NamedTreeEntry::print(raw_ostream &OS, ObjectStore &CAS) const {
+ printTreeEntryKind(OS, getKind());
+ OS << " " << CAS.getID(getRef()) << " " << Name;
+ if (getKind() == TreeEntry::Tree)
+ OS << "/";
+ if (getKind() == TreeEntry::Symlink) {
+ ObjectProxy Target = cantFail(CAS.getProxy(getRef()));
+ OS << " -> ";
+ OS << Target.getData();
+ }
+ OS << "\n";
diff --git a/llvm/lib/CAS/TreeSchema.cpp b/llvm/lib/CAS/TreeSchema.cpp
new file mode 100644
index 00000000000000..91f3d8e3d25403
--- /dev/null
+++ b/llvm/lib/CAS/TreeSchema.cpp
@@ -0,0 +1,231 @@
+//===- TreeSchema.cpp -----------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/TreeSchema.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/MemoryBufferRef.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/StringSaver.h"
+using namespace llvm;
+using namespace llvm::cas;
+char TreeSchema::ID = 0;
+constexpr StringLiteral TreeSchema::SchemaName;
+void TreeSchema::anchor() {}
+bool TreeSchema::isNode(const ObjectProxy &Node) const {
+ // Load the first ref to check its content.
+ if (Node.getNumReferences() < 1)
+ return false;
+ // If can't load the first ref, consume error and return false.
+ auto FirstRef = Node.getReference(0);
+ return FirstRef == getKindRef();
+TreeSchema::TreeSchema(cas::ObjectStore &CAS) : TreeSchema::RTTIExtends(CAS) {
+ TreeKindRef = cantFail(CAS.storeFromString(std::nullopt, SchemaName));
+ObjectRef TreeSchema::getKindRef() const { return *TreeKindRef; }
+size_t TreeSchema::getNumTreeEntries(TreeProxy Tree) const {
+ return Tree.getNumReferences() - 1;
+Error TreeSchema::forEachTreeEntry(
+ TreeProxy Tree,
+ function_ref<Error(const NamedTreeEntry &)> Callback) const {
+ for (size_t I = 0, IE = getNumTreeEntries(Tree); I != IE; ++I)
+ if (Error E = Callback(loadTreeEntry(Tree, I)))
+ return E;
+ return Error::success();
+Error TreeSchema::walkFileTreeRecursively(
+ ObjectStore &CAS, ObjectRef Root,
+ function_ref<Error(const NamedTreeEntry &, std::optional<TreeProxy>)>
+ Callback) {
+ BumpPtrAllocator Alloc;
+ StringSaver Saver(Alloc);
+ SmallString<128> PathStorage;
+ SmallVector<NamedTreeEntry> Stack;
+ Stack.emplace_back(Root, TreeEntry::Tree, "/");
+ while (!Stack.empty()) {
+ if (Stack.back().getKind() != TreeEntry::Tree) {
+ if (Error E = Callback(Stack.pop_back_val(), std::nullopt))
+ return E;
+ continue;
+ }
+ NamedTreeEntry Parent = Stack.pop_back_val();
+ Expected<TreeProxy> ExpTree = load(Parent.getRef());
+ if (Error E = ExpTree.takeError())
+ return E;
+ TreeProxy Tree = *ExpTree;
+ if (Error E = Callback(Parent, Tree))
+ return E;
+ for (int I = Tree.size(), E = 0; I != E; --I) {
+ std::optional<NamedTreeEntry> Child = Tree.get(I - 1);
+ assert(Child && "Expected no corruption");
+ PathStorage = Parent.getName();
+ sys::path::append(PathStorage, sys::path::Style::posix, Child->getName());
+ Stack.emplace_back(Child->getRef(), Child->getKind(),
+ Saver.save(StringRef(PathStorage)));
+ }
+ }
+ return Error::success();
+NamedTreeEntry TreeSchema::loadTreeEntry(TreeProxy Tree, size_t I) const {
+ // Load entry from TreeNode.
+ TreeEntry::EntryKind Kind =
+ (TreeEntry::EntryKind)
+ Tree.getData()[I + (Tree.size() + 1) * sizeof(uint32_t)];
+ StringRef Name = Tree.getName(I);
+ auto ObjectRef = Tree.getReference(I + 1);
+ return {ObjectRef, Kind, Name};
+std::optional<size_t> TreeSchema::lookupTreeEntry(TreeProxy Tree,
+ StringRef Name) const {
+ size_t NumNames = Tree.size();
+ if (!NumNames)
+ return std::nullopt;
+ // Start with a binary search, if there are enough entries.
+ //
+ // FIXME: Should just use std::lower_bound, but we need the actual iterators
+ // to know the index in the NameCache...
+ const size_t MaxLinearSearchSize = 4;
+ size_t Last = NumNames;
+ size_t First = 0;
+ while (Last - First > MaxLinearSearchSize) {
+ auto I = First + (Last - First) / 2;
+ StringRef NameI = Tree.getName(I);
+ switch (Name.compare(NameI)) {
+ case 0:
+ return I;
+ case -1:
+ Last = I;
+ break;
+ case 1:
+ First = I + 1;
+ break;
+ }
+ }
+ // Use a linear search for small trees.
+ for (; First != Last; ++First)
+ if (Name == Tree.getName(First))
+ return First;
+ return std::nullopt;
+Expected<TreeProxy> TreeSchema::load(ObjectRef Object) const {
+ auto TreeNode = CAS.getProxy(Object);
+ if (!TreeNode)
+ return TreeNode.takeError();
+ return load(*TreeNode);
+Expected<TreeProxy> TreeSchema::load(ObjectProxy Object) const {
+ if (!isNode(Object))
+ return createStringError(inconvertibleErrorCode(), "not a tree object");
+ return TreeProxy::get(*this, Object);
+Expected<TreeProxy> TreeSchema::create(ArrayRef<NamedTreeEntry> Entries) {
+ return TreeProxy::create(*this, Entries);
+Expected<TreeProxy> TreeProxy::get(const TreeSchema &Schema,
+ Expected<ObjectProxy> Ref) {
+ if (!Ref)
+ return Ref.takeError();
+ return TreeProxy(Schema, *Ref);
+Expected<TreeProxy> TreeProxy::create(TreeSchema &Schema,
+ ArrayRef<NamedTreeEntry> Entries) {
+ auto B = Builder::startNode(Schema);
+ if (!B)
+ return B.takeError();
+ return B->build(Entries);
+StringRef TreeProxy::getName(size_t I) const {
+ uint32_t StartIdx =
+ support::endian::read32le(getData().data() + sizeof(uint32_t) * I);
+ uint32_t EndIdx =
+ support::endian::read32le(getData().data() + sizeof(uint32_t) * (I + 1));
+ return StringRef(getData().data() + StartIdx, EndIdx - StartIdx);
+Expected<TreeProxy::Builder> TreeProxy::Builder::startNode(TreeSchema &Schema) {
+ Builder B(Schema);
+ B.Refs.push_back(Schema.getKindRef());
+ return std::move(B);
+TreeProxy::Builder::build(ArrayRef<NamedTreeEntry> Entries) {
+ // Ensure a stable order for tree entries and ignore name collisions.
+ SmallVector<NamedTreeEntry> Sorted(Entries.begin(), Entries.end());
+ std::stable_sort(Sorted.begin(), Sorted.end());
+ Sorted.erase(std::unique(Sorted.begin(), Sorted.end()), Sorted.end());
+ raw_svector_ostream OS(Data);
+ support::endian::Writer Writer(OS, support::endianness::little);
+ // Encode the entires in the Data. The layout of the tree schema object is:
+ // * Name offset table: The offset of in the data blob for where to find the
+ // string. It has N + 1 entries and you can find the name of n-th entry at
+ // offset[n] -> offset[n+1]. Each offset is encoded as little-endian
+ // uint32_t.
+ // * Kind: uint8_t for each entry.
+ // * Object: ObjectRef for each entry is at n + 1 refs for the object (with
+ // the first one being the tree kind ID).
+ // Write Name.
+ // The start of the string table index.
+ uint32_t StrIdx =
+ sizeof(uint8_t) * Sorted.size() + sizeof(uint32_t) * (Sorted.size() + 1);
+ for (auto &Entry : Sorted) {
+ Writer.write(StrIdx);
+ StrIdx += Entry.getName().size();
+ // Append refs.
+ Refs.push_back(Entry.getRef());
+ }
+ // Write the end index for the last string.
+ Writer.write(StrIdx);
+ // Write Kind.
+ for (auto &Entry : Sorted)
+ Writer.write((uint8_t)Entry.getKind());
+ // Write names in the end of the block.
+ for (auto &Entry : Sorted)
+ OS << Entry.getName();
+ return TreeProxy::get(*Schema, Schema->CAS.createProxy(Refs, Data));
diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp
new file mode 100644
index 00000000000000..b5b136d2a1c5e8
--- /dev/null
+++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp
@@ -0,0 +1,339 @@
+//===- UnifiedOnDiskCache.cpp -----------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Encapsulates \p OnDiskGraphDB and \p OnDiskKeyValueDB instances within one
+// directory while also restricting storage growth with a scheme of chaining the
+// two most recent directories (primary & upstream), where the primary
+// "faults-in" data from the upstream one. When the primary (most recent)
+// directory exceeds its intended limit a new empty directory becomes the
+// primary one.
+// Within the top-level directory (the path that \p UnifiedOnDiskCache::open
+// receives) there are directories named like this:
+// 'v<version>.<x>'
+// 'v<version>.<x+1'
+// 'v<version>.<x+2>'
+// ...
+// 'version' is the version integer for this \p UnifiedOnDiskCache's scheme and
+// the part after the dot is an increasing integer. The primary directory is the
+// one with the highest integer and the upstream one is the directory before it.
+// For example, if the sub-directories contained are:
+// 'v1.5', 'v1.6', 'v1.7', 'v1.8'
+// Then the primary one is 'v1.8', the upstream one is 'v1.7', and the rest are
+// unused directories that can be safely deleted at any time and by any process.
+// Contained within the top-level directory is a file named "lock" which is used
+// for processes to take shared or exclusive locks for the contents of the top
+// directory. While a \p UnifiedOnDiskCache is open it keeps a shared lock for
+// the top-level directory; when it closes, if the primary sub-directory
+// exceeded its limit, it attempts to get an exclusive lock in order to create a
+// new empty primary directory; if it can't get the exclusive lock it gives up
+// and lets the next \p UnifiedOnDiskCache instance that closes to attempt
+// again.
+// The downside of this scheme is that while \p UnifiedOnDiskCache is open on a
+// directory, by any process, the storage size in that directory will keep
+// growing unrestricted. But the major benefit is that garbage-collection can be
+// triggered on a directory concurrently, at any time and by any process,
+// without affecting any active readers/writers in the same process or other
+// processes.
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+/// FIXME: When the version of \p DBDirPrefix is bumped up we need to figure out
+/// how to handle the leftover sub-directories of the previous version, within
+/// the \p UnifiedOnDiskCache::collectGarbage function.
+static constexpr StringLiteral DBDirPrefix = "v1.";
+Expected<ObjectID> UnifiedOnDiskCache::KVPut(ObjectID Key, ObjectID Value) {
+ return KVPut(PrimaryGraphDB->getDigest(Key), Value);
+Expected<ObjectID> UnifiedOnDiskCache::KVPut(ArrayRef<uint8_t> Key,
+ ObjectID Value) {
+ static_assert(sizeof(Value.getOpaqueData()) == sizeof(uint64_t),
+ "unexpected return opaque type");
+ std::array<char, sizeof(uint64_t)> ValBytes;
+ support::endian::write64le(ValBytes.data(), Value.getOpaqueData());
+ Expected<ArrayRef<char>> Existing = PrimaryKVDB->put(Key, ValBytes);
+ if (!Existing)
+ return Existing.takeError();
+ assert(Existing->size() == sizeof(uint64_t));
+ return ObjectID::fromOpaqueData(support::endian::read64le(Existing->data()));
+UnifiedOnDiskCache::KVGet(ArrayRef<uint8_t> Key) {
+ std::optional<ArrayRef<char>> Value;
+ if (Error E = PrimaryKVDB->get(Key).moveInto(Value))
+ return std::move(E);
+ if (!Value) {
+ if (UpstreamKVDB)
+ return faultInFromUpstreamKV(Key);
+ return std::nullopt;
+ }
+ assert(Value->size() == sizeof(uint64_t));
+ return ObjectID::fromOpaqueData(support::endian::read64le(Value->data()));
+UnifiedOnDiskCache::faultInFromUpstreamKV(ArrayRef<uint8_t> Key) {
+ assert(UpstreamGraphDB);
+ assert(UpstreamKVDB);
+ std::optional<ArrayRef<char>> UpstreamValue;
+ if (Error E = UpstreamKVDB->get(Key).moveInto(UpstreamValue))
+ return std::move(E);
+ if (!UpstreamValue)
+ return std::nullopt;
+ // The value is the \p ObjectID in the context of the upstream
+ // \p OnDiskGraphDB instance. Translate it to the context of the primary
+ // \p OnDiskGraphDB instance.
+ assert(UpstreamValue->size() == sizeof(uint64_t));
+ ObjectID UpstreamID = ObjectID::fromOpaqueData(
+ support::endian::read64le(UpstreamValue->data()));
+ ObjectID PrimaryID =
+ PrimaryGraphDB->getReference(UpstreamGraphDB->getDigest(UpstreamID));
+ return KVPut(Key, PrimaryID);
+/// \returns all the 'v<version>.<x>' names of sub-directories, sorted with
+/// ascending order of the integer after the dot.
+static Error getAllDBDirs(StringRef Path,
+ SmallVectorImpl<std::string> &DBDirs) {
+ struct DBDir {
+ uint64_t Order;
+ std::string Name;
+ };
+ SmallVector<DBDir, 6> FoundDBDirs;
+ std::error_code EC;
+ for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE;
+ DirI.increment(EC)) {
+ if (DirI->type() != sys::fs::file_type::directory_file)
+ continue;
+ StringRef SubDir = sys::path::filename(DirI->path());
+ if (!SubDir.startswith(DBDirPrefix))
+ continue;
+ uint64_t Order;
+ if (SubDir.substr(DBDirPrefix.size()).getAsInteger(10, Order))
+ return createStringError(inconvertibleErrorCode(),
+ "unexpected directory " + DirI->path());
+ FoundDBDirs.push_back({Order, std::string(SubDir)});
+ }
+ if (EC)
+ return createFileError(Path, EC);
+ llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool {
+ return LHS.Order <= RHS.Order;
+ });
+ for (DBDir &Dir : FoundDBDirs)
+ DBDirs.push_back(std::move(Dir.Name));
+ return Error::success();
+/// \returns Given a sub-directory named 'v<version>.<x>', it outputs the
+/// 'v<version>.<x+1>' name.
+static void getNextDBDirName(StringRef DBDir, llvm::raw_ostream &OS) {
+ assert(DBDir.startswith(DBDirPrefix));
+ uint64_t Count;
+ bool Failed = DBDir.substr(DBDirPrefix.size()).getAsInteger(10, Count);
+ assert(!Failed);
+ (void)Failed;
+ OS << DBDirPrefix << Count + 1;
+UnifiedOnDiskCache::open(StringRef RootPath, std::optional<uint64_t> SizeLimit,
+ StringRef HashName, unsigned HashByteSize,
+ OnDiskGraphDB::FaultInPolicy FaultInPolicy) {
+ if (std::error_code EC = sys::fs::create_directories(RootPath))
+ return createFileError(RootPath, EC);
+ SmallString<256> PathBuf(RootPath);
+ sys::path::append(PathBuf, "lock");
+ int LockFD = -1;
+ if (std::error_code EC = sys::fs::openFileForReadWrite(
+ PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+ return createFileError(PathBuf, EC);
+ assert(LockFD != -1);
+ // Locking the directory using shared lock, which will prevent other processes
+ // from creating a new chain (essentially while a \p UnifiedOnDiskCache
+ // instance holds a shared lock the storage for the primary directory will
+ // grow unrestricted).
+ if (std::error_code EC = sys::fs::lockFile(LockFD, /*Exclusive=*/false))
+ return createFileError(PathBuf, EC);
+ SmallVector<std::string, 4> DBDirs;
+ if (Error E = getAllDBDirs(RootPath, DBDirs))
+ return std::move(E);
+ if (DBDirs.empty())
+ DBDirs.push_back((Twine(DBDirPrefix) + "1").str());
+ assert(!DBDirs.empty());
+ /// If there is only one directory open databases on it. If there are 2 or
+ /// more directories, get the most recent directories and chain them, with the
+ /// most recent being the primary one. The remaining directories are unused
+ /// data than can be garbage-collected.
+ std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB;
+ std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB;
+ if (DBDirs.size() > 1) {
+ StringRef UpstreamDir = *(DBDirs.end() - 2);
+ PathBuf = RootPath;
+ sys::path::append(PathBuf, UpstreamDir);
+ if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize,
+ /*UpstreamDB=*/nullptr, FaultInPolicy)
+ .moveInto(UpstreamGraphDB))
+ return std::move(E);
+ if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize,
+ /*ValueName=*/"objectid",
+ /*ValueSize=*/sizeof(uint64_t))
+ .moveInto(UpstreamKVDB))
+ return std::move(E);
+ }
+ OnDiskGraphDB *UpstreamGraphDBPtr = UpstreamGraphDB.get();
+ StringRef PrimaryDir = *(DBDirs.end() - 1);
+ PathBuf = RootPath;
+ sys::path::append(PathBuf, PrimaryDir);
+ std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB;
+ if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize,
+ std::move(UpstreamGraphDB), FaultInPolicy)
+ .moveInto(PrimaryGraphDB))
+ return std::move(E);
+ std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB;
+ // \p UnifiedOnDiskCache does manual chaining for key-value requests,
+ // including an extra translation step of the value during fault-in.
+ if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize,
+ /*ValueName=*/"objectid",
+ /*ValueSize=*/sizeof(uint64_t))
+ .moveInto(PrimaryKVDB))
+ return std::move(E);
+ auto UniDB = std::unique_ptr<UnifiedOnDiskCache>(new UnifiedOnDiskCache());
+ UniDB->RootPath = RootPath;
+ UniDB->SizeLimit = SizeLimit;
+ UniDB->LockFD = LockFD;
+ UniDB->NeedsGarbageCollection = DBDirs.size() > 2;
+ UniDB->PrimaryDBDir = PrimaryDir;
+ UniDB->UpstreamGraphDB = UpstreamGraphDBPtr;
+ UniDB->PrimaryGraphDB = std::move(PrimaryGraphDB);
+ UniDB->UpstreamKVDB = std::move(UpstreamKVDB);
+ UniDB->PrimaryKVDB = std::move(PrimaryKVDB);
+ return std::move(UniDB);
+bool UnifiedOnDiskCache::hasExceededSizeLimit() const {
+ if (!SizeLimit)
+ return false;
+ // We allow each of the directories in the chain to reach up to half the
+ // intended size limit. Check whether the primary directory has exceeded half
+ // the limit or not, in order to decide whether we need to start a new chain.
+ //
+ // We could check the size limit against the sum of sizes of both the primary
+ // and upstream directories but then if the upstream is significantly larger
+ // than the intended limit, it would trigger a new chain to be created before
+ // the primary has reached its own limit. Essentially in such situation we
+ // prefer reclaiming the storage later in order to have more consistent cache
+ // hits behavior.
+ return (*SizeLimit / 2) <
+ (PrimaryGraphDB->getStorageSize() + PrimaryKVDB->getStorageSize());
+Error UnifiedOnDiskCache::close(bool CheckSizeLimit) {
+ if (LockFD == -1)
+ return Error::success(); // already closed.
+ auto _1 = make_scope_exit([&]() {
+ assert(LockFD >= 0);
+ sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD);
+ sys::fs::closeFile(LockFile);
+ LockFD = -1;
+ });
+ bool ExceededSizeLimit = CheckSizeLimit ? hasExceededSizeLimit() : false;
+ PrimaryKVDB.reset();
+ UpstreamKVDB.reset();
+ PrimaryGraphDB.reset();
+ UpstreamGraphDB = nullptr;
+ if (std::error_code EC = sys::fs::unlockFile(LockFD))
+ return createFileError(RootPath, EC);
+ if (!ExceededSizeLimit)
+ return Error::success();
+ // The primary directory exceeded its intended size limit. Try to get an
+ // exclusive lock in order to create a new primary directory for next time
+ // this \p UnifiedOnDiskCache path is opened.
+ if (std::error_code EC = sys::fs::tryLockFile(
+ LockFD, std::chrono::milliseconds(0), /*Exclusive=*/true)) {
+ if (EC == errc::no_lock_available)
+ return Error::success(); // couldn't get exclusive lock, give up.
+ return createFileError(RootPath, EC);
+ }
+ auto _2 = make_scope_exit([&]() { sys::fs::unlockFile(LockFD); });
+ // Managed to get an exclusive lock which means there are no other open
+ // \p UnifiedOnDiskCache instances for the same path, so we can safely start a
+ // new primary directory. To start a new primary directory we just have to
+ // create a new empty directory with the next consecutive index; since this is
+ // an atomic operation we will leave the top-level directory in a consistent
+ // state even if the process dies during this code-path.
+ SmallString<256> PathBuf(RootPath);
+ raw_svector_ostream OS(PathBuf);
+ OS << sys::path::get_separator();
+ getNextDBDirName(PrimaryDBDir, OS);
+ if (std::error_code EC = sys::fs::create_directory(PathBuf))
+ return createFileError(PathBuf, EC);
+ NeedsGarbageCollection = true;
+ return Error::success();
+UnifiedOnDiskCache::UnifiedOnDiskCache() = default;
+UnifiedOnDiskCache::~UnifiedOnDiskCache() { consumeError(close()); }
+Error UnifiedOnDiskCache::collectGarbage(StringRef Path) {
+ SmallVector<std::string, 4> DBDirs;
+ if (Error E = getAllDBDirs(Path, DBDirs))
+ return E;
+ if (DBDirs.size() <= 2)
+ return Error::success(); // no unused directories.
+ // FIXME: When the version of \p DBDirPrefix is bumped up we need to figure
+ // out how to handle the leftover sub-directories of the previous version.
+ SmallString<256> PathBuf(Path);
+ for (StringRef UnusedSubDir : ArrayRef(DBDirs).drop_back(2)) {
+ sys::path::append(PathBuf, UnusedSubDir);
+ if (std::error_code EC = sys::fs::remove_directories(PathBuf))
+ return createFileError(PathBuf, EC);
+ sys::path::remove_filename(PathBuf);
+ }
+ return Error::success();
diff --git a/llvm/tools/llvm-cas/CMakeLists.txt b/llvm/tools/llvm-cas/CMakeLists.txt
new file mode 100644
index 00000000000000..6093a906b503a7
--- /dev/null
+++ b/llvm/tools/llvm-cas/CMakeLists.txt
@@ -0,0 +1,8 @@
+ Support
+ )
+ llvm-cas.cpp
+ )
diff --git a/llvm/tools/llvm-cas/llvm-cas.cpp b/llvm/tools/llvm-cas/llvm-cas.cpp
new file mode 100644
index 00000000000000..fb1bd3df961cfe
--- /dev/null
+++ b/llvm/tools/llvm-cas/llvm-cas.cpp
@@ -0,0 +1,449 @@
+//===- llvm-cas.cpp - CAS tool --------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/CAS/TreeSchema.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+#include <system_error>
+using namespace llvm;
+using namespace llvm::cas;
+static cl::opt<bool> AllTrees(
+ "all-trees",
+ cl::desc("Print all trees, not just empty ones, for ls-tree-recursive"));
+static cl::list<std::string> Inputs(cl::Positional, cl::desc("Input object"));
+static int dump(ObjectStore &CAS);
+static int listTree(ObjectStore &CAS, const CASID &ID);
+static int listTreeRecursively(ObjectStore &CAS, const CASID &ID);
+static int listObjectReferences(ObjectStore &CAS, const CASID &ID);
+static int catBlob(ObjectStore &CAS, const CASID &ID);
+static int catNodeData(ObjectStore &CAS, const CASID &ID);
+static int printKind(ObjectStore &CAS, const CASID &ID);
+static int makeBlob(ObjectStore &CAS, StringRef DataPath);
+static int makeNode(ObjectStore &CAS, ArrayRef<std::string> References,
+ StringRef DataPath);
+static int diffGraphs(ObjectStore &CAS, const CASID &LHS, const CASID &RHS);
+static int traverseGraph(ObjectStore &CAS, const CASID &ID);
+static int import(ObjectStore &CAS, ObjectStore &UpstreamCAS,
+ ArrayRef<std::string> Objects);
+static int putCacheKey(ObjectStore &CAS, ActionCache &AC,
+ ArrayRef<std::string> Objects);
+static int getCacheResult(ObjectStore &CAS, ActionCache &AC, const CASID &ID);
+int main(int Argc, char **Argv) {
+ InitLLVM X(Argc, Argv);
+ cl::opt<std::string> CASPath("cas", cl::desc("Path to CAS on disk."),
+ cl::value_desc("path"));
+ cl::opt<std::string> CASPluginPath("fcas-plugin-path",
+ cl::desc("Path to plugin CAS library"),
+ cl::value_desc("path"));
+ cl::list<std::string> CASPluginOpts("fcas-plugin-option",
+ cl::desc("Plugin CAS Options"));
+ cl::opt<std::string> UpstreamCASPath(
+ "upstream-cas", cl::desc("Path to another CAS."), cl::value_desc("path"));
+ cl::opt<std::string> DataPath("data",
+ cl::desc("Path to data or '-' for stdin."),
+ cl::value_desc("path"));
+ enum CommandKind {
+ Invalid,
+ Dump,
+ PrintKind,
+ CatBlob,
+ CatNodeData,
+ DiffGraphs,
+ TraverseGraph,
+ MakeBlob,
+ MakeNode,
+ ListTree,
+ ListTreeRecursive,
+ ListObjectReferences,
+ Import,
+ PutCacheKey,
+ GetCacheResult,
+ };
+ cl::opt<CommandKind> Command(
+ cl::desc("choose command action:"),
+ cl::values(
+ clEnumValN(Dump, "dump", "dump internal contents"),
+ clEnumValN(PrintKind, "print-kind", "print kind"),
+ clEnumValN(CatBlob, "cat-blob", "cat blob"),
+ clEnumValN(CatNodeData, "cat-node-data", "cat node data"),
+ clEnumValN(DiffGraphs, "diff-graphs", "diff graphs"),
+ clEnumValN(TraverseGraph, "traverse-graph", "traverse graph"),
+ clEnumValN(MakeBlob, "make-blob", "make blob"),
+ clEnumValN(MakeNode, "make-node", "make node"),
+ clEnumValN(ListTree, "ls-tree", "list tree"),
+ clEnumValN(ListTreeRecursive, "ls-tree-recursive",
+ "list tree recursive"),
+ clEnumValN(ListObjectReferences, "ls-node-refs", "list node refs"),
+ clEnumValN(Import, "import", "import objects from another CAS"),
+ clEnumValN(PutCacheKey, "put-cache-key",
+ "set a value for a cache key"),
+ clEnumValN(GetCacheResult, "get-cache-result",
+ "get the result value from a cache key")),
+ cl::init(CommandKind::Invalid));
+ cl::ParseCommandLineOptions(Argc, Argv, "llvm-cas CAS tool\n");
+ ExitOnError ExitOnErr("llvm-cas: ");
+ if (Command == CommandKind::Invalid)
+ ExitOnErr(createStringError(inconvertibleErrorCode(),
+ "no command action is specified"));
+ // FIXME: Consider creating an in-memory CAS.
+ if (CASPath.empty())
+ ExitOnErr(
+ createStringError(inconvertibleErrorCode(), "missing --cas=<path>"));
+ std::shared_ptr<ObjectStore> CAS;
+ std::shared_ptr<ActionCache> AC;
+ std::tie(CAS, AC) = ExitOnErr(createOnDiskUnifiedCASDatabases(CASPath));
+ assert(CAS);
+ std::shared_ptr<ObjectStore> UpstreamCAS;
+ if (!UpstreamCASPath.empty())
+ UpstreamCAS = ExitOnErr(createCASFromIdentifier(UpstreamCASPath));
+ if (Command == Dump)
+ return dump(*CAS);
+ if (Command == MakeBlob)
+ return makeBlob(*CAS, DataPath);
+ if (Command == MakeNode)
+ return makeNode(*CAS, Inputs, DataPath);
+ if (Command == DiffGraphs) {
+ ExitOnError CommandErr("llvm-cas: diff-graphs");
+ if (Inputs.size() != 2)
+ CommandErr(
+ createStringError(inconvertibleErrorCode(), "expected 2 objects"));
+ CASID LHS = ExitOnErr(CAS->parseID(Inputs[0]));
+ CASID RHS = ExitOnErr(CAS->parseID(Inputs[1]));
+ return diffGraphs(*CAS, LHS, RHS);
+ }
+ if (Inputs.empty())
+ ExitOnErr(createStringError(inconvertibleErrorCode(),
+ "missing <object> to operate on"));
+ if (Command == Import) {
+ if (!UpstreamCAS)
+ ExitOnErr(createStringError(inconvertibleErrorCode(),
+ "missing '-upstream-cas'"));
+ return import(*CAS, *UpstreamCAS, Inputs);
+ }
+ if (Command == PutCacheKey || Command == GetCacheResult) {
+ if (!AC)
+ ExitOnErr(createStringError(inconvertibleErrorCode(),
+ "no action-cache available"));
+ }
+ if (Command == PutCacheKey)
+ return putCacheKey(*CAS, *AC, Inputs);
+ // Remaining commands need exactly one CAS object.
+ if (Inputs.size() > 1)
+ ExitOnErr(createStringError(inconvertibleErrorCode(),
+ "too many <object>s, expected 1"));
+ CASID ID = ExitOnErr(CAS->parseID(Inputs.front()));
+ if (Command == GetCacheResult)
+ return getCacheResult(*CAS, *AC, ID);
+ if (Command == TraverseGraph)
+ return traverseGraph(*CAS, ID);
+ if (Command == ListTree)
+ return listTree(*CAS, ID);
+ if (Command == ListTreeRecursive)
+ return listTreeRecursively(*CAS, ID);
+ if (Command == ListObjectReferences)
+ return listObjectReferences(*CAS, ID);
+ if (Command == CatNodeData)
+ return catNodeData(*CAS, ID);
+ if (Command == PrintKind)
+ return printKind(*CAS, ID);
+ assert(Command == CatBlob);
+ return catBlob(*CAS, ID);
+int listTree(ObjectStore &CAS, const CASID &ID) {
+ ExitOnError ExitOnErr("llvm-cas: ls-tree: ");
+ TreeSchema Schema(CAS);
+ ObjectProxy TreeN = ExitOnErr(CAS.getProxy(ID));
+ TreeProxy Tree = ExitOnErr(Schema.load(TreeN));
+ ExitOnErr(Tree.forEachEntry([&](const NamedTreeEntry &Entry) {
+ Entry.print(llvm::outs(), CAS);
+ return Error::success();
+ }));
+ return 0;
+int listTreeRecursively(ObjectStore &CAS, const CASID &ID) {
+ ExitOnError ExitOnErr("llvm-cas: ls-tree-recursively: ");
+ TreeSchema Schema(CAS);
+ ObjectProxy TreeN = ExitOnErr(CAS.getProxy(ID));
+ ExitOnErr(Schema.walkFileTreeRecursively(
+ CAS, TreeN.getRef(),
+ [&](const NamedTreeEntry &Entry, std::optional<TreeProxy> Tree) -> Error {
+ if (Entry.getKind() != TreeEntry::Tree) {
+ Entry.print(llvm::outs(), CAS);
+ return Error::success();
+ }
+ if (Tree->empty() || AllTrees)
+ Entry.print(llvm::outs(), CAS);
+ return Error::success();
+ }));
+ return 0;
+int catBlob(ObjectStore &CAS, const CASID &ID) { return catNodeData(CAS, ID); }
+static Expected<std::unique_ptr<MemoryBuffer>> openBuffer(StringRef DataPath) {
+ if (DataPath.empty())
+ return createStringError(inconvertibleErrorCode(), "--data missing");
+ return errorOrToExpected(DataPath == "-"
+ ? llvm::MemoryBuffer::getSTDIN()
+ : llvm::MemoryBuffer::getFile(DataPath));
+int dump(ObjectStore &CAS) {
+ ExitOnError ExitOnErr("llvm-cas: dump: ");
+ CAS.print(llvm::outs());
+ return 0;
+int makeBlob(ObjectStore &CAS, StringRef DataPath) {
+ ExitOnError ExitOnErr("llvm-cas: make-blob: ");
+ std::unique_ptr<MemoryBuffer> Buffer = ExitOnErr(openBuffer(DataPath));
+ ObjectProxy Blob =
+ ExitOnErr(CAS.createProxy(std::nullopt, Buffer->getBuffer()));
+ llvm::outs() << Blob.getID() << "\n";
+ return 0;
+int catNodeData(ObjectStore &CAS, const CASID &ID) {
+ ExitOnError ExitOnErr("llvm-cas: cat-node-data: ");
+ llvm::outs() << ExitOnErr(CAS.getProxy(ID)).getData();
+ return 0;
+static StringRef getKindString(ObjectStore &CAS, ObjectProxy Object) {
+ TreeSchema Schema(CAS);
+ if (Schema.isNode(Object))
+ return "tree";
+ return "object";
+int printKind(ObjectStore &CAS, const CASID &ID) {
+ ExitOnError ExitOnErr("llvm-cas: print-kind: ");
+ ObjectProxy Object = ExitOnErr(CAS.getProxy(ID));
+ llvm::outs() << getKindString(CAS, Object) << "\n";
+ return 0;
+int listObjectReferences(ObjectStore &CAS, const CASID &ID) {
+ ExitOnError ExitOnErr("llvm-cas: ls-node-refs: ");
+ ObjectProxy Object = ExitOnErr(CAS.getProxy(ID));
+ ExitOnErr(Object.forEachReference([&](ObjectRef Ref) -> Error {
+ llvm::outs() << CAS.getID(Ref) << "\n";
+ return Error::success();
+ }));
+ return 0;
+static int makeNode(ObjectStore &CAS, ArrayRef<std::string> Objects,
+ StringRef DataPath) {
+ std::unique_ptr<MemoryBuffer> Data =
+ ExitOnError("llvm-cas: make-node: data: ")(openBuffer(DataPath));
+ SmallVector<ObjectRef> IDs;
+ for (StringRef Object : Objects) {
+ ExitOnError ObjectErr("llvm-cas: make-node: ref: ");
+ std::optional<ObjectRef> ID =
+ CAS.getReference(ObjectErr(CAS.parseID(Object)));
+ if (!ID)
+ ObjectErr(createStringError(inconvertibleErrorCode(),
+ "unknown object '" + Object + "'"));
+ IDs.push_back(*ID);
+ }
+ ExitOnError ExitOnErr("llvm-cas: make-node: ");
+ ObjectProxy Object = ExitOnErr(CAS.createProxy(IDs, Data->getBuffer()));
+ llvm::outs() << Object.getID() << "\n";
+ return 0;
+namespace {
+struct GraphInfo {
+ SmallVector<cas::CASID> PostOrder;
+ DenseSet<cas::CASID> Seen;
+} // namespace
+static GraphInfo traverseObjectGraph(ObjectStore &CAS, const CASID &TopLevel) {
+ ExitOnError ExitOnErr("llvm-cas: traverse-node-graph: ");
+ GraphInfo Info;
+ SmallVector<std::pair<CASID, bool>> Worklist;
+ auto push = [&](CASID ID) {
+ if (Info.Seen.insert(ID).second)
+ Worklist.push_back({ID, false});
+ };
+ push(TopLevel);
+ while (!Worklist.empty()) {
+ if (Worklist.back().second) {
+ Info.PostOrder.push_back(Worklist.pop_back_val().first);
+ continue;
+ }
+ Worklist.back().second = true;
+ CASID ID = Worklist.back().first;
+ ObjectProxy Object = ExitOnErr(CAS.getProxy(ID));
+ TreeSchema Schema(CAS);
+ if (Schema.isNode(Object)) {
+ TreeProxy Tree = ExitOnErr(Schema.load(Object));
+ ExitOnErr(Tree.forEachEntry([&](const NamedTreeEntry &Entry) {
+ push(CAS.getID(Entry.getRef()));
+ return Error::success();
+ }));
+ continue;
+ }
+ ExitOnErr(Object.forEachReference([&](ObjectRef Ref) {
+ push(CAS.getID(Ref));
+ return Error::success();
+ }));
+ }
+ return Info;
+static void printDiffs(ObjectStore &CAS, const GraphInfo &Baseline,
+ const GraphInfo &New, StringRef NewName) {
+ ExitOnError ExitOnErr("llvm-cas: diff-graphs: ");
+ for (cas::CASID ID : New.PostOrder) {
+ if (Baseline.Seen.count(ID))
+ continue;
+ StringRef KindString;
+ ObjectProxy Object = ExitOnErr(CAS.getProxy(ID));
+ KindString = getKindString(CAS, Object);
+ outs() << llvm::formatv("{0}{1,-4} {2}\n", NewName, KindString, ID);
+ }
+int diffGraphs(ObjectStore &CAS, const CASID &LHS, const CASID &RHS) {
+ if (LHS == RHS)
+ return 0;
+ ExitOnError ExitOnErr("llvm-cas: diff-graphs: ");
+ GraphInfo LHSInfo = traverseObjectGraph(CAS, LHS);
+ GraphInfo RHSInfo = traverseObjectGraph(CAS, RHS);
+ printDiffs(CAS, RHSInfo, LHSInfo, "- ");
+ printDiffs(CAS, LHSInfo, RHSInfo, "+ ");
+ return 0;
+int traverseGraph(ObjectStore &CAS, const CASID &ID) {
+ ExitOnError ExitOnErr("llvm-cas: traverse-graph: ");
+ GraphInfo Info = traverseObjectGraph(CAS, ID);
+ printDiffs(CAS, GraphInfo{}, Info, "");
+ return 0;
+static ObjectRef importNode(ObjectStore &CAS, ObjectStore &UpstreamCAS,
+ const CASID &ID) {
+ ExitOnError ExitOnErr("llvm-cas: import: ");
+ std::optional<ObjectRef> PrimaryRef = CAS.getReference(ID);
+ if (PrimaryRef)
+ return *PrimaryRef; // object is present.
+ ObjectProxy UpstreamObj = ExitOnErr(UpstreamCAS.getProxy(ID));
+ SmallVector<ObjectRef> Refs;
+ ExitOnErr(UpstreamObj.forEachReference([&](ObjectRef UpstreamRef) -> Error {
+ ObjectRef Ref =
+ importNode(CAS, UpstreamCAS, UpstreamCAS.getID(UpstreamRef));
+ Refs.push_back(Ref);
+ return Error::success();
+ }));
+ return ExitOnErr(CAS.storeFromString(Refs, UpstreamObj.getData()));
+static int import(ObjectStore &CAS, ObjectStore &UpstreamCAS,
+ ArrayRef<std::string> Objects) {
+ ExitOnError ExitOnErr("llvm-cas: import: ");
+ for (StringRef Object : Objects) {
+ CASID ID = ExitOnErr(CAS.parseID(Object));
+ importNode(CAS, UpstreamCAS, ID);
+ }
+ return 0;
+static int putCacheKey(ObjectStore &CAS, ActionCache &AC,
+ ArrayRef<std::string> Objects) {
+ ExitOnError ExitOnErr("llvm-cas: put-cache-key: ");
+ if (Objects.size() % 2 != 0)
+ ExitOnErr(createStringError(inconvertibleErrorCode(),
+ "expected pairs of inputs"));
+ while (!Objects.empty()) {
+ CASID Key = ExitOnErr(CAS.parseID(Objects[0]));
+ CASID Result = ExitOnErr(CAS.parseID(Objects[1]));
+ Objects = Objects.drop_front(2);
+ ExitOnErr(AC.put(Key, Result));
+ }
+ return 0;
+static int getCacheResult(ObjectStore &CAS, ActionCache &AC, const CASID &ID) {
+ ExitOnError ExitOnErr("llvm-cas: get-cache-result: ");
+ auto Result = ExitOnErr(AC.get(ID));
+ if (!Result) {
+ outs() << "result not found\n";
+ return 1;
+ }
+ outs() << *Result << "\n";
+ return 0;
>From 1252de966a7b3ac4a34c68367c6e52d56ad44da8 Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Mon, 9 Oct 2023 11:01:53 -0700
Subject: [PATCH 05/11] [UnitTest] Add unit-test for LLVMCAS
llvm/unittests/CAS/ActionCacheTest.cpp | 152 ++++++++++
llvm/unittests/CAS/CASTestConfig.cpp | 33 +-
llvm/unittests/CAS/CASTestConfig.h | 34 ++-
llvm/unittests/CAS/CMakeLists.txt | 12 +
.../CAS/HierarchicalTreeBuilderTest.cpp | 210 +++++++++++++
llvm/unittests/CAS/ObjectStoreTest.cpp | 203 ++++++++++++-
llvm/unittests/CAS/OnDiskCommonUtils.h | 69 +++++
llvm/unittests/CAS/OnDiskGraphDBTest.cpp | 284 ++++++++++++++++++
.../CAS/OnDiskHashMappedTrieTest.cpp | 146 +++++++++
llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp | 54 ++++
llvm/unittests/CAS/TreeSchemaTest.cpp | 266 ++++++++++++++++
llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp | 182 +++++++++++
12 files changed, 1626 insertions(+), 19 deletions(-)
create mode 100644 llvm/unittests/CAS/ActionCacheTest.cpp
create mode 100644 llvm/unittests/CAS/HierarchicalTreeBuilderTest.cpp
create mode 100644 llvm/unittests/CAS/OnDiskCommonUtils.h
create mode 100644 llvm/unittests/CAS/OnDiskGraphDBTest.cpp
create mode 100644 llvm/unittests/CAS/OnDiskHashMappedTrieTest.cpp
create mode 100644 llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp
create mode 100644 llvm/unittests/CAS/TreeSchemaTest.cpp
create mode 100644 llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp
diff --git a/llvm/unittests/CAS/ActionCacheTest.cpp b/llvm/unittests/CAS/ActionCacheTest.cpp
new file mode 100644
index 00000000000000..1f5fdaa9003e23
--- /dev/null
+++ b/llvm/unittests/CAS/ActionCacheTest.cpp
@@ -0,0 +1,152 @@
+//===- ActionCacheTest.cpp ------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/ActionCache.h"
+#include "CASTestConfig.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+using namespace llvm::cas;
+TEST_P(CASTest, ActionCacheHit) {
+ std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ std::unique_ptr<ActionCache> Cache = createActionCache();
+ std::optional<ObjectProxy> ID;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID),
+ Succeeded());
+ std::optional<CASID> ResultID;
+ ASSERT_THAT_ERROR(Cache->put(*ID, *ID), Succeeded());
+ ASSERT_THAT_ERROR(Cache->get(*ID).moveInto(ResultID), Succeeded());
+ std::optional<ObjectRef> Result = CAS->getReference(*ResultID);
+ ASSERT_TRUE(Result);
+ ASSERT_EQ(*ID, *Result);
+TEST_P(CASTest, ActionCacheMiss) {
+ std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ std::unique_ptr<ActionCache> Cache = createActionCache();
+ std::optional<ObjectProxy> ID1, ID2;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2),
+ Succeeded());
+ ASSERT_THAT_ERROR(Cache->put(*ID1, *ID2), Succeeded());
+ // This is a cache miss for looking up a key doesn't exist.
+ std::optional<CASID> Result1;
+ ASSERT_THAT_ERROR(Cache->get(*ID2).moveInto(Result1), Succeeded());
+ ASSERT_FALSE(Result1);
+ ASSERT_THAT_ERROR(Cache->put(*ID2, *ID1), Succeeded());
+ // Cache hit after adding the value.
+ std::optional<CASID> Result2;
+ ASSERT_THAT_ERROR(Cache->get(*ID2).moveInto(Result2), Succeeded());
+ ASSERT_TRUE(Result2);
+ std::optional<ObjectRef> Ref = CAS->getReference(*Result2);
+ ASSERT_EQ(*ID1, *Ref);
+TEST_P(CASTest, ActionCacheRewrite) {
+ std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ std::unique_ptr<ActionCache> Cache = createActionCache();
+ std::optional<ObjectProxy> ID1, ID2;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2),
+ Succeeded());
+ ASSERT_THAT_ERROR(Cache->put(*ID1, *ID1), Succeeded());
+ // Writing to the same key with different value is error.
+ ASSERT_THAT_ERROR(Cache->put(*ID1, *ID2), Failed());
+ // Writing the same value multiple times to the same key is fine.
+ ASSERT_THAT_ERROR(Cache->put(*ID1, *ID1), Succeeded());
+TEST(OnDiskActionCache, ActionCacheResultInvalid) {
+ unittest::TempDir Temp("on-disk-cache", /*Unique=*/true);
+ std::unique_ptr<ObjectStore> CAS1 = createInMemoryCAS();
+ std::unique_ptr<ObjectStore> CAS2 = createInMemoryCAS();
+ std::optional<ObjectProxy> ID1, ID2, ID3;
+ ASSERT_THAT_ERROR(CAS1->createProxy(std::nullopt, "1").moveInto(ID1),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS1->createProxy(std::nullopt, "2").moveInto(ID2),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS2->createProxy(std::nullopt, "1").moveInto(ID3),
+ Succeeded());
+ std::unique_ptr<ActionCache> Cache1 =
+ cantFail(createOnDiskActionCache(Temp.path()));
+ // Test put and get.
+ ASSERT_THAT_ERROR(Cache1->put(*ID1, *ID2), Succeeded());
+ std::optional<CASID> Result;
+ ASSERT_THAT_ERROR(Cache1->get(*ID1).moveInto(Result), Succeeded());
+ ASSERT_TRUE(Result);
+ // Create OnDiskCAS from the same location but a different underlying CAS.
+ std::unique_ptr<ActionCache> Cache2 =
+ cantFail(createOnDiskActionCache(Temp.path()));
+ // Loading an key that points to an invalid object.
+ std::optional<CASID> Result2;
+ // Get will work but the resulting CASID doesn't exist in ObjectStore.
+ ASSERT_THAT_ERROR(Cache2->get(*ID3).moveInto(Result2), Succeeded());
+ ASSERT_FALSE(CAS2->getReference(*Result2));
+ // Write a different value will cause error.
+ ASSERT_THAT_ERROR(Cache2->put(*ID3, *ID3), Failed());
+TEST_P(CASTest, ActionCacheAsync) {
+ std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ std::unique_ptr<ActionCache> Cache = createActionCache();
+ {
+ std::optional<ObjectProxy> ID;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID),
+ Succeeded());
+ auto PutFuture = Cache->putFuture(*ID, *ID);
+ ASSERT_THAT_ERROR(PutFuture.get().take(), Succeeded());
+ auto GetFuture = Cache->getFuture(*ID);
+ std::optional<CASID> ResultID;
+ ASSERT_THAT_ERROR(GetFuture.get().take().moveInto(ResultID), Succeeded());
+ }
+ std::optional<ObjectProxy> ID2;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2),
+ Succeeded());
+ {
+ std::promise<AsyncErrorValue> Promise;
+ auto Future = Promise.get_future();
+ Cache->putAsync(*ID2, *ID2, false,
+ [Promise = std::move(Promise)](Error E) mutable {
+ Promise.set_value(std::move(E));
+ });
+ ASSERT_THAT_ERROR(Future.get().take(), Succeeded());
+ }
+ {
+ std::promise<AsyncCASIDValue> Promise;
+ auto Future = Promise.get_future();
+ Cache->getAsync(*ID2, false,
+ [Promise = std::move(Promise)](
+ Expected<std::optional<CASID>> Value) mutable {
+ Promise.set_value(std::move(Value));
+ });
+ std::optional<CASID> ResultID;
+ ASSERT_THAT_ERROR(Future.get().take().moveInto(ResultID), Succeeded());
+ }
diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp
index bb06ee5573134f..4747ea9d8aa028 100644
--- a/llvm/unittests/CAS/CASTestConfig.cpp
+++ b/llvm/unittests/CAS/CASTestConfig.cpp
@@ -7,16 +7,45 @@
#include "CASTestConfig.h"
+#include "llvm/CAS/ActionCache.h"
#include "llvm/CAS/ObjectStore.h"
#include "gtest/gtest.h"
+#include <mutex>
using namespace llvm;
using namespace llvm::cas;
-CASTestingEnv createInMemory(int I) {
+TestingAndDir createInMemory(int I) {
std::unique_ptr<ObjectStore> CAS = createInMemoryCAS();
- return CASTestingEnv{std::move(CAS)};
+ std::unique_ptr<ActionCache> Cache = createInMemoryActionCache();
+ return TestingAndDir{std::move(CAS), std::move(Cache), std::nullopt};
+__attribute__((constructor)) static void configureCASTestEnv() {
+ // Restrict the size of the on-disk CAS for tests. This allows testing in
+ // constrained environments (e.g. small TMPDIR). It also prevents leaving
+ // behind large files on file systems that do not support sparse files if a
+ // test crashes before resizing the file.
+ static std::once_flag Flag;
+ std::call_once(Flag, [] {
+ size_t Limit = 100 * 1024 * 1024;
+ std::string LimitStr = std::to_string(Limit);
+ setenv("LLVM_CAS_MAX_MAPPING_SIZE", LimitStr.c_str(), /*overwrite=*/false);
+ });
+TestingAndDir createOnDisk(int I) {
+ unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+ std::unique_ptr<ObjectStore> CAS;
+ EXPECT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+ std::unique_ptr<ActionCache> Cache;
+ EXPECT_THAT_ERROR(createOnDiskActionCache(Temp.path()).moveInto(Cache),
+ Succeeded());
+ return TestingAndDir{std::move(CAS), std::move(Cache), std::move(Temp)};
+INSTANTIATE_TEST_SUITE_P(OnDiskCAS, CASTest, ::testing::Values(createOnDisk));
diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h
index c787e800396543..3917fd6378d34c 100644
--- a/llvm/unittests/CAS/CASTestConfig.h
+++ b/llvm/unittests/CAS/CASTestConfig.h
@@ -6,31 +6,47 @@
+#include "llvm/CAS/ActionCache.h"
#include "llvm/CAS/ObjectStore.h"
-#include "llvm/Config/llvm-config.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Testing/Support/Error.h"
#include "llvm/Testing/Support/SupportHelpers.h"
#include "gtest/gtest.h"
+#include <memory>
-struct CASTestingEnv {
- std::unique_ptr<llvm::cas::ObjectStore> CAS;
+struct TestingAndDir {
+ std::shared_ptr<llvm::cas::ObjectStore> CAS;
+ std::unique_ptr<llvm::cas::ActionCache> Cache;
+ std::optional<llvm::unittest::TempDir> Temp;
class CASTest
- : public testing::TestWithParam<std::function<CASTestingEnv(int)>> {
+ : public testing::TestWithParam<std::function<TestingAndDir(int)>> {
std::optional<int> NextCASIndex;
- std::unique_ptr<llvm::cas::ObjectStore> createObjectStore() {
+ llvm::SmallVector<llvm::unittest::TempDir> Dirs;
+ std::shared_ptr<llvm::cas::ObjectStore> createObjectStore() {
auto TD = GetParam()(++(*NextCASIndex));
+ if (TD.Temp)
+ Dirs.push_back(std::move(*TD.Temp));
return std::move(TD.CAS);
+ std::unique_ptr<llvm::cas::ActionCache> createActionCache() {
+ auto TD = GetParam()(++(*NextCASIndex));
+ if (TD.Temp)
+ Dirs.push_back(std::move(*TD.Temp));
+ return std::move(TD.Cache);
+ }
void SetUp() { NextCASIndex = 0; }
- void TearDown() { NextCASIndex = std::nullopt; }
+ void TearDown() {
+ NextCASIndex = std::nullopt;
+ Dirs.clear();
+ }
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index 39a2100c4909ee..8a82d8b8df3bfb 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -1,3 +1,7 @@
+ add_definitions(-DLLVM_ENABLE_ONDISK_CAS=1)
@@ -5,8 +9,16 @@ set(LLVM_LINK_COMPONENTS
+ ActionCacheTest.cpp
+ HierarchicalTreeBuilderTest.cpp
+ OnDiskCommonUtils.h
+ OnDiskGraphDBTest.cpp
+ OnDiskHashMappedTrieTest.cpp
+ OnDiskKeyValueDBTest.cpp
+ TreeSchemaTest.cpp
+ UnifiedOnDiskCacheTest.cpp
target_link_libraries(CASTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/CAS/HierarchicalTreeBuilderTest.cpp b/llvm/unittests/CAS/HierarchicalTreeBuilderTest.cpp
new file mode 100644
index 00000000000000..ec8bfafe31f99e
--- /dev/null
+++ b/llvm/unittests/CAS/HierarchicalTreeBuilderTest.cpp
@@ -0,0 +1,210 @@
+//===- HierarchicalTreeBuilderTest.cpp ------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/HierarchicalTreeBuilder.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+#include <memory>
+using namespace llvm;
+using namespace llvm::cas;
+static std::unique_ptr<MemoryBuffer> getBufferForName(ObjectStore &CAS,
+ TreeSchema &Tree,
+ ObjectRef Root,
+ StringRef Name) {
+ std::unique_ptr<MemoryBuffer> Buffer = nullptr;
+ StringRef Filename = sys::path::filename(Name, sys::path::Style::posix);
+ StringRef Dirname = sys::path::parent_path(Name, sys::path::Style::posix);
+ auto Err = Tree.walkFileTreeRecursively(
+ CAS, Root,
+ [&](const NamedTreeEntry &Entry,
+ std::optional<TreeProxy> Proxy) -> Error {
+ if (Proxy && Entry.getName() == Dirname) {
+ if (auto File = Proxy->lookup(Filename)) {
+ auto Ref = File->getRef();
+ auto Loaded = CAS.getProxy(Ref);
+ if (!Loaded)
+ return Loaded.takeError();
+ Buffer = Loaded->getMemoryBuffer();
+ }
+ }
+ return Error::success();
+ });
+ EXPECT_THAT_ERROR(std::move(Err), Succeeded());
+ return Buffer;
+TEST(HierarchicalTreeBuilderTest, Flat) {
+ std::unique_ptr<ObjectStore> CAS = createInMemoryCAS();
+ auto make = [&](StringRef Content) {
+ return *expectedToOptional(CAS->storeFromString(std::nullopt, Content));
+ };
+ HierarchicalTreeBuilder Builder;
+ Builder.push(make("1"), TreeEntry::Regular, "/file1");
+ Builder.push(make("1"), TreeEntry::Regular, "/1");
+ Builder.push(make("2"), TreeEntry::Regular, "/2");
+ std::optional<ObjectProxy> Root;
+ ASSERT_THAT_ERROR(Builder.create(*CAS).moveInto(Root), Succeeded());
+ TreeSchema Tree(*CAS);
+ ASSERT_TRUE(Tree.isNode(*Root));
+ std::unique_ptr<MemoryBuffer> F1 =
+ getBufferForName(*CAS, Tree, Root->getRef(), "/1");
+ std::unique_ptr<MemoryBuffer> F2 =
+ getBufferForName(*CAS, Tree, Root->getRef(), "/2");
+ std::unique_ptr<MemoryBuffer> Ffile1 =
+ getBufferForName(*CAS, Tree, Root->getRef(), "/file1");
+ ASSERT_TRUE(Ffile1);
+ EXPECT_EQ("1", F1->getBuffer());
+ EXPECT_EQ("2", F2->getBuffer());
+ EXPECT_EQ("1", Ffile1->getBuffer());
+TEST(HierarchicalTreeBuilderTest, Nested) {
+ std::unique_ptr<ObjectStore> CAS = createInMemoryCAS();
+ auto make = [&](StringRef Content) {
+ return *expectedToOptional(CAS->storeFromString(std::nullopt, Content));
+ };
+ HierarchicalTreeBuilder Builder;
+ Builder.push(make("blob2"), TreeEntry::Regular, "/d2");
+ Builder.push(make("blob1"), TreeEntry::Regular, "/t1/d1");
+ Builder.push(make("blob3"), TreeEntry::Regular, "/t3/d3");
+ Builder.push(make("blob1"), TreeEntry::Regular, "/t3/t1nested/d1");
+ Builder.push(make("blob1"), TreeEntry::Regular, "/t3/t2/d1also");
+ Builder.push(make("blob2"), TreeEntry::Regular, "/t3/t2/d2");
+ std::optional<ObjectProxy> Root;
+ ASSERT_THAT_ERROR(Builder.create(*CAS).moveInto(Root), Succeeded());
+ TreeSchema Tree(*CAS);
+ ASSERT_TRUE(Tree.isNode(*Root));
+ std::unique_ptr<MemoryBuffer> F1 =
+ getBufferForName(*CAS, Tree, Root->getRef(), "/1");
+ std::unique_ptr<MemoryBuffer> T1D1 =
+ getBufferForName(*CAS, Tree, Root->getRef(), "/t1/d1");
+ std::unique_ptr<MemoryBuffer> T1NestedD1 =
+ getBufferForName(*CAS, Tree, Root->getRef(), "/t3/t1nested/d1");
+ std::unique_ptr<MemoryBuffer> T3T2D1Also =
+ getBufferForName(*CAS, Tree, Root->getRef(), "/t3/t2/d1also");
+ std::unique_ptr<MemoryBuffer> T3TD3 =
+ getBufferForName(*CAS, Tree, Root->getRef(), "/t3/d3");
+ ASSERT_TRUE(T1NestedD1);
+ EXPECT_EQ("blob1", T1D1->getBuffer());
+ EXPECT_EQ("blob1", T1NestedD1->getBuffer());
+ EXPECT_EQ("blob1", T3T2D1Also->getBuffer());
+ EXPECT_EQ("blob3", T3TD3->getBuffer());
+TEST(HierarchicalTreeBuilderTest, MergeDirectories) {
+ std::unique_ptr<ObjectStore> CAS = createInMemoryCAS();
+ auto make = [&](StringRef Content) {
+ return *expectedToOptional(CAS->storeFromString(std::nullopt, Content));
+ };
+ auto createRoot = [&](StringRef Blob, StringRef Path,
+ std::optional<ObjectRef> &Root) {
+ HierarchicalTreeBuilder Builder;
+ Builder.push(make(Blob), TreeEntry::Regular, Path);
+ std::optional<ObjectProxy> H;
+ ASSERT_THAT_ERROR(Builder.create(*CAS).moveInto(H), Succeeded());
+ Root = CAS->getReference(*H);
+ };
+ std::optional<ObjectRef> Root1;
+ createRoot("blob1", "/t1/d1", Root1);
+ std::optional<ObjectRef> Root2;
+ createRoot("blob2", "/t1/d2", Root2);
+ std::optional<ObjectRef> Root3;
+ createRoot("blob3", "/t1/nested/d1", Root3);
+ HierarchicalTreeBuilder Builder;
+ Builder.pushTreeContent(*Root1, "/");
+ Builder.pushTreeContent(*Root2, "");
+ Builder.pushTreeContent(*Root3, "/");
+ Builder.pushTreeContent(*Root1, "");
+ Builder.pushTreeContent(*Root1, "other1/nest");
+ std::optional<ObjectProxy> Root;
+ ASSERT_THAT_ERROR(Builder.create(*CAS).moveInto(Root), Succeeded());
+ TreeSchema Tree(*CAS);
+ ASSERT_TRUE(Tree.isNode(*Root));
+ std::unique_ptr<MemoryBuffer> T1D1 =
+ getBufferForName(*CAS, Tree, Root->getRef(), "/t1/d1");
+ std::unique_ptr<MemoryBuffer> T1D2 =
+ getBufferForName(*CAS, Tree, Root->getRef(), "/t1/d2");
+ std::unique_ptr<MemoryBuffer> T1NestedD1 =
+ getBufferForName(*CAS, Tree, Root->getRef(), "/t1/nested/d1");
+ std::unique_ptr<MemoryBuffer> OtherT1D1 =
+ getBufferForName(*CAS, Tree, Root->getRef(), "/other1/nest/t1/d1");
+ ASSERT_TRUE(T1NestedD1);
+ EXPECT_EQ("blob1", T1D1->getBuffer());
+ EXPECT_EQ("blob2", T1D2->getBuffer());
+ EXPECT_EQ("blob3", T1NestedD1->getBuffer());
+ EXPECT_EQ("blob1", OtherT1D1->getBuffer());
+TEST(HierarchicalTreeBuilderTest, MergeDirectoriesConflict) {
+ std::unique_ptr<ObjectStore> CAS = createInMemoryCAS();
+ auto make = [&](StringRef Content) {
+ return *expectedToOptional(CAS->storeFromString(std::nullopt, Content));
+ };
+ auto createRoot = [&](StringRef Blob, StringRef Path,
+ std::optional<ObjectProxy> &Root) {
+ HierarchicalTreeBuilder Builder;
+ Builder.push(make(Blob), TreeEntry::Regular, Path);
+ ASSERT_THAT_ERROR(Builder.create(*CAS).moveInto(Root), Succeeded());
+ };
+ std::optional<ObjectProxy> Root1;
+ createRoot("blob1", "/t1/d1", Root1);
+ std::optional<ObjectProxy> Root2;
+ createRoot("blob2", "/t1/d1", Root2);
+ std::optional<ObjectProxy> Root3;
+ createRoot("blob3", "/t1/d1/nested", Root3);
+ {
+ HierarchicalTreeBuilder Builder;
+ Builder.pushTreeContent(Root1->getRef(), "");
+ Builder.pushTreeContent(Root2->getRef(), "");
+ std::optional<ObjectProxy> Root;
+ Builder.create(*CAS).moveInto(Root),
+ FailedWithMessage("duplicate path '/t1/d1' with different ID"));
+ }
+ {
+ HierarchicalTreeBuilder Builder;
+ Builder.pushTreeContent(Root1->getRef(), "");
+ Builder.pushTreeContent(Root3->getRef(), "");
+ std::optional<ObjectProxy> Root;
+ EXPECT_THAT_ERROR(Builder.create(*CAS).moveInto(Root),
+ FailedWithMessage("duplicate path '/t1/d1'"));
+ }
diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp
index fb29d76cff46f5..de443032c34243 100644
--- a/llvm/unittests/CAS/ObjectStoreTest.cpp
+++ b/llvm/unittests/CAS/ObjectStoreTest.cpp
@@ -7,8 +7,9 @@
#include "llvm/CAS/ObjectStore.h"
-#include "llvm/Config/llvm-config.h"
#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/ThreadPool.h"
#include "llvm/Testing/Support/Error.h"
#include "llvm/Testing/Support/SupportHelpers.h"
#include "gtest/gtest.h"
@@ -19,7 +20,7 @@ using namespace llvm;
using namespace llvm::cas;
TEST_P(CASTest, PrintIDs) {
- std::unique_ptr<ObjectStore> CAS = createObjectStore();
+ std::shared_ptr<ObjectStore> CAS = createObjectStore();
std::optional<CASID> ID1, ID2;
ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1),
@@ -39,7 +40,7 @@ TEST_P(CASTest, PrintIDs) {
TEST_P(CASTest, Blobs) {
- std::unique_ptr<ObjectStore> CAS1 = createObjectStore();
+ std::shared_ptr<ObjectStore> CAS1 = createObjectStore();
StringRef ContentStrings[] = {
"some longer text std::string's local memory",
@@ -90,7 +91,7 @@ multiline text multiline text multiline text multiline text multiline text)",
// Confirm these blobs don't exist in a fresh CAS instance.
- std::unique_ptr<ObjectStore> CAS2 = createObjectStore();
+ std::shared_ptr<ObjectStore> CAS2 = createObjectStore();
for (int I = 0, E = IDs.size(); I != E; ++I) {
std::optional<ObjectProxy> Proxy;
EXPECT_THAT_ERROR(CAS2->getProxy(IDs[I]).moveInto(Proxy), Failed());
@@ -114,7 +115,7 @@ multiline text multiline text multiline text multiline text multiline text)",
TEST_P(CASTest, BlobsBig) {
// A little bit of validation that bigger blobs are okay. Climb up to 1MB.
- std::unique_ptr<ObjectStore> CAS = createObjectStore();
+ std::shared_ptr<ObjectStore> CAS = createObjectStore();
SmallString<256> String1 = StringRef("a few words");
SmallString<256> String2 = StringRef("others");
while (String1.size() < 1024U * 1024U) {
@@ -157,7 +158,7 @@ TEST_P(CASTest, BlobsBig) {
TEST_P(CASTest, LeafNodes) {
- std::unique_ptr<ObjectStore> CAS1 = createObjectStore();
+ std::shared_ptr<ObjectStore> CAS1 = createObjectStore();
StringRef ContentStrings[] = {
"some longer text std::string's local memory",
@@ -216,7 +217,7 @@ multiline text multiline text multiline text multiline text multiline text)",
// Confirm these blobs don't exist in a fresh CAS instance.
- std::unique_ptr<ObjectStore> CAS2 = createObjectStore();
+ std::shared_ptr<ObjectStore> CAS2 = createObjectStore();
for (int I = 0, E = IDs.size(); I != E; ++I) {
std::optional<ObjectProxy> Object;
EXPECT_THAT_ERROR(CAS2->getProxy(IDs[I]).moveInto(Object), Failed());
@@ -242,7 +243,7 @@ multiline text multiline text multiline text multiline text multiline text)",
TEST_P(CASTest, NodesBig) {
- std::unique_ptr<ObjectStore> CAS = createObjectStore();
+ std::shared_ptr<ObjectStore> CAS = createObjectStore();
// Specifically check near 1MB for objects large enough they're likely to be
// stored externally in an on-disk CAS, and such that one of them will be
@@ -278,3 +279,189 @@ TEST_P(CASTest, NodesBig) {
for (auto ID : CreatedNodes)
ASSERT_THAT_ERROR(CAS->validate(CAS->getID(ID)), Succeeded());
+/// Common test functionality for creating blobs in parallel. You can vary which
+/// cas instances are the same or different, and the size of the created blobs.
+static void testBlobsParallel(ObjectStore &Read1, ObjectStore &Read2,
+ ObjectStore &Write1, ObjectStore &Write2,
+ uint64_t BlobSize) {
+ SCOPED_TRACE(testBlobsParallel);
+ unsigned BlobCount = 100;
+ std::vector<std::string> Blobs;
+ Blobs.reserve(BlobCount);
+ for (unsigned I = 0; I < BlobCount; ++I) {
+ std::string Blob;
+ Blob.reserve(BlobSize);
+ while (Blob.size() < BlobSize) {
+ auto R = sys::Process::GetRandomNumber();
+ Blob.append((char *)&R, sizeof(R));
+ }
+ assert(Blob.size() >= BlobSize);
+ Blob.resize(BlobSize);
+ Blobs.push_back(std::move(Blob));
+ }
+ std::mutex NodesMtx;
+ std::vector<std::optional<CASID>> CreatedNodes(BlobCount);
+ auto Producer = [&](unsigned I, ObjectStore *CAS) {
+ std::optional<ObjectProxy> Node;
+ EXPECT_THAT_ERROR(CAS->createProxy({}, Blobs[I]).moveInto(Node),
+ Succeeded());
+ {
+ std::lock_guard<std::mutex> L(NodesMtx);
+ CreatedNodes[I] = Node ? Node->getID() : CASID::getDenseMapTombstoneKey();
+ }
+ };
+ auto Consumer = [&](unsigned I, ObjectStore *CAS) {
+ std::optional<CASID> ID;
+ while (!ID) {
+ // Busy wait.
+ std::lock_guard<std::mutex> L(NodesMtx);
+ ID = CreatedNodes[I];
+ }
+ if (ID == CASID::getDenseMapTombstoneKey())
+ // Producer failed; already reported.
+ return;
+ std::optional<ObjectProxy> Node;
+ ASSERT_THAT_ERROR(CAS->getProxy(*ID).moveInto(Node), Succeeded());
+ EXPECT_EQ(Node->getData(), Blobs[I]);
+ };
+ ThreadPool Threads;
+ for (unsigned I = 0; I < BlobCount; ++I) {
+ Threads.async(Consumer, I, &Read1);
+ Threads.async(Consumer, I, &Read2);
+ Threads.async(Producer, I, &Write1);
+ Threads.async(Producer, I, &Write2);
+ }
+ Threads.wait();
+static void testBlobsParallel1(ObjectStore &CAS, uint64_t BlobSize) {
+ SCOPED_TRACE(testBlobsParallel1);
+ testBlobsParallel(CAS, CAS, CAS, CAS, BlobSize);
+TEST_P(CASTest, BlobsParallel) {
+ std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ uint64_t Size = 1ULL * 1024;
+ ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
+TEST_P(CASTest, BlobsBigParallel) {
+ std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ // 100k is large enough to be standalone files in our on-disk cas.
+ uint64_t Size = 100ULL * 1024;
+ ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
+TEST(OnDiskCASTest, BlobsParallelMultiCAS) {
+ // This test intentionally uses symlinked paths to the same CAS to subvert the
+ // shared memory mappings that would normally be created within a single
+ // process. This breaks the lock file guarantees, so we must be careful not
+ // to create or destroy the CAS objects concurrently, which is when the locks
+ // are normally important.
+ unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+ ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")),
+ std::error_code());
+ ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")),
+ std::error_code());
+ ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")),
+ std::error_code());
+ ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")),
+ std::error_code());
+ std::unique_ptr<ObjectStore> CAS1, CAS2, CAS3, CAS4;
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1),
+ Succeeded());
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2),
+ Succeeded());
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3),
+ Succeeded());
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4),
+ Succeeded());
+ uint64_t Size = 1ULL * 1024;
+ ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size));
+TEST(OnDiskCASTest, BlobsBigParallelMultiCAS) {
+ // See comment in BlobsParallelMultiCAS.
+ unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+ ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")),
+ std::error_code());
+ ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")),
+ std::error_code());
+ ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")),
+ std::error_code());
+ ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")),
+ std::error_code());
+ std::unique_ptr<ObjectStore> CAS1, CAS2, CAS3, CAS4;
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1),
+ Succeeded());
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2),
+ Succeeded());
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3),
+ Succeeded());
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4),
+ Succeeded());
+ // 100k is large enough to be standalone files in our on-disk cas.
+ uint64_t Size = 100ULL * 1024;
+ ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size));
+#ifndef _WIN32 // FIXME: resize support on Windows.
+TEST(OnDiskCASTest, DiskSize) {
+ unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+ std::unique_ptr<ObjectStore> CAS;
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+ uint64_t MaxSize = 100 * 1024 * 1024;
+ // Check that we map the files to the correct size.
+ auto CheckFileSizes = [&](bool Mapped) {
+ bool FoundIndex = false, FoundData = false;
+ std::error_code EC;
+ for (sys::fs::directory_iterator I(Temp.path(), EC), E; I != E && !EC;
+ I.increment(EC)) {
+ if (StringRef(I->path()).endswith(".index")) {
+ FoundIndex = true;
+ ASSERT_TRUE(I->status());
+ if (Mapped)
+ EXPECT_EQ(I->status()->getSize(), MaxSize);
+ else
+ EXPECT_LT(I->status()->getSize(), MaxSize);
+ }
+ if (StringRef(I->path()).endswith(".data")) {
+ FoundData = true;
+ ASSERT_TRUE(I->status());
+ if (Mapped)
+ EXPECT_EQ(I->status()->getSize(), MaxSize);
+ else
+ EXPECT_LT(I->status()->getSize(), MaxSize);
+ }
+ }
+ ASSERT_TRUE(FoundIndex);
+ ASSERT_TRUE(FoundData);
+ };
+ // Check that we have the full mapping size when the CAS is open.
+ CheckFileSizes(/*Mapped=*/true);
+ CAS.reset();
+ // Check that the CAS is shrunk to a smaller size.
+ CheckFileSizes(/*Mapped=*/false);
+ // Repeat the checks when starting from an existing CAS.
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+ CheckFileSizes(/*Mapped=*/true);
+ CAS.reset();
+ CheckFileSizes(/*Mapped=*/false);
diff --git a/llvm/unittests/CAS/OnDiskCommonUtils.h b/llvm/unittests/CAS/OnDiskCommonUtils.h
new file mode 100644
index 00000000000000..3978c6b054e8d3
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskCommonUtils.h
@@ -0,0 +1,69 @@
+//===- llvm/unittest/CAS/OnDiskCommonUtils.h --------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/Support/BLAKE3.h"
+namespace llvm::unittest::cas {
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+using HasherT = BLAKE3;
+using HashType = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>()));
+using ValueType = std::array<char, 20>;
+inline HashType digest(StringRef Data, ArrayRef<ArrayRef<uint8_t>> RefHashes) {
+ return BuiltinObjectHasher<HasherT>::hashObject(
+ RefHashes, arrayRefFromStringRef<char>(Data));
+inline ObjectID digest(OnDiskGraphDB &DB, StringRef Data,
+ ArrayRef<ObjectID> Refs) {
+ SmallVector<ArrayRef<uint8_t>, 8> RefHashes;
+ for (ObjectID Ref : Refs)
+ RefHashes.push_back(DB.getDigest(Ref));
+ HashType Digest = digest(Data, RefHashes);
+ return DB.getReference(Digest);
+inline HashType digest(StringRef Data) {
+ return HasherT::hash(arrayRefFromStringRef(Data));
+inline ValueType valueFromString(StringRef S) {
+ ValueType Val;
+ llvm::copy(S.substr(0, sizeof(Val)), Val.data());
+ return Val;
+inline Expected<ObjectID> store(OnDiskGraphDB &DB, StringRef Data,
+ ArrayRef<ObjectID> Refs) {
+ ObjectID ID = digest(DB, Data, Refs);
+ if (Error E = DB.store(ID, Refs, arrayRefFromStringRef<char>(Data)))
+ return std::move(E);
+ return ID;
+inline Error printTree(OnDiskGraphDB &DB, ObjectID ID, raw_ostream &OS,
+ unsigned Indent = 0) {
+ std::optional<ondisk::ObjectHandle> Obj;
+ if (Error E = DB.load(ID).moveInto(Obj))
+ return E;
+ if (!Obj)
+ return Error::success();
+ OS.indent(Indent) << toStringRef(DB.getObjectData(*Obj)) << '\n';
+ for (ObjectID Ref : DB.getObjectRefs(*Obj)) {
+ if (Error E = printTree(DB, Ref, OS, Indent + 2))
+ return E;
+ }
+ return Error::success();
+} // namespace llvm::unittest::cas
diff --git a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
new file mode 100644
index 00000000000000..57ea061e4f43a7
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
@@ -0,0 +1,284 @@
+//===- llvm/unittest/CAS/OnDiskGraphDBTest.cpp ----------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "OnDiskCommonUtils.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+using namespace llvm::unittest::cas;
+TEST(OnDiskGraphDBTest, Basic) {
+ unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+ std::unique_ptr<OnDiskGraphDB> DB;
+ OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType)).moveInto(DB),
+ Succeeded());
+ auto digest = [&DB](StringRef Data, ArrayRef<ObjectID> Refs) -> ObjectID {
+ return ::digest(*DB, Data, Refs);
+ };
+ auto store = [&](StringRef Data,
+ ArrayRef<ObjectID> Refs) -> Expected<ObjectID> {
+ return ::store(*DB, Data, Refs);
+ };
+ std::optional<ObjectID> ID1;
+ ASSERT_THAT_ERROR(store("hello", {}).moveInto(ID1), Succeeded());
+ std::optional<ondisk::ObjectHandle> Obj1;
+ ASSERT_THAT_ERROR(DB->load(*ID1).moveInto(Obj1), Succeeded());
+ ASSERT_TRUE(Obj1.has_value());
+ EXPECT_EQ(toStringRef(DB->getObjectData(*Obj1)), "hello");
+ ArrayRef<uint8_t> Digest1 = DB->getDigest(*ID1);
+ ObjectID ID2 = DB->getReference(Digest1);
+ ObjectID ID3 = digest("world", {});
+ EXPECT_FALSE(DB->containsObject(ID3));
+ std::optional<ondisk::ObjectHandle> Obj2;
+ ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj2), Succeeded());
+ EXPECT_FALSE(Obj2.has_value());
+ ASSERT_THAT_ERROR(DB->store(ID3, {}, arrayRefFromStringRef<char>("world")),
+ Succeeded());
+ EXPECT_TRUE(DB->containsObject(ID3));
+ ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj2), Succeeded());
+ ASSERT_TRUE(Obj2.has_value());
+ EXPECT_EQ(toStringRef(DB->getObjectData(*Obj2)), "world");
+ size_t LargeDataSize = 256LL * 1024LL; // 256K.
+ // The precise size number is not important, we mainly check that the large
+ // object will be properly accounted for.
+ EXPECT_TRUE(DB->getStorageSize() > 10 &&
+ DB->getStorageSize() < LargeDataSize);
+ SmallString<16> Buffer;
+ Buffer.resize(LargeDataSize);
+ ASSERT_THAT_ERROR(store(Buffer, {}).moveInto(ID1), Succeeded());
+ size_t StorageSize = DB->getStorageSize();
+ EXPECT_TRUE(StorageSize > LargeDataSize);
+ // Close & re-open the DB and check that it reports the same storage size.
+ DB.reset();
+ OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType)).moveInto(DB),
+ Succeeded());
+ EXPECT_EQ(DB->getStorageSize(), StorageSize);
+TEST(OnDiskGraphDBTest, FaultInSingleNode) {
+ unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true);
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+ OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType))
+ .moveInto(UpstreamDB),
+ Succeeded());
+ {
+ std::optional<ObjectID> ID1;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "hello", {}).moveInto(ID1),
+ Succeeded());
+ std::optional<ObjectID> ID2;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "another", {}).moveInto(ID2),
+ Succeeded());
+ std::optional<ObjectID> ID3;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "world", {*ID1, *ID2}).moveInto(ID3),
+ Succeeded());
+ }
+ unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+ std::unique_ptr<OnDiskGraphDB> DB;
+ OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+ std::move(UpstreamDB),
+ OnDiskGraphDB::FaultInPolicy::SingleNode)
+ .moveInto(DB),
+ Succeeded());
+ ObjectID ID1 = digest(*DB, "hello", {});
+ ObjectID ID2 = digest(*DB, "another", {});
+ ObjectID ID3 = digest(*DB, "world", {ID1, ID2});
+ ObjectID ID4 = digest(*DB, "world", {});
+ EXPECT_TRUE(DB->containsObject(ID1));
+ EXPECT_TRUE(DB->containsObject(ID2));
+ EXPECT_TRUE(DB->containsObject(ID3));
+ EXPECT_FALSE(DB->containsObject(ID4));
+ EXPECT_TRUE(DB->getExistingReference(digest("hello", {})).has_value());
+ EXPECT_TRUE(DB->getExistingReference(DB->getDigest(ID3)).has_value());
+ EXPECT_FALSE(DB->getExistingReference(digest("world", {})).has_value());
+ {
+ std::optional<ondisk::ObjectHandle> Obj;
+ ASSERT_THAT_ERROR(DB->load(ID1).moveInto(Obj), Succeeded());
+ ASSERT_TRUE(Obj.has_value());
+ EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "hello");
+ auto Refs = DB->getObjectRefs(*Obj);
+ EXPECT_TRUE(Refs.empty());
+ }
+ {
+ std::optional<ondisk::ObjectHandle> Obj;
+ ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj), Succeeded());
+ ASSERT_TRUE(Obj.has_value());
+ EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "world");
+ auto Refs = DB->getObjectRefs(*Obj);
+ ASSERT_EQ(std::distance(Refs.begin(), Refs.end()), 2);
+ EXPECT_EQ(Refs.begin()[0], ID1);
+ EXPECT_EQ(Refs.begin()[1], ID2);
+ }
+ {
+ std::optional<ondisk::ObjectHandle> Obj;
+ ASSERT_THAT_ERROR(DB->load(ID4).moveInto(Obj), Succeeded());
+ EXPECT_FALSE(Obj.has_value());
+ }
+ // Re-open the primary without chaining, to verify the data were copied from
+ // the upstream.
+ OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+ /*UpstreamDB=*/nullptr,
+ OnDiskGraphDB::FaultInPolicy::SingleNode)
+ .moveInto(DB),
+ Succeeded());
+ ID1 = digest(*DB, "hello", {});
+ ID2 = digest(*DB, "another", {});
+ ID3 = digest(*DB, "world", {ID1, ID2});
+ EXPECT_TRUE(DB->containsObject(ID1));
+ EXPECT_FALSE(DB->containsObject(ID2));
+ EXPECT_TRUE(DB->containsObject(ID3));
+ {
+ std::optional<ondisk::ObjectHandle> Obj;
+ ASSERT_THAT_ERROR(DB->load(ID1).moveInto(Obj), Succeeded());
+ ASSERT_TRUE(Obj.has_value());
+ EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "hello");
+ auto Refs = DB->getObjectRefs(*Obj);
+ EXPECT_TRUE(Refs.empty());
+ }
+TEST(OnDiskGraphDBTest, FaultInFullTree) {
+ unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true);
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+ OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType))
+ .moveInto(UpstreamDB),
+ Succeeded());
+ HashType RootHash;
+ {
+ std::optional<ObjectID> ID11;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "11", {}).moveInto(ID11), Succeeded());
+ std::optional<ObjectID> ID121;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "121", {}).moveInto(ID121),
+ Succeeded());
+ std::optional<ObjectID> ID12;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "12", {*ID121}).moveInto(ID12),
+ Succeeded());
+ std::optional<ObjectID> ID1;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "1", {*ID11, *ID12}).moveInto(ID1),
+ Succeeded());
+ std::optional<ObjectID> ID21;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "21", {}).moveInto(ID21), Succeeded());
+ std::optional<ObjectID> ID22;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "22", {}).moveInto(ID22), Succeeded());
+ std::optional<ObjectID> ID2;
+ store(*UpstreamDB, "2", {*ID12, *ID21, *ID22}).moveInto(ID2),
+ Succeeded());
+ std::optional<ObjectID> IDRoot;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "root", {*ID1, *ID2}).moveInto(IDRoot),
+ Succeeded());
+ ArrayRef<uint8_t> Digest = UpstreamDB->getDigest(*IDRoot);
+ ASSERT_EQ(Digest.size(), RootHash.size());
+ llvm::copy(Digest, RootHash.data());
+ }
+ unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+ std::unique_ptr<OnDiskGraphDB> DB;
+ ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+ std::move(UpstreamDB),
+ OnDiskGraphDB::FaultInPolicy::FullTree)
+ .moveInto(DB),
+ Succeeded());
+ {
+ ObjectID IDRoot = DB->getReference(RootHash);
+ std::optional<ondisk::ObjectHandle> Obj;
+ ASSERT_THAT_ERROR(DB->load(IDRoot).moveInto(Obj), Succeeded());
+ ASSERT_TRUE(Obj.has_value());
+ EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "root");
+ auto Refs = DB->getObjectRefs(*Obj);
+ ASSERT_EQ(std::distance(Refs.begin(), Refs.end()), 2);
+ }
+ // Re-open the primary without chaining, to verify the data were copied from
+ // the upstream.
+ ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+ /*UpstreamDB=*/nullptr,
+ OnDiskGraphDB::FaultInPolicy::FullTree)
+ .moveInto(DB),
+ Succeeded());
+ ObjectID IDRoot = DB->getReference(RootHash);
+ std::string PrintedTree;
+ raw_string_ostream OS(PrintedTree);
+ ASSERT_THAT_ERROR(printTree(*DB, IDRoot, OS), Succeeded());
+ StringRef Expected = R"(root
+ 1
+ 11
+ 12
+ 121
+ 2
+ 12
+ 121
+ 21
+ 22
+ EXPECT_EQ(PrintedTree, Expected);
+TEST(OnDiskGraphDBTest, FaultInPolicyConflict) {
+ auto tryFaultInPolicyConflict = [](OnDiskGraphDB::FaultInPolicy Policy1,
+ OnDiskGraphDB::FaultInPolicy Policy2) {
+ unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true);
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+ OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType))
+ .moveInto(UpstreamDB),
+ Succeeded());
+ unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+ std::unique_ptr<OnDiskGraphDB> DB;
+ ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3",
+ sizeof(HashType),
+ std::move(UpstreamDB), Policy1)
+ .moveInto(DB),
+ Succeeded());
+ DB.reset();
+ ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3",
+ sizeof(HashType),
+ std::move(UpstreamDB), Policy2)
+ .moveInto(DB),
+ Failed());
+ };
+ // Open as 'single', then as 'full'.
+ tryFaultInPolicyConflict(OnDiskGraphDB::FaultInPolicy::SingleNode,
+ OnDiskGraphDB::FaultInPolicy::FullTree);
+ // Open as 'full', then as 'single'.
+ tryFaultInPolicyConflict(OnDiskGraphDB::FaultInPolicy::FullTree,
+ OnDiskGraphDB::FaultInPolicy::SingleNode);
diff --git a/llvm/unittests/CAS/OnDiskHashMappedTrieTest.cpp b/llvm/unittests/CAS/OnDiskHashMappedTrieTest.cpp
new file mode 100644
index 00000000000000..b8e5bf632dde25
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskHashMappedTrieTest.cpp
@@ -0,0 +1,146 @@
+//===- OnDiskHashMappedTrieTest.cpp ---------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/OnDiskHashMappedTrie.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+using namespace llvm::cas;
+namespace {
+TEST(OnDiskHashMappedTrieTest, Insertion) {
+ unittest::TempDir Temp("on-disk-hash-mapped-trie", /*Unique=*/true);
+ // Create tries with various sizes of hash and with data.
+ //
+ // NOTE: The check related to \a recoverFromFileOffset() catches a potential
+ // off-by-one bounds-checking bug when the trie record size (data + hash) add
+ // up to a multiple of 8B. Iterate through a few different hash sizes to
+ // check it both ways.
+ constexpr size_t MB = 1024u * 1024u;
+ constexpr size_t DataSize = 8; // Multiple of 8B.
+ for (size_t NumHashBytes : {1, 2, 4, 8}) {
+ size_t NumHashBits = NumHashBytes * 8;
+ auto createTrie = [&]() {
+ return OnDiskHashMappedTrie::create(
+ Temp.path((Twine(NumHashBytes) + "B").str()), "index",
+ /*NumHashBits=*/NumHashBits, DataSize, /*MaxFileSize=*/MB,
+ /*NewInitialFileSize=*/std::nullopt);
+ };
+ std::optional<OnDiskHashMappedTrie> Trie1;
+ ASSERT_THAT_ERROR(createTrie().moveInto(Trie1), Succeeded());
+ std::optional<OnDiskHashMappedTrie> Trie2;
+ ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded());
+ uint8_t Hash0Bytes[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+ uint8_t Hash1Bytes[8] = {1, 0, 0, 0, 0, 0, 0, 0};
+ auto Hash0 = ArrayRef(Hash0Bytes).take_front(NumHashBytes);
+ auto Hash1 = ArrayRef(Hash1Bytes).take_front(NumHashBytes);
+ constexpr StringLiteral Data0v1Bytes = "data0.v1";
+ constexpr StringLiteral Data0v2Bytes = "data0.v2";
+ constexpr StringLiteral Data1Bytes = "data1...";
+ static_assert(Data0v1Bytes.size() == DataSize, "math error");
+ static_assert(Data0v2Bytes.size() == DataSize, "math error");
+ static_assert(Data1Bytes.size() == DataSize, "math error");
+ ArrayRef<char> Data0v1 = ArrayRef(Data0v1Bytes.data(), Data0v1Bytes.size());
+ ArrayRef<char> Data0v2 = ArrayRef(Data0v2Bytes.data(), Data0v2Bytes.size());
+ ArrayRef<char> Data1 = ArrayRef(Data1Bytes.data(), Data1Bytes.size());
+ // Lookup when trie is empty.
+ EXPECT_FALSE(Trie1->find(Hash0));
+ // Insert.
+ std::optional<FileOffset> Offset;
+ std::optional<MutableArrayRef<char>> Data;
+ {
+ auto Insertion = Trie1->insert({Hash0, Data0v1});
+ ASSERT_TRUE(Insertion);
+ EXPECT_EQ(Hash0, Insertion->Hash);
+ EXPECT_EQ(Data0v1, Insertion->Data);
+ EXPECT_TRUE(isAddrAligned(Align(8), Insertion->Data.data()));
+ Offset = Insertion.getOffset();
+ Data = Insertion->Data;
+ }
+ // Find.
+ {
+ auto Lookup = Trie1->find(Hash0);
+ ASSERT_TRUE(Lookup);
+ EXPECT_EQ(Hash0, Lookup->Hash);
+ EXPECT_EQ(Data0v1, Lookup->Data);
+ EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+ }
+ // Find in a different instance of the same on-disk trie that existed
+ // before the insertion.
+ {
+ auto Lookup = Trie2->find(Hash0);
+ ASSERT_TRUE(Lookup);
+ EXPECT_EQ(Hash0, Lookup->Hash);
+ EXPECT_EQ(Data0v1, Lookup->Data);
+ EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+ }
+ // Create a new instance and check that too.
+ Trie2.reset();
+ ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded());
+ {
+ auto Lookup = Trie2->find(Hash0);
+ ASSERT_TRUE(Lookup);
+ EXPECT_EQ(Hash0, Lookup->Hash);
+ EXPECT_EQ(Data0v1, Lookup->Data);
+ EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+ }
+ // Change the data.
+ llvm::copy(Data0v2, Data->data());
+ {
+ auto Lookup = Trie2->find(Hash0);
+ ASSERT_TRUE(Lookup);
+ EXPECT_EQ(Hash0, Lookup->Hash);
+ EXPECT_EQ(Data0v2, Lookup->Data);
+ EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+ }
+ // Find different hash.
+ EXPECT_FALSE(Trie1->find(Hash1));
+ EXPECT_FALSE(Trie2->find(Hash1));
+ // Recover from an offset.
+ {
+ auto Recovered = Trie1->recoverFromFileOffset(*Offset);
+ ASSERT_TRUE(Recovered);
+ EXPECT_EQ(Offset->get(), Recovered.getOffset().get());
+ EXPECT_EQ(Hash0, Recovered->Hash);
+ EXPECT_EQ(Data0v2, Recovered->Data);
+ }
+ // Insert another thing.
+ {
+ auto Insertion = Trie1->insert({Hash1, Data1});
+ ASSERT_TRUE(Insertion);
+ EXPECT_EQ(Hash1, Insertion->Hash);
+ EXPECT_EQ(Data1, Insertion->Data);
+ EXPECT_TRUE(isAddrAligned(Align(8), Insertion->Data.data()));
+ EXPECT_NE(Offset->get(), Insertion.getOffset().get());
+ }
+ }
+} // namespace
diff --git a/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp b/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp
new file mode 100644
index 00000000000000..3edc5e77f64fb6
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp
@@ -0,0 +1,54 @@
+//===- llvm/unittest/CAS/OnDiskKeyValueDBTest.cpp -------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "OnDiskCommonUtils.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+using namespace llvm::unittest::cas;
+TEST(OnDiskKeyValueDBTest, Basic) {
+ unittest::TempDir Temp("ondiskkv", /*Unique=*/true);
+ std::unique_ptr<OnDiskKeyValueDB> DB;
+ ASSERT_THAT_ERROR(OnDiskKeyValueDB::open(Temp.path(), "blake3",
+ sizeof(HashType), "test",
+ sizeof(ValueType))
+ .moveInto(DB),
+ Succeeded());
+ {
+ std::optional<ArrayRef<char>> Val;
+ ASSERT_THAT_ERROR(DB->get(digest("hello")).moveInto(Val), Succeeded());
+ EXPECT_FALSE(Val.has_value());
+ }
+ ValueType ValW = valueFromString("world");
+ ArrayRef<char> Val;
+ ASSERT_THAT_ERROR(DB->put(digest("hello"), ValW).moveInto(Val), Succeeded());
+ EXPECT_EQ(Val, ArrayRef(ValW));
+ DB->put(digest("hello"), valueFromString("other")).moveInto(Val),
+ Succeeded());
+ EXPECT_EQ(Val, ArrayRef(ValW));
+ {
+ std::optional<ArrayRef<char>> Val;
+ ASSERT_THAT_ERROR(DB->get(digest("hello")).moveInto(Val), Succeeded());
+ EXPECT_TRUE(Val.has_value());
+ EXPECT_EQ(*Val, ArrayRef(ValW));
+ }
diff --git a/llvm/unittests/CAS/TreeSchemaTest.cpp b/llvm/unittests/CAS/TreeSchemaTest.cpp
new file mode 100644
index 00000000000000..1ec27526153939
--- /dev/null
+++ b/llvm/unittests/CAS/TreeSchemaTest.cpp
@@ -0,0 +1,266 @@
+//===- TreeSchemaTest.cpp -------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/TreeSchema.h"
+#include "llvm/CAS/HierarchicalTreeBuilder.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+using namespace llvm::cas;
+TEST(TreeSchemaTest, Trees) {
+ std::unique_ptr<ObjectStore> CAS1 = createInMemoryCAS();
+ std::unique_ptr<ObjectStore> CAS2 = createInMemoryCAS();
+ auto createBlobInBoth = [&](StringRef Content) {
+ std::optional<ObjectRef> H1, H2;
+ EXPECT_THAT_ERROR(CAS1->storeFromString(std::nullopt, Content).moveInto(H1),
+ Succeeded());
+ EXPECT_THAT_ERROR(CAS2->storeFromString(std::nullopt, Content).moveInto(H2),
+ Succeeded());
+ EXPECT_EQ(CAS1->getID(*H1), CAS2->getID(*H2));
+ return *H1;
+ };
+ ObjectRef Blob1 = createBlobInBoth("blob1");
+ ObjectRef Blob2 = createBlobInBoth("blob2");
+ ObjectRef Blob3 = createBlobInBoth("blob3");
+ SmallVector<SmallVector<NamedTreeEntry, 0>, 0> FlatTreeEntries = {
+ {},
+ {NamedTreeEntry(Blob1, TreeEntry::Regular, "regular")},
+ {NamedTreeEntry(Blob2, TreeEntry::Executable, "executable")},
+ {NamedTreeEntry(Blob3, TreeEntry::Symlink, "symlink")},
+ {
+ NamedTreeEntry(Blob1, TreeEntry::Regular, "various"),
+ NamedTreeEntry(Blob1, TreeEntry::Regular, "names"),
+ NamedTreeEntry(Blob1, TreeEntry::Regular, "that"),
+ NamedTreeEntry(Blob1, TreeEntry::Regular, "do"),
+ NamedTreeEntry(Blob1, TreeEntry::Regular, "not"),
+ NamedTreeEntry(Blob1, TreeEntry::Regular, "conflict"),
+ NamedTreeEntry(Blob1, TreeEntry::Regular, "but have spaces and..."),
+ NamedTreeEntry(Blob1, TreeEntry::Regular,
+ "`~,!@#$%^&*()-+=[]{}\\<>'\""),
+ },
+ };
+ SmallVector<ObjectRef> FlatRefs;
+ SmallVector<CASID> FlatIDs;
+ TreeSchema Schema1(*CAS1);
+ for (ArrayRef<NamedTreeEntry> Entries : FlatTreeEntries) {
+ std::optional<TreeProxy> H;
+ ASSERT_THAT_ERROR(Schema1.create(Entries).moveInto(H), Succeeded());
+ FlatIDs.push_back(H->getID());
+ FlatRefs.push_back(H->getRef());
+ }
+ // Confirm we get the same IDs the second time and that the trees can be
+ // visited (the entries themselves will be checked later).
+ for (int I = 0, E = FlatIDs.size(); I != E; ++I) {
+ std::optional<TreeProxy> H;
+ ASSERT_THAT_ERROR(Schema1.create(FlatTreeEntries[I]).moveInto(H),
+ Succeeded());
+ EXPECT_EQ(FlatRefs[I], CAS1->getReference(*H));
+ std::optional<TreeProxy> Tree;
+ ASSERT_THAT_ERROR(TreeProxy::get(Schema1, *H).moveInto(Tree), Succeeded());
+ EXPECT_EQ(FlatTreeEntries[I].size(), Tree->size());
+ size_t NumCalls = 0;
+ EXPECT_THAT_ERROR(Tree->forEachEntry([&NumCalls](const NamedTreeEntry &E) {
+ ++NumCalls;
+ return Error::success();
+ }),
+ Succeeded());
+ EXPECT_EQ(FlatTreeEntries[I].size(), NumCalls);
+ }
+ // Run validation.
+ for (int I = 1, E = FlatIDs.size(); I != E; ++I)
+ ASSERT_THAT_ERROR(CAS1->validate(FlatIDs[I]), Succeeded());
+ // Confirm these trees don't exist in a fresh CAS instance. Skip the first
+ // tree, which is empty and could be implicitly in some CAS.
+ for (int I = 1, E = FlatIDs.size(); I != E; ++I)
+ EXPECT_FALSE(CAS2->getReference(FlatIDs[I]));
+ // Insert into the other CAS and confirm the IDs are stable.
+ for (int I = FlatIDs.size(), E = 0; I != E; --I) {
+ for (ObjectStore *CAS : {&*CAS1, &*CAS2}) {
+ TreeSchema Schema(*CAS);
+ auto &ID = FlatIDs[I - 1];
+ // Make a copy of the original entries and sort them.
+ SmallVector<NamedTreeEntry> NewEntries;
+ for (const NamedTreeEntry &Entry : FlatTreeEntries[I - 1]) {
+ std::optional<ObjectRef> NewRef =
+ CAS->getReference(CAS1->getID(Entry.getRef()));
+ NewEntries.emplace_back(*NewRef, Entry.getKind(), Entry.getName());
+ }
+ llvm::sort(NewEntries);
+ // Confirm we get the same tree out of CAS2.
+ {
+ std::optional<TreeProxy> Tree;
+ ASSERT_THAT_ERROR(Schema.create(NewEntries).moveInto(Tree),
+ Succeeded());
+ EXPECT_EQ(ID, Tree->getID());
+ }
+ // Check that the correct entries come back.
+ std::optional<ObjectRef> Ref = CAS->getReference(ID);
+ std::optional<TreeProxy> Tree;
+ ASSERT_THAT_ERROR(Schema.load(*Ref).moveInto(Tree), Succeeded());
+ for (int I = 0, E = NewEntries.size(); I != E; ++I)
+ EXPECT_EQ(NewEntries[I], Tree->get(I));
+ }
+ }
+ // Create some nested trees.
+ SmallVector<ObjectRef> NestedTrees = FlatRefs;
+ for (int I = 0, E = FlatTreeEntries.size() * 3; I != E; ++I) {
+ // Copy one of the flat entries and add some trees.
+ auto OriginalEntries =
+ ArrayRef(FlatTreeEntries[I % FlatTreeEntries.size()]);
+ SmallVector<NamedTreeEntry> Entries(OriginalEntries.begin(),
+ OriginalEntries.end());
+ std::string Name = ("tree" + Twine(I)).str();
+ Entries.emplace_back(*CAS1->getReference(FlatIDs[(I + 4) % FlatIDs.size()]),
+ TreeEntry::Tree, Name);
+ std::optional<std::string> Name1, Name2;
+ if (NestedTrees.size() >= 2) {
+ int Nested1 = I % NestedTrees.size();
+ int Nested2 = (I * 3 + 2) % NestedTrees.size();
+ if (Nested2 == Nested1)
+ Nested2 = (Nested1 + 1) % NestedTrees.size();
+ ASSERT_NE(Nested1, Nested2);
+ Name1.emplace(("tree" + Twine(I) + "-" + Twine(Nested1)).str());
+ Name2.emplace(("tree" + Twine(I) + "-" + Twine(Nested2)).str());
+ Entries.emplace_back(NestedTrees[I % NestedTrees.size()], TreeEntry::Tree,
+ *Name1);
+ Entries.emplace_back(NestedTrees[(I * 3 + 2) % NestedTrees.size()],
+ TreeEntry::Tree, *Name2);
+ }
+ std::optional<CASID> ID;
+ {
+ std::optional<TreeProxy> Tree;
+ ASSERT_THAT_ERROR(Schema1.create(Entries).moveInto(Tree), Succeeded());
+ ID = Tree->getID();
+ }
+ llvm::sort(Entries);
+ for (ObjectStore *CAS : {&*CAS1, &*CAS2}) {
+ // Make a copy of the original entries and sort them.
+ SmallVector<NamedTreeEntry> NewEntries;
+ for (const NamedTreeEntry &Entry : Entries) {
+ std::optional<ObjectRef> NewRef =
+ CAS->getReference(CAS1->getID(Entry.getRef()));
+ NewEntries.emplace_back(*NewRef, Entry.getKind(), Entry.getName());
+ }
+ llvm::sort(NewEntries);
+ TreeSchema Schema(*CAS);
+ std::optional<TreeProxy> Tree;
+ ASSERT_THAT_ERROR(Schema.create(NewEntries).moveInto(Tree), Succeeded());
+ ASSERT_EQ(*ID, Tree->getID());
+ ASSERT_THAT_ERROR(CAS->validate(*ID), Succeeded());
+ Tree.reset();
+ std::optional<ObjectRef> Ref = CAS->getReference(*ID);
+ ASSERT_THAT_ERROR(Schema.load(*Ref).moveInto(Tree), Succeeded());
+ for (int I = 0, E = NewEntries.size(); I != E; ++I)
+ EXPECT_EQ(NewEntries[I], Tree->get(I));
+ }
+ }
+TEST(TreeSchemaTest, Lookup) {
+ std::unique_ptr<ObjectStore> CAS = createInMemoryCAS();
+ std::optional<ObjectRef> Node;
+ EXPECT_THAT_ERROR(CAS->storeFromString(std::nullopt, "blob").moveInto(Node),
+ Succeeded());
+ ObjectRef Blob = *Node;
+ SmallVector<NamedTreeEntry> FlatTreeEntries = {
+ NamedTreeEntry(Blob, TreeEntry::Regular, "e"),
+ NamedTreeEntry(Blob, TreeEntry::Regular, "b"),
+ NamedTreeEntry(Blob, TreeEntry::Regular, "f"),
+ NamedTreeEntry(Blob, TreeEntry::Regular, "a"),
+ NamedTreeEntry(Blob, TreeEntry::Regular, "c"),
+ NamedTreeEntry(Blob, TreeEntry::Regular, "f"),
+ NamedTreeEntry(Blob, TreeEntry::Regular, "d"),
+ };
+ std::optional<TreeProxy> Tree;
+ TreeSchema Schema(*CAS);
+ ASSERT_THAT_ERROR(Schema.create(FlatTreeEntries).moveInto(Tree), Succeeded());
+ ASSERT_EQ(Tree->size(), (size_t)6);
+ auto CheckEntry = [&](StringRef Name) {
+ auto MaybeEntry = Tree->lookup(Name);
+ ASSERT_TRUE(MaybeEntry);
+ ASSERT_EQ(MaybeEntry->getName(), Name);
+ };
+ CheckEntry("a");
+ CheckEntry("b");
+ CheckEntry("c");
+ CheckEntry("d");
+ CheckEntry("e");
+ CheckEntry("f");
+ ASSERT_FALSE(Tree->lookup("h"));
+TEST(TreeSchemaTest, walkFileTreeRecursively) {
+ std::unique_ptr<ObjectStore> CAS = createInMemoryCAS();
+ auto make = [&](StringRef Content) {
+ return cantFail(CAS->storeFromString(std::nullopt, Content));
+ };
+ HierarchicalTreeBuilder Builder;
+ Builder.push(make("blob2"), TreeEntry::Regular, "/d2");
+ Builder.push(make("blob1"), TreeEntry::Regular, "/t1/d1");
+ Builder.push(make("blob3"), TreeEntry::Regular, "/t3/d3");
+ Builder.push(make("blob1"), TreeEntry::Regular, "/t3/t1nested/d1");
+ std::optional<ObjectProxy> Root;
+ ASSERT_THAT_ERROR(Builder.create(*CAS).moveInto(Root), Succeeded());
+ std::pair<std::string, bool> ExpectedEntries[] = {
+ {"/", true},
+ {"/d2", false},
+ {"/t1", true},
+ {"/t1/d1", false},
+ {"/t3", true},
+ {"/t3/d3", false},
+ {"/t3/t1nested", true},
+ {"/t3/t1nested/d1", false},
+ };
+ auto RemainingEntries = ArrayRef(ExpectedEntries);
+ TreeSchema Schema(*CAS);
+ Error E = Schema.walkFileTreeRecursively(
+ *CAS, Root->getRef(),
+ [&](const NamedTreeEntry &Entry, std::optional<TreeProxy> Tree) -> Error {
+ if (RemainingEntries.empty())
+ return createStringError(inconvertibleErrorCode(),
+ "unexpected entry: '" + Entry.getName() +
+ "'");
+ auto ExpectedEntry = RemainingEntries.front();
+ RemainingEntries = RemainingEntries.drop_front();
+ EXPECT_EQ(ExpectedEntry.first, Entry.getName());
+ EXPECT_EQ(ExpectedEntry.second, Tree.has_value());
+ return Error::success();
+ });
+ EXPECT_THAT_ERROR(std::move(E), Succeeded());
diff --git a/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp
new file mode 100644
index 00000000000000..eafa07ff0f4d27
--- /dev/null
+++ b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp
@@ -0,0 +1,182 @@
+//===- llvm/unittest/CAS/UnifiedOnDiskCacheTest.cpp -----------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "OnDiskCommonUtils.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+using namespace llvm::unittest::cas;
+/// Visits all the files of a directory recursively and returns the sum of their
+/// sizes.
+static Expected<size_t> countFileSizes(StringRef Path) {
+ size_t TotalSize = 0;
+ std::error_code EC;
+ for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE;
+ DirI.increment(EC)) {
+ if (DirI->type() == sys::fs::file_type::directory_file) {
+ Expected<size_t> Subsize = countFileSizes(DirI->path());
+ if (!Subsize)
+ return Subsize.takeError();
+ TotalSize += *Subsize;
+ continue;
+ }
+ ErrorOr<sys::fs::basic_file_status> Stat = DirI->status();
+ if (!Stat)
+ return createFileError(DirI->path(), Stat.getError());
+ TotalSize += Stat->getSize();
+ }
+ if (EC)
+ return createFileError(Path, EC);
+ return TotalSize;
+TEST(UnifiedOnDiskCacheTest, Basic) {
+ unittest::TempDir Temp("ondisk-unified", /*Unique=*/true);
+ std::unique_ptr<UnifiedOnDiskCache> UniDB;
+ auto reopenDB = [&]() {
+ UniDB.reset();
+ const uint64_t SizeLimit = 1024ull * 64;
+ ASSERT_THAT_ERROR(UnifiedOnDiskCache::open(Temp.path(), SizeLimit, "blake3",
+ sizeof(HashType))
+ .moveInto(UniDB),
+ Succeeded());
+ };
+ reopenDB();
+ HashType RootHash;
+ HashType OtherHash;
+ HashType Key1Hash;
+ HashType Key2Hash;
+ {
+ OnDiskGraphDB &DB = UniDB->getGraphDB();
+ std::optional<ObjectID> ID1;
+ ASSERT_THAT_ERROR(store(DB, "1", {}).moveInto(ID1), Succeeded());
+ std::optional<ObjectID> ID2;
+ ASSERT_THAT_ERROR(store(DB, "2", {}).moveInto(ID2), Succeeded());
+ std::optional<ObjectID> IDRoot;
+ ASSERT_THAT_ERROR(store(DB, "root", {*ID1, *ID2}).moveInto(IDRoot),
+ Succeeded());
+ ArrayRef<uint8_t> Digest = DB.getDigest(*IDRoot);
+ ASSERT_EQ(Digest.size(), RootHash.size());
+ llvm::copy(Digest, RootHash.data());
+ std::optional<ObjectID> IDOther;
+ ASSERT_THAT_ERROR(store(DB, "other", {}).moveInto(IDOther), Succeeded());
+ Digest = DB.getDigest(*IDOther);
+ ASSERT_EQ(Digest.size(), OtherHash.size());
+ llvm::copy(Digest, OtherHash.data());
+ Key1Hash = digest("key1");
+ std::optional<ObjectID> Val;
+ ASSERT_THAT_ERROR(UniDB->KVPut(Key1Hash, *IDRoot).moveInto(Val),
+ Succeeded());
+ EXPECT_EQ(IDRoot, Val);
+ Key2Hash = digest("key2");
+ UniDB->KVPut(DB.getReference(Key2Hash), *ID1).moveInto(Val),
+ Succeeded());
+ }
+ auto checkTree = [&](const HashType &Digest, StringRef ExpectedTree) {
+ OnDiskGraphDB &DB = UniDB->getGraphDB();
+ ObjectID ID = DB.getReference(Digest);
+ std::string PrintedTree;
+ raw_string_ostream OS(PrintedTree);
+ ASSERT_THAT_ERROR(printTree(DB, ID, OS), Succeeded());
+ EXPECT_EQ(PrintedTree, ExpectedTree);
+ };
+ auto checkRootTree = [&]() {
+ return checkTree(RootHash, "root\n 1\n 2\n");
+ };
+ auto checkKey = [&](const HashType &Key, StringRef ExpectedData) {
+ OnDiskGraphDB &DB = UniDB->getGraphDB();
+ std::optional<ObjectID> Val;
+ ASSERT_THAT_ERROR(UniDB->KVGet(Key).moveInto(Val), Succeeded());
+ ASSERT_TRUE(Val.has_value());
+ std::optional<ondisk::ObjectHandle> Obj;
+ ASSERT_THAT_ERROR(DB.load(*Val).moveInto(Obj), Succeeded());
+ EXPECT_EQ(toStringRef(DB.getObjectData(*Obj)), ExpectedData);
+ };
+ checkRootTree();
+ checkTree(OtherHash, "other\n");
+ checkKey(Key1Hash, "root");
+ checkKey(Key2Hash, "1");
+ auto storeBigObject = [&](unsigned Index) {
+ SmallString<1000> Buf;
+ Buf.append(970, 'a');
+ raw_svector_ostream(Buf) << Index;
+ std::optional<ObjectID> ID;
+ ASSERT_THAT_ERROR(store(UniDB->getGraphDB(), Buf, {}).moveInto(ID),
+ Succeeded());
+ };
+ unsigned Index = 0;
+ while (!UniDB->hasExceededSizeLimit()) {
+ storeBigObject(Index++);
+ }
+ reopenDB();
+ EXPECT_FALSE(UniDB->hasExceededSizeLimit());
+ EXPECT_FALSE(UniDB->needsGarbaseCollection());
+ checkRootTree();
+ checkKey(Key1Hash, "root");
+ while (!UniDB->hasExceededSizeLimit()) {
+ storeBigObject(Index++);
+ }
+ ASSERT_THAT_ERROR(UniDB->close(), Succeeded());
+ EXPECT_TRUE(UniDB->needsGarbaseCollection());
+ reopenDB();
+ EXPECT_TRUE(UniDB->needsGarbaseCollection());
+ std::optional<size_t> DirSizeBefore;
+ ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeBefore),
+ Succeeded());
+ ASSERT_THAT_ERROR(UnifiedOnDiskCache::collectGarbage(Temp.path()),
+ Succeeded());
+ std::optional<size_t> DirSizeAfter;
+ ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeAfter),
+ Succeeded());
+ EXPECT_LT(*DirSizeAfter, *DirSizeBefore);
+ reopenDB();
+ EXPECT_FALSE(UniDB->needsGarbaseCollection());
+ checkRootTree();
+ checkKey(Key1Hash, "root");
+ // 'Other' tree and 'Key2' got garbage-collected.
+ {
+ OnDiskGraphDB &DB = UniDB->getGraphDB();
+ EXPECT_FALSE(DB.containsObject(DB.getReference(OtherHash)));
+ std::optional<ObjectID> Val;
+ ASSERT_THAT_ERROR(UniDB->KVGet(Key2Hash).moveInto(Val), Succeeded());
+ EXPECT_FALSE(Val.has_value());
+ }
>From 453033c60d61e4ec544b12f791f63c41870285cc Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Mon, 9 Oct 2023 11:14:04 -0700
Subject: [PATCH 06/11] [CAS] Add llvm-cas test
llvm/test/CMakeLists.txt | 2 +
llvm/test/lit.cfg.py | 8 ++++
llvm/test/lit.site.cfg.py.in | 1 +
llvm/test/tools/llvm-cas/Inputs/oneline | 1 +
.../tools/llvm-cas/Inputs/oneline-nonewline | 1 +
llvm/test/tools/llvm-cas/cache.test | 14 +++++++
llvm/test/tools/llvm-cas/lit.local.cfg | 2 +
llvm/test/tools/llvm-cas/make-blob.test | 42 +++++++++++++++++++
llvm/test/tools/llvm-cas/make-node.test | 40 ++++++++++++++++++
llvm/test/tools/llvm-cas/print-id.test | 13 ++++++
10 files changed, 124 insertions(+)
create mode 100644 llvm/test/tools/llvm-cas/Inputs/oneline
create mode 100644 llvm/test/tools/llvm-cas/Inputs/oneline-nonewline
create mode 100644 llvm/test/tools/llvm-cas/cache.test
create mode 100644 llvm/test/tools/llvm-cas/lit.local.cfg
create mode 100644 llvm/test/tools/llvm-cas/make-blob.test
create mode 100644 llvm/test/tools/llvm-cas/make-node.test
create mode 100644 llvm/test/tools/llvm-cas/print-id.test
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index c66075434f1583..fdf059cbd1f645 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -23,6 +23,7 @@ llvm_canonicalize_cmake_booleans(
@@ -73,6 +74,7 @@ set(LLVM_TEST_DEPENDS
+ llvm-cas
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 5a03a85386e0aa..4ca73a3c02dc73 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -615,9 +615,17 @@ def have_ld64_plugin_support():
if config.expensive_checks:
+if config.have_ondisk_cas:
+ config.available_features.add("ondisk_cas")
if "MemoryWithOrigins" in config.llvm_use_sanitizer:
+# Restrict the size of the on-disk CAS for tests. This allows testing in
+# constrained environments (e.g. small TMPDIR). It also prevents leaving
+# behind large files on file systems that do not support sparse files if a test
+# crashes before resizing the file.
+config.environment["LLVM_CAS_MAX_MAPPING_SIZE"] = "%d" % (100 * 1024 * 1024)
# Some tools support an environment variable "OBJECT_MODE" on AIX OS, which
# controls the kind of objects they will support. If there is no "OBJECT_MODE"
diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
index 0968f6214772d0..3cf951be5f71e0 100644
--- a/llvm/test/lit.site.cfg.py.in
+++ b/llvm/test/lit.site.cfg.py.in
@@ -59,6 +59,7 @@ config.llvm_raevict_model_autogenerated = @LLVM_RAEVICT_MODEL_AUTOGENERATED@
config.expensive_checks = @LLVM_ENABLE_EXPENSIVE_CHECKS@
config.reverse_iteration = @LLVM_ENABLE_REVERSE_ITERATION@
config.dxil_tests = @LLVM_INCLUDE_DXIL_TESTS@
+config.have_ondisk_cas = @LLVM_ENABLE_ONDISK_CAS@
config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@
config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@
config.have_vc_rev = @LLVM_APPEND_VC_REV@
diff --git a/llvm/test/tools/llvm-cas/Inputs/oneline b/llvm/test/tools/llvm-cas/Inputs/oneline
new file mode 100644
index 00000000000000..d95f3ad14dee63
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/Inputs/oneline
@@ -0,0 +1 @@
diff --git a/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline b/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline
new file mode 100644
index 00000000000000..6b584e8ece562e
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline
@@ -0,0 +1 @@
\ No newline at end of file
diff --git a/llvm/test/tools/llvm-cas/cache.test b/llvm/test/tools/llvm-cas/cache.test
new file mode 100644
index 00000000000000..f0ce69190d4182
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/cache.test
@@ -0,0 +1,14 @@
+RUN: rm -rf %t %t.cas
+RUN: mkdir %t
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data /dev/null > %t/empty.casid
+RUN: echo "abc" | \
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data - >%t/abc.casid
+RUN: llvm-cas --cas %t/cas --put-cache-key @%t/abc.casid @%t/empty.casid
+RUN: llvm-cas --cas %t/cas --get-cache-result @%t/abc.casid > %t/empty2.casid
+RUN: diff %t/empty.casid %t/empty2.casid
+RUN: not llvm-cas --cas %t/cas --get-cache-result @%t/empty.casid
diff --git a/llvm/test/tools/llvm-cas/lit.local.cfg b/llvm/test/tools/llvm-cas/lit.local.cfg
new file mode 100644
index 00000000000000..379945b68925df
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.have_ondisk_cas:
+ config.unsupported = True
diff --git a/llvm/test/tools/llvm-cas/make-blob.test b/llvm/test/tools/llvm-cas/make-blob.test
new file mode 100644
index 00000000000000..10c64732ceb901
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/make-blob.test
@@ -0,0 +1,42 @@
+RUN: rm -rf %t %t.cas
+RUN: mkdir %t
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data - </dev/null >%t/empty.casid
+RUN: sed -e 's,^.,CHECK: ,' <%t/empty.casid >%t/empty.check
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data /dev/null | FileCheck %t/empty.check
+RUN: echo "abc" | \
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data - >%t/abc.casid
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data %S/Inputs/oneline >%t/oneline.casid
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data %S/Inputs/oneline-nonewline >%t/oneline-nonewline.casid
+RUN: llvm-cas --cas %t.cas --cat-blob @%t/empty.casid |\
+RUN: FileCheck %s -check-prefix CHECK-EMPTY -allow-empty
+RUN: llvm-cas --cas %t.cas --print-kind @%t/empty.casid |\
+RUN: FileCheck %s -check-prefix CHECK-KIND
+CHECK-KIND: object
+RUN: llvm-cas --cas %t.cas --cat-blob @%t/abc.casid |\
+RUN: FileCheck %s -check-prefix CHECK-ABC
+RUN: llvm-cas --cas %t.cas --print-kind @%t/abc.casid |\
+RUN: FileCheck %s -check-prefix CHECK-KIND
+RUN: llvm-cas --cas %t.cas --cat-blob @%t/oneline-nonewline.casid |\
+RUN: FileCheck %s -check-prefix CHECK-ONELINE
+RUN: llvm-cas --cas %t.cas --cat-blob @%t/oneline.casid |\
+RUN: FileCheck %s -check-prefix CHECK-ONELINE
+# Double-check newlines.
+RUN: llvm-cas --cas %t.cas --cat-blob @%t/oneline-nonewline.casid \
+RUN: >%t/oneline-nonewline
+RUN: diff %S/Inputs/oneline-nonewline %t/oneline-nonewline
+RUN: llvm-cas --cas %t.cas --cat-blob @%t/oneline.casid \
+RUN: >%t/oneline
+RUN: diff %S/Inputs/oneline %t/oneline
diff --git a/llvm/test/tools/llvm-cas/make-node.test b/llvm/test/tools/llvm-cas/make-node.test
new file mode 100644
index 00000000000000..876afd89c69621
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/make-node.test
@@ -0,0 +1,40 @@
+RUN: rm -rf %t
+RUN: mkdir %t
+# Make some empty objects.
+RUN: llvm-cas --cas %t/cas --make-node \
+RUN: --data - </dev/null >%t/empty.casid
+RUN: llvm-cas --cas %t/cas --cat-node-data @%t/empty.casid |\
+RUN: FileCheck %s -check-prefix CHECK-EMPTY -allow-empty
+RUN: llvm-cas --cas %t/cas --ls-node-refs @%t/empty.casid |\
+RUN: FileCheck %s -check-prefix CHECK-EMPTY -allow-empty
+RUN: llvm-cas --cas %t/cas --print-kind @%t/empty.casid |\
+RUN: FileCheck %s -check-prefix CHECK-NO-KIND
+### FIXME: Node ObjectKind with no reference is Blob kind in BuiltinCAS.
+CHECK-NO-KIND: object
+# Make a complex object, which references existing ones. Reference a blob and
+# other objects, and reference one of them twice to be sure they don't get
+# deduped.
+RUN: llvm-cas --cas %t/cas --make-blob --data /dev/null \
+RUN: >%t/empty-blob.casid
+RUN: cat %t/empty.casid %t/empty.casid %t/empty-blob.casid \
+RUN: >%t/complex.refs
+RUN: cat %t/complex.refs | sed -e 's,^.,CHECK: ,' > %t/complex.check
+RUN: llvm-cas --cas %t/cas --make-node \
+RUN: --data %S/Inputs/oneline @%t/complex.refs \
+RUN: >%t/complex.casid
+RUN: llvm-cas --cas %t/cas --print-kind \
+RUN: @%t/complex.casid | FileCheck %s -check-prefix COMPLEX-KIND
+RUN: llvm-cas --cas %t/cas --cat-node-data \
+RUN: @%t/complex.casid | FileCheck %s -check-prefix COMPLEX-DATA
+RUN: llvm-cas --cas %t/cas --ls-node-refs @%t/complex.casid |\
+RUN: FileCheck %t/complex.check
+COMPLEX-DATA: content
+RUN: not llvm-cas --cas %t/cas --ls-tree @%t/complex.casid 2>&1 | FileCheck %s --check-prefix=CHECK-WRONG-TYPE
+CHECK-WRONG-TYPE: llvm-cas: ls-tree: not a tree object
diff --git a/llvm/test/tools/llvm-cas/print-id.test b/llvm/test/tools/llvm-cas/print-id.test
new file mode 100644
index 00000000000000..d0d1be498524e1
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/print-id.test
@@ -0,0 +1,13 @@
+RUN: rm -rf %t
+RUN: mkdir %t
+RUN: llvm-cas --cas %t/cas --make-blob --data %s > %t/id
+# Confirm that the ID has the right prefix, is well-formed, and that there's
+# nothing else on the line.
+RUN: FileCheck %s --match-full-lines --strict-whitespace <%t/id
+# Confirm that there's a newline after.
+RUN: wc -l <%t/id | FileCheck %s -check-prefix=NEWLINE
>From 6e124421430dbf544de07dde751bcf5d0a409512 Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Tue, 10 Oct 2023 13:04:33 -0700
Subject: [PATCH 07/11] [CAS] Add support for PluginCAS
Allow loading external CAS implementation via PluginCAS. In this patch,
it adds:
* C APIs that can be implemented by plugin, from which LLVM can load the
dylib to use external CAS implementation
* A PluginCAS, that implements vending external CAS implementation as
llvm ObjectStore and ActionCache class.
* A libCASPluginTest dylib, that provides example external CAS
implementation that wraps LLVM CAS for testing purpose.
* Add a unified way to load external CAS implementation.
llvm/include/llvm-c/CAS/PluginAPI_functions.h | 307 ++++++++++
llvm/include/llvm-c/CAS/PluginAPI_types.h | 118 ++++
llvm/include/llvm/CAS/CASRegistry.h | 47 ++
llvm/include/llvm/CAS/ObjectStore.h | 19 -
llvm/include/llvm/CAS/PluginCAS.h | 28 +
llvm/lib/CAS/CASRegistry.cpp | 103 ++++
llvm/lib/CAS/CMakeLists.txt | 2 +
llvm/lib/CAS/ObjectStore.cpp | 54 --
llvm/lib/CAS/PluginAPI.h | 97 +++
llvm/lib/CAS/PluginAPI_functions.def | 31 +
llvm/lib/CAS/PluginCAS.cpp | 523 ++++++++++++++++
llvm/test/CMakeLists.txt | 1 +
llvm/tools/libCASPluginTest/CMakeLists.txt | 12 +
.../libCASPluginTest/libCASPluginTest.cpp | 572 ++++++++++++++++++
.../libCASPluginTest/libCASPluginTest.exports | 26 +
llvm/tools/llvm-cas/llvm-cas.cpp | 10 +-
llvm/unittests/CAS/ActionCacheTest.cpp | 8 +-
llvm/unittests/CAS/CASTestConfig.cpp | 37 ++
llvm/unittests/CAS/CASTestConfig.h | 8 +-
llvm/unittests/CAS/CMakeLists.txt | 5 +
llvm/unittests/CAS/PluginCASTest.cpp | 93 +++
21 files changed, 2019 insertions(+), 82 deletions(-)
create mode 100644 llvm/include/llvm-c/CAS/PluginAPI_functions.h
create mode 100644 llvm/include/llvm-c/CAS/PluginAPI_types.h
create mode 100644 llvm/include/llvm/CAS/CASRegistry.h
create mode 100644 llvm/include/llvm/CAS/PluginCAS.h
create mode 100644 llvm/lib/CAS/CASRegistry.cpp
create mode 100644 llvm/lib/CAS/PluginAPI.h
create mode 100644 llvm/lib/CAS/PluginAPI_functions.def
create mode 100644 llvm/lib/CAS/PluginCAS.cpp
create mode 100644 llvm/tools/libCASPluginTest/CMakeLists.txt
create mode 100644 llvm/tools/libCASPluginTest/libCASPluginTest.cpp
create mode 100644 llvm/tools/libCASPluginTest/libCASPluginTest.exports
create mode 100644 llvm/unittests/CAS/PluginCASTest.cpp
diff --git a/llvm/include/llvm-c/CAS/PluginAPI_functions.h b/llvm/include/llvm-c/CAS/PluginAPI_functions.h
new file mode 100644
index 00000000000000..8a7c9a64597ca1
--- /dev/null
+++ b/llvm/include/llvm-c/CAS/PluginAPI_functions.h
@@ -0,0 +1,307 @@
+/*===-- llvm-c/CAS/PluginAPI_functions.h - CAS Plugin Functions Interface -===*\
+|* *|
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM *|
+|* Exceptions. *|
+|* See https://llvm.org/LICENSE.txt for license information. *|
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception *|
+|* *|
+|* *|
+|* The functions for the LLVM CAS plugin API. Intended for assisting *|
+|* implementations of the API. *|
+|* The API is experimental and subject to change. *|
+|* *|
+#include "llvm-c/CAS/PluginAPI_types.h"
+#include "llvm-c/ExternC.h"
+#ifdef _WIN32
+#define LLCAS_PUBLIC __declspec(dllexport)
+ * Returns the \c LLCAS_VERSION_MAJOR and \c LLCAS_VERSION_MINOR values that the
+ * plugin was compiled with.
+ * Intended for assisting compatibility with different versions.
+ */
+LLCAS_PUBLIC void llcas_get_plugin_version(unsigned *major, unsigned *minor);
+ * Releases memory of C string pointers provided by other functions.
+ */
+LLCAS_PUBLIC void llcas_string_dispose(char *);
+ * Options object to configure creation of \c llcas_cas_t. After passing to
+ * \c llcas_cas_create, its memory can be released via
+ * \c llcas_cas_options_dispose.
+ */
+LLCAS_PUBLIC llcas_cas_options_t llcas_cas_options_create(void);
+LLCAS_PUBLIC void llcas_cas_options_dispose(llcas_cas_options_t);
+ * Receives the \c LLCAS_VERSION_MAJOR and \c LLCAS_VERSION_MINOR values that
+ * the client was compiled with.
+ * Intended for assisting compatibility with different versions.
+ */
+LLCAS_PUBLIC void llcas_cas_options_set_client_version(llcas_cas_options_t,
+ unsigned major,
+ unsigned minor);
+ * Receives a local file-system path that the plugin should use for any on-disk
+ * resources/caches.
+ */
+LLCAS_PUBLIC void llcas_cas_options_set_ondisk_path(llcas_cas_options_t,
+ const char *path);
+ * Receives a name/value strings pair, for the plugin to set as a custom option
+ * it supports. These are usually passed through as invocation options and are
+ * opaque to the client.
+ *
+ * \param error optional pointer to receive an error message if an error
+ * occurred. If set, the memory it points to needs to be released via
+ * \c llcas_string_dispose.
+ * \returns true if there was an error, false otherwise.
+ */
+LLCAS_PUBLIC bool llcas_cas_options_set_option(llcas_cas_options_t,
+ const char *name,
+ const char *value, char **error);
+ * Creates a new \c llcas_cas_t object. The objects returned from the other
+ * functions are only valid to use while the \c llcas_cas_t object that they
+ * came from is still valid.
+ *
+ * \param error optional pointer to receive an error message if an error
+ * occurred. If set, the memory it points to needs to be released via
+ * \c llcas_string_dispose.
+ * \returns \c NULL if there was an error.
+ */
+LLCAS_PUBLIC llcas_cas_t llcas_cas_create(llcas_cas_options_t, char **error);
+ * Releases memory of \c llcas_cas_t. After calling this it is invalid to keep
+ * using objects that originated from this \c llcas_cas_t instance.
+ */
+LLCAS_PUBLIC void llcas_cas_dispose(llcas_cas_t);
+ * \returns the hash schema name that the plugin is using. The string memory it
+ * points to needs to be released via \c llcas_string_dispose.
+ */
+LLCAS_PUBLIC char *llcas_cas_get_hash_schema_name(llcas_cas_t);
+ * Parses the printed digest and returns the digest hash bytes.
+ *
+ * \param printed_digest a C string that was previously provided by
+ * \c llcas_digest_print.
+ * \param bytes pointer to a buffer for writing the digest bytes. Can be \c NULL
+ * if \p bytes_size is 0.
+ * \param bytes_size the size of the buffer.
+ * \param error optional pointer to receive an error message if an error
+ * occurred. If set, the memory it points to needs to be released via
+ * \c llcas_string_dispose.
+ * \returns 0 if there was an error. If \p bytes_size is smaller than the
+ * required size to fit the digest bytes, returns the required buffer size
+ * without writing to \c bytes. Otherwise writes the digest bytes to \p bytes
+ * and returns the number of written bytes.
+ */
+LLCAS_PUBLIC unsigned llcas_digest_parse(llcas_cas_t,
+ const char *printed_digest,
+ uint8_t *bytes, size_t bytes_size,
+ char **error);
+ * Returns a string for the given digest bytes that can be passed to
+ * \c llcas_digest_parse.
+ *
+ * \param printed_id pointer to receive the printed digest string. The memory it
+ * points to needs to be released via \c llcas_string_dispose.
+ * \param error optional pointer to receive an error message if an error
+ * occurred. If set, the memory it points to needs to be released via
+ * \c llcas_string_dispose.
+ * \returns true if there was an error, false otherwise.
+ */
+LLCAS_PUBLIC bool llcas_digest_print(llcas_cas_t, llcas_digest_t,
+ char **printed_id, char **error);
+ * Provides the \c llcas_objectid_t value for the given \c llcas_digest_t.
+ *
+ * \param digest the digest bytes that the returned \c llcas_objectid_t
+ * represents.
+ * \param p_id pointer to store the returned \c llcas_objectid_t object.
+ * \param error optional pointer to receive an error message if an error
+ * occurred. If set, the memory it points to needs to be released via
+ * \c llcas_string_dispose.
+ * \returns true if there was an error, false otherwise.
+ */
+LLCAS_PUBLIC bool llcas_cas_get_objectid(llcas_cas_t, llcas_digest_t digest,
+ llcas_objectid_t *p_id, char **error);
+ * \returns the \c llcas_digest_t value for the given \c llcas_objectid_t.
+ * The memory that the buffer points to is valid for the lifetime of the
+ * \c llcas_cas_t object.
+ */
+LLCAS_PUBLIC llcas_digest_t llcas_objectid_get_digest(llcas_cas_t,
+ llcas_objectid_t);
+ * Checks whether a \c llcas_objectid_t points to an existing object.
+ *
+ * \param globally For CAS implementations that distinguish between local CAS
+ * and remote/distributed CAS, \p globally set to false indicates that the
+ * lookup will be restricted to the local CAS, returning "not found" even if the
+ * object might exist in the remote CAS.
+ * \param error optional pointer to receive an error message if an error
+ * occurred. If set, the memory it points to needs to be released via
+ * \c llcas_string_dispose.
+ * \returns one of \c llcas_lookup_result_t.
+ */
+LLCAS_PUBLIC llcas_lookup_result_t llcas_cas_contains_object(llcas_cas_t,
+ llcas_objectid_t,
+ bool globally,
+ char **error);
+ * Loads the object that \c llcas_objectid_t points to.
+ *
+ * \param error optional pointer to receive an error message if an error
+ * occurred. If set, the memory it points to needs to be released via
+ * \c llcas_string_dispose.
+ * \returns one of \c llcas_lookup_result_t.
+ */
+LLCAS_PUBLIC llcas_lookup_result_t llcas_cas_load_object(
+ llcas_cas_t, llcas_objectid_t, llcas_loaded_object_t *, char **error);
+ * Like \c llcas_cas_load_object but loading happens via a callback function.
+ * Whether the call is asynchronous or not depends on the implementation.
+ *
+ * \param ctx_cb pointer to pass to the callback function.
+ */
+LLCAS_PUBLIC void llcas_cas_load_object_async(llcas_cas_t, llcas_objectid_t,
+ void *ctx_cb,
+ llcas_cas_load_object_cb);
+ * Stores the object with the provided data buffer and \c llcas_objectid_t
+ * references, and provides its associated \c llcas_objectid_t.
+ *
+ * \param refs pointer to array of \c llcas_objectid_t. Can be \c NULL if
+ * \p refs_count is 0.
+ * \param refs_count number of \c llcas_objectid_t objects in the array.
+ * \param p_id pointer to store the returned \c llcas_objectid_t object.
+ * \param error optional pointer to receive an error message if an error
+ * occurred. If set, the memory it points to needs to be released via
+ * \c llcas_string_dispose.
+ * \returns true if there was an error, false otherwise.
+ */
+LLCAS_PUBLIC bool llcas_cas_store_object(llcas_cas_t, llcas_data_t,
+ const llcas_objectid_t *refs,
+ size_t refs_count,
+ llcas_objectid_t *p_id, char **error);
+ * \returns the data buffer of the provided \c llcas_loaded_object_t. The buffer
+ * pointer must be 8-byte aligned and \c NULL terminated. The memory that the
+ * buffer points to is valid for the lifetime of the \c llcas_cas_t object.
+ */
+LLCAS_PUBLIC llcas_data_t llcas_loaded_object_get_data(llcas_cas_t,
+ llcas_loaded_object_t);
+ * \returns the references of the provided \c llcas_loaded_object_t.
+ */
+LLCAS_PUBLIC llcas_object_refs_t
+ llcas_loaded_object_get_refs(llcas_cas_t, llcas_loaded_object_t);
+ * \returns the number of references in the provided \c llcas_object_refs_t.
+ */
+LLCAS_PUBLIC size_t llcas_object_refs_get_count(llcas_cas_t,
+ llcas_object_refs_t);
+ * \returns the \c llcas_objectid_t of the reference at \p index. It is invalid
+ * to pass an index that is out of the range of references.
+ */
+LLCAS_PUBLIC llcas_objectid_t llcas_object_refs_get_id(llcas_cas_t,
+ llcas_object_refs_t,
+ size_t index);
+ * Retrieves the \c llcas_objectid_t value associated with a \p key.
+ *
+ * \param p_value pointer to store the returned \c llcas_objectid_t object.
+ * \param globally if true it is a hint to the underlying implementation that
+ * the lookup is profitable to be done on a distributed caching level, not just
+ * locally. The implementation is free to ignore this flag.
+ * \param error optional pointer to receive an error message if an error
+ * occurred. If set, the memory it points to needs to be released via
+ * \c llcas_string_dispose.
+ * \returns one of \c llcas_lookup_result_t.
+ */
+LLCAS_PUBLIC llcas_lookup_result_t llcas_actioncache_get_for_digest(
+ llcas_cas_t, llcas_digest_t key, llcas_objectid_t *p_value, bool globally,
+ char **error);
+ * Like \c llcas_actioncache_get_for_digest but result is provided to a callback
+ * function. Whether the call is asynchronous or not depends on the
+ * implementation.
+ *
+ * \param ctx_cb pointer to pass to the callback function.
+ */
+llcas_actioncache_get_for_digest_async(llcas_cas_t, llcas_digest_t key,
+ bool globally, void *ctx_cb,
+ llcas_actioncache_get_cb);
+ * Associates a \c llcas_objectid_t \p value with a \p key. It is invalid to set
+ * a different \p value to the same \p key.
+ *
+ * \param globally if true it is a hint to the underlying implementation that
+ * the association is profitable to be done on a distributed caching level, not
+ * just locally. The implementation is free to ignore this flag.
+ * \param error optional pointer to receive an error message if an error
+ * occurred. If set, the memory it points to needs to be released via
+ * \c llcas_string_dispose.
+ * \returns true if there was an error, false otherwise.
+ */
+LLCAS_PUBLIC bool llcas_actioncache_put_for_digest(llcas_cas_t,
+ llcas_digest_t key,
+ llcas_objectid_t value,
+ bool globally, char **error);
+ * Like \c llcas_actioncache_put_for_digest but result is provided to a callback
+ * function. Whether the call is asynchronous or not depends on the
+ * implementation.
+ *
+ * \param ctx_cb pointer to pass to the callback function.
+ */
+llcas_actioncache_put_for_digest_async(llcas_cas_t, llcas_digest_t key,
+ llcas_objectid_t value, bool globally,
+ void *ctx_cb, llcas_actioncache_put_cb);
diff --git a/llvm/include/llvm-c/CAS/PluginAPI_types.h b/llvm/include/llvm-c/CAS/PluginAPI_types.h
new file mode 100644
index 00000000000000..fdade74fcebcc6
--- /dev/null
+++ b/llvm/include/llvm-c/CAS/PluginAPI_types.h
@@ -0,0 +1,118 @@
+/*===-- llvm-c/CAS/PluginAPI_Types.h - CAS Plugin Types Interface -*- C -*-===*\
+|* *|
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM *|
+|* Exceptions. *|
+|* See https://llvm.org/LICENSE.txt for license information. *|
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception *|
+|* *|
+|* *|
+|* The types for the LLVM CAS plugin API. *|
+|* The API is experimental and subject to change. *|
+|* *|
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+typedef struct llcas_cas_options_s *llcas_cas_options_t;
+typedef struct llcas_cas_s *llcas_cas_t;
+ * Digest hash bytes.
+ */
+typedef struct {
+ const uint8_t *data;
+ size_t size;
+} llcas_digest_t;
+ * Data buffer for stored CAS objects.
+ */
+typedef struct {
+ const void *data;
+ size_t size;
+} llcas_data_t;
+ * Identifier for a CAS object.
+ */
+typedef struct {
+ uint64_t opaque;
+} llcas_objectid_t;
+ * A loaded CAS object.
+ */
+typedef struct {
+ uint64_t opaque;
+} llcas_loaded_object_t;
+ * Object references for a CAS object.
+ */
+typedef struct {
+ uint64_t opaque_b;
+ uint64_t opaque_e;
+} llcas_object_refs_t;
+ * Return values for a load operation.
+ */
+typedef enum {
+ /**
+ * The object was found.
+ */
+ /**
+ * The object was not found.
+ */
+ /**
+ * An error occurred.
+ */
+} llcas_lookup_result_t;
+ * Callback for \c llcas_cas_load_object_async.
+ *
+ * \param ctx pointer passed through from the \c llcas_cas_load_object_async
+ * call.
+ * \param error message if an error occurred. If set, the memory it points to
+ * needs to be released via \c llcas_string_dispose.
+ */
+typedef void (*llcas_cas_load_object_cb)(void *ctx, llcas_lookup_result_t,
+ llcas_loaded_object_t, char *error);
+ * Callback for \c llcas_actioncache_get_for_digest_async.
+ *
+ * \param ctx pointer passed through from the
+ * \c llcas_actioncache_get_for_digest_async call.
+ * \param error message if an error occurred. If set, the memory it points to
+ * needs to be released via \c llcas_string_dispose.
+ */
+typedef void (*llcas_actioncache_get_cb)(void *ctx, llcas_lookup_result_t,
+ llcas_objectid_t, char *error);
+ * Callback for \c llcas_actioncache_put_for_digest_async.
+ *
+ * \param ctx pointer passed through from the
+ * \c llcas_actioncache_put_for_digest_async call.
+ * \param error message if an error occurred. If set, the memory it points to
+ * needs to be released via \c llcas_string_dispose.
+ */
+typedef void (*llcas_actioncache_put_cb)(void *ctx, bool failed, char *error);
diff --git a/llvm/include/llvm/CAS/CASRegistry.h b/llvm/include/llvm/CAS/CASRegistry.h
new file mode 100644
index 00000000000000..4654e91d39b49a
--- /dev/null
+++ b/llvm/include/llvm/CAS/CASRegistry.h
@@ -0,0 +1,47 @@
+//===- llvm/CAS/CASRegistry.h -----------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/CASID.h"
+#include "llvm/CAS/CASReference.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Support/Error.h"
+namespace llvm::cas {
+/// Create ObjectStore from a string identifier.
+/// Currently the string identifier is using URL scheme with following supported
+/// schemes:
+/// * InMemory CAS: mem://
+/// * OnDisk CAS: file://${PATH_TO_ONDISK_CAS}
+/// * PlugIn CAS: plugin://${PATH_TO_PLUGIN}?${OPT1}=${VAL1}&${OPT2}=${VAL2}..
+/// If no URL scheme is used, it defaults to following (but might change in
+/// future)
+/// For the plugin scheme, use argument "ondisk-path=${PATH}" to choose the
+/// on-disk directory that the plugin should use, otherwise the default
+/// OnDiskCAS location will be used.
+/// FIXME: Need to implement proper URL encoding scheme that allows "%".
+Expected<std::pair<std::shared_ptr<ObjectStore>, std::shared_ptr<ActionCache>>>
+createCASFromIdentifier(StringRef Id);
+/// Check if a string is a CAS identifier.
+bool isRegisteredCASIdentifier(StringRef Config);
+/// Register a URL scheme to CAS Identifier.
+using ObjectStoreCreateFuncTy = Expected<
+ std::pair<std::shared_ptr<ObjectStore>, std::shared_ptr<ActionCache>>>(
+ const Twine &);
+void registerCASURLScheme(StringRef Prefix, ObjectStoreCreateFuncTy *Func);
+} // namespace llvm::cas
diff --git a/llvm/include/llvm/CAS/ObjectStore.h b/llvm/include/llvm/CAS/ObjectStore.h
index d8977f2d13c709..931f5046a5bc1c 100644
--- a/llvm/include/llvm/CAS/ObjectStore.h
+++ b/llvm/include/llvm/CAS/ObjectStore.h
@@ -333,25 +333,6 @@ void getDefaultOnDiskCASPath(SmallVectorImpl<char> &Path);
/// user.
std::string getDefaultOnDiskCASPath();
-/// Create ObjectStore from a string identifier.
-/// Currently the string identifier is using URL scheme with following supported
-/// schemes:
-/// * InMemory CAS: mem://
-/// * OnDisk CAS: file://${PATH_TO_ONDISK_CAS}
-/// For the plugin scheme, use argument "ondisk-path=${PATH}" to choose the
-/// on-disk directory that the plugin should use, otherwise the default
-/// OnDiskCAS location will be used.
-/// FIXME: Need to implement proper URL encoding scheme that allows "%".
-Expected<std::shared_ptr<ObjectStore>> createCASFromIdentifier(StringRef Id);
-/// Check if a string is a CAS identifier.
-bool isRegisteredCASIdentifier(StringRef Config);
-/// Register a URL scheme to CAS Identifier.
-using ObjectStoreCreateFuncTy =
- Expected<std::shared_ptr<ObjectStore>>(const Twine &);
-void registerCASURLScheme(StringRef Prefix, ObjectStoreCreateFuncTy *Func);
} // namespace cas
} // namespace llvm
diff --git a/llvm/include/llvm/CAS/PluginCAS.h b/llvm/include/llvm/CAS/PluginCAS.h
new file mode 100644
index 00000000000000..12c765b7d0b273
--- /dev/null
+++ b/llvm/include/llvm/CAS/PluginCAS.h
@@ -0,0 +1,28 @@
+//===- llvm/CAS/PluginCAS.h -------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Support/Error.h"
+namespace llvm::cas {
+/// Create \c ObjectStore and \c ActionCache instances using the plugin
+/// interface.
+Expected<std::pair<std::shared_ptr<ObjectStore>, std::shared_ptr<ActionCache>>>
+ StringRef PluginPath, StringRef OnDiskPath,
+ ArrayRef<std::pair<std::string, std::string>> PluginArgs);
+} // namespace llvm::cas
diff --git a/llvm/lib/CAS/CASRegistry.cpp b/llvm/lib/CAS/CASRegistry.cpp
new file mode 100644
index 00000000000000..04137b3b242842
--- /dev/null
+++ b/llvm/lib/CAS/CASRegistry.cpp
@@ -0,0 +1,103 @@
+//===- CASRegistry.cpp ------------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/CASRegistry.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/CAS/PluginCAS.h"
+#include "llvm/Support/ManagedStatic.h"
+using namespace llvm;
+using namespace llvm::cas;
+static Expected<
+ std::pair<std::shared_ptr<ObjectStore>, std::shared_ptr<ActionCache>>>
+createOnDiskCASImpl(const Twine &Path) {
+ std::string CASPath = Path.str();
+ // If path is empty, use default ondisk CAS path.
+ if (CASPath.empty())
+ CASPath = getDefaultOnDiskCASPath();
+ auto UniDB = createOnDiskUnifiedCASDatabases(Path.str());
+ if (!UniDB)
+ return UniDB.takeError();
+ return std::pair{std::move(UniDB->first), std::move(UniDB->second)};
+static Expected<
+ std::pair<std::shared_ptr<ObjectStore>, std::shared_ptr<ActionCache>>>
+createPluginCASImpl(const Twine &URL) {
+ // Format used is
+ // plugin://${PATH_TO_PLUGIN}?${OPT1}=${VAL1}&${OPT2}=${VAL2}..
+ // "ondisk-path" as option is treated specially, the rest of options are
+ // passed to the plugin verbatim.
+ SmallString<256> PathBuf;
+ auto [PluginPath, Options] = URL.toStringRef(PathBuf).split('?');
+ std::string OnDiskPath;
+ SmallVector<std::pair<std::string, std::string>> PluginArgs;
+ while (!Options.empty()) {
+ StringRef Opt;
+ std::tie(Opt, Options) = Options.split('&');
+ auto [Name, Value] = Opt.split('=');
+ if (Name == "ondisk-path") {
+ OnDiskPath = Value;
+ } else {
+ PluginArgs.push_back({std::string(Name), std::string(Value)});
+ }
+ }
+ if (OnDiskPath.empty())
+ OnDiskPath = getDefaultOnDiskCASPath();
+ return createPluginCASDatabases(PluginPath, OnDiskPath, PluginArgs);
+static Expected<
+ std::pair<std::shared_ptr<ObjectStore>, std::shared_ptr<ActionCache>>>
+createInMemoryCASImpl(const Twine &) {
+ return std::pair{createInMemoryCAS(), createInMemoryActionCache()};
+static ManagedStatic<StringMap<ObjectStoreCreateFuncTy *>> RegisteredScheme;
+static StringMap<ObjectStoreCreateFuncTy *> &getRegisteredScheme() {
+ if (!RegisteredScheme.isConstructed()) {
+ RegisteredScheme->insert({"mem://", &createInMemoryCASImpl});
+ RegisteredScheme->insert({"file://", &createOnDiskCASImpl});
+ RegisteredScheme->insert({"plugin://", &createPluginCASImpl});
+ }
+ return *RegisteredScheme;
+Expected<std::pair<std::shared_ptr<ObjectStore>, std::shared_ptr<ActionCache>>>
+cas::createCASFromIdentifier(StringRef Id) {
+ for (auto &Scheme : getRegisteredScheme()) {
+ if (Id.consume_front(Scheme.getKey()))
+ return Scheme.getValue()(Id);
+ }
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "Unknown CAS identifier is provided");
+bool cas::isRegisteredCASIdentifier(StringRef Id) {
+ for (auto &Scheme : getRegisteredScheme()) {
+ if (Id.consume_front(Scheme.getKey()))
+ return true;
+ }
+ return false;
+void cas::registerCASURLScheme(StringRef Prefix,
+ ObjectStoreCreateFuncTy *Func) {
+ getRegisteredScheme().insert({Prefix, Func});
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index b3f9ddc8e315c2..e2d192b28498b1 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_component_library(LLVMCAS
+ CASRegistry.cpp
@@ -17,6 +18,7 @@ add_llvm_component_library(LLVMCAS
+ PluginCAS.cpp
diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp
index a9c5c53c1fcfea..4d58dfb1157a12 100644
--- a/llvm/lib/CAS/ObjectStore.cpp
+++ b/llvm/lib/CAS/ObjectStore.cpp
@@ -10,12 +10,9 @@
#include "BuiltinCAS.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/FunctionExtras.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/CAS/UnifiedOnDiskCache.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/Errc.h"
#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/ManagedStatic.h"
#include "llvm/Support/SmallVectorMemoryBuffer.h"
using namespace llvm;
@@ -207,54 +204,3 @@ ObjectProxy::getMemoryBuffer(StringRef Name,
bool RequiresNullTerminator) const {
return CAS->getMemoryBuffer(H, Name, RequiresNullTerminator);
-static Expected<std::shared_ptr<ObjectStore>>
-createOnDiskCASImpl(const Twine &Path) {
- std::string CASPath = Path.str();
- // If path is empty, use default ondisk CAS path.
- if (CASPath.empty())
- CASPath = getDefaultOnDiskCASPath();
- auto UniDB = builtin::createBuiltinUnifiedOnDiskCache(CASPath);
- if (!UniDB)
- return UniDB.takeError();
- return builtin::createObjectStoreFromUnifiedOnDiskCache(std::move(*UniDB));
-static Expected<std::shared_ptr<ObjectStore>>
-createInMemoryCASImpl(const Twine &) {
- return createInMemoryCAS();
-static ManagedStatic<StringMap<ObjectStoreCreateFuncTy *>> RegisteredScheme;
-static StringMap<ObjectStoreCreateFuncTy *> &getRegisteredScheme() {
- if (!RegisteredScheme.isConstructed()) {
- RegisteredScheme->insert({"mem://", &createInMemoryCASImpl});
- RegisteredScheme->insert({"file://", &createOnDiskCASImpl});
- }
- return *RegisteredScheme;
-cas::createCASFromIdentifier(StringRef Id) {
- for (auto &Scheme : getRegisteredScheme()) {
- if (Id.consume_front(Scheme.getKey()))
- return Scheme.getValue()(Id);
- }
- return createStringError(std::make_error_code(std::errc::invalid_argument),
- "Unknown CAS identifier is provided");
-bool cas::isRegisteredCASIdentifier(StringRef Id) {
- for (auto &Scheme : getRegisteredScheme()) {
- if (Id.consume_front(Scheme.getKey()))
- return true;
- }
- return false;
-void cas::registerCASURLScheme(StringRef Prefix,
- ObjectStoreCreateFuncTy *Func) {
- getRegisteredScheme().insert({Prefix, Func});
diff --git a/llvm/lib/CAS/PluginAPI.h b/llvm/lib/CAS/PluginAPI.h
new file mode 100644
index 00000000000000..b9505677e6c11c
--- /dev/null
+++ b/llvm/lib/CAS/PluginAPI.h
@@ -0,0 +1,97 @@
+//===- PluginAPI.h ----------------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm-c/CAS/PluginAPI_types.h"
+/// See documentation in \c "llvm-c/CAS/PluginAPI_functions.h" for how these
+/// functions are used.
+struct llcas_functions_t {
+ void (*get_plugin_version)(unsigned *major, unsigned *minor);
+ void (*string_dispose)(char *);
+ llcas_cas_options_t (*cas_options_create)(void);
+ void (*cas_options_dispose)(llcas_cas_options_t);
+ void (*cas_options_set_client_version)(llcas_cas_options_t, unsigned major,
+ unsigned minor);
+ void (*cas_options_set_ondisk_path)(llcas_cas_options_t, const char *path);
+ bool (*cas_options_set_option)(llcas_cas_options_t, const char *name,
+ const char *value, char **error);
+ llcas_cas_t (*cas_create)(llcas_cas_options_t, char **error);
+ void (*cas_dispose)(llcas_cas_t);
+ unsigned (*digest_parse)(llcas_cas_t, const char *printed_digest,
+ uint8_t *bytes, size_t bytes_size, char **error);
+ bool (*digest_print)(llcas_cas_t, llcas_digest_t, char **printed_id,
+ char **error);
+ char *(*cas_get_hash_schema_name)(llcas_cas_t);
+ bool (*cas_get_objectid)(llcas_cas_t, llcas_digest_t, llcas_objectid_t *,
+ char **error);
+ llcas_digest_t (*objectid_get_digest)(llcas_cas_t, llcas_objectid_t);
+ llcas_lookup_result_t (*cas_contains_object)(llcas_cas_t, llcas_objectid_t,
+ bool globally, char **error);
+ llcas_lookup_result_t (*cas_load_object)(llcas_cas_t, llcas_objectid_t,
+ llcas_loaded_object_t *,
+ char **error);
+ void (*cas_load_object_async)(llcas_cas_t, llcas_objectid_t, void *ctx_cb,
+ llcas_cas_load_object_cb);
+ bool (*cas_store_object)(llcas_cas_t, llcas_data_t,
+ const llcas_objectid_t *refs, size_t refs_count,
+ llcas_objectid_t *, char **error);
+ llcas_data_t (*loaded_object_get_data)(llcas_cas_t, llcas_loaded_object_t);
+ llcas_object_refs_t (*loaded_object_get_refs)(llcas_cas_t,
+ llcas_loaded_object_t);
+ size_t (*object_refs_get_count)(llcas_cas_t, llcas_object_refs_t);
+ llcas_objectid_t (*object_refs_get_id)(llcas_cas_t, llcas_object_refs_t,
+ size_t index);
+ /*===--------------------------------------------------------------------===*\
+ |* Action cache API
+ \*===--------------------------------------------------------------------===*/
+ llcas_lookup_result_t (*actioncache_get_for_digest)(llcas_cas_t,
+ llcas_digest_t key,
+ llcas_objectid_t *p_value,
+ bool globally,
+ char **error);
+ void (*actioncache_get_for_digest_async)(llcas_cas_t, llcas_digest_t key,
+ bool globally, void *ctx_cb,
+ llcas_actioncache_get_cb);
+ bool (*actioncache_put_for_digest)(llcas_cas_t, llcas_digest_t key,
+ llcas_objectid_t value, bool globally,
+ char **error);
+ void (*actioncache_put_for_digest_async)(llcas_cas_t, llcas_digest_t key,
+ llcas_objectid_t value,
+ bool globally, void *ctx_cb,
+ llcas_actioncache_put_cb);
diff --git a/llvm/lib/CAS/PluginAPI_functions.def b/llvm/lib/CAS/PluginAPI_functions.def
new file mode 100644
index 00000000000000..17e60e510b2fcf
--- /dev/null
+++ b/llvm/lib/CAS/PluginAPI_functions.def
@@ -0,0 +1,31 @@
+// Format is Name/required. If 'required' is true then loading will fail if the
+// symbol is missing, otherwise loading will continue and the function pointer
+// will be null. Order is lexicographically by name.
+CASPLUGINAPI_FUNCTION(actioncache_get_for_digest, true)
+CASPLUGINAPI_FUNCTION(actioncache_get_for_digest_async, true)
+CASPLUGINAPI_FUNCTION(actioncache_put_for_digest, true)
+CASPLUGINAPI_FUNCTION(actioncache_put_for_digest_async, true)
+CASPLUGINAPI_FUNCTION(cas_contains_object, true)
+CASPLUGINAPI_FUNCTION(cas_create, true)
+CASPLUGINAPI_FUNCTION(cas_dispose, true)
+CASPLUGINAPI_FUNCTION(cas_get_hash_schema_name, true)
+CASPLUGINAPI_FUNCTION(cas_get_objectid, true)
+CASPLUGINAPI_FUNCTION(cas_load_object, true)
+CASPLUGINAPI_FUNCTION(cas_load_object_async, true)
+CASPLUGINAPI_FUNCTION(cas_options_create, true)
+CASPLUGINAPI_FUNCTION(cas_options_dispose, true)
+CASPLUGINAPI_FUNCTION(cas_options_set_client_version, true)
+CASPLUGINAPI_FUNCTION(cas_options_set_ondisk_path, true)
+CASPLUGINAPI_FUNCTION(cas_options_set_option, true)
+CASPLUGINAPI_FUNCTION(cas_store_object, true)
+CASPLUGINAPI_FUNCTION(digest_parse, true)
+CASPLUGINAPI_FUNCTION(digest_print, true)
+CASPLUGINAPI_FUNCTION(get_plugin_version, true)
+CASPLUGINAPI_FUNCTION(loaded_object_get_data, true)
+CASPLUGINAPI_FUNCTION(loaded_object_get_refs, true)
+CASPLUGINAPI_FUNCTION(object_refs_get_count, true)
+CASPLUGINAPI_FUNCTION(object_refs_get_id, true)
+CASPLUGINAPI_FUNCTION(objectid_get_digest, true)
+CASPLUGINAPI_FUNCTION(string_dispose, true)
diff --git a/llvm/lib/CAS/PluginCAS.cpp b/llvm/lib/CAS/PluginCAS.cpp
new file mode 100644
index 00000000000000..333180c61996ee
--- /dev/null
+++ b/llvm/lib/CAS/PluginCAS.cpp
@@ -0,0 +1,523 @@
+//===- PluginCAS.cpp --------------------------------------------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/CAS/PluginCAS.h"
+#include "PluginAPI.h"
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Support/DynamicLibrary.h"
+using namespace llvm;
+using namespace llvm::cas;
+namespace {
+class PluginCASContext : public CASContext {
+ void printIDImpl(raw_ostream &OS, const CASID &ID) const final;
+ StringRef getHashSchemaIdentifier() const final { return SchemaName; }
+ static Expected<std::shared_ptr<PluginCASContext>>
+ create(StringRef PluginPath, StringRef OnDiskPath,
+ ArrayRef<std::pair<std::string, std::string>> PluginArgs);
+ ~PluginCASContext() { Functions.cas_dispose(c_cas); }
+ llcas_functions_t Functions{};
+ llcas_cas_t c_cas = nullptr;
+ std::string SchemaName;
+ static Error errorAndDispose(char *c_err, const llcas_functions_t &Funcs) {
+ Error E = createStringError(inconvertibleErrorCode(), c_err);
+ Funcs.string_dispose(c_err);
+ return E;
+ }
+ Error errorAndDispose(char *c_err) const {
+ return errorAndDispose(c_err, Functions);
+ }
+} // anonymous namespace
+void PluginCASContext::printIDImpl(raw_ostream &OS, const CASID &ID) const {
+ ArrayRef<uint8_t> Hash = ID.getHash();
+ char *c_printed_id = nullptr;
+ char *c_err = nullptr;
+ if (Functions.digest_print(c_cas, llcas_digest_t{Hash.data(), Hash.size()},
+ &c_printed_id, &c_err))
+ report_fatal_error(errorAndDispose(c_err));
+ OS << c_printed_id;
+ Functions.string_dispose(c_printed_id);
+Expected<std::shared_ptr<PluginCASContext>> PluginCASContext::create(
+ StringRef PluginPath, StringRef OnDiskPath,
+ ArrayRef<std::pair<std::string, std::string>> PluginArgs) {
+ auto reportError = [PluginPath](const Twine &Description) -> Error {
+ std::error_code EC = inconvertibleErrorCode();
+ return createStringError(EC, "error loading '" + PluginPath +
+ "': " + Description);
+ };
+ SmallString<256> PathBuf = PluginPath;
+ std::string ErrMsg;
+ sys::DynamicLibrary Lib =
+ sys::DynamicLibrary::getPermanentLibrary(PathBuf.c_str(), &ErrMsg);
+ if (!Lib.isValid())
+ return reportError(ErrMsg);
+ llcas_functions_t Functions{};
+#define CASPLUGINAPI_FUNCTION(name, required) \
+ if (!(Functions.name = (decltype(llcas_functions_t::name)) \
+ Lib.getAddressOfSymbol("llcas_" #name))) { \
+ if (required) \
+ return reportError("failed symbol 'llcas_" #name "' lookup"); \
+ }
+#include "PluginAPI_functions.def"
+ llcas_cas_options_t c_opts = Functions.cas_options_create();
+ auto _ = make_scope_exit([&]() { Functions.cas_options_dispose(c_opts); });
+ Functions.cas_options_set_client_version(c_opts, LLCAS_VERSION_MAJOR,
+ SmallString<256> OnDiskPathBuf = OnDiskPath;
+ Functions.cas_options_set_ondisk_path(c_opts, OnDiskPathBuf.c_str());
+ for (const auto &Pair : PluginArgs) {
+ char *c_err = nullptr;
+ if (Functions.cas_options_set_option(c_opts, Pair.first.c_str(),
+ Pair.second.c_str(), &c_err))
+ return errorAndDispose(c_err, Functions);
+ }
+ char *c_err = nullptr;
+ llcas_cas_t c_cas = Functions.cas_create(c_opts, &c_err);
+ if (!c_cas)
+ return errorAndDispose(c_err, Functions);
+ char *c_schema = Functions.cas_get_hash_schema_name(c_cas);
+ std::string SchemaName = c_schema;
+ Functions.string_dispose(c_schema);
+ auto Ctx = std::make_shared<PluginCASContext>();
+ Ctx->Functions = Functions;
+ Ctx->c_cas = c_cas;
+ Ctx->SchemaName = std::move(SchemaName);
+ return Ctx;
+// ObjectStore API
+namespace {
+class PluginObjectStore
+ : public ObjectStore,
+ public std::enable_shared_from_this<PluginObjectStore> {
+ Expected<CASID> parseID(StringRef ID) final;
+ Expected<ObjectRef> store(ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) final;
+ CASID getID(ObjectRef Ref) const final;
+ std::optional<ObjectRef> getReference(const CASID &ID) const final;
+ Expected<bool> isMaterialized(ObjectRef Ref) const final;
+ Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) final;
+ void
+ loadIfExistsAsync(ObjectRef Ref,
+ unique_function<void(Expected<std::optional<ObjectHandle>>)>
+ Callback) final;
+ uint64_t getDataSize(ObjectHandle Node) const final;
+ Error forEachRef(ObjectHandle Node,
+ function_ref<Error(ObjectRef)> Callback) const final;
+ ObjectRef readRef(ObjectHandle Node, size_t I) const final;
+ size_t getNumRefs(ObjectHandle Node) const final;
+ ArrayRef<char> getData(ObjectHandle Node,
+ bool RequiresNullTerminator = false) const final;
+ Error validate(const CASID &ID) final {
+ // Not supported yet. Always return success.
+ return Error::success();
+ }
+ PluginObjectStore(std::shared_ptr<PluginCASContext>);
+ std::shared_ptr<PluginCASContext> Ctx;
+} // anonymous namespace
+Expected<CASID> PluginObjectStore::parseID(StringRef ID) {
+ // Use big enough stack so that we don't have to allocate in the heap.
+ SmallString<148> IDBuf(ID);
+ SmallVector<uint8_t, 68> BytesBuf(68);
+ auto parseDigest = [&]() -> Expected<unsigned> {
+ char *c_err = nullptr;
+ unsigned NumBytes = Ctx->Functions.digest_parse(
+ Ctx->c_cas, IDBuf.c_str(), BytesBuf.data(), BytesBuf.size(), &c_err);
+ if (NumBytes == 0)
+ return Ctx->errorAndDispose(c_err);
+ return NumBytes;
+ };
+ Expected<unsigned> NumBytes = parseDigest();
+ if (!NumBytes)
+ return NumBytes.takeError();
+ if (*NumBytes > BytesBuf.size()) {
+ BytesBuf.resize(*NumBytes);
+ NumBytes = parseDigest();
+ if (!NumBytes)
+ return NumBytes.takeError();
+ assert(*NumBytes == BytesBuf.size());
+ } else {
+ BytesBuf.truncate(*NumBytes);
+ }
+ return CASID::create(Ctx.get(), toStringRef(BytesBuf));
+Expected<ObjectRef> PluginObjectStore::store(ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) {
+ SmallVector<llcas_objectid_t, 64> c_ids;
+ c_ids.reserve(Refs.size());
+ for (ObjectRef Ref : Refs) {
+ c_ids.push_back(llcas_objectid_t{Ref.getInternalRef(*this)});
+ }
+ llcas_objectid_t c_stored_id;
+ char *c_err = nullptr;
+ if (Ctx->Functions.cas_store_object(
+ Ctx->c_cas, llcas_data_t{Data.data(), Data.size()}, c_ids.data(),
+ c_ids.size(), &c_stored_id, &c_err))
+ return Ctx->errorAndDispose(c_err);
+ return ObjectRef::getFromInternalRef(*this, c_stored_id.opaque);
+static StringRef toStringRef(llcas_digest_t c_digest) {
+ return StringRef((const char *)c_digest.data, c_digest.size);
+CASID PluginObjectStore::getID(ObjectRef Ref) const {
+ llcas_objectid_t c_id{Ref.getInternalRef(*this)};
+ llcas_digest_t c_digest =
+ Ctx->Functions.objectid_get_digest(Ctx->c_cas, c_id);
+ return CASID::create(Ctx.get(), toStringRef(c_digest));
+PluginObjectStore::getReference(const CASID &ID) const {
+ ArrayRef<uint8_t> Hash = ID.getHash();
+ llcas_objectid_t c_id;
+ char *c_err = nullptr;
+ if (Ctx->Functions.cas_get_objectid(
+ Ctx->c_cas, llcas_digest_t{Hash.data(), Hash.size()}, &c_id, &c_err))
+ report_fatal_error(Ctx->errorAndDispose(c_err));
+ return ObjectRef::getFromInternalRef(*this, c_id.opaque);
+Expected<bool> PluginObjectStore::isMaterialized(ObjectRef Ref) const {
+ llcas_objectid_t c_id{Ref.getInternalRef(*this)};
+ char *c_err = nullptr;
+ llcas_lookup_result_t c_result = Ctx->Functions.cas_contains_object(
+ Ctx->c_cas, c_id, /*globally=*/false, &c_err);
+ switch (c_result) {
+ return true;
+ return false;
+ return Ctx->errorAndDispose(c_err);
+ }
+PluginObjectStore::loadIfExists(ObjectRef Ref) {
+ llcas_objectid_t c_id{Ref.getInternalRef(*this)};
+ llcas_loaded_object_t c_obj;
+ char *c_err = nullptr;
+ llcas_lookup_result_t c_result =
+ Ctx->Functions.cas_load_object(Ctx->c_cas, c_id, &c_obj, &c_err);
+ switch (c_result) {
+ return makeObjectHandle(c_obj.opaque);
+ return std::nullopt;
+ return Ctx->errorAndDispose(c_err);
+ }
+void PluginObjectStore::loadIfExistsAsync(
+ ObjectRef Ref,
+ unique_function<void(Expected<std::optional<ObjectHandle>>)> Callback) {
+ llcas_objectid_t c_id{Ref.getInternalRef(*this)};
+ struct LoadObjCtx {
+ std::shared_ptr<PluginObjectStore> CAS;
+ unique_function<void(Expected<std::optional<ObjectHandle>>)> Callback;
+ LoadObjCtx(
+ std::shared_ptr<PluginObjectStore> CAS,
+ unique_function<void(Expected<std::optional<ObjectHandle>>)> Callback)
+ : CAS(std::move(CAS)), Callback(std::move(Callback)) {}
+ };
+ auto LoadObjCB = [](void *c_ctx, llcas_lookup_result_t c_result,
+ llcas_loaded_object_t c_obj, char *c_err) {
+ auto getObjAndDispose =
+ [&](LoadObjCtx *Ctx) -> Expected<std::optional<ObjectHandle>> {
+ auto _ = make_scope_exit([Ctx]() { delete Ctx; });
+ switch (c_result) {
+ return Ctx->CAS->makeObjectHandle(c_obj.opaque);
+ return std::nullopt;
+ return Ctx->CAS->Ctx->errorAndDispose(c_err);
+ }
+ };
+ LoadObjCtx *Ctx = static_cast<LoadObjCtx *>(c_ctx);
+ auto Callback = std::move(Ctx->Callback);
+ Callback(getObjAndDispose(Ctx));
+ };
+ LoadObjCtx *CallCtx = new LoadObjCtx(shared_from_this(), std::move(Callback));
+ Ctx->Functions.cas_load_object_async(Ctx->c_cas, c_id, CallCtx, LoadObjCB);
+namespace {
+class ObjectRefsWrapper {
+ ObjectRefsWrapper(const ObjectHandle &Node, const PluginObjectStore &Store)
+ : Store(Store), Ctx(*Store.Ctx) {
+ llcas_loaded_object_t c_obj{Node.getInternalRef(Store)};
+ this->c_refs = Ctx.Functions.loaded_object_get_refs(Ctx.c_cas, c_obj);
+ }
+ size_t size() const {
+ return Ctx.Functions.object_refs_get_count(Ctx.c_cas, c_refs);
+ }
+ ObjectRef operator[](size_t I) const {
+ llcas_objectid_t c_id =
+ Ctx.Functions.object_refs_get_id(Ctx.c_cas, c_refs, I);
+ return ObjectRef::getFromInternalRef(Store, c_id.opaque);
+ }
+ const PluginObjectStore &Store;
+ PluginCASContext &Ctx;
+ llcas_object_refs_t c_refs;
+} // namespace
+// FIXME: Replace forEachRef/readRef/getNumRefs APIs with an iterator interface.
+Error PluginObjectStore::forEachRef(
+ ObjectHandle Node, function_ref<Error(ObjectRef)> Callback) const {
+ ObjectRefsWrapper Refs(Node, *this);
+ for (unsigned I = 0, E = Refs.size(); I != E; ++I) {
+ if (Error E = Callback(Refs[I]))
+ return E;
+ }
+ return Error::success();
+ObjectRef PluginObjectStore::readRef(ObjectHandle Node, size_t I) const {
+ ObjectRefsWrapper Refs(Node, *this);
+ return Refs[I];
+size_t PluginObjectStore::getNumRefs(ObjectHandle Node) const {
+ ObjectRefsWrapper Refs(Node, *this);
+ return Refs.size();
+// FIXME: Remove getDataSize(ObjectHandle) from API requirement,
+// \c getData(ObjectHandle) should be enough.
+uint64_t PluginObjectStore::getDataSize(ObjectHandle Node) const {
+ ArrayRef<char> Data = getData(Node);
+ return Data.size();
+ArrayRef<char> PluginObjectStore::getData(ObjectHandle Node,
+ bool RequiresNullTerminator) const {
+ // FIXME: Remove RequiresNullTerminator from ObjectStore API requirement?
+ // It is a requirement for the plugin API.
+ llcas_data_t c_data = Ctx->Functions.loaded_object_get_data(
+ Ctx->c_cas, llcas_loaded_object_t{Node.getInternalRef(*this)});
+ return ArrayRef((const char *)c_data.data, c_data.size);
+PluginObjectStore::PluginObjectStore(std::shared_ptr<PluginCASContext> CASCtx)
+ : ObjectStore(*CASCtx), Ctx(std::move(CASCtx)) {}
+// ActionCache API
+namespace {
+class PluginActionCache : public ActionCache {
+ Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ResolvedKey,
+ bool Globally) const final;
+ void getImplAsync(ArrayRef<uint8_t> ResolvedKey, bool Globally,
+ unique_function<void(Expected<std::optional<CASID>>)>
+ Callback) const final;
+ Error putImpl(ArrayRef<uint8_t> ResolvedKey, const CASID &Result,
+ bool Globally) final;
+ void putImplAsync(ArrayRef<uint8_t> ResolvedKey, const CASID &Result,
+ bool Globally, unique_function<void(Error)> Callback) final;
+ PluginActionCache(std::shared_ptr<PluginCASContext>);
+ std::shared_ptr<PluginCASContext> Ctx;
+} // anonymous namespace
+PluginActionCache::getImpl(ArrayRef<uint8_t> ResolvedKey, bool Globally) const {
+ llcas_objectid_t c_value;
+ char *c_err = nullptr;
+ llcas_lookup_result_t c_result = Ctx->Functions.actioncache_get_for_digest(
+ Ctx->c_cas, llcas_digest_t{ResolvedKey.data(), ResolvedKey.size()},
+ &c_value, Globally, &c_err);
+ switch (c_result) {
+ llcas_digest_t c_digest =
+ Ctx->Functions.objectid_get_digest(Ctx->c_cas, c_value);
+ return CASID::create(Ctx.get(), toStringRef(c_digest));
+ }
+ return std::nullopt;
+ return Ctx->errorAndDispose(c_err);
+ }
+void PluginActionCache::getImplAsync(
+ ArrayRef<uint8_t> ResolvedKey, bool Globally,
+ unique_function<void(Expected<std::optional<CASID>>)> Callback) const {
+ struct CacheGetCtx {
+ std::shared_ptr<PluginCASContext> CASCtx;
+ unique_function<void(Expected<std::optional<CASID>>)> Callback;
+ };
+ auto CacheGetCB = [](void *c_ctx, llcas_lookup_result_t c_result,
+ llcas_objectid_t c_value, char *c_err) {
+ auto getValueAndDispose =
+ [&](CacheGetCtx *Ctx) -> Expected<std::optional<CASID>> {
+ auto _ = make_scope_exit([Ctx]() { delete Ctx; });
+ switch (c_result) {
+ llcas_digest_t c_digest = Ctx->CASCtx->Functions.objectid_get_digest(
+ Ctx->CASCtx->c_cas, c_value);
+ return CASID::create(Ctx->CASCtx.get(), toStringRef(c_digest));
+ }
+ return std::nullopt;
+ return Ctx->CASCtx->errorAndDispose(c_err);
+ }
+ };
+ CacheGetCtx *Ctx = static_cast<CacheGetCtx *>(c_ctx);
+ auto Callback = std::move(Ctx->Callback);
+ Callback(getValueAndDispose(Ctx));
+ };
+ CacheGetCtx *CallCtx = new CacheGetCtx{this->Ctx, std::move(Callback)};
+ Ctx->Functions.actioncache_get_for_digest_async(
+ Ctx->c_cas, llcas_digest_t{ResolvedKey.data(), ResolvedKey.size()},
+ Globally, CallCtx, CacheGetCB);
+Error PluginActionCache::putImpl(ArrayRef<uint8_t> ResolvedKey,
+ const CASID &Result, bool Globally) {
+ ArrayRef<uint8_t> Hash = Result.getHash();
+ llcas_objectid_t c_value;
+ char *c_err = nullptr;
+ if (Ctx->Functions.cas_get_objectid(Ctx->c_cas,
+ llcas_digest_t{Hash.data(), Hash.size()},
+ &c_value, &c_err))
+ return Ctx->errorAndDispose(c_err);
+ if (Ctx->Functions.actioncache_put_for_digest(
+ Ctx->c_cas, llcas_digest_t{ResolvedKey.data(), ResolvedKey.size()},
+ c_value, Globally, &c_err))
+ return Ctx->errorAndDispose(c_err);
+ return Error::success();
+void PluginActionCache::putImplAsync(ArrayRef<uint8_t> ResolvedKey,
+ const CASID &Result, bool Globally,
+ unique_function<void(Error)> Callback) {
+ ArrayRef<uint8_t> Hash = Result.getHash();
+ llcas_objectid_t c_value;
+ char *c_err = nullptr;
+ if (Ctx->Functions.cas_get_objectid(Ctx->c_cas,
+ llcas_digest_t{Hash.data(), Hash.size()},
+ &c_value, &c_err))
+ return Callback(Ctx->errorAndDispose(c_err));
+ struct CachePutCtx {
+ std::shared_ptr<PluginCASContext> CASCtx;
+ unique_function<void(Error)> Callback;
+ };
+ auto CachePutCB = [](void *c_ctx, bool failed, char *c_err) {
+ auto checkForErrorAndDispose = [&](CachePutCtx *Ctx) -> Error {
+ auto _ = make_scope_exit([Ctx]() { delete Ctx; });
+ if (failed)
+ return Ctx->CASCtx->errorAndDispose(c_err);
+ return Error::success();
+ };
+ CachePutCtx *Ctx = static_cast<CachePutCtx *>(c_ctx);
+ auto Callback = std::move(Ctx->Callback);
+ Callback(checkForErrorAndDispose(Ctx));
+ };
+ CachePutCtx *CallCtx = new CachePutCtx{this->Ctx, std::move(Callback)};
+ Ctx->Functions.actioncache_put_for_digest_async(
+ Ctx->c_cas, llcas_digest_t{ResolvedKey.data(), ResolvedKey.size()},
+ c_value, Globally, CallCtx, CachePutCB);
+PluginActionCache::PluginActionCache(std::shared_ptr<PluginCASContext> CASCtx)
+ : ActionCache(*CASCtx), Ctx(std::move(CASCtx)) {}
+// createPluginCASDatabases API
+Expected<std::pair<std::shared_ptr<ObjectStore>, std::shared_ptr<ActionCache>>>
+ StringRef PluginPath, StringRef OnDiskPath,
+ ArrayRef<std::pair<std::string, std::string>> PluginArgs) {
+ std::shared_ptr<PluginCASContext> Ctx;
+ if (Error E = PluginCASContext::create(PluginPath, OnDiskPath, PluginArgs)
+ .moveInto(Ctx))
+ return std::move(E);
+ auto CAS = std::make_shared<PluginObjectStore>(Ctx);
+ auto AC = std::make_shared<PluginActionCache>(std::move(Ctx));
+ return std::make_pair(std::move(CAS), std::move(AC));
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index fdf059cbd1f645..44032bec48ec12 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -60,6 +60,7 @@ configure_lit_site_cfg(
# NOTE: Sync the substitutions in test/lit.cfg when adding to this list.
+ CASPluginTest
diff --git a/llvm/tools/libCASPluginTest/CMakeLists.txt b/llvm/tools/libCASPluginTest/CMakeLists.txt
new file mode 100644
index 00000000000000..aaeba14fb40fb4
--- /dev/null
+++ b/llvm/tools/libCASPluginTest/CMakeLists.txt
@@ -0,0 +1,12 @@
+ Support
+ )
+ libCASPluginTest.cpp
+ )
+add_llvm_library(CASPluginTest SHARED ${SOURCES})
diff --git a/llvm/tools/libCASPluginTest/libCASPluginTest.cpp b/llvm/tools/libCASPluginTest/libCASPluginTest.cpp
new file mode 100644
index 00000000000000..b7bfde7f21a0b1
--- /dev/null
+++ b/llvm/tools/libCASPluginTest/libCASPluginTest.cpp
@@ -0,0 +1,572 @@
+//===- llvm/tools/libCASPluginTest/libCASPluginTest.cpp ---------*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Implementation of the LLVM CAS plugin API, for testing purposes.
+#include "llvm-c/CAS/PluginAPI_functions.h"
+#include "llvm/CAS/BuiltinCASContext.h"
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "llvm/Support/CBindingWrapping.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/ThreadPool.h"
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::builtin;
+using namespace llvm::cas::ondisk;
+static char *copyNewMallocString(StringRef Str) {
+ char *c_str = (char *)malloc(Str.size() + 1);
+ std::uninitialized_copy(Str.begin(), Str.end(), c_str);
+ c_str[Str.size()] = '\0';
+ return c_str;
+template <typename ResT>
+static ResT reportError(Error &&E, char **error, ResT Result = ResT()) {
+ if (error)
+ *error = copyNewMallocString(toString(std::move(E)));
+ return Result;
+void llcas_get_plugin_version(unsigned *major, unsigned *minor) {
+void llcas_string_dispose(char *str) { free(str); }
+namespace {
+struct CASPluginOptions {
+ std::string OnDiskPath;
+ std::string UpstreamPath;
+ std::string FirstPrefix;
+ std::string SecondPrefix;
+ bool SimulateMissingObjects = false;
+ bool Logging = true;
+ Error setOption(StringRef Name, StringRef Value);
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(CASPluginOptions, llcas_cas_options_t)
+} // namespace
+Error CASPluginOptions::setOption(StringRef Name, StringRef Value) {
+ if (Name == "first-prefix")
+ FirstPrefix = Value;
+ else if (Name == "second-prefix")
+ SecondPrefix = Value;
+ else if (Name == "upstream-path")
+ UpstreamPath = Value;
+ else if (Name == "simulate-missing-objects")
+ SimulateMissingObjects = true;
+ else if (Name == "no-logging")
+ Logging = false;
+ else
+ return createStringError(errc::invalid_argument,
+ Twine("unknown option: ") + Name);
+ return Error::success();
+llcas_cas_options_t llcas_cas_options_create(void) {
+ return wrap(new CASPluginOptions());
+void llcas_cas_options_dispose(llcas_cas_options_t c_opts) {
+ delete unwrap(c_opts);
+void llcas_cas_options_set_ondisk_path(llcas_cas_options_t c_opts,
+ const char *path) {
+ auto &Opts = *unwrap(c_opts);
+ Opts.OnDiskPath = path;
+bool llcas_cas_options_set_option(llcas_cas_options_t c_opts, const char *name,
+ const char *value, char **error) {
+ auto &Opts = *unwrap(c_opts);
+ if (Error E = Opts.setOption(name, value))
+ return reportError(std::move(E), error, true);
+ return false;
+namespace {
+struct CASWrapper {
+ std::string FirstPrefix;
+ std::string SecondPrefix;
+ /// If true, asynchronous "download" of an object will treat it as missing.
+ bool SimulateMissingObjects = false;
+ bool Logging = true;
+ std::unique_ptr<UnifiedOnDiskCache> DB;
+ /// Used for testing the \c globally parameter of action cache APIs. Simulates
+ /// "uploading"/"downloading" objects from/to the primary on-disk path.
+ std::unique_ptr<UnifiedOnDiskCache> UpstreamDB;
+ ThreadPool Pool{llvm::hardware_concurrency()};
+ std::mutex Lock{};
+ /// Check if the object is contained, in the "local" CAS only or "globally".
+ bool containsObject(ObjectID ID, bool Globally);
+ /// Load the object, potentially "downloading" it from upstream.
+ Expected<std::optional<ondisk::ObjectHandle>> loadObject(ObjectID ID);
+ /// "Uploads" a key and the associated full node graph.
+ Error upstreamKey(ArrayRef<uint8_t> Key, ObjectID Value);
+ /// "Downloads" the ID associated with the key but not the node data. The node
+ /// itself and the rest of the nodes in the graph will be "downloaded" lazily
+ /// as they are visited.
+ Expected<std::optional<ObjectID>> downstreamKey(ArrayRef<uint8_t> Key);
+ /// Synchronized access to \c llvm::errs().
+ void syncErrs(llvm::function_ref<void(raw_ostream &OS)> Fn) {
+ if (!Logging) {
+ // Ignore log output.
+ SmallString<32> Buf;
+ raw_svector_ostream OS(Buf);
+ Fn(OS);
+ return;
+ }
+ std::unique_lock<std::mutex> LockGuard(Lock);
+ Fn(errs());
+ errs().flush();
+ }
+ /// "Uploads" the full object node graph.
+ Expected<ObjectID> upstreamNode(ObjectID Node);
+ /// "Downloads" only a single object node. The rest of the nodes in the graph
+ /// will be "downloaded" lazily as they are visited.
+ Expected<ObjectID> downstreamNode(ObjectID Node);
+} // namespace
+bool CASWrapper::containsObject(ObjectID ID, bool Globally) {
+ if (DB->getGraphDB().containsObject(ID))
+ return true;
+ if (!Globally || !UpstreamDB)
+ return false;
+ ObjectID UpstreamID =
+ UpstreamDB->getGraphDB().getReference(DB->getGraphDB().getDigest(ID));
+ return UpstreamDB->getGraphDB().containsObject(UpstreamID);
+CASWrapper::loadObject(ObjectID ID) {
+ std::optional<ondisk::ObjectHandle> Obj;
+ if (Error E = DB->getGraphDB().load(ID).moveInto(Obj))
+ return std::move(E);
+ if (Obj)
+ return Obj;
+ if (!UpstreamDB)
+ return std::nullopt;
+ // Try "downloading" the node from upstream.
+ ObjectID UpstreamID =
+ UpstreamDB->getGraphDB().getReference(DB->getGraphDB().getDigest(ID));
+ std::optional<ObjectID> Ret;
+ if (Error E = downstreamNode(UpstreamID).moveInto(Ret))
+ return std::move(E);
+ return DB->getGraphDB().load(ID);
+/// Imports a single object node.
+static Expected<ObjectID> importNode(ObjectID FromID, OnDiskGraphDB &FromDB,
+ OnDiskGraphDB &ToDB) {
+ ObjectID ToID = ToDB.getReference(FromDB.getDigest(FromID));
+ if (ToDB.containsObject(ToID))
+ return ToID;
+ std::optional<ondisk::ObjectHandle> FromH;
+ if (Error E = FromDB.load(FromID).moveInto(FromH))
+ return std::move(E);
+ if (!FromH)
+ return ToID;
+ auto Data = FromDB.getObjectData(*FromH);
+ auto FromRefs = FromDB.getObjectRefs(*FromH);
+ SmallVector<ObjectID> Refs;
+ for (ObjectID FromRef : FromRefs)
+ Refs.push_back(ToDB.getReference(FromDB.getDigest(FromRef)));
+ if (Error E = ToDB.store(ToID, Refs, Data))
+ return std::move(E);
+ return ToID;
+Expected<ObjectID> CASWrapper::upstreamNode(ObjectID Node) {
+ OnDiskGraphDB &FromDB = DB->getGraphDB();
+ OnDiskGraphDB &ToDB = UpstreamDB->getGraphDB();
+ std::optional<ondisk::ObjectHandle> FromH;
+ if (Error E = FromDB.load(Node).moveInto(FromH))
+ return std::move(E);
+ if (!FromH)
+ return createStringError(errc::invalid_argument, "node doesn't exist");
+ for (ObjectID Ref : FromDB.getObjectRefs(*FromH)) {
+ std::optional<ObjectID> ID;
+ if (Error E = upstreamNode(Ref).moveInto(ID))
+ return std::move(E);
+ }
+ return importNode(Node, FromDB, ToDB);
+Expected<ObjectID> CASWrapper::downstreamNode(ObjectID Node) {
+ OnDiskGraphDB &FromDB = UpstreamDB->getGraphDB();
+ OnDiskGraphDB &ToDB = DB->getGraphDB();
+ return importNode(Node, FromDB, ToDB);
+Error CASWrapper::upstreamKey(ArrayRef<uint8_t> Key, ObjectID Value) {
+ if (!UpstreamDB)
+ return Error::success();
+ Expected<ObjectID> UpstreamVal = upstreamNode(Value);
+ if (!UpstreamVal)
+ return UpstreamVal.takeError();
+ Expected<ObjectID> PutValue = UpstreamDB->KVPut(Key, *UpstreamVal);
+ if (!PutValue)
+ return PutValue.takeError();
+ assert(*PutValue == *UpstreamVal);
+ return Error::success();
+CASWrapper::downstreamKey(ArrayRef<uint8_t> Key) {
+ if (!UpstreamDB)
+ return std::nullopt;
+ std::optional<ObjectID> UpstreamValue;
+ if (Error E = UpstreamDB->KVGet(Key).moveInto(UpstreamValue))
+ return std::move(E);
+ if (!UpstreamValue)
+ return std::nullopt;
+ ObjectID Value = DB->getGraphDB().getReference(
+ UpstreamDB->getGraphDB().getDigest(*UpstreamValue));
+ Expected<ObjectID> PutValue = DB->KVPut(Key, Value);
+ if (!PutValue)
+ return PutValue.takeError();
+ assert(*PutValue == Value);
+ return PutValue;
+llcas_cas_t llcas_cas_create(llcas_cas_options_t c_opts, char **error) {
+ auto &Opts = *unwrap(c_opts);
+ Expected<std::unique_ptr<UnifiedOnDiskCache>> DB = UnifiedOnDiskCache::open(
+ Opts.OnDiskPath, /*SizeLimit=*/std::nullopt,
+ BuiltinCASContext::getHashName(), sizeof(HashType));
+ if (!DB)
+ return reportError<llcas_cas_t>(DB.takeError(), error);
+ std::unique_ptr<UnifiedOnDiskCache> UpstreamDB;
+ if (!Opts.UpstreamPath.empty()) {
+ if (Error E = UnifiedOnDiskCache::open(
+ Opts.UpstreamPath, /*SizeLimit=*/std::nullopt,
+ BuiltinCASContext::getHashName(), sizeof(HashType))
+ .moveInto(UpstreamDB))
+ return reportError<llcas_cas_t>(std::move(E), error);
+ }
+ return wrap(new CASWrapper{Opts.FirstPrefix, Opts.SecondPrefix,
+ Opts.SimulateMissingObjects, Opts.Logging,
+ std::move(*DB), std::move(UpstreamDB)});
+void llcas_cas_dispose(llcas_cas_t c_cas) { delete unwrap(c_cas); }
+void llcas_cas_options_set_client_version(llcas_cas_options_t, unsigned major,
+ unsigned minor) {
+ // Ignore for now.
+char *llcas_cas_get_hash_schema_name(llcas_cas_t) {
+ // Using same name as builtin CAS so that it's interchangeable for testing
+ // purposes.
+ return copyNewMallocString("llvm.cas.builtin.v2[BLAKE3]");
+unsigned llcas_digest_parse(llcas_cas_t c_cas, const char *printed_digest,
+ uint8_t *bytes, size_t bytes_size, char **error) {
+ auto &Wrapper = *unwrap(c_cas);
+ if (bytes_size < sizeof(HashType))
+ return sizeof(HashType);
+ StringRef PrintedDigest = printed_digest;
+ bool Consumed = PrintedDigest.consume_front(Wrapper.FirstPrefix);
+ assert(Consumed);
+ (void)Consumed;
+ Consumed = PrintedDigest.consume_front(Wrapper.SecondPrefix);
+ assert(Consumed);
+ (void)Consumed;
+ Expected<HashType> Digest = BuiltinCASContext::parseID(PrintedDigest);
+ if (!Digest)
+ return reportError(Digest.takeError(), error, 0);
+ std::uninitialized_copy(Digest->begin(), Digest->end(), bytes);
+ return Digest->size();
+bool llcas_digest_print(llcas_cas_t c_cas, llcas_digest_t c_digest,
+ char **printed_id, char **error) {
+ auto &Wrapper = *unwrap(c_cas);
+ SmallString<74> PrintDigest;
+ raw_svector_ostream OS(PrintDigest);
+ // Include these for testing purposes.
+ OS << Wrapper.FirstPrefix << Wrapper.SecondPrefix;
+ BuiltinCASContext::printID(ArrayRef(c_digest.data, c_digest.size), OS);
+ *printed_id = copyNewMallocString(PrintDigest);
+ return false;
+bool llcas_cas_get_objectid(llcas_cas_t c_cas, llcas_digest_t c_digest,
+ llcas_objectid_t *c_id_p, char **error) {
+ auto &CAS = unwrap(c_cas)->DB->getGraphDB();
+ ObjectID ID = CAS.getReference(ArrayRef(c_digest.data, c_digest.size));
+ *c_id_p = llcas_objectid_t{ID.getOpaqueData()};
+ return false;
+llcas_digest_t llcas_objectid_get_digest(llcas_cas_t c_cas,
+ llcas_objectid_t c_id) {
+ auto &CAS = unwrap(c_cas)->DB->getGraphDB();
+ ObjectID ID = ObjectID::fromOpaqueData(c_id.opaque);
+ ArrayRef<uint8_t> Digest = CAS.getDigest(ID);
+ return llcas_digest_t{Digest.data(), Digest.size()};
+llcas_lookup_result_t llcas_cas_contains_object(llcas_cas_t c_cas,
+ llcas_objectid_t c_id,
+ bool globally, char **error) {
+ ObjectID ID = ObjectID::fromOpaqueData(c_id.opaque);
+ return unwrap(c_cas)->containsObject(ID, globally)
+llcas_lookup_result_t llcas_cas_load_object(llcas_cas_t c_cas,
+ llcas_objectid_t c_id,
+ llcas_loaded_object_t *c_obj_p,
+ char **error) {
+ ObjectID ID = ObjectID::fromOpaqueData(c_id.opaque);
+ Expected<std::optional<ondisk::ObjectHandle>> ObjOpt =
+ unwrap(c_cas)->loadObject(ID);
+ if (!ObjOpt)
+ return reportError(ObjOpt.takeError(), error, LLCAS_LOOKUP_RESULT_ERROR);
+ if (!*ObjOpt)
+ ondisk::ObjectHandle Obj = **ObjOpt;
+ *c_obj_p = llcas_loaded_object_t{Obj.getOpaqueData()};
+void llcas_cas_load_object_async(llcas_cas_t c_cas, llcas_objectid_t c_id,
+ void *ctx_cb, llcas_cas_load_object_cb cb) {
+ std::string PrintedDigest;
+ {
+ llcas_digest_t c_digest = llcas_objectid_get_digest(c_cas, c_id);
+ char *printed_id;
+ char *c_err;
+ bool failed = llcas_digest_print(c_cas, c_digest, &printed_id, &c_err);
+ if (failed)
+ report_fatal_error(Twine("digest printing failed: ") + c_err);
+ PrintedDigest = printed_id;
+ llcas_string_dispose(printed_id);
+ }
+ auto passObject = [ctx_cb,
+ cb](Expected<std::optional<ondisk::ObjectHandle>> Obj) {
+ if (!Obj) {
+ cb(ctx_cb, LLCAS_LOOKUP_RESULT_ERROR, llcas_loaded_object_t(),
+ copyNewMallocString(toString(Obj.takeError())));
+ } else if (!*Obj) {
+ cb(ctx_cb, LLCAS_LOOKUP_RESULT_NOTFOUND, llcas_loaded_object_t(),
+ nullptr);
+ } else {
+ llcas_loaded_object_t{(*Obj)->getOpaqueData()}, nullptr);
+ }
+ };
+ auto &CAS = unwrap(c_cas)->DB->getGraphDB();
+ ObjectID ID = ObjectID::fromOpaqueData(c_id.opaque);
+ if (CAS.containsObject(ID)) {
+ unwrap(c_cas)->syncErrs([&](raw_ostream &OS) {
+ OS << "load_object_async existing: " << PrintedDigest << '\n';
+ });
+ return passObject(unwrap(c_cas)->loadObject(ID));
+ }
+ if (!unwrap(c_cas)->UpstreamDB)
+ return passObject(std::nullopt);
+ // Try "downloading" the node from upstream.
+ unwrap(c_cas)->syncErrs([&](raw_ostream &OS) {
+ OS << "load_object_async downstream begin: " << PrintedDigest << '\n';
+ });
+ unwrap(c_cas)->Pool.async([=] {
+ // Wait a bit for the caller to proceed.
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
+ auto &Wrap = *unwrap(c_cas);
+ Wrap.syncErrs([&](raw_ostream &OS) {
+ OS << "load_object_async downstream end: " << PrintedDigest << '\n';
+ });
+ if (Wrap.SimulateMissingObjects)
+ return passObject(std::nullopt);
+ passObject(Wrap.loadObject(ID));
+ });
+bool llcas_cas_store_object(llcas_cas_t c_cas, llcas_data_t c_data,
+ const llcas_objectid_t *c_refs, size_t c_refs_count,
+ llcas_objectid_t *c_id_p, char **error) {
+ auto &CAS = unwrap(c_cas)->DB->getGraphDB();
+ SmallVector<ObjectID, 64> Refs;
+ Refs.reserve(c_refs_count);
+ for (unsigned I = 0; I != c_refs_count; ++I) {
+ Refs.push_back(ObjectID::fromOpaqueData(c_refs[I].opaque));
+ }
+ ArrayRef Data((const char *)c_data.data, c_data.size);
+ SmallVector<ArrayRef<uint8_t>, 8> RefHashes;
+ RefHashes.reserve(c_refs_count);
+ for (ObjectID Ref : Refs)
+ RefHashes.push_back(CAS.getDigest(Ref));
+ HashType Digest = BuiltinObjectHasher<HasherT>::hashObject(RefHashes, Data);
+ ObjectID StoredID = CAS.getReference(Digest);
+ if (Error E = CAS.store(StoredID, Refs, Data))
+ return reportError(std::move(E), error, true);
+ *c_id_p = llcas_objectid_t{StoredID.getOpaqueData()};
+ return false;
+llcas_data_t llcas_loaded_object_get_data(llcas_cas_t c_cas,
+ llcas_loaded_object_t c_obj) {
+ auto &CAS = unwrap(c_cas)->DB->getGraphDB();
+ ondisk::ObjectHandle Obj = ondisk::ObjectHandle::fromOpaqueData(c_obj.opaque);
+ auto Data = CAS.getObjectData(Obj);
+ return llcas_data_t{Data.data(), Data.size()};
+llcas_object_refs_t llcas_loaded_object_get_refs(llcas_cas_t c_cas,
+ llcas_loaded_object_t c_obj) {
+ auto &CAS = unwrap(c_cas)->DB->getGraphDB();
+ ondisk::ObjectHandle Obj = ondisk::ObjectHandle::fromOpaqueData(c_obj.opaque);
+ auto Refs = CAS.getObjectRefs(Obj);
+ return llcas_object_refs_t{Refs.begin().getOpaqueData(),
+ Refs.end().getOpaqueData()};
+size_t llcas_object_refs_get_count(llcas_cas_t c_cas,
+ llcas_object_refs_t c_refs) {
+ auto B = object_refs_iterator::fromOpaqueData(c_refs.opaque_b);
+ auto E = object_refs_iterator::fromOpaqueData(c_refs.opaque_e);
+ return E - B;
+llcas_objectid_t llcas_object_refs_get_id(llcas_cas_t c_cas,
+ llcas_object_refs_t c_refs,
+ size_t index) {
+ auto RefsI = object_refs_iterator::fromOpaqueData(c_refs.opaque_b);
+ ObjectID Ref = *(RefsI + index);
+ return llcas_objectid_t{Ref.getOpaqueData()};
+llcas_actioncache_get_for_digest(llcas_cas_t c_cas, llcas_digest_t c_key,
+ llcas_objectid_t *p_value, bool globally,
+ char **error) {
+ auto &Wrap = *unwrap(c_cas);
+ auto &DB = *Wrap.DB;
+ ArrayRef Key(c_key.data, c_key.size);
+ std::optional<ObjectID> Value;
+ if (Error E = DB.KVGet(Key).moveInto(Value))
+ return reportError(std::move(E), error, LLCAS_LOOKUP_RESULT_ERROR);
+ if (!Value) {
+ if (!globally)
+ if (Error E = Wrap.downstreamKey(Key).moveInto(Value))
+ return reportError(std::move(E), error, LLCAS_LOOKUP_RESULT_ERROR);
+ if (!Value)
+ }
+ *p_value = llcas_objectid_t{Value->getOpaqueData()};
+void llcas_actioncache_get_for_digest_async(llcas_cas_t c_cas,
+ llcas_digest_t c_key, bool globally,
+ void *ctx_cb,
+ llcas_actioncache_get_cb cb) {
+ ArrayRef Key(c_key.data, c_key.size);
+ SmallVector<uint8_t, 32> KeyBuf(Key);
+ unwrap(c_cas)->Pool.async([=] {
+ llcas_objectid_t c_value;
+ char *c_err;
+ llcas_lookup_result_t result = llcas_actioncache_get_for_digest(
+ c_cas, llcas_digest_t{KeyBuf.data(), KeyBuf.size()}, &c_value, globally,
+ &c_err);
+ cb(ctx_cb, result, c_value, c_err);
+ });
+bool llcas_actioncache_put_for_digest(llcas_cas_t c_cas, llcas_digest_t c_key,
+ llcas_objectid_t c_value, bool globally,
+ char **error) {
+ auto &Wrap = *unwrap(c_cas);
+ auto &DB = *Wrap.DB;
+ ObjectID Value = ObjectID::fromOpaqueData(c_value.opaque);
+ ArrayRef Key(c_key.data, c_key.size);
+ Expected<ObjectID> Ret = DB.KVPut(Key, Value);
+ if (!Ret)
+ return reportError(Ret.takeError(), error, true);
+ if (*Ret != Value)
+ return reportError(
+ createStringError(errc::invalid_argument, "cache poisoned"), error,
+ true);
+ if (globally) {
+ if (Error E = Wrap.upstreamKey(Key, Value))
+ return reportError(std::move(E), error, true);
+ }
+ return false;
+void llcas_actioncache_put_for_digest_async(llcas_cas_t c_cas,
+ llcas_digest_t c_key,
+ llcas_objectid_t c_value,
+ bool globally, void *ctx_cb,
+ llcas_actioncache_put_cb cb) {
+ ArrayRef Key(c_key.data, c_key.size);
+ SmallVector<uint8_t, 32> KeyBuf(Key);
+ unwrap(c_cas)->Pool.async([=] {
+ char *c_err;
+ bool failed = llcas_actioncache_put_for_digest(
+ c_cas, llcas_digest_t{KeyBuf.data(), KeyBuf.size()}, c_value, globally,
+ &c_err);
+ cb(ctx_cb, failed, c_err);
+ });
diff --git a/llvm/tools/libCASPluginTest/libCASPluginTest.exports b/llvm/tools/libCASPluginTest/libCASPluginTest.exports
new file mode 100644
index 00000000000000..8fda2c5559c92f
--- /dev/null
+++ b/llvm/tools/libCASPluginTest/libCASPluginTest.exports
@@ -0,0 +1,26 @@
diff --git a/llvm/tools/llvm-cas/llvm-cas.cpp b/llvm/tools/llvm-cas/llvm-cas.cpp
index fb1bd3df961cfe..f5341a89e1a76e 100644
--- a/llvm/tools/llvm-cas/llvm-cas.cpp
+++ b/llvm/tools/llvm-cas/llvm-cas.cpp
@@ -8,6 +8,7 @@
#include "llvm/CAS/ActionCache.h"
#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "llvm/CAS/CASRegistry.h"
#include "llvm/CAS/ObjectStore.h"
#include "llvm/CAS/TreeSchema.h"
#include "llvm/Support/CommandLine.h"
@@ -116,12 +117,15 @@ int main(int Argc, char **Argv) {
std::shared_ptr<ObjectStore> CAS;
std::shared_ptr<ActionCache> AC;
- std::tie(CAS, AC) = ExitOnErr(createOnDiskUnifiedCASDatabases(CASPath));
- assert(CAS);
+ if (isRegisteredCASIdentifier(CASPath))
+ std::tie(CAS, AC) = ExitOnErr(createCASFromIdentifier(CASPath));
+ else
+ std::tie(CAS, AC) = ExitOnErr(createOnDiskUnifiedCASDatabases(CASPath));
std::shared_ptr<ObjectStore> UpstreamCAS;
if (!UpstreamCASPath.empty())
- UpstreamCAS = ExitOnErr(createCASFromIdentifier(UpstreamCASPath));
+ UpstreamCAS =
+ std::move(ExitOnErr(createCASFromIdentifier(UpstreamCASPath)).first);
if (Command == Dump)
return dump(*CAS);
diff --git a/llvm/unittests/CAS/ActionCacheTest.cpp b/llvm/unittests/CAS/ActionCacheTest.cpp
index 1f5fdaa9003e23..cf3fec47ec5f65 100644
--- a/llvm/unittests/CAS/ActionCacheTest.cpp
+++ b/llvm/unittests/CAS/ActionCacheTest.cpp
@@ -19,7 +19,7 @@ using namespace llvm::cas;
TEST_P(CASTest, ActionCacheHit) {
std::shared_ptr<ObjectStore> CAS = createObjectStore();
- std::unique_ptr<ActionCache> Cache = createActionCache();
+ std::shared_ptr<ActionCache> Cache = createActionCache();
std::optional<ObjectProxy> ID;
ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID),
@@ -35,7 +35,7 @@ TEST_P(CASTest, ActionCacheHit) {
TEST_P(CASTest, ActionCacheMiss) {
std::shared_ptr<ObjectStore> CAS = createObjectStore();
- std::unique_ptr<ActionCache> Cache = createActionCache();
+ std::shared_ptr<ActionCache> Cache = createActionCache();
std::optional<ObjectProxy> ID1, ID2;
ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1),
@@ -60,7 +60,7 @@ TEST_P(CASTest, ActionCacheMiss) {
TEST_P(CASTest, ActionCacheRewrite) {
std::shared_ptr<ObjectStore> CAS = createObjectStore();
- std::unique_ptr<ActionCache> Cache = createActionCache();
+ std::shared_ptr<ActionCache> Cache = createActionCache();
std::optional<ObjectProxy> ID1, ID2;
ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1),
@@ -111,7 +111,7 @@ TEST(OnDiskActionCache, ActionCacheResultInvalid) {
TEST_P(CASTest, ActionCacheAsync) {
std::shared_ptr<ObjectStore> CAS = createObjectStore();
- std::unique_ptr<ActionCache> Cache = createActionCache();
+ std::shared_ptr<ActionCache> Cache = createActionCache();
std::optional<ObjectProxy> ID;
diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp
index 4747ea9d8aa028..8423dbf7ab9b25 100644
--- a/llvm/unittests/CAS/CASTestConfig.cpp
+++ b/llvm/unittests/CAS/CASTestConfig.cpp
@@ -9,12 +9,32 @@
#include "CASTestConfig.h"
#include "llvm/CAS/ActionCache.h"
#include "llvm/CAS/ObjectStore.h"
+#include "llvm/CAS/PluginCAS.h"
+#include "llvm/Config/config.h"
#include "gtest/gtest.h"
+#include <memory>
#include <mutex>
using namespace llvm;
using namespace llvm::cas;
+// See llvm/utils/unittest/UnitTestMain/TestMain.cpp
+extern const char *TestMainArgv0;
+// Just a reachable symbol to ease resolving of the executable's path.
+static std::string TestStringArg1("plugincas-test-string-arg1");
+std::string llvm::unittest::cas::getCASPluginPath() {
+ std::string Executable =
+ sys::fs::getMainExecutable(TestMainArgv0, &TestStringArg1);
+ llvm::SmallString<256> PathBuf(sys::path::parent_path(
+ sys::path::parent_path(sys::path::parent_path(Executable))));
+ std::string LibName = "libCASPluginTest";
+ sys::path::append(PathBuf, "lib", LibName + LLVM_PLUGIN_EXT);
+ return std::string(PathBuf);
TestingAndDir createInMemory(int I) {
std::unique_ptr<ObjectStore> CAS = createInMemoryCAS();
std::unique_ptr<ActionCache> Cache = createInMemoryActionCache();
@@ -48,4 +68,21 @@ TestingAndDir createOnDisk(int I) {
return TestingAndDir{std::move(CAS), std::move(Cache), std::move(Temp)};
INSTANTIATE_TEST_SUITE_P(OnDiskCAS, CASTest, ::testing::Values(createOnDisk));
+TestingAndDir createPluginCASImpl(int I) {
+ using namespace llvm::unittest::cas;
+ unittest::TempDir Temp("plugin-cas", /*Unique=*/true);
+ std::optional<
+ std::pair<std::shared_ptr<ObjectStore>, std::shared_ptr<ActionCache>>>
+ DBs;
+ createPluginCASDatabases(getCASPluginPath(), Temp.path(), {})
+ .moveInto(DBs),
+ Succeeded());
+ return TestingAndDir{std::move(DBs->first), std::move(DBs->second),
+ std::move(Temp)};
+ ::testing::Values(createPluginCASImpl));
diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h
index 3917fd6378d34c..830c6015488bd1 100644
--- a/llvm/unittests/CAS/CASTestConfig.h
+++ b/llvm/unittests/CAS/CASTestConfig.h
@@ -17,9 +17,13 @@
#include "gtest/gtest.h"
#include <memory>
+namespace llvm::unittest::cas {
+std::string getCASPluginPath();
+} // namespace llvm::unittest::cas
struct TestingAndDir {
std::shared_ptr<llvm::cas::ObjectStore> CAS;
- std::unique_ptr<llvm::cas::ActionCache> Cache;
+ std::shared_ptr<llvm::cas::ActionCache> Cache;
std::optional<llvm::unittest::TempDir> Temp;
@@ -36,7 +40,7 @@ class CASTest
return std::move(TD.CAS);
- std::unique_ptr<llvm::cas::ActionCache> createActionCache() {
+ std::shared_ptr<llvm::cas::ActionCache> createActionCache() {
auto TD = GetParam()(++(*NextCASIndex));
if (TD.Temp)
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index 8a82d8b8df3bfb..4c81e0f4728406 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -1,5 +1,6 @@
@@ -17,8 +18,12 @@ add_llvm_unittest(CASTests
+ PluginCASTest.cpp
target_link_libraries(CASTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/CAS/PluginCASTest.cpp b/llvm/unittests/CAS/PluginCASTest.cpp
new file mode 100644
index 00000000000000..d11bde4a0dfda4
--- /dev/null
+++ b/llvm/unittests/CAS/PluginCASTest.cpp
@@ -0,0 +1,93 @@
+//===- llvm/unittest/CAS/PluginCASTest.cpp --------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "CASTestConfig.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/CAS/PluginCAS.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::unittest::cas;
+TEST(PluginCASTest, isMaterialized) {
+ unittest::TempDir Temp("plugin-cas", /*Unique=*/true);
+ std::string UpDir(Temp.path("up"));
+ std::string DownDir(Temp.path("down"));
+ std::pair<std::string, std::string> PluginOpts[] = {
+ {"upstream-path", std::string(UpDir)}};
+ {
+ std::optional<
+ std::pair<std::shared_ptr<ObjectStore>, std::shared_ptr<ActionCache>>>
+ DBs;
+ createPluginCASDatabases(getCASPluginPath(), DownDir, PluginOpts)
+ .moveInto(DBs),
+ Succeeded());
+ std::shared_ptr<ObjectStore> CAS;
+ std::shared_ptr<ActionCache> AC;
+ std::tie(CAS, AC) = std::move(*DBs);
+ std::optional<CASID> ID1, ID2;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2),
+ Succeeded());
+ std::optional<ObjectRef> ID2Ref = CAS->getReference(*ID2);
+ bool IsMaterialized = false;
+ ASSERT_THAT_ERROR(CAS->isMaterialized(*ID2Ref).moveInto(IsMaterialized),
+ Succeeded());
+ EXPECT_TRUE(IsMaterialized);
+ ASSERT_THAT_ERROR(AC->put(*ID1, *ID2, /*Globally=*/true), Succeeded());
+ }
+ // Clear "local" cache.
+ sys::fs::remove_directories(DownDir);
+ {
+ std::optional<
+ std::pair<std::shared_ptr<ObjectStore>, std::shared_ptr<ActionCache>>>
+ DBs;
+ createPluginCASDatabases(getCASPluginPath(), DownDir, PluginOpts)
+ .moveInto(DBs),
+ Succeeded());
+ std::shared_ptr<ObjectStore> CAS;
+ std::shared_ptr<ActionCache> AC;
+ std::tie(CAS, AC) = std::move(*DBs);
+ std::optional<CASID> ID1, ID2;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1),
+ Succeeded());
+ ASSERT_THAT_ERROR(AC->get(*ID1, /*Globally=*/true).moveInto(ID2),
+ Succeeded());
+ std::optional<ObjectRef> ID2Ref = CAS->getReference(*ID2);
+ bool IsMaterialized = false;
+ ASSERT_THAT_ERROR(CAS->isMaterialized(*ID2Ref).moveInto(IsMaterialized),
+ Succeeded());
+ EXPECT_FALSE(IsMaterialized);
+ std::optional<ObjectProxy> Obj;
+ ASSERT_THAT_ERROR(CAS->getProxy(*ID2Ref).moveInto(Obj), Succeeded());
+ ASSERT_THAT_ERROR(CAS->isMaterialized(*ID2Ref).moveInto(IsMaterialized),
+ Succeeded());
+ EXPECT_TRUE(IsMaterialized);
+ }
>From aecf3631104fcf88fa0810bcb99969b21c27508c Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Tue, 10 Oct 2023 14:29:40 -0700
Subject: [PATCH 08/11] Add modulemap for LLVM CAS
llvm/include/module.modulemap | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/llvm/include/module.modulemap b/llvm/include/module.modulemap
index b00da6d7cd28c7..d44d395fa8ef46 100644
--- a/llvm/include/module.modulemap
+++ b/llvm/include/module.modulemap
@@ -105,6 +105,12 @@ module LLVM_BinaryFormat {
textual header "llvm/BinaryFormat/MsgPack.def"
+module LLVM_CAS {
+ requires cplusplus
+ umbrella "llvm/CAS"
+ module * { export * }
module LLVM_Config {
requires cplusplus
umbrella "llvm/Config"
>From 3f26afcfa33f1caab2cd9a8623e5e39022ed1239 Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Fri, 13 Oct 2023 14:32:40 -0700
Subject: [PATCH 09/11] Try fix msvc test failures
llvm/include/llvm/CAS/ActionCache.h | 30 +++++++++++++++-----------
llvm/lib/CAS/ActionCache.cpp | 4 ++++
llvm/unittests/CAS/ActionCacheTest.cpp | 4 ++++
llvm/unittests/CAS/CASTestConfig.cpp | 2 ++
llvm/unittests/CAS/ObjectStoreTest.cpp | 2 ++
5 files changed, 30 insertions(+), 12 deletions(-)
diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h
index 134c586fa0a9a7..e4ac687ae28543 100644
--- a/llvm/include/llvm/CAS/ActionCache.h
+++ b/llvm/include/llvm/CAS/ActionCache.h
@@ -72,18 +72,6 @@ class ActionCache {
return getImpl(arrayRefFromStringRef(ActionKey.getKey()), Globally);
- /// Asynchronous version of \c get.
- std::future<AsyncCASIDValue> getFuture(const CacheKey &ActionKey,
- bool Globally = false) const;
- /// Asynchronous version of \c get.
- void getAsync(
- const CacheKey &ActionKey, bool Globally,
- unique_function<void(Expected<std::optional<CASID>>)> Callback) const {
- return getImplAsync(arrayRefFromStringRef(ActionKey.getKey()), Globally,
- std::move(Callback));
- }
/// Cache \p Result for the \p ActionKey computation.
/// \param Globally if true it is a hint to the underlying implementation that
@@ -97,6 +85,23 @@ class ActionCache {
return putImpl(arrayRefFromStringRef(ActionKey.getKey()), Result, Globally);
+#ifndef _MSC_VER
+ /// FIXME: MSVC doesn't compile Error within Promise/Future correctly and will
+ /// result in unchecked error. Disable AsyncAPIs when using MSVC for now.
+ /// Asynchronous version of \c get.
+ std::future<AsyncCASIDValue> getFuture(const CacheKey &ActionKey,
+ bool Globally = false) const;
+ /// Asynchronous version of \c get.
+ void getAsync(
+ const CacheKey &ActionKey, bool Globally,
+ unique_function<void(Expected<std::optional<CASID>>)> Callback) const {
+ return getImplAsync(arrayRefFromStringRef(ActionKey.getKey()), Globally,
+ std::move(Callback));
+ }
/// Asynchronous version of \c put.
std::future<AsyncErrorValue> putFuture(const CacheKey &ActionKey,
const CASID &Result,
@@ -111,6 +116,7 @@ class ActionCache {
return putImplAsync(arrayRefFromStringRef(ActionKey.getKey()), Result,
Globally, std::move(Callback));
virtual ~ActionCache() = default;
diff --git a/llvm/lib/CAS/ActionCache.cpp b/llvm/lib/CAS/ActionCache.cpp
index ded1fc4879fc0b..b7d83ebf1722c0 100644
--- a/llvm/lib/CAS/ActionCache.cpp
+++ b/llvm/lib/CAS/ActionCache.cpp
@@ -21,6 +21,9 @@ CacheKey::CacheKey(const ObjectProxy &Proxy)
CacheKey::CacheKey(const ObjectStore &CAS, const ObjectRef &Ref)
: Key(toStringRef(CAS.getID(Ref).getHash())) {}
+#ifndef _MSC_VER
+/// FIXME: MSVC doesn't compile Error within Promise/Future correctly and will
+/// result in unchecked error. Disable AsyncAPIs when using MSVC for now.
std::future<AsyncCASIDValue> ActionCache::getFuture(const CacheKey &ActionKey,
bool Globally) const {
std::promise<AsyncCASIDValue> Promise;
@@ -44,6 +47,7 @@ std::future<AsyncErrorValue> ActionCache::putFuture(const CacheKey &ActionKey,
return Future;
void ActionCache::getImplAsync(
ArrayRef<uint8_t> ResolvedKey, bool Globally,
diff --git a/llvm/unittests/CAS/ActionCacheTest.cpp b/llvm/unittests/CAS/ActionCacheTest.cpp
index cf3fec47ec5f65..566630e8a3c363 100644
--- a/llvm/unittests/CAS/ActionCacheTest.cpp
+++ b/llvm/unittests/CAS/ActionCacheTest.cpp
@@ -109,6 +109,9 @@ TEST(OnDiskActionCache, ActionCacheResultInvalid) {
+#ifndef _MSC_VER
+/// FIXME: MSVC doesn't compile Error within Promise/Future correctly and will
+/// result in unchecked error. Disable AsyncAPIs when using MSVC for now.
TEST_P(CASTest, ActionCacheAsync) {
std::shared_ptr<ObjectStore> CAS = createObjectStore();
std::shared_ptr<ActionCache> Cache = createActionCache();
@@ -150,3 +153,4 @@ TEST_P(CASTest, ActionCacheAsync) {
diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp
index 8423dbf7ab9b25..c54ced747b7f63 100644
--- a/llvm/unittests/CAS/CASTestConfig.cpp
+++ b/llvm/unittests/CAS/CASTestConfig.cpp
@@ -45,6 +45,7 @@ INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest,
+#ifndef _WIN32
__attribute__((constructor)) static void configureCASTestEnv() {
// Restrict the size of the on-disk CAS for tests. This allows testing in
// constrained environments (e.g. small TMPDIR). It also prevents leaving
@@ -57,6 +58,7 @@ __attribute__((constructor)) static void configureCASTestEnv() {
setenv("LLVM_CAS_MAX_MAPPING_SIZE", LimitStr.c_str(), /*overwrite=*/false);
TestingAndDir createOnDisk(int I) {
unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp
index de443032c34243..a1ca10b722884d 100644
--- a/llvm/unittests/CAS/ObjectStoreTest.cpp
+++ b/llvm/unittests/CAS/ObjectStoreTest.cpp
@@ -352,12 +352,14 @@ TEST_P(CASTest, BlobsParallel) {
ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
TEST_P(CASTest, BlobsBigParallel) {
std::shared_ptr<ObjectStore> CAS = createObjectStore();
// 100k is large enough to be standalone files in our on-disk cas.
uint64_t Size = 100ULL * 1024;
ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
TEST(OnDiskCASTest, BlobsParallelMultiCAS) {
>From e5263c679448e99efa5f252d1e329223fd982e1b Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Wed, 9 Oct 2024 14:16:50 -0700
Subject: [PATCH 10/11] Rebase change to latest
llvm/include/llvm/CAS/BuiltinObjectHasher.h | 2 +-
llvm/lib/CAS/TreeSchema.cpp | 2 +-
llvm/lib/CAS/UnifiedOnDiskCache.cpp | 4 ++--
llvm/tools/libCASPluginTest/libCASPluginTest.cpp | 2 +-
llvm/unittests/ADT/TrieRawHashMapTest.cpp | 4 +++-
llvm/unittests/CAS/ObjectStoreTest.cpp | 6 +++---
6 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/llvm/include/llvm/CAS/BuiltinObjectHasher.h b/llvm/include/llvm/CAS/BuiltinObjectHasher.h
index ac95edf6de7f1b..24616af5e10530 100644
--- a/llvm/include/llvm/CAS/BuiltinObjectHasher.h
+++ b/llvm/include/llvm/CAS/BuiltinObjectHasher.h
@@ -67,7 +67,7 @@ template <class HasherT> class BuiltinObjectHasher {
void updateSize(uint64_t Size) {
- Size = support::endian::byte_swap(Size, support::endianness::little);
+ Size = support::endian::byte_swap(Size, endianness::little);
ArrayRef(reinterpret_cast<const uint8_t *>(&Size), sizeof(Size)));
diff --git a/llvm/lib/CAS/TreeSchema.cpp b/llvm/lib/CAS/TreeSchema.cpp
index 91f3d8e3d25403..bb2ff7d6ec3bf3 100644
--- a/llvm/lib/CAS/TreeSchema.cpp
+++ b/llvm/lib/CAS/TreeSchema.cpp
@@ -195,7 +195,7 @@ TreeProxy::Builder::build(ArrayRef<NamedTreeEntry> Entries) {
Sorted.erase(std::unique(Sorted.begin(), Sorted.end()), Sorted.end());
raw_svector_ostream OS(Data);
- support::endian::Writer Writer(OS, support::endianness::little);
+ support::endian::Writer Writer(OS, endianness::little);
// Encode the entires in the Data. The layout of the tree schema object is:
// * Name offset table: The offset of in the data blob for where to find the
// string. It has N + 1 entries and you can find the name of n-th entry at
diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp
index b5b136d2a1c5e8..a36dd49fb01fa7 100644
--- a/llvm/lib/CAS/UnifiedOnDiskCache.cpp
+++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp
@@ -134,7 +134,7 @@ static Error getAllDBDirs(StringRef Path,
if (DirI->type() != sys::fs::file_type::directory_file)
StringRef SubDir = sys::path::filename(DirI->path());
- if (!SubDir.startswith(DBDirPrefix))
+ if (!SubDir.starts_with(DBDirPrefix))
uint64_t Order;
if (SubDir.substr(DBDirPrefix.size()).getAsInteger(10, Order))
@@ -156,7 +156,7 @@ static Error getAllDBDirs(StringRef Path,
/// \returns Given a sub-directory named 'v<version>.<x>', it outputs the
/// 'v<version>.<x+1>' name.
static void getNextDBDirName(StringRef DBDir, llvm::raw_ostream &OS) {
- assert(DBDir.startswith(DBDirPrefix));
+ assert(DBDir.starts_with(DBDirPrefix));
uint64_t Count;
bool Failed = DBDir.substr(DBDirPrefix.size()).getAsInteger(10, Count);
diff --git a/llvm/tools/libCASPluginTest/libCASPluginTest.cpp b/llvm/tools/libCASPluginTest/libCASPluginTest.cpp
index b7bfde7f21a0b1..0d9443da9d39f1 100644
--- a/llvm/tools/libCASPluginTest/libCASPluginTest.cpp
+++ b/llvm/tools/libCASPluginTest/libCASPluginTest.cpp
@@ -112,7 +112,7 @@ struct CASWrapper {
/// Used for testing the \c globally parameter of action cache APIs. Simulates
/// "uploading"/"downloading" objects from/to the primary on-disk path.
std::unique_ptr<UnifiedOnDiskCache> UpstreamDB;
- ThreadPool Pool{llvm::hardware_concurrency()};
+ StdThreadPool Pool{llvm::hardware_concurrency()};
std::mutex Lock{};
diff --git a/llvm/unittests/ADT/TrieRawHashMapTest.cpp b/llvm/unittests/ADT/TrieRawHashMapTest.cpp
index bd3610666ec941..24be6c4748ea6f 100644
--- a/llvm/unittests/ADT/TrieRawHashMapTest.cpp
+++ b/llvm/unittests/ADT/TrieRawHashMapTest.cpp
@@ -8,6 +8,7 @@
#include "llvm/ADT/TrieRawHashMap.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/bit.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/SHA1.h"
#include "gtest/gtest.h"
@@ -72,7 +73,8 @@ class SimpleTrieHashMapTest : public TrieRawHashMapTestHelper,
// Use the number itself as hash to test the pathological case.
static HashType hash(uint64_t Num) {
- uint64_t HashN = llvm::support::endian::byte_swap(Num, llvm::support::big);
+ uint64_t HashN =
+ llvm::support::endian::byte_swap(Num, llvm::endianness::big);
HashType Hash;
memcpy(&Hash[0], &HashN, sizeof(HashType));
return Hash;
diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp
index a1ca10b722884d..d6bcf20cc577a4 100644
--- a/llvm/unittests/CAS/ObjectStoreTest.cpp
+++ b/llvm/unittests/CAS/ObjectStoreTest.cpp
@@ -330,7 +330,7 @@ static void testBlobsParallel(ObjectStore &Read1, ObjectStore &Read2,
EXPECT_EQ(Node->getData(), Blobs[I]);
- ThreadPool Threads;
+ StdThreadPool Threads;
for (unsigned I = 0; I < BlobCount; ++I) {
Threads.async(Consumer, I, &Read1);
Threads.async(Consumer, I, &Read2);
@@ -432,7 +432,7 @@ TEST(OnDiskCASTest, DiskSize) {
std::error_code EC;
for (sys::fs::directory_iterator I(Temp.path(), EC), E; I != E && !EC;
I.increment(EC)) {
- if (StringRef(I->path()).endswith(".index")) {
+ if (StringRef(I->path()).ends_with(".index")) {
FoundIndex = true;
if (Mapped)
@@ -440,7 +440,7 @@ TEST(OnDiskCASTest, DiskSize) {
EXPECT_LT(I->status()->getSize(), MaxSize);
- if (StringRef(I->path()).endswith(".data")) {
+ if (StringRef(I->path()).ends_with(".data")) {
FoundData = true;
if (Mapped)
>From 54b6fe61752cd4baf5aec2699b8bb2dc04ca1a64 Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Wed, 9 Oct 2024 14:29:05 -0700
Subject: [PATCH 11/11] format fixup
llvm/include/llvm/CAS/ActionCache.h | 1 -
llvm/lib/Support/TrieRawHashMap.cpp | 2 +-
llvm/unittests/CAS/CASTestConfig.cpp | 2 --
llvm/unittests/CAS/PluginCASTest.cpp | 2 +-
4 files changed, 2 insertions(+), 5 deletions(-)
diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h
index e4ac687ae28543..9abefb1876265a 100644
--- a/llvm/include/llvm/CAS/ActionCache.h
+++ b/llvm/include/llvm/CAS/ActionCache.h
@@ -85,7 +85,6 @@ class ActionCache {
return putImpl(arrayRefFromStringRef(ActionKey.getKey()), Result, Globally);
#ifndef _MSC_VER
/// FIXME: MSVC doesn't compile Error within Promise/Future correctly and will
/// result in unchecked error. Disable AsyncAPIs when using MSVC for now.
diff --git a/llvm/lib/Support/TrieRawHashMap.cpp b/llvm/lib/Support/TrieRawHashMap.cpp
index af4cd8b57aed21..a6818d434e0d13 100644
--- a/llvm/lib/Support/TrieRawHashMap.cpp
+++ b/llvm/lib/Support/TrieRawHashMap.cpp
@@ -155,7 +155,7 @@ struct ThreadSafeTrieRawHashMapBase::ImplType {
static std::unique_ptr<ImplType> create(size_t StartBit, size_t NumBits) {
size_t Size = sizeof(ImplType) + getTrieTailSize(StartBit, NumBits);
void *Memory = ::malloc(Size);
- ImplType* Impl = ::new (Memory) ImplType(StartBit, NumBits);
+ ImplType *Impl = ::new (Memory) ImplType(StartBit, NumBits);
return std::unique_ptr<ImplType>(Impl);
diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp
index c54ced747b7f63..fd5dfa0143e496 100644
--- a/llvm/unittests/CAS/CASTestConfig.cpp
+++ b/llvm/unittests/CAS/CASTestConfig.cpp
@@ -34,7 +34,6 @@ std::string llvm::unittest::cas::getCASPluginPath() {
return std::string(PathBuf);
TestingAndDir createInMemory(int I) {
std::unique_ptr<ObjectStore> CAS = createInMemoryCAS();
std::unique_ptr<ActionCache> Cache = createInMemoryActionCache();
@@ -71,7 +70,6 @@ TestingAndDir createOnDisk(int I) {
INSTANTIATE_TEST_SUITE_P(OnDiskCAS, CASTest, ::testing::Values(createOnDisk));
TestingAndDir createPluginCASImpl(int I) {
using namespace llvm::unittest::cas;
unittest::TempDir Temp("plugin-cas", /*Unique=*/true);
diff --git a/llvm/unittests/CAS/PluginCASTest.cpp b/llvm/unittests/CAS/PluginCASTest.cpp
index d11bde4a0dfda4..e7bf025bf1794b 100644
--- a/llvm/unittests/CAS/PluginCASTest.cpp
+++ b/llvm/unittests/CAS/PluginCASTest.cpp
@@ -6,10 +6,10 @@
+#include "llvm/CAS/PluginCAS.h"
#include "CASTestConfig.h"
#include "llvm/CAS/ActionCache.h"
#include "llvm/CAS/ObjectStore.h"
-#include "llvm/CAS/PluginCAS.h"
#include "llvm/Config/config.h"
#include "llvm/Support/Path.h"
#include "llvm/Testing/Support/Error.h"
More information about the llvm-commits
mailing list