[llvm] [CAS] Add UnifiedOnDiskCache and OnDiskCAS (PR #114103)
Steven Wu via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 27 10:10:01 PDT 2025
https://github.com/cachemeifyoucan updated https://github.com/llvm/llvm-project/pull/114103
>From 4b951ef459415adbab70b1b685590d60a8dfd29e Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Mon, 20 Oct 2025 14:41:11 -0700
Subject: [PATCH] [CAS] Add OnDiskCAS
Add OnDiskCAS abstraction, that implements ObjectStore and ActionCache
interface using OnDiskGraphDB and OnDiskKeyValueDB.
Reviewers:
Pull Request: https://github.com/llvm/llvm-project/pull/114103
---
llvm/include/llvm/CAS/ActionCache.h | 10 +
.../llvm/CAS/BuiltinUnifiedCASDatabases.h | 59 ++
llvm/include/llvm/CAS/ObjectStore.h | 49 +-
llvm/include/llvm/CAS/UnifiedOnDiskCache.h | 191 +++++
llvm/lib/CAS/ActionCaches.cpp | 156 +++++
llvm/lib/CAS/BuiltinCAS.cpp | 14 +-
llvm/lib/CAS/BuiltinCAS.h | 25 +-
llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp | 38 +
llvm/lib/CAS/CMakeLists.txt | 3 +
llvm/lib/CAS/InMemoryCAS.cpp | 8 +
llvm/lib/CAS/ObjectStore.cpp | 93 ++-
llvm/lib/CAS/OnDiskCAS.cpp | 228 ++++++
llvm/lib/CAS/UnifiedOnDiskCache.cpp | 655 ++++++++++++++++++
llvm/unittests/CAS/ActionCacheTest.cpp | 6 +-
.../CAS/BuiltinUnifiedCASDatabasesTest.cpp | 67 ++
llvm/unittests/CAS/CASTestConfig.cpp | 23 +-
llvm/unittests/CAS/CASTestConfig.h | 44 +-
llvm/unittests/CAS/CMakeLists.txt | 2 +
llvm/unittests/CAS/ObjectStoreTest.cpp | 134 +++-
llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp | 191 +++++
20 files changed, 1967 insertions(+), 29 deletions(-)
create mode 100644 llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h
create mode 100644 llvm/include/llvm/CAS/UnifiedOnDiskCache.h
create mode 100644 llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp
create mode 100644 llvm/lib/CAS/OnDiskCAS.cpp
create mode 100644 llvm/lib/CAS/UnifiedOnDiskCache.cpp
create mode 100644 llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp
create mode 100644 llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp
diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h
index 69ee4dde1974a..781ad81001368 100644
--- a/llvm/include/llvm/CAS/ActionCache.h
+++ b/llvm/include/llvm/CAS/ActionCache.h
@@ -75,6 +75,9 @@ class ActionCache {
CanBeDistributed);
}
+ /// Validate the ActionCache contents.
+ virtual Error validate() const = 0;
+
virtual ~ActionCache() = default;
protected:
@@ -97,6 +100,13 @@ class ActionCache {
/// Create an action cache in memory.
std::unique_ptr<ActionCache> createInMemoryActionCache();
+/// Get a reasonable default on-disk path for a persistent ActionCache for the
+/// current user.
+std::string getDefaultOnDiskActionCachePath();
+
+/// Create an action cache on disk.
+Expected<std::unique_ptr<ActionCache>> createOnDiskActionCache(StringRef Path);
+
} // end namespace llvm::cas
#endif // LLVM_CAS_ACTIONCACHE_H
diff --git a/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h
new file mode 100644
index 0000000000000..6c31a82ff9db0
--- /dev/null
+++ b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
+#define LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
+
+#include "llvm/Support/Error.h"
+
+namespace llvm::cas {
+
+class ActionCache;
+class ObjectStore;
+
+/// Create on-disk \c ObjectStore and \c ActionCache instances based on
+/// \c ondisk::UnifiedOnDiskCache, with built-in hashing.
+Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>>
+createOnDiskUnifiedCASDatabases(StringRef Path);
+
+/// Represents the result of validating the contents using
+/// \c validateOnDiskUnifiedCASDatabasesIfNeeded.
+///
+/// Note: invalid results are handled as an \c Error.
+enum class ValidationResult {
+ /// The data is already valid.
+ Valid,
+ /// The data was invalid, but was recovered.
+ Recovered,
+ /// Validation was skipped, as it was not needed.
+ Skipped,
+};
+
+/// Validate the data in \p Path, if needed to ensure correctness.
+///
+/// \param Path directory for the on-disk database.
+/// \param CheckHash Whether to validate hashes match the data.
+/// \param AllowRecovery Whether to automatically recover from invalid data by
+/// marking the files for garbage collection.
+/// \param ForceValidation Whether to force validation to occur even if it
+/// should not be necessary.
+/// \param LLVMCasBinary If provided, validation is performed out-of-process
+/// using the given \c llvm-cas executable which protects against crashes
+/// during validation. Otherwise validation is performed in-process.
+///
+/// \returns \c Valid if the data is already valid, \c Recovered if data
+/// was invalid but has been cleared, \c Skipped if validation is not needed,
+/// or an \c Error if validation cannot be performed or if the data is left
+/// in an invalid state because \p AllowRecovery is false.
+Expected<ValidationResult> validateOnDiskUnifiedCASDatabasesIfNeeded(
+ StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation,
+ std::optional<StringRef> LLVMCasBinary);
+
+} // namespace llvm::cas
+
+#endif // LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
diff --git a/llvm/include/llvm/CAS/ObjectStore.h b/llvm/include/llvm/CAS/ObjectStore.h
index 6db5dd3904095..febfff815e86e 100644
--- a/llvm/include/llvm/CAS/ObjectStore.h
+++ b/llvm/include/llvm/CAS/ObjectStore.h
@@ -111,7 +111,10 @@ class ObjectStore {
virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0;
/// Validate the underlying object referred by CASID.
- virtual Error validate(const CASID &ID) = 0;
+ virtual Error validateObject(const CASID &ID) = 0;
+
+ /// Validate the entire ObjectStore.
+ virtual Error validate(bool CheckHash) const = 0;
protected:
/// Load the object referenced by \p Ref.
@@ -215,9 +218,39 @@ class ObjectStore {
return Data.size();
}
+ /// Set the size for limiting growth of on-disk storage. This has an effect
+ /// for when the instance is closed.
+ ///
+ /// Implementations may be not have this implemented.
+ virtual Error setSizeLimit(std::optional<uint64_t> SizeLimit) {
+ return Error::success();
+ }
+
+ /// \returns the storage size of the on-disk CAS data.
+ ///
+ /// Implementations that don't have an implementation for this should return
+ /// \p std::nullopt.
+ virtual Expected<std::optional<uint64_t>> getStorageSize() const {
+ return std::nullopt;
+ }
+
+ /// Prune local storage to reduce its size according to the desired size
+ /// limit. Pruning can happen concurrently with other operations.
+ ///
+ /// Implementations may be not have this implemented.
+ virtual Error pruneStorageData() { return Error::success(); }
+
/// Validate the whole node tree.
Error validateTree(ObjectRef Ref);
+ /// Import object from another CAS. This will import the full tree from the
+ /// other CAS.
+ Expected<ObjectRef> importObject(ObjectStore &Upstream, ObjectRef Other);
+
+ /// Print the ObjectStore internals for debugging purpose.
+ virtual void print(raw_ostream &) const {}
+ void dump() const;
+
/// Get CASContext
const CASContext &getContext() const { return Context; }
@@ -292,6 +325,20 @@ class ObjectProxy {
std::unique_ptr<ObjectStore> createInMemoryCAS();
+/// \returns true if \c LLVM_ENABLE_ONDISK_CAS configuration was enabled.
+bool isOnDiskCASEnabled();
+
+/// Gets or creates a persistent on-disk path at \p Path.
+Expected<std::unique_ptr<ObjectStore>> createOnDiskCAS(const Twine &Path);
+
+/// Set \p Path to a reasonable default on-disk path for a persistent CAS for
+/// the current user.
+Error getDefaultOnDiskCASPath(SmallVectorImpl<char> &Path);
+
+/// Get a reasonable default on-disk path for a persistent CAS for the current
+/// user.
+llvm::Expected<std::string> getDefaultOnDiskCASPath();
+
} // namespace cas
} // namespace llvm
diff --git a/llvm/include/llvm/CAS/UnifiedOnDiskCache.h b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h
new file mode 100644
index 0000000000000..fa9d2fcfdb4e4
--- /dev/null
+++ b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h
@@ -0,0 +1,191 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_UNIFIEDONDISKCACHE_H
+#define LLVM_CAS_UNIFIEDONDISKCACHE_H
+
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include <atomic>
+
+namespace llvm::cas::ondisk {
+
+class OnDiskKeyValueDB;
+
+/// A unified CAS nodes and key-value database, using on-disk storage for both.
+/// It manages storage growth and provides APIs for garbage collection.
+///
+/// High-level properties:
+/// * While \p UnifiedOnDiskCache is open on a directory, by any process, the
+/// storage size in that directory will keep growing unrestricted. For data to
+/// become eligible for garbage-collection there should be no open instances
+/// of \p UnifiedOnDiskCache for that directory, by any process.
+/// * Garbage-collection needs to be triggered explicitly by the client. It can
+/// be triggered on a directory concurrently, at any time and by any process,
+/// without affecting any active readers/writers, in the same process or other
+/// processes.
+///
+/// Usage patterns should be that an instance of \p UnifiedOnDiskCache is open
+/// for a limited period of time, e.g. for the duration of a build operation.
+/// For long-living processes that need periodic access to a
+/// \p UnifiedOnDiskCache, the client should device a scheme where access is
+/// performed within some defined period. For example, if a service is designed
+/// to continuously wait for requests that access a \p UnifiedOnDiskCache, it
+/// could keep the instance alive while new requests are coming in but close it
+/// after a time period in which there are no new requests.
+class UnifiedOnDiskCache {
+public:
+ /// The \p OnDiskGraphDB instance for the open directory.
+ OnDiskGraphDB &getGraphDB() { return *PrimaryGraphDB; }
+
+ /// Associate an \p ObjectID, of the \p OnDiskGraphDB instance, with a key.
+ ///
+ /// \param Key the hash bytes for the key.
+ /// \param Value the \p ObjectID value.
+ ///
+ /// \returns the \p ObjectID associated with the \p Key. It may be different
+ /// than \p Value if another value was already associated with this key.
+ Expected<ObjectID> KVPut(ArrayRef<uint8_t> Key, ObjectID Value);
+
+ /// Associate an \p ObjectID, of the \p OnDiskGraphDB instance, with a key.
+ /// An \p ObjectID as a key is equivalent to its digest bytes.
+ ///
+ /// \param Key the \p ObjectID for the key.
+ /// \param Value the \p ObjectID value.
+ ///
+ /// \returns the \p ObjectID associated with the \p Key. It may be different
+ /// than \p Value if another value was already associated with this key.
+ Expected<ObjectID> KVPut(ObjectID Key, ObjectID Value);
+
+ /// \returns the \p ObjectID, of the \p OnDiskGraphDB instance, associated
+ /// with the \p Key, or \p std::nullopt if the key does not exist.
+ Expected<std::optional<ObjectID>> KVGet(ArrayRef<uint8_t> Key);
+
+ /// Open a \p UnifiedOnDiskCache instance for a directory.
+ ///
+ /// \param Path directory for the on-disk database. The directory will be
+ /// created if it doesn't exist.
+ /// \param SizeLimit Optional size for limiting growth. This has an effect for
+ /// when the instance is closed.
+ /// \param HashName Identifier name for the hashing algorithm that is going to
+ /// be used.
+ /// \param HashByteSize Size for the object digest hash bytes.
+ /// \param FaultInPolicy Controls how nodes are copied to primary store. This
+ /// is recorded at creation time and subsequent opens need to pass the same
+ /// policy otherwise the \p open will fail.
+ static Expected<std::unique_ptr<UnifiedOnDiskCache>>
+ open(StringRef Path, std::optional<uint64_t> SizeLimit, StringRef HashName,
+ unsigned HashByteSize,
+ OnDiskGraphDB::FaultInPolicy FaultInPolicy =
+ OnDiskGraphDB::FaultInPolicy::FullTree);
+
+ /// Validate the data in \p Path, if needed to ensure correctness.
+ ///
+ /// Note: if invalid data is detected and \p AllowRecovery is true, then
+ /// recovery requires exclusive access to the CAS and it is an error to
+ /// attempt recovery if there is concurrent use of the CAS.
+ ///
+ /// \param Path directory for the on-disk database.
+ /// \param HashName Identifier name for the hashing algorithm that is going to
+ /// be used.
+ /// \param HashByteSize Size for the object digest hash bytes.
+ /// \param CheckHash Whether to validate hashes match the data.
+ /// \param AllowRecovery Whether to automatically recover from invalid data by
+ /// marking the files for garbage collection.
+ /// \param ForceValidation Whether to force validation to occur even if it
+ /// should not be necessary.
+ /// \param LLVMCasBinary If provided, validation is performed out-of-process
+ /// using the given \c llvm-cas executable which protects against crashes
+ /// during validation. Otherwise validation is performed in-process.
+ ///
+ /// \returns \c Valid if the data is already valid, \c Recovered if data
+ /// was invalid but has been cleared, \c Skipped if validation is not needed,
+ /// or an \c Error if validation cannot be performed or if the data is left
+ /// in an invalid state because \p AllowRecovery is false.
+ static Expected<ValidationResult>
+ validateIfNeeded(StringRef Path, StringRef HashName, unsigned HashByteSize,
+ bool CheckHash, bool AllowRecovery, bool ForceValidation,
+ std::optional<StringRef> LLVMCasBinary);
+
+ /// This is called implicitly at destruction time, so it is not required for a
+ /// client to call this. After calling \p close the only method that is valid
+ /// to call is \p needsGarbageCollection.
+ ///
+ /// \param CheckSizeLimit if true it will check whether the primary store has
+ /// exceeded its intended size limit. If false the check is skipped even if a
+ /// \p SizeLimit was passed to the \p open call.
+ Error close(bool CheckSizeLimit = true);
+
+ /// Set the size for limiting growth. This has an effect for when the instance
+ /// is closed.
+ void setSizeLimit(std::optional<uint64_t> SizeLimit);
+
+ /// \returns the storage size of the cache data.
+ uint64_t getStorageSize() const;
+
+ /// \returns whether the primary store has exceeded the intended size limit.
+ /// This can return false even if the overall size of the opened directory is
+ /// over the \p SizeLimit passed to \p open. To know whether garbage
+ /// collection needs to be triggered or not, call \p needsGarbaseCollection.
+ bool hasExceededSizeLimit() const;
+
+ /// \returns whether there are unused data that can be deleted using a
+ /// \p collectGarbage call.
+ bool needsGarbageCollection() const { return NeedsGarbageCollection; }
+
+ /// Remove any unused data from the directory at \p Path. If there are no such
+ /// data the operation is a no-op.
+ ///
+ /// This can be called concurrently, regardless of whether there is an open
+ /// \p UnifiedOnDiskCache instance or not; it has no effect on readers/writers
+ /// in the same process or other processes.
+ ///
+ /// It is recommended that garbage-collection is triggered concurrently in the
+ /// background, so that it has minimal effect on the workload of the process.
+ static Error collectGarbage(StringRef Path);
+
+ /// Remove unused data from the current UnifiedOnDiskCache.
+ Error collectGarbage();
+
+ /// Validate the key value databases.
+ Error validateActionCache();
+
+ /// Get the upstream OnDiskGraphDB if exists.
+ ///
+ /// \returns upstream database or nullptr if upstream database doesn't exist.
+ OnDiskGraphDB *getUpstreamGraphDB() const { return UpstreamGraphDB; }
+
+ ~UnifiedOnDiskCache();
+
+private:
+ UnifiedOnDiskCache();
+
+ Expected<std::optional<ObjectID>>
+ faultInFromUpstreamKV(ArrayRef<uint8_t> Key);
+
+ /// \returns the storage size of the primary directory.
+ uint64_t getPrimaryStorageSize() const;
+
+ std::string RootPath;
+ std::atomic<uint64_t> SizeLimit;
+
+ int LockFD = -1;
+
+ std::atomic<bool> NeedsGarbageCollection;
+ std::string PrimaryDBDir;
+
+ OnDiskGraphDB *UpstreamGraphDB = nullptr;
+ std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB;
+
+ std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB;
+ std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB;
+};
+
+} // namespace llvm::cas::ondisk
+
+#endif // LLVM_CAS_UNIFIEDONDISKCACHE_H
diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp
index 571c5b3ca5b4b..dac50e0740c5c 100644
--- a/llvm/lib/CAS/ActionCaches.cpp
+++ b/llvm/lib/CAS/ActionCaches.cpp
@@ -13,7 +13,11 @@
#include "BuiltinCAS.h"
#include "llvm/ADT/TrieRawHashMap.h"
#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/Support/BLAKE3.h"
+#include "llvm/Support/Path.h"
#define DEBUG_TYPE "cas-action-caches"
@@ -47,12 +51,51 @@ class InMemoryActionCache final : public ActionCache {
Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
bool CanBeDistributed) const final;
+ Error validate() const final {
+ return createStringError("InMemoryActionCache doesn't support validate()");
+ }
+
private:
using DataT = CacheEntry<sizeof(HashType)>;
using InMemoryCacheT = ThreadSafeTrieRawHashMap<DataT, sizeof(HashType)>;
InMemoryCacheT Cache;
};
+
+class OnDiskActionCache final : public ActionCache {
+public:
+ Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result,
+ bool CanBeDistributed) final;
+ Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
+ bool CanBeDistributed) const final;
+
+ static Expected<std::unique_ptr<OnDiskActionCache>> create(StringRef Path);
+
+ Error validate() const final;
+
+private:
+ static StringRef getHashName() { return "BLAKE3"; }
+
+ OnDiskActionCache(std::unique_ptr<ondisk::OnDiskKeyValueDB> DB);
+
+ std::unique_ptr<ondisk::OnDiskKeyValueDB> DB;
+ using DataT = CacheEntry<sizeof(HashType)>;
+};
+
+class UnifiedOnDiskActionCache final : public ActionCache {
+public:
+ Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result,
+ bool CanBeDistributed) final;
+ Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
+ bool CanBeDistributed) const final;
+
+ UnifiedOnDiskActionCache(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+
+ Error validate() const final;
+
+private:
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB;
+};
} // end namespace
static Error createResultCachePoisonedError(ArrayRef<uint8_t> KeyHash,
@@ -92,10 +135,123 @@ Error InMemoryActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result,
Observed.getValue());
}
+static constexpr StringLiteral DefaultName = "actioncache";
+
namespace llvm::cas {
+std::string getDefaultOnDiskActionCachePath() {
+ SmallString<128> Path;
+ if (!llvm::sys::path::cache_directory(Path))
+ report_fatal_error("cannot get default cache directory");
+ llvm::sys::path::append(Path, builtin::DefaultDir, DefaultName);
+ return Path.str().str();
+}
+
std::unique_ptr<ActionCache> createInMemoryActionCache() {
return std::make_unique<InMemoryActionCache>();
}
} // namespace llvm::cas
+
+OnDiskActionCache::OnDiskActionCache(
+ std::unique_ptr<ondisk::OnDiskKeyValueDB> DB)
+ : ActionCache(builtin::BuiltinCASContext::getDefaultContext()),
+ DB(std::move(DB)) {}
+
+Expected<std::unique_ptr<OnDiskActionCache>>
+OnDiskActionCache::create(StringRef AbsPath) {
+ std::unique_ptr<ondisk::OnDiskKeyValueDB> DB;
+ if (Error E = ondisk::OnDiskKeyValueDB::open(AbsPath, getHashName(),
+ sizeof(HashType), getHashName(),
+ sizeof(DataT))
+ .moveInto(DB))
+ return std::move(E);
+ return std::unique_ptr<OnDiskActionCache>(
+ new OnDiskActionCache(std::move(DB)));
+}
+
+Expected<std::optional<CASID>>
+OnDiskActionCache::getImpl(ArrayRef<uint8_t> Key,
+ bool /*CanBeDistributed*/) const {
+ std::optional<ArrayRef<char>> Val;
+ if (Error E = DB->get(Key).moveInto(Val))
+ return std::move(E);
+ if (!Val)
+ return std::nullopt;
+ return CASID::create(&getContext(), toStringRef(*Val));
+}
+
+Error OnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result,
+ bool /*CanBeDistributed*/) {
+ auto ResultHash = Result.getHash();
+ ArrayRef Expected((const char *)ResultHash.data(), ResultHash.size());
+ ArrayRef<char> Observed;
+ if (Error E = DB->put(Key, Expected).moveInto(Observed))
+ return E;
+
+ if (Expected == Observed)
+ return Error::success();
+
+ return createResultCachePoisonedError(
+ Key, getContext(), Result,
+ ArrayRef((const uint8_t *)Observed.data(), Observed.size()));
+}
+
+Error OnDiskActionCache::validate() const {
+ // FIXME: without the matching CAS there is nothing we can check about the
+ // cached values. The hash size is already validated by the DB validator.
+ return DB->validate(nullptr);
+}
+
+UnifiedOnDiskActionCache::UnifiedOnDiskActionCache(
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB)
+ : ActionCache(builtin::BuiltinCASContext::getDefaultContext()),
+ UniDB(std::move(UniDB)) {}
+
+Expected<std::optional<CASID>>
+UnifiedOnDiskActionCache::getImpl(ArrayRef<uint8_t> Key,
+ bool /*CanBeDistributed*/) const {
+ std::optional<ondisk::ObjectID> Val;
+ if (Error E = UniDB->KVGet(Key).moveInto(Val))
+ return std::move(E);
+ if (!Val)
+ return std::nullopt;
+ return CASID::create(&getContext(),
+ toStringRef(UniDB->getGraphDB().getDigest(*Val)));
+}
+
+Error UnifiedOnDiskActionCache::putImpl(ArrayRef<uint8_t> Key,
+ const CASID &Result,
+ bool /*CanBeDistributed*/) {
+ auto Expected = UniDB->getGraphDB().getReference(Result.getHash());
+ if (LLVM_UNLIKELY(!Expected))
+ return Expected.takeError();
+ std::optional<ondisk::ObjectID> Observed;
+ if (Error E = UniDB->KVPut(Key, *Expected).moveInto(Observed))
+ return E;
+
+ if (*Expected == Observed)
+ return Error::success();
+
+ return createResultCachePoisonedError(
+ Key, getContext(), Result, UniDB->getGraphDB().getDigest(*Observed));
+}
+
+Error UnifiedOnDiskActionCache::validate() const {
+ return UniDB->validateActionCache();
+}
+
+Expected<std::unique_ptr<ActionCache>>
+cas::createOnDiskActionCache(StringRef Path) {
+#if LLVM_ENABLE_ONDISK_CAS
+ return OnDiskActionCache::create(Path);
+#else
+ return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled");
+#endif
+}
+
+std::unique_ptr<ActionCache>
+cas::builtin::createActionCacheFromUnifiedOnDiskCache(
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) {
+ return std::make_unique<UnifiedOnDiskActionCache>(std::move(UniDB));
+}
diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp
index 73646ad2c3528..e9bc6d8beed4e 100644
--- a/llvm/lib/CAS/BuiltinCAS.cpp
+++ b/llvm/lib/CAS/BuiltinCAS.cpp
@@ -9,6 +9,7 @@
#include "BuiltinCAS.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
#include "llvm/Support/Process.h"
using namespace llvm;
@@ -68,7 +69,7 @@ Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs,
Refs, Data);
}
-Error BuiltinCAS::validate(const CASID &ID) {
+Error BuiltinCAS::validateObject(const CASID &ID) {
auto Ref = getReference(ID);
if (!Ref)
return createUnknownObjectError(ID);
@@ -92,3 +93,14 @@ Error BuiltinCAS::validate(const CASID &ID) {
return Error::success();
}
+
+Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>>
+cas::builtin::createBuiltinUnifiedOnDiskCache(StringRef Path) {
+#if LLVM_ENABLE_ONDISK_CAS
+ return ondisk::UnifiedOnDiskCache::open(Path, /*SizeLimit=*/std::nullopt,
+ BuiltinCASContext::getHashName(),
+ sizeof(HashType));
+#else
+ return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled");
+#endif
+}
diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h
index 3b5374d5e1850..4d2de66cf636f 100644
--- a/llvm/lib/CAS/BuiltinCAS.h
+++ b/llvm/lib/CAS/BuiltinCAS.h
@@ -1,4 +1,4 @@
-//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -15,6 +15,9 @@
namespace llvm::cas {
class ActionCache;
+namespace ondisk {
+class UnifiedOnDiskCache;
+} // namespace ondisk
namespace builtin {
/// Common base class for builtin CAS implementations using the same CASContext.
@@ -65,9 +68,27 @@ class BuiltinCAS : public ObjectStore {
"corrupt storage");
}
- Error validate(const CASID &ID) final;
+ Error validateObject(const CASID &ID) final;
};
+/// Create a \p UnifiedOnDiskCache instance that uses \p BLAKE3 hashing.
+Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>>
+createBuiltinUnifiedOnDiskCache(StringRef Path);
+
+/// \param UniDB A \p UnifiedOnDiskCache instance from \p
+/// createBuiltinUnifiedOnDiskCache.
+std::unique_ptr<ObjectStore> createObjectStoreFromUnifiedOnDiskCache(
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+
+/// \param UniDB A \p UnifiedOnDiskCache instance from \p
+/// createBuiltinUnifiedOnDiskCache.
+std::unique_ptr<ActionCache> createActionCacheFromUnifiedOnDiskCache(
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+
+// FIXME: Proxy not portable. Maybe also error-prone?
+constexpr StringLiteral DefaultDirProxy = "/^llvm::cas::builtin::default";
+constexpr StringLiteral DefaultDir = "llvm.cas.builtin.default";
+
} // end namespace builtin
} // end namespace llvm::cas
diff --git a/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp
new file mode 100644
index 0000000000000..f3f6fa043bc52
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "BuiltinCAS.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>>
+cas::createOnDiskUnifiedCASDatabases(StringRef Path) {
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB;
+ if (Error E = builtin::createBuiltinUnifiedOnDiskCache(Path).moveInto(UniDB))
+ return std::move(E);
+ auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB);
+ auto AC = builtin::createActionCacheFromUnifiedOnDiskCache(std::move(UniDB));
+ return std::make_pair(std::move(CAS), std::move(AC));
+}
+
+Expected<ValidationResult> cas::validateOnDiskUnifiedCASDatabasesIfNeeded(
+ StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation,
+ std::optional<StringRef> LLVMCasBinary) {
+#if LLVM_ENABLE_ONDISK_CAS
+ return ondisk::UnifiedOnDiskCache::validateIfNeeded(
+ Path, builtin::BuiltinCASContext::getHashName(),
+ sizeof(builtin::HashType), CheckHash, AllowRecovery, ForceValidation,
+ LLVMCasBinary);
+#else
+ return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled");
+#endif
+}
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index a2f8c49e50145..aad77dce370d8 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -2,15 +2,18 @@ add_llvm_component_library(LLVMCAS
ActionCache.cpp
ActionCaches.cpp
BuiltinCAS.cpp
+ BuiltinUnifiedCASDatabases.cpp
DatabaseFile.cpp
InMemoryCAS.cpp
MappedFileRegionArena.cpp
ObjectStore.cpp
+ OnDiskCAS.cpp
OnDiskCommon.cpp
OnDiskDataAllocator.cpp
OnDiskGraphDB.cpp
OnDiskKeyValueDB.cpp
OnDiskTrieRawHashMap.cpp
+ UnifiedOnDiskCache.cpp
ADDITIONAL_HEADER_DIRS
${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS
diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp
index c63ee70de0849..2d4eedd5bdc8f 100644
--- a/llvm/lib/CAS/InMemoryCAS.cpp
+++ b/llvm/lib/CAS/InMemoryCAS.cpp
@@ -233,6 +233,12 @@ class InMemoryCAS : public BuiltinCAS {
return cast<InMemoryObject>(asInMemoryObject(Node)).getData();
}
+ void print(raw_ostream &OS) const final;
+
+ Error validate(bool CheckHash) const final {
+ return createStringError("InMemoryCAS doesn't support validate()");
+ }
+
InMemoryCAS() = default;
private:
@@ -271,6 +277,8 @@ ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const {
return cast<InMemoryInlineObject>(this)->getRefsImpl();
}
+void InMemoryCAS::print(raw_ostream &OS) const {}
+
Expected<ObjectRef>
InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
sys::fs::mapped_file_region Map) {
diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp
index e0be50bbe013a..3110577e03774 100644
--- a/llvm/lib/CAS/ObjectStore.cpp
+++ b/llvm/lib/CAS/ObjectStore.cpp
@@ -1,4 +1,4 @@
-//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -12,7 +12,7 @@
#include "llvm/Support/Errc.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MemoryBuffer.h"
-#include <optional>
+#include <deque>
using namespace llvm;
using namespace llvm::cas;
@@ -21,6 +21,7 @@ void CASContext::anchor() {}
void ObjectStore::anchor() {}
LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); }
LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); }
LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); }
@@ -141,7 +142,7 @@ Error ObjectStore::validateTree(ObjectRef Root) {
auto [I, Inserted] = ValidatedRefs.insert(Ref);
if (!Inserted)
continue; // already validated.
- if (Error E = validate(getID(Ref)))
+ if (Error E = validateObject(getID(Ref)))
return E;
Expected<ObjectHandle> Obj = load(Ref);
if (!Obj)
@@ -155,6 +156,92 @@ Error ObjectStore::validateTree(ObjectRef Root) {
return Error::success();
}
+Expected<ObjectRef> ObjectStore::importObject(ObjectStore &Upstream,
+ ObjectRef Other) {
+ // Copy the full CAS tree from upstream with depth-first ordering to ensure
+ // all the child nodes are available in downstream CAS before inserting
+ // current object. This uses a similar algorithm as
+ // `OnDiskGraphDB::importFullTree` but doesn't assume the upstream CAS schema
+ // so it can be used to import from any other ObjectStore reguardless of the
+ // CAS schema.
+
+ // There is no work to do if importing from self.
+ if (this == &Upstream)
+ return Other;
+
+ /// Keeps track of the state of visitation for current node and all of its
+ /// parents. Upstream Cursor holds information only from upstream CAS.
+ struct UpstreamCursor {
+ ObjectRef Ref;
+ ObjectHandle Node;
+ size_t RefsCount;
+ std::deque<ObjectRef> Refs;
+ };
+ SmallVector<UpstreamCursor, 16> CursorStack;
+ /// PrimaryNodeStack holds the ObjectRef of the current CAS, with nodes either
+ /// just stored in the CAS or nodes already exists in the current CAS.
+ SmallVector<ObjectRef, 128> PrimaryRefStack;
+ /// A map from upstream ObjectRef to current ObjectRef.
+ llvm::DenseMap<ObjectRef, ObjectRef> CreatedObjects;
+
+ auto enqueueNode = [&](ObjectRef Ref, ObjectHandle Node) {
+ unsigned NumRefs = Upstream.getNumRefs(Node);
+ std::deque<ObjectRef> Refs;
+ for (unsigned I = 0; I < NumRefs; ++I)
+ Refs.push_back(Upstream.readRef(Node, I));
+
+ CursorStack.push_back({Ref, Node, NumRefs, std::move(Refs)});
+ };
+
+ auto UpstreamHandle = Upstream.load(Other);
+ if (!UpstreamHandle)
+ return UpstreamHandle.takeError();
+ enqueueNode(Other, *UpstreamHandle);
+
+ while (!CursorStack.empty()) {
+ UpstreamCursor &Cur = CursorStack.back();
+ if (Cur.Refs.empty()) {
+ // Copy the node data into the primary store.
+ // The bottom of \p PrimaryRefStack contains the ObjectRef for the
+ // current node.
+ assert(PrimaryRefStack.size() >= Cur.RefsCount);
+ auto Refs = ArrayRef(PrimaryRefStack)
+ .slice(PrimaryRefStack.size() - Cur.RefsCount);
+ auto NewNode = store(Refs, Upstream.getData(Cur.Node));
+ if (!NewNode)
+ return NewNode.takeError();
+
+ // Remove the current node and its IDs from the stack.
+ PrimaryRefStack.truncate(PrimaryRefStack.size() - Cur.RefsCount);
+ CursorStack.pop_back();
+
+ PrimaryRefStack.push_back(*NewNode);
+ CreatedObjects.try_emplace(Cur.Ref, *NewNode);
+ continue;
+ }
+
+ // Check if the node exists already.
+ auto CurrentID = Cur.Refs.front();
+ Cur.Refs.pop_front();
+ auto Ref = CreatedObjects.find(CurrentID);
+ if (Ref != CreatedObjects.end()) {
+ // If exists already, just need to enqueue the primary node.
+ PrimaryRefStack.push_back(Ref->second);
+ continue;
+ }
+
+ // Load child.
+ auto PrimaryID = Upstream.load(CurrentID);
+ if (LLVM_UNLIKELY(!PrimaryID))
+ return PrimaryID.takeError();
+
+ enqueueNode(CurrentID, *PrimaryID);
+ }
+
+ assert(PrimaryRefStack.size() == 1);
+ return PrimaryRefStack.front();
+}
+
std::unique_ptr<MemoryBuffer>
ObjectProxy::getMemoryBuffer(StringRef Name,
bool RequiresNullTerminator) const {
diff --git a/llvm/lib/CAS/OnDiskCAS.cpp b/llvm/lib/CAS/OnDiskCAS.cpp
new file mode 100644
index 0000000000000..035722459236a
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskCAS.cpp
@@ -0,0 +1,228 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BuiltinCAS.h"
+#include "llvm/CAS/BuiltinCASContext.h"
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/Path.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::builtin;
+
+namespace {
+
+class OnDiskCAS : public BuiltinCAS {
+public:
+ Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash,
+ ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) final;
+
+ Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) final;
+
+ CASID getID(ObjectRef Ref) const final;
+
+ std::optional<ObjectRef> getReference(const CASID &ID) const final;
+
+ Expected<bool> isMaterialized(ObjectRef Ref) const final;
+
+ ArrayRef<char> getDataConst(ObjectHandle Node) const final;
+
+ void print(raw_ostream &OS) const final;
+ Error validate(bool CheckHash) const final;
+
+ static Expected<std::unique_ptr<OnDiskCAS>> open(StringRef Path);
+
+ OnDiskCAS(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB)
+ : UnifiedDB(std::move(UniDB)), DB(&UnifiedDB->getGraphDB()) {}
+
+private:
+ ObjectHandle convertHandle(ondisk::ObjectHandle Node) const {
+ return makeObjectHandle(Node.getOpaqueData());
+ }
+
+ ondisk::ObjectHandle convertHandle(ObjectHandle Node) const {
+ return ondisk::ObjectHandle(Node.getInternalRef(*this));
+ }
+
+ ObjectRef convertRef(ondisk::ObjectID Ref) const {
+ return makeObjectRef(Ref.getOpaqueData());
+ }
+
+ ondisk::ObjectID convertRef(ObjectRef Ref) const {
+ return ondisk::ObjectID::fromOpaqueData(Ref.getInternalRef(*this));
+ }
+
+ size_t getNumRefs(ObjectHandle Node) const final {
+ auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+ return std::distance(RefsRange.begin(), RefsRange.end());
+ }
+ ObjectRef readRef(ObjectHandle Node, size_t I) const final {
+ auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+ return convertRef(RefsRange.begin()[I]);
+ }
+ Error forEachRef(ObjectHandle Node,
+ function_ref<Error(ObjectRef)> Callback) const final;
+
+ Error setSizeLimit(std::optional<uint64_t> SizeLimit) final;
+ Expected<std::optional<uint64_t>> getStorageSize() const final;
+ Error pruneStorageData() final;
+
+ OnDiskCAS(std::unique_ptr<ondisk::OnDiskGraphDB> GraphDB)
+ : OwnedDB(std::move(GraphDB)), DB(OwnedDB.get()) {}
+
+ std::unique_ptr<ondisk::OnDiskGraphDB> OwnedDB;
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UnifiedDB;
+ ondisk::OnDiskGraphDB *DB;
+};
+
+} // end anonymous namespace
+
+void OnDiskCAS::print(raw_ostream &OS) const { DB->print(OS); }
+Error OnDiskCAS::validate(bool CheckHash) const {
+ auto Hasher = [](ArrayRef<ArrayRef<uint8_t>> Refs, ArrayRef<char> Data,
+ SmallVectorImpl<uint8_t> &Result) {
+ auto Hash = BuiltinObjectHasher<llvm::cas::builtin::HasherT>::hashObject(
+ Refs, Data);
+ Result.assign(Hash.begin(), Hash.end());
+ };
+
+ if (auto E = DB->validate(CheckHash, Hasher))
+ return E;
+ if (UnifiedDB && UnifiedDB->getUpstreamGraphDB())
+ return UnifiedDB->getUpstreamGraphDB()->validate(CheckHash, Hasher);
+
+ return Error::success();
+}
+
+CASID OnDiskCAS::getID(ObjectRef Ref) const {
+ ArrayRef<uint8_t> Hash = DB->getDigest(convertRef(Ref));
+ return CASID::create(&getContext(), toStringRef(Hash));
+}
+
+std::optional<ObjectRef> OnDiskCAS::getReference(const CASID &ID) const {
+ std::optional<ondisk::ObjectID> ObjID =
+ DB->getExistingReference(ID.getHash());
+ if (!ObjID)
+ return std::nullopt;
+ return convertRef(*ObjID);
+}
+
+Expected<bool> OnDiskCAS::isMaterialized(ObjectRef ExternalRef) const {
+ return DB->isMaterialized(convertRef(ExternalRef));
+}
+
+ArrayRef<char> OnDiskCAS::getDataConst(ObjectHandle Node) const {
+ return DB->getObjectData(convertHandle(Node));
+}
+
+Expected<std::optional<ObjectHandle>>
+OnDiskCAS::loadIfExists(ObjectRef ExternalRef) {
+ Expected<std::optional<ondisk::ObjectHandle>> ObjHnd =
+ DB->load(convertRef(ExternalRef));
+ if (!ObjHnd)
+ return ObjHnd.takeError();
+ if (!*ObjHnd)
+ return std::nullopt;
+ return convertHandle(**ObjHnd);
+}
+
+Expected<ObjectRef> OnDiskCAS::storeImpl(ArrayRef<uint8_t> ComputedHash,
+ ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) {
+ SmallVector<ondisk::ObjectID, 64> IDs;
+ IDs.reserve(Refs.size());
+ for (ObjectRef Ref : Refs) {
+ IDs.push_back(convertRef(Ref));
+ }
+
+ auto StoredID = DB->getReference(ComputedHash);
+ if (LLVM_UNLIKELY(!StoredID))
+ return StoredID.takeError();
+ if (Error E = DB->store(*StoredID, IDs, Data))
+ return std::move(E);
+ return convertRef(*StoredID);
+}
+
+Error OnDiskCAS::forEachRef(ObjectHandle Node,
+ function_ref<Error(ObjectRef)> Callback) const {
+ auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+ for (ondisk::ObjectID Ref : RefsRange) {
+ if (Error E = Callback(convertRef(Ref)))
+ return E;
+ }
+ return Error::success();
+}
+
+Error OnDiskCAS::setSizeLimit(std::optional<uint64_t> SizeLimit) {
+ UnifiedDB->setSizeLimit(SizeLimit);
+ return Error::success();
+}
+
+Expected<std::optional<uint64_t>> OnDiskCAS::getStorageSize() const {
+ return UnifiedDB->getStorageSize();
+}
+
+Error OnDiskCAS::pruneStorageData() { return UnifiedDB->collectGarbage(); }
+
+Expected<std::unique_ptr<OnDiskCAS>> OnDiskCAS::open(StringRef AbsPath) {
+ Expected<std::unique_ptr<ondisk::OnDiskGraphDB>> DB =
+ ondisk::OnDiskGraphDB::open(AbsPath, BuiltinCASContext::getHashName(),
+ sizeof(HashType));
+ if (!DB)
+ return DB.takeError();
+ return std::unique_ptr<OnDiskCAS>(new OnDiskCAS(std::move(*DB)));
+}
+
+bool cas::isOnDiskCASEnabled() {
+#if LLVM_ENABLE_ONDISK_CAS
+ return true;
+#else
+ return false;
+#endif
+}
+
+Expected<std::unique_ptr<ObjectStore>> cas::createOnDiskCAS(const Twine &Path) {
+#if LLVM_ENABLE_ONDISK_CAS
+ // FIXME: An absolute path isn't really good enough. Should open a directory
+ // and use openat() for files underneath.
+ SmallString<256> AbsPath;
+ Path.toVector(AbsPath);
+ sys::fs::make_absolute(AbsPath);
+
+ return OnDiskCAS::open(AbsPath);
+#else
+ return createStringError(inconvertibleErrorCode(), "OnDiskCAS is disabled");
+#endif /* LLVM_ENABLE_ONDISK_CAS */
+}
+
+std::unique_ptr<ObjectStore>
+cas::builtin::createObjectStoreFromUnifiedOnDiskCache(
+ std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) {
+ return std::make_unique<OnDiskCAS>(std::move(UniDB));
+}
+
+static constexpr StringLiteral DefaultName = "cas";
+
+Error cas::getDefaultOnDiskCASPath(SmallVectorImpl<char> &Path) {
+ if (!llvm::sys::path::cache_directory(Path))
+ return createStringError("cache directory is not available");
+ llvm::sys::path::append(Path, DefaultDir, DefaultName);
+ return Error::success();
+}
+
+Expected<std::string> cas::getDefaultOnDiskCASPath() {
+ SmallString<128> Path;
+ if (auto E = getDefaultOnDiskCASPath(Path))
+ return std::move(E);
+ return Path.str().str();
+}
diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp
new file mode 100644
index 0000000000000..acf327d57cfe0
--- /dev/null
+++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp
@@ -0,0 +1,655 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Encapsulates \p OnDiskGraphDB and \p OnDiskKeyValueDB instances within one
+/// directory while also restricting storage growth with a scheme of chaining
+/// the two most recent directories (primary & upstream), where the primary
+/// "faults-in" data from the upstream one. When the primary (most recent)
+/// directory exceeds its intended limit a new empty directory becomes the
+/// primary one.
+///
+/// Within the top-level directory (the path that \p UnifiedOnDiskCache::open
+/// receives) there are directories named like this:
+///
+/// 'v<version>.<x>'
+/// 'v<version>.<x+1'
+/// 'v<version>.<x+2>'
+/// ...
+///
+/// 'version' is the version integer for this \p UnifiedOnDiskCache's scheme and
+/// the part after the dot is an increasing integer. The primary directory is
+/// the one with the highest integer and the upstream one is the directory
+/// before it. For example, if the sub-directories contained are:
+///
+/// 'v1.5', 'v1.6', 'v1.7', 'v1.8'
+///
+/// Then the primary one is 'v1.8', the upstream one is 'v1.7', and the rest are
+/// unused directories that can be safely deleted at any time and by any
+/// process.
+///
+/// Contained within the top-level directory is a file named "lock" which is
+/// used for processes to take shared or exclusive locks for the contents of the
+/// top directory. While a \p UnifiedOnDiskCache is open it keeps a shared lock
+/// for the top-level directory; when it closes, if the primary sub-directory
+/// exceeded its limit, it attempts to get an exclusive lock in order to create
+/// a new empty primary directory; if it can't get the exclusive lock it gives
+/// up and lets the next \p UnifiedOnDiskCache instance that closes to attempt
+/// again.
+///
+/// The downside of this scheme is that while \p UnifiedOnDiskCache is open on a
+/// directory, by any process, the storage size in that directory will keep
+/// growing unrestricted. But the major benefit is that garbage-collection can
+/// be triggered on a directory concurrently, at any time and by any process,
+/// without affecting any active readers/writers in the same process or other
+/// processes.
+///
+/// The \c UnifiedOnDiskCache also provides validation and recovery on top of
+/// the underlying on-disk storage. The low-level storage is designed to remain
+/// coherent across regular process crashes, but may be invalid after power loss
+/// or similar system failures. \c UnifiedOnDiskCache::validateIfNeeded allows
+/// validating the contents once per boot and can recover by marking invalid
+/// data for garbage collection.
+///
+/// The data recovery described above requires exclusive access to the CAS, and
+/// it is an error to attempt recovery if the CAS is open in any process/thread.
+/// In order to maximize backwards compatibility with tools that do not perform
+/// validation before opening the CAS, we do not attempt to get exclusive access
+/// until recovery is actually performed, meaning as long as the data is valid
+/// it will not conflict with concurrent use.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "BuiltinCAS.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/raw_ostream.h"
+#include <optional>
+
+#if __has_include(<sys/sysctl.h>)
+#include <sys/sysctl.h>
+#endif
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+/// FIXME: When the version of \p DBDirPrefix is bumped up we need to figure out
+/// how to handle the leftover sub-directories of the previous version, within
+/// the \p UnifiedOnDiskCache::collectGarbage function.
+static constexpr StringLiteral DBDirPrefix = "v1.";
+
+static constexpr StringLiteral ValidationFilename = "v1.validation";
+static constexpr StringLiteral CorruptPrefix = "corrupt.";
+
+Expected<ObjectID> UnifiedOnDiskCache::KVPut(ObjectID Key, ObjectID Value) {
+ return KVPut(PrimaryGraphDB->getDigest(Key), Value);
+}
+
+Expected<ObjectID> UnifiedOnDiskCache::KVPut(ArrayRef<uint8_t> Key,
+ ObjectID Value) {
+ static_assert(sizeof(Value.getOpaqueData()) == sizeof(uint64_t),
+ "unexpected return opaque type");
+ std::array<char, sizeof(uint64_t)> ValBytes;
+ support::endian::write64le(ValBytes.data(), Value.getOpaqueData());
+ Expected<ArrayRef<char>> Existing = PrimaryKVDB->put(Key, ValBytes);
+ if (!Existing)
+ return Existing.takeError();
+ assert(Existing->size() == sizeof(uint64_t));
+ return ObjectID::fromOpaqueData(support::endian::read64le(Existing->data()));
+}
+
+Expected<std::optional<ObjectID>>
+UnifiedOnDiskCache::KVGet(ArrayRef<uint8_t> Key) {
+ std::optional<ArrayRef<char>> Value;
+ if (Error E = PrimaryKVDB->get(Key).moveInto(Value))
+ return std::move(E);
+ if (!Value) {
+ if (UpstreamKVDB)
+ return faultInFromUpstreamKV(Key);
+ return std::nullopt;
+ }
+ assert(Value->size() == sizeof(uint64_t));
+ return ObjectID::fromOpaqueData(support::endian::read64le(Value->data()));
+}
+
+Expected<std::optional<ObjectID>>
+UnifiedOnDiskCache::faultInFromUpstreamKV(ArrayRef<uint8_t> Key) {
+ assert(UpstreamGraphDB);
+ assert(UpstreamKVDB);
+
+ std::optional<ArrayRef<char>> UpstreamValue;
+ if (Error E = UpstreamKVDB->get(Key).moveInto(UpstreamValue))
+ return std::move(E);
+ if (!UpstreamValue)
+ return std::nullopt;
+
+ // The value is the \p ObjectID in the context of the upstream
+ // \p OnDiskGraphDB instance. Translate it to the context of the primary
+ // \p OnDiskGraphDB instance.
+ assert(UpstreamValue->size() == sizeof(uint64_t));
+ ObjectID UpstreamID = ObjectID::fromOpaqueData(
+ support::endian::read64le(UpstreamValue->data()));
+ auto PrimaryID =
+ PrimaryGraphDB->getReference(UpstreamGraphDB->getDigest(UpstreamID));
+ if (LLVM_UNLIKELY(!PrimaryID))
+ return PrimaryID.takeError();
+ return KVPut(Key, *PrimaryID);
+}
+
+Error UnifiedOnDiskCache::validateActionCache() {
+ auto ValidateRef = [&](FileOffset Offset, ArrayRef<char> Value) -> Error {
+ assert(Value.size() == sizeof(uint64_t) && "should be validated already");
+ auto ID = ObjectID::fromOpaqueData(support::endian::read64le(Value.data()));
+ auto formatError = [&](Twine Msg) {
+ return createStringError(
+ llvm::errc::illegal_byte_sequence,
+ "bad record at 0x" +
+ utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " +
+ Msg.str());
+ };
+ if (ID.getOpaqueData() == 0)
+ return formatError("zero is not a valid ref");
+ return Error::success();
+ };
+ if (Error E = PrimaryKVDB->validate(ValidateRef))
+ return E;
+ if (UpstreamKVDB)
+ return UpstreamKVDB->validate(ValidateRef);
+ return Error::success();
+}
+
+/// \returns all the 'v<version>.<x>' names of sub-directories, sorted with
+/// ascending order of the integer after the dot. Corrupt directories, if
+/// included, will come first.
+static Error getAllDBDirs(StringRef Path, SmallVectorImpl<std::string> &DBDirs,
+ bool IncludeCorrupt = false) {
+ struct DBDir {
+ uint64_t Order;
+ std::string Name;
+ };
+ SmallVector<DBDir, 6> FoundDBDirs;
+
+ std::error_code EC;
+ for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE;
+ DirI.increment(EC)) {
+ if (DirI->type() != sys::fs::file_type::directory_file)
+ continue;
+ StringRef SubDir = sys::path::filename(DirI->path());
+ if (IncludeCorrupt && SubDir.starts_with(CorruptPrefix)) {
+ FoundDBDirs.push_back({0, std::string(SubDir)});
+ continue;
+ }
+ if (!SubDir.starts_with(DBDirPrefix))
+ continue;
+ uint64_t Order;
+ if (SubDir.substr(DBDirPrefix.size()).getAsInteger(10, Order))
+ return createStringError(inconvertibleErrorCode(),
+ "unexpected directory " + DirI->path());
+ FoundDBDirs.push_back({Order, std::string(SubDir)});
+ }
+ if (EC)
+ return createFileError(Path, EC);
+
+ llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool {
+ return LHS.Order <= RHS.Order;
+ });
+ for (DBDir &Dir : FoundDBDirs)
+ DBDirs.push_back(std::move(Dir.Name));
+ return Error::success();
+}
+
+static Error getAllGarbageDirs(StringRef Path,
+ SmallVectorImpl<std::string> &DBDirs) {
+ if (Error E = getAllDBDirs(Path, DBDirs, /*IncludeCorrupt=*/true))
+ return E;
+
+ // FIXME: When the version of \p DBDirPrefix is bumped up we need to figure
+ // out how to handle the leftover sub-directories of the previous version.
+
+ for (unsigned Keep = 2; Keep > 0 && !DBDirs.empty(); --Keep) {
+ StringRef Back(DBDirs.back());
+ if (Back.starts_with(CorruptPrefix))
+ break;
+ DBDirs.pop_back();
+ }
+ return Error::success();
+}
+
+/// \returns Given a sub-directory named 'v<version>.<x>', it outputs the
+/// 'v<version>.<x+1>' name.
+static void getNextDBDirName(StringRef DBDir, llvm::raw_ostream &OS) {
+ assert(DBDir.starts_with(DBDirPrefix));
+ uint64_t Count;
+ bool Failed = DBDir.substr(DBDirPrefix.size()).getAsInteger(10, Count);
+ assert(!Failed);
+ (void)Failed;
+ OS << DBDirPrefix << Count + 1;
+}
+
+static Error validateOutOfProcess(StringRef LLVMCasBinary, StringRef RootPath,
+ bool CheckHash) {
+ SmallVector<StringRef> Args{LLVMCasBinary, "-cas", RootPath, "-validate"};
+ if (CheckHash)
+ Args.push_back("-check-hash");
+
+ llvm::SmallString<128> StdErrPath;
+ int StdErrFD = -1;
+ if (std::error_code EC = sys::fs::createTemporaryFile(
+ "llvm-cas-validate-stderr", "txt", StdErrFD, StdErrPath,
+ llvm::sys::fs::OF_Text))
+ return createStringError(EC, "failed to create temporary file");
+ FileRemover OutputRemover(StdErrPath.c_str());
+
+ std::optional<llvm::StringRef> Redirects[] = {
+ {""}, // stdin = /dev/null
+ {""}, // stdout = /dev/null
+ StdErrPath.str(),
+ };
+
+ std::string ErrMsg;
+ int Result =
+ sys::ExecuteAndWait(LLVMCasBinary, Args, /*Env=*/std::nullopt, Redirects,
+ /*SecondsToWait=*/120, /*MemoryLimit=*/0, &ErrMsg);
+
+ if (Result == -1)
+ return createStringError("failed to exec " + join(Args, " ") + ": " +
+ ErrMsg);
+ if (Result != 0) {
+ llvm::SmallString<64> Err("cas contents invalid");
+ if (!ErrMsg.empty()) {
+ Err += ": ";
+ Err += ErrMsg;
+ }
+ auto StdErrBuf = MemoryBuffer::getFile(StdErrPath.c_str());
+ if (StdErrBuf && !(*StdErrBuf)->getBuffer().empty()) {
+ Err += ": ";
+ Err += (*StdErrBuf)->getBuffer();
+ }
+ return createStringError(Err);
+ }
+ return Error::success();
+}
+
+static Error validateInProcess(StringRef RootPath, StringRef HashName,
+ unsigned HashByteSize, bool CheckHash) {
+ std::shared_ptr<UnifiedOnDiskCache> UniDB;
+ if (Error E = UnifiedOnDiskCache::open(RootPath, std::nullopt, HashName,
+ HashByteSize)
+ .moveInto(UniDB))
+ return E;
+ auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB);
+ if (Error E = CAS->validate(CheckHash))
+ return E;
+ if (Error E = UniDB->validateActionCache())
+ return E;
+ return Error::success();
+}
+
+static Expected<uint64_t> getBootTime() {
+#if __has_include(<sys/sysctl.h>) && defined(KERN_BOOTTIME)
+ struct timeval TV;
+ size_t TVLen = sizeof(TV);
+ int KernBoot[2] = {CTL_KERN, KERN_BOOTTIME};
+ if (sysctl(KernBoot, 2, &TV, &TVLen, nullptr, 0) < 0)
+ return createStringError(llvm::errnoAsErrorCode(),
+ "failed to get boottime");
+ if (TVLen != sizeof(TV))
+ return createStringError("sysctl kern.boottime unexpected format");
+ return TV.tv_sec;
+#elif defined(__linux__)
+ // Use the mtime for /proc, which is recreated during system boot.
+ // We could also read /proc/stat and search for 'btime'.
+ sys::fs::file_status Status;
+ if (std::error_code EC = sys::fs::status("/proc", Status))
+ return createFileError("/proc", EC);
+ return Status.getLastModificationTime().time_since_epoch().count();
+#else
+ llvm::report_fatal_error("unimplemented");
+#endif
+}
+
+Expected<ValidationResult>
+UnifiedOnDiskCache::validateIfNeeded(StringRef RootPath, StringRef HashName,
+ unsigned HashByteSize, bool CheckHash,
+ bool AllowRecovery, bool ForceValidation,
+ std::optional<StringRef> LLVMCasBinary) {
+ if (std::error_code EC = sys::fs::create_directories(RootPath))
+ return createFileError(RootPath, EC);
+
+ SmallString<256> PathBuf(RootPath);
+ sys::path::append(PathBuf, ValidationFilename);
+ int FD = -1;
+ if (std::error_code EC = sys::fs::openFileForReadWrite(
+ PathBuf, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+ return createFileError(PathBuf, EC);
+ assert(FD != -1);
+
+ sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+ auto CloseFile = make_scope_exit([&]() { sys::fs::closeFile(File); });
+
+ if (std::error_code EC = lockFileThreadSafe(FD, sys::fs::LockKind::Exclusive))
+ return createFileError(PathBuf, EC);
+ auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(FD); });
+
+ SmallString<8> Bytes;
+ if (Error E = sys::fs::readNativeFileToEOF(File, Bytes))
+ return createFileError(PathBuf, std::move(E));
+
+ uint64_t ValidationBootTime = 0;
+ if (!Bytes.empty() &&
+ StringRef(Bytes).trim().getAsInteger(10, ValidationBootTime))
+ return createFileError(PathBuf, errc::illegal_byte_sequence,
+ "expected integer");
+
+ static uint64_t BootTime = 0;
+ if (BootTime == 0)
+ if (Error E = getBootTime().moveInto(BootTime))
+ return std::move(E);
+
+ bool Recovered = false;
+ bool Skipped = false;
+ std::string LogValidationError;
+
+ if (ValidationBootTime == BootTime && !ForceValidation) {
+ Skipped = true;
+ return ValidationResult::Skipped;
+ }
+
+ // Validate!
+ bool NeedsRecovery = false;
+ Error E =
+ LLVMCasBinary
+ ? validateOutOfProcess(*LLVMCasBinary, RootPath, CheckHash)
+ : validateInProcess(RootPath, HashName, HashByteSize, CheckHash);
+ if (E) {
+ if (AllowRecovery) {
+ consumeError(std::move(E));
+ NeedsRecovery = true;
+ } else {
+ return std::move(E);
+ }
+ }
+
+ if (NeedsRecovery) {
+ sys::path::remove_filename(PathBuf);
+ sys::path::append(PathBuf, "lock");
+
+ int LockFD = -1;
+ if (std::error_code EC = sys::fs::openFileForReadWrite(
+ PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+ return createFileError(PathBuf, EC);
+ sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD);
+ auto CloseLock = make_scope_exit([&]() { sys::fs::closeFile(LockFile); });
+ if (std::error_code EC = tryLockFileThreadSafe(LockFD)) {
+ if (EC == std::errc::no_lock_available)
+ return createFileError(
+ PathBuf, EC,
+ "CAS validation requires exclusive access but CAS was in use");
+ return createFileError(PathBuf, EC);
+ }
+ auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); });
+
+ SmallVector<std::string, 4> DBDirs;
+ if (Error E = getAllDBDirs(RootPath, DBDirs))
+ return std::move(E);
+
+ for (StringRef DBDir : DBDirs) {
+ sys::path::remove_filename(PathBuf);
+ sys::path::append(PathBuf, DBDir);
+ std::error_code EC;
+ int Attempt = 0, MaxAttempts = 100;
+ SmallString<128> GCPath;
+ for (; Attempt < MaxAttempts; ++Attempt) {
+ GCPath.assign(RootPath);
+ sys::path::append(GCPath, CorruptPrefix + std::to_string(Attempt) +
+ "." + DBDir);
+ EC = sys::fs::rename(PathBuf, GCPath);
+ // Darwin uses ENOTEMPTY. Linux may return either ENOTEMPTY or EEXIST.
+ if (EC != errc::directory_not_empty && EC != errc::file_exists)
+ break;
+ }
+ if (Attempt == MaxAttempts)
+ return createStringError(
+ EC, "rename " + PathBuf +
+ " failed: too many CAS directories awaiting pruning");
+ if (EC)
+ return createStringError(EC, "rename " + PathBuf + " to " + GCPath +
+ " failed: " + EC.message());
+ }
+ Recovered = true;
+ }
+
+ if (ValidationBootTime != BootTime) {
+ // Fix filename in case we have error to report.
+ sys::path::remove_filename(PathBuf);
+ sys::path::append(PathBuf, ValidationFilename);
+ if (std::error_code EC = sys::fs::resize_file(FD, 0))
+ return createFileError(PathBuf, EC);
+ raw_fd_ostream OS(FD, /*shouldClose=*/false);
+ OS.seek(0); // resize does not reset position
+ OS << BootTime << '\n';
+ if (OS.has_error())
+ return createFileError(PathBuf, OS.error());
+ }
+
+ return NeedsRecovery ? ValidationResult::Recovered : ValidationResult::Valid;
+}
+
+Expected<std::unique_ptr<UnifiedOnDiskCache>>
+UnifiedOnDiskCache::open(StringRef RootPath, std::optional<uint64_t> SizeLimit,
+ StringRef HashName, unsigned HashByteSize,
+ OnDiskGraphDB::FaultInPolicy FaultInPolicy) {
+ if (std::error_code EC = sys::fs::create_directories(RootPath))
+ return createFileError(RootPath, EC);
+
+ SmallString<256> PathBuf(RootPath);
+ sys::path::append(PathBuf, "lock");
+ int LockFD = -1;
+ if (std::error_code EC = sys::fs::openFileForReadWrite(
+ PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+ return createFileError(PathBuf, EC);
+ assert(LockFD != -1);
+ // Locking the directory using shared lock, which will prevent other processes
+ // from creating a new chain (essentially while a \p UnifiedOnDiskCache
+ // instance holds a shared lock the storage for the primary directory will
+ // grow unrestricted).
+ if (std::error_code EC =
+ lockFileThreadSafe(LockFD, sys::fs::LockKind::Shared))
+ return createFileError(PathBuf, EC);
+
+ SmallVector<std::string, 4> DBDirs;
+ if (Error E = getAllDBDirs(RootPath, DBDirs))
+ return std::move(E);
+ if (DBDirs.empty())
+ DBDirs.push_back((Twine(DBDirPrefix) + "1").str());
+
+ assert(!DBDirs.empty());
+
+ /// If there is only one directory open databases on it. If there are 2 or
+ /// more directories, get the most recent directories and chain them, with the
+ /// most recent being the primary one. The remaining directories are unused
+ /// data than can be garbage-collected.
+ std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB;
+ std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB;
+ if (DBDirs.size() > 1) {
+ StringRef UpstreamDir = *(DBDirs.end() - 2);
+ PathBuf = RootPath;
+ sys::path::append(PathBuf, UpstreamDir);
+ if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize,
+ /*UpstreamDB=*/nullptr, FaultInPolicy)
+ .moveInto(UpstreamGraphDB))
+ return std::move(E);
+ if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize,
+ /*ValueName=*/"objectid",
+ /*ValueSize=*/sizeof(uint64_t))
+ .moveInto(UpstreamKVDB))
+ return std::move(E);
+ }
+ OnDiskGraphDB *UpstreamGraphDBPtr = UpstreamGraphDB.get();
+
+ StringRef PrimaryDir = *(DBDirs.end() - 1);
+ PathBuf = RootPath;
+ sys::path::append(PathBuf, PrimaryDir);
+ std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB;
+ if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize,
+ std::move(UpstreamGraphDB), FaultInPolicy)
+ .moveInto(PrimaryGraphDB))
+ return std::move(E);
+ std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB;
+ // \p UnifiedOnDiskCache does manual chaining for key-value requests,
+ // including an extra translation step of the value during fault-in.
+ if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize,
+ /*ValueName=*/"objectid",
+ /*ValueSize=*/sizeof(uint64_t))
+ .moveInto(PrimaryKVDB))
+ return std::move(E);
+
+ auto UniDB = std::unique_ptr<UnifiedOnDiskCache>(new UnifiedOnDiskCache());
+ UniDB->RootPath = RootPath;
+ UniDB->SizeLimit = SizeLimit.value_or(0);
+ UniDB->LockFD = LockFD;
+ UniDB->NeedsGarbageCollection = DBDirs.size() > 2;
+ UniDB->PrimaryDBDir = PrimaryDir;
+ UniDB->UpstreamGraphDB = UpstreamGraphDBPtr;
+ UniDB->PrimaryGraphDB = std::move(PrimaryGraphDB);
+ UniDB->UpstreamKVDB = std::move(UpstreamKVDB);
+ UniDB->PrimaryKVDB = std::move(PrimaryKVDB);
+
+ return std::move(UniDB);
+}
+
+void UnifiedOnDiskCache::setSizeLimit(std::optional<uint64_t> SizeLimit) {
+ this->SizeLimit = SizeLimit.value_or(0);
+}
+
+uint64_t UnifiedOnDiskCache::getStorageSize() const {
+ uint64_t TotalSize = getPrimaryStorageSize();
+ if (UpstreamGraphDB)
+ TotalSize += UpstreamGraphDB->getStorageSize();
+ if (UpstreamKVDB)
+ TotalSize += UpstreamKVDB->getStorageSize();
+ return TotalSize;
+}
+
+uint64_t UnifiedOnDiskCache::getPrimaryStorageSize() const {
+ return PrimaryGraphDB->getStorageSize() + PrimaryKVDB->getStorageSize();
+}
+
+bool UnifiedOnDiskCache::hasExceededSizeLimit() const {
+ uint64_t CurSizeLimit = SizeLimit;
+ if (!CurSizeLimit)
+ return false;
+
+ // If the hard limit is beyond 85%, declare above limit and request clean up.
+ unsigned CurrentPrecent =
+ std::max(PrimaryGraphDB->getHardStorageLimitUtilization(),
+ PrimaryKVDB->getHardStorageLimitUtilization());
+ if (CurrentPrecent > 85)
+ return true;
+
+ // We allow each of the directories in the chain to reach up to half the
+ // intended size limit. Check whether the primary directory has exceeded half
+ // the limit or not, in order to decide whether we need to start a new chain.
+ //
+ // We could check the size limit against the sum of sizes of both the primary
+ // and upstream directories but then if the upstream is significantly larger
+ // than the intended limit, it would trigger a new chain to be created before
+ // the primary has reached its own limit. Essentially in such situation we
+ // prefer reclaiming the storage later in order to have more consistent cache
+ // hits behavior.
+ return (CurSizeLimit / 2) < getPrimaryStorageSize();
+}
+
+Error UnifiedOnDiskCache::close(bool CheckSizeLimit) {
+ if (LockFD == -1)
+ return Error::success(); // already closed.
+ auto _1 = make_scope_exit([&]() {
+ assert(LockFD >= 0);
+ sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD);
+ sys::fs::closeFile(LockFile);
+ LockFD = -1;
+ });
+
+ bool ExceededSizeLimit = CheckSizeLimit ? hasExceededSizeLimit() : false;
+ PrimaryKVDB.reset();
+ UpstreamKVDB.reset();
+ PrimaryGraphDB.reset();
+ UpstreamGraphDB = nullptr;
+ if (std::error_code EC = unlockFileThreadSafe(LockFD))
+ return createFileError(RootPath, EC);
+
+ if (!ExceededSizeLimit)
+ return Error::success();
+
+ // The primary directory exceeded its intended size limit. Try to get an
+ // exclusive lock in order to create a new primary directory for next time
+ // this \p UnifiedOnDiskCache path is opened.
+
+ if (std::error_code EC = tryLockFileThreadSafe(
+ LockFD, std::chrono::milliseconds(0), sys::fs::LockKind::Exclusive)) {
+ if (EC == errc::no_lock_available)
+ return Error::success(); // couldn't get exclusive lock, give up.
+ return createFileError(RootPath, EC);
+ }
+ auto _2 = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); });
+
+ // Managed to get an exclusive lock which means there are no other open
+ // \p UnifiedOnDiskCache instances for the same path, so we can safely start a
+ // new primary directory. To start a new primary directory we just have to
+ // create a new empty directory with the next consecutive index; since this is
+ // an atomic operation we will leave the top-level directory in a consistent
+ // state even if the process dies during this code-path.
+
+ SmallString<256> PathBuf(RootPath);
+ raw_svector_ostream OS(PathBuf);
+ OS << sys::path::get_separator();
+ getNextDBDirName(PrimaryDBDir, OS);
+ if (std::error_code EC = sys::fs::create_directory(PathBuf))
+ return createFileError(PathBuf, EC);
+
+ NeedsGarbageCollection = true;
+ return Error::success();
+}
+
+UnifiedOnDiskCache::UnifiedOnDiskCache() = default;
+
+UnifiedOnDiskCache::~UnifiedOnDiskCache() { consumeError(close()); }
+
+Error UnifiedOnDiskCache::collectGarbage(StringRef Path) {
+ SmallVector<std::string, 4> DBDirs;
+ if (Error E = getAllGarbageDirs(Path, DBDirs))
+ return E;
+
+ SmallString<256> PathBuf(Path);
+ for (StringRef UnusedSubDir : DBDirs) {
+ sys::path::append(PathBuf, UnusedSubDir);
+ if (std::error_code EC = sys::fs::remove_directories(PathBuf))
+ return createFileError(PathBuf, EC);
+ sys::path::remove_filename(PathBuf);
+ }
+ return Error::success();
+}
+
+Error UnifiedOnDiskCache::collectGarbage() { return collectGarbage(RootPath); }
diff --git a/llvm/unittests/CAS/ActionCacheTest.cpp b/llvm/unittests/CAS/ActionCacheTest.cpp
index db67e30ca203b..692da230b6e09 100644
--- a/llvm/unittests/CAS/ActionCacheTest.cpp
+++ b/llvm/unittests/CAS/ActionCacheTest.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
using namespace llvm::cas;
TEST_P(CASTest, ActionCacheHit) {
- std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ std::unique_ptr<ObjectStore> CAS = createObjectStore();
std::unique_ptr<ActionCache> Cache = createActionCache();
std::optional<ObjectProxy> ID;
@@ -36,7 +36,7 @@ TEST_P(CASTest, ActionCacheHit) {
}
TEST_P(CASTest, ActionCacheMiss) {
- std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ std::unique_ptr<ObjectStore> CAS = createObjectStore();
std::unique_ptr<ActionCache> Cache = createActionCache();
std::optional<ObjectProxy> ID1, ID2;
@@ -59,7 +59,7 @@ TEST_P(CASTest, ActionCacheMiss) {
}
TEST_P(CASTest, ActionCacheRewrite) {
- std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ std::unique_ptr<ObjectStore> CAS = createObjectStore();
std::unique_ptr<ActionCache> Cache = createActionCache();
std::optional<ObjectProxy> ID1, ID2;
diff --git a/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp b/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp
new file mode 100644
index 0000000000000..19522e9372d85
--- /dev/null
+++ b/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "CASTestConfig.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+TEST_F(OnDiskCASTest, UnifiedCASMaterializationCheckPreventsGarbageCollection) {
+ unittest::TempDir Temp("on-disk-unified-cas", /*Unique=*/true);
+
+ auto WithCAS = [&](llvm::function_ref<void(ObjectStore &)> Action) {
+ std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>> DBs;
+ ASSERT_THAT_ERROR(
+ createOnDiskUnifiedCASDatabases(Temp.path()).moveInto(DBs),
+ Succeeded());
+ ObjectStore &CAS = *DBs.first;
+ ASSERT_THAT_ERROR(CAS.setSizeLimit(1), Succeeded());
+ Action(CAS);
+ };
+
+ std::optional<CASID> ID;
+
+ // Create an object in the CAS.
+ WithCAS([&ID](ObjectStore &CAS) {
+ std::optional<ObjectRef> Ref;
+ ASSERT_THAT_ERROR(CAS.store({}, "blah").moveInto(Ref), Succeeded());
+ ASSERT_TRUE(Ref.has_value());
+
+ ID = CAS.getID(*Ref);
+ });
+
+ // Check materialization and prune the storage.
+ WithCAS([&ID](ObjectStore &CAS) {
+ std::optional<ObjectRef> Ref = CAS.getReference(*ID);
+ ASSERT_TRUE(Ref.has_value());
+
+ std::optional<bool> IsMaterialized;
+ ASSERT_THAT_ERROR(CAS.isMaterialized(*Ref).moveInto(IsMaterialized),
+ Succeeded());
+ ASSERT_TRUE(IsMaterialized);
+
+ ASSERT_THAT_ERROR(CAS.pruneStorageData(), Succeeded());
+ });
+
+ // Verify that the previous materialization check kept the object in the CAS.
+ WithCAS([&ID](ObjectStore &CAS) {
+ std::optional<ObjectRef> Ref = CAS.getReference(*ID);
+ ASSERT_TRUE(Ref.has_value());
+
+ std::optional<bool> IsMaterialized;
+ ASSERT_THAT_ERROR(CAS.isMaterialized(*Ref).moveInto(IsMaterialized),
+ Succeeded());
+ ASSERT_TRUE(IsMaterialized);
+ });
+}
diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp
index 91d0970367ac3..dc1578a50541c 100644
--- a/llvm/unittests/CAS/CASTestConfig.cpp
+++ b/llvm/unittests/CAS/CASTestConfig.cpp
@@ -8,13 +8,20 @@
#include "CASTestConfig.h"
#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Testing/Support/Error.h"
#include "gtest/gtest.h"
using namespace llvm;
using namespace llvm::cas;
+namespace llvm::unittest::cas {
+void MockEnv::anchor() {}
+MockEnv::~MockEnv() {}
+} // namespace llvm::unittest::cas
+
static CASTestingEnv createInMemory(int I) {
- return CASTestingEnv{createInMemoryCAS(), createInMemoryActionCache()};
+ return CASTestingEnv{createInMemoryCAS(), createInMemoryActionCache(),
+ nullptr, std::nullopt};
}
INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest,
@@ -22,7 +29,7 @@ INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest,
#if LLVM_ENABLE_ONDISK_CAS
namespace llvm::cas::ondisk {
-extern void setMaxMappingSize(uint64_t Size);
+void setMaxMappingSize(uint64_t Size);
} // namespace llvm::cas::ondisk
void setMaxOnDiskCASMappingSize() {
@@ -30,6 +37,18 @@ void setMaxOnDiskCASMappingSize() {
std::call_once(
Flag, [] { llvm::cas::ondisk::setMaxMappingSize(100 * 1024 * 1024); });
}
+
+CASTestingEnv createOnDisk(int I) {
+ unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+ std::unique_ptr<ObjectStore> CAS;
+ EXPECT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+ std::unique_ptr<ActionCache> Cache;
+ EXPECT_THAT_ERROR(createOnDiskActionCache(Temp.path()).moveInto(Cache),
+ Succeeded());
+ return CASTestingEnv{std::move(CAS), std::move(Cache), nullptr,
+ std::move(Temp)};
+}
+INSTANTIATE_TEST_SUITE_P(OnDiskCAS, CASTest, ::testing::Values(createOnDisk));
#else
void setMaxOnDiskCASMappingSize() {}
#endif /* LLVM_ENABLE_ONDISK_CAS */
diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h
index c08968b95b9cc..27033c93d57bb 100644
--- a/llvm/unittests/CAS/CASTestConfig.h
+++ b/llvm/unittests/CAS/CASTestConfig.h
@@ -6,16 +6,29 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H
+#define LLVM_UNITTESTS_CASTESTCONFIG_H
+
#include "llvm/CAS/ActionCache.h"
#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
#include "gtest/gtest.h"
+#include <memory>
-#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H
-#define LLVM_UNITTESTS_CASTESTCONFIG_H
+namespace llvm::unittest::cas {
+class MockEnv {
+ void anchor();
+
+public:
+ virtual ~MockEnv();
+};
+} // namespace llvm::unittest::cas
struct CASTestingEnv {
std::unique_ptr<llvm::cas::ObjectStore> CAS;
std::unique_ptr<llvm::cas::ActionCache> Cache;
+ std::unique_ptr<llvm::unittest::cas::MockEnv> Env;
+ std::optional<llvm::unittest::TempDir> Temp;
};
void setMaxOnDiskCASMappingSize();
@@ -24,26 +37,49 @@ void setMaxOnDiskCASMappingSize();
class OnDiskCASTest : public ::testing::Test {
protected:
void SetUp() override {
+#if !LLVM_ENABLE_ONDISK_CAS
+ GTEST_SKIP() << "OnDiskCAS is not enabled";
+#endif
// Use a smaller database size for testing to conserve disk space.
setMaxOnDiskCASMappingSize();
}
};
+// Parametered test fixture for ObjectStore and ActionCache tests.
class CASTest
: public testing::TestWithParam<std::function<CASTestingEnv(int)>> {
protected:
std::optional<int> NextCASIndex;
+ llvm::SmallVector<llvm::unittest::TempDir> Dirs;
+
+ llvm::SmallVector<std::unique_ptr<llvm::unittest::cas::MockEnv>> Envs;
+
std::unique_ptr<llvm::cas::ObjectStore> createObjectStore() {
auto TD = GetParam()(++(*NextCASIndex));
+ if (TD.Temp)
+ Dirs.push_back(std::move(*TD.Temp));
+ if (TD.Env)
+ Envs.emplace_back(std::move(TD.Env));
return std::move(TD.CAS);
}
std::unique_ptr<llvm::cas::ActionCache> createActionCache() {
auto TD = GetParam()(++(*NextCASIndex));
+ if (TD.Temp)
+ Dirs.push_back(std::move(*TD.Temp));
+ if (TD.Env)
+ Envs.emplace_back(std::move(TD.Env));
return std::move(TD.Cache);
}
- void SetUp() { NextCASIndex = 0; }
- void TearDown() { NextCASIndex = std::nullopt; }
+ void SetUp() {
+ NextCASIndex = 0;
+ setMaxOnDiskCASMappingSize();
+ }
+ void TearDown() {
+ NextCASIndex = std::nullopt;
+ Dirs.clear();
+ Envs.clear();
+ }
};
#endif
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index da469f7fccb5a..91e49be770745 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -1,9 +1,11 @@
set(ONDISK_CAS_TEST_SOURCES
+ BuiltinUnifiedCASDatabasesTest.cpp
OnDiskGraphDBTest.cpp
OnDiskDataAllocatorTest.cpp
OnDiskKeyValueDBTest.cpp
OnDiskTrieRawHashMapTest.cpp
ProgramTest.cpp
+ UnifiedOnDiskCacheTest.cpp
)
set(LLVM_OPTIONAL_SOURCES
diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp
index 54083fdb408f6..b43ae33d74127 100644
--- a/llvm/unittests/CAS/ObjectStoreTest.cpp
+++ b/llvm/unittests/CAS/ObjectStoreTest.cpp
@@ -1,4 +1,4 @@
-//===- ObjectStoreTest.cpp ------------------------------------------------===//
+//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -76,7 +76,7 @@ multiline text multiline text multiline text multiline text multiline text)",
// Run validation on all CASIDs.
for (int I = 0, E = IDs.size(); I != E; ++I)
- ASSERT_THAT_ERROR(CAS1->validate(IDs[I]), Succeeded());
+ ASSERT_THAT_ERROR(CAS1->validateObject(IDs[I]), Succeeded());
// Check that the blobs can be retrieved multiple times.
for (int I = 0, E = IDs.size(); I != E; ++I) {
@@ -120,15 +120,15 @@ TEST_P(CASTest, BlobsBig) {
std::optional<CASID> ID2;
ASSERT_THAT_ERROR(CAS->createProxy({}, String1).moveInto(ID1), Succeeded());
ASSERT_THAT_ERROR(CAS->createProxy({}, String1).moveInto(ID2), Succeeded());
- ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded());
- ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded());
+ ASSERT_THAT_ERROR(CAS->validateObject(*ID1), Succeeded());
+ ASSERT_THAT_ERROR(CAS->validateObject(*ID2), Succeeded());
ASSERT_EQ(ID1, ID2);
String1.append(String2);
ASSERT_THAT_ERROR(CAS->createProxy({}, String2).moveInto(ID1), Succeeded());
ASSERT_THAT_ERROR(CAS->createProxy({}, String2).moveInto(ID2), Succeeded());
- ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded());
- ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded());
+ ASSERT_THAT_ERROR(CAS->validateObject(*ID1), Succeeded());
+ ASSERT_THAT_ERROR(CAS->validateObject(*ID2), Succeeded());
ASSERT_EQ(ID1, ID2);
String2.append(String1);
}
@@ -176,10 +176,11 @@ multiline text multiline text multiline text multiline text multiline text)",
// Check basic printing of IDs.
IDs.push_back(CAS1->getID(*Node));
- auto ID = CAS1->getID(Nodes.back());
- EXPECT_EQ(ID.toString(), IDs.back().toString());
- EXPECT_EQ(*Node, Nodes.back());
- EXPECT_EQ(ID, IDs.back());
+ EXPECT_EQ(IDs.back().toString(), IDs.back().toString());
+ EXPECT_EQ(Nodes.front(), Nodes.front());
+ EXPECT_EQ(Nodes.back(), Nodes.back());
+ EXPECT_EQ(IDs.front(), IDs.front());
+ EXPECT_EQ(IDs.back(), IDs.back());
if (Nodes.size() <= 1)
continue;
EXPECT_NE(Nodes.front(), Nodes.back());
@@ -266,7 +267,7 @@ TEST_P(CASTest, NodesBig) {
}
for (auto ID : CreatedNodes)
- ASSERT_THAT_ERROR(CAS->validate(CAS->getID(ID)), Succeeded());
+ ASSERT_THAT_ERROR(CAS->validateObject(CAS->getID(ID)), Succeeded());
}
#if LLVM_ENABLE_THREADS
@@ -332,17 +333,124 @@ static void testBlobsParallel1(ObjectStore &CAS, uint64_t BlobSize) {
}
TEST_P(CASTest, BlobsParallel) {
- std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ std::unique_ptr<ObjectStore> CAS = createObjectStore();
uint64_t Size = 1ULL * 1024;
ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
}
#ifdef EXPENSIVE_CHECKS
TEST_P(CASTest, BlobsBigParallel) {
- std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ std::unique_ptr<ObjectStore> CAS = createObjectStore();
// 100k is large enough to be standalone files in our on-disk cas.
uint64_t Size = 100ULL * 1024;
ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
}
#endif // EXPENSIVE_CHECKS
+
+#ifndef _WIN32 // create_link won't work for directories on Windows
+TEST_F(OnDiskCASTest, OnDiskCASBlobsParallelMultiCAS) {
+ // This test intentionally uses symlinked paths to the same CAS to subvert the
+ // shared memory mappings that would normally be created within a single
+ // process. This breaks the lock file guarantees, so we must be careful not
+ // to create or destroy the CAS objects concurrently, which is when the locks
+ // are normally important.
+ unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+ ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")),
+ std::error_code());
+ ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")),
+ std::error_code());
+ ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")),
+ std::error_code());
+ ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")),
+ std::error_code());
+
+ std::unique_ptr<ObjectStore> CAS1, CAS2, CAS3, CAS4;
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1),
+ Succeeded());
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2),
+ Succeeded());
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3),
+ Succeeded());
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4),
+ Succeeded());
+
+ uint64_t Size = 1ULL * 1024;
+ ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size));
+}
+
+TEST_F(OnDiskCASTest, OnDiskCASBlobsBigParallelMultiCAS) {
+ // See comment in BlobsParallelMultiCAS.
+ unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+ ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")),
+ std::error_code());
+ ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")),
+ std::error_code());
+ ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")),
+ std::error_code());
+ ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")),
+ std::error_code());
+
+ std::unique_ptr<ObjectStore> CAS1, CAS2, CAS3, CAS4;
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1),
+ Succeeded());
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2),
+ Succeeded());
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3),
+ Succeeded());
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4),
+ Succeeded());
+
+ // 100k is large enough to be standalone files in our on-disk cas.
+ uint64_t Size = 100ULL * 1024;
+ ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size));
+}
+#endif // _WIN32
#endif // LLVM_ENABLE_THREADS
+
+TEST_F(OnDiskCASTest, OnDiskCASDiskSize) {
+ unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+ std::unique_ptr<ObjectStore> CAS;
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+
+ uint64_t MaxSize = 100 * 1024 * 1024;
+
+ // Check that we map the files to the correct size.
+ auto CheckFileSizes = [&](bool Mapped) {
+ bool FoundIndex = false, FoundData = false;
+ std::error_code EC;
+ for (sys::fs::directory_iterator I(Temp.path(), EC), E; I != E && !EC;
+ I.increment(EC)) {
+ StringRef Filename = sys::path::filename(I->path());
+ if (Filename.starts_with("index.") && !Filename.ends_with(".shared")) {
+ FoundIndex = true;
+ ASSERT_TRUE(I->status());
+ if (Mapped)
+ EXPECT_EQ(I->status()->getSize(), MaxSize);
+ else
+ EXPECT_LT(I->status()->getSize(), MaxSize);
+ }
+ if (Filename.starts_with("data.") && !Filename.ends_with(".shared")) {
+ FoundData = true;
+ ASSERT_TRUE(I->status());
+ if (Mapped)
+ EXPECT_EQ(I->status()->getSize(), MaxSize);
+ else
+ EXPECT_LT(I->status()->getSize(), MaxSize);
+ }
+ }
+ ASSERT_TRUE(FoundIndex);
+ ASSERT_TRUE(FoundData);
+ };
+
+ // Check that we have the full mapping size when the CAS is open.
+ CheckFileSizes(/*Mapped=*/true);
+ CAS.reset();
+ // Check that the CAS is shrunk to a smaller size.
+ CheckFileSizes(/*Mapped=*/false);
+
+ // Repeat the checks when starting from an existing CAS.
+ ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+ CheckFileSizes(/*Mapped=*/true);
+ CAS.reset();
+ CheckFileSizes(/*Mapped=*/false);
+}
diff --git a/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp
new file mode 100644
index 0000000000000..e25288a26eb92
--- /dev/null
+++ b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp
@@ -0,0 +1,191 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "CASTestConfig.h"
+#include "OnDiskCommonUtils.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+using namespace llvm::unittest::cas;
+
+/// Visits all the files of a directory recursively and returns the sum of their
+/// sizes.
+static Expected<size_t> countFileSizes(StringRef Path) {
+ size_t TotalSize = 0;
+ std::error_code EC;
+ for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE;
+ DirI.increment(EC)) {
+ if (DirI->type() == sys::fs::file_type::directory_file) {
+ Expected<size_t> Subsize = countFileSizes(DirI->path());
+ if (!Subsize)
+ return Subsize.takeError();
+ TotalSize += *Subsize;
+ continue;
+ }
+ ErrorOr<sys::fs::basic_file_status> Stat = DirI->status();
+ if (!Stat)
+ return createFileError(DirI->path(), Stat.getError());
+ TotalSize += Stat->getSize();
+ }
+ if (EC)
+ return createFileError(Path, EC);
+ return TotalSize;
+}
+
+TEST_F(OnDiskCASTest, UnifiedOnDiskCacheTest) {
+ unittest::TempDir Temp("ondisk-unified", /*Unique=*/true);
+ std::unique_ptr<UnifiedOnDiskCache> UniDB;
+
+ const uint64_t SizeLimit = 1024ull * 64;
+ auto reopenDB = [&]() {
+ UniDB.reset();
+ ASSERT_THAT_ERROR(UnifiedOnDiskCache::open(Temp.path(), SizeLimit, "blake3",
+ sizeof(HashType))
+ .moveInto(UniDB),
+ Succeeded());
+ };
+
+ reopenDB();
+
+ HashType RootHash;
+ HashType OtherHash;
+ HashType Key1Hash;
+ HashType Key2Hash;
+ {
+ OnDiskGraphDB &DB = UniDB->getGraphDB();
+ std::optional<ObjectID> ID1;
+ ASSERT_THAT_ERROR(store(DB, "1", {}).moveInto(ID1), Succeeded());
+ std::optional<ObjectID> ID2;
+ ASSERT_THAT_ERROR(store(DB, "2", {}).moveInto(ID2), Succeeded());
+ std::optional<ObjectID> IDRoot;
+ ASSERT_THAT_ERROR(store(DB, "root", {*ID1, *ID2}).moveInto(IDRoot),
+ Succeeded());
+ ArrayRef<uint8_t> Digest = DB.getDigest(*IDRoot);
+ ASSERT_EQ(Digest.size(), RootHash.size());
+ llvm::copy(Digest, RootHash.data());
+
+ std::optional<ObjectID> IDOther;
+ ASSERT_THAT_ERROR(store(DB, "other", {}).moveInto(IDOther), Succeeded());
+ Digest = DB.getDigest(*IDOther);
+ ASSERT_EQ(Digest.size(), OtherHash.size());
+ llvm::copy(Digest, OtherHash.data());
+
+ Key1Hash = digest("key1");
+ std::optional<ObjectID> Val;
+ ASSERT_THAT_ERROR(UniDB->KVPut(Key1Hash, *IDRoot).moveInto(Val),
+ Succeeded());
+ EXPECT_EQ(IDRoot, Val);
+
+ Key2Hash = digest("key2");
+ std::optional<ObjectID> KeyID;
+ ASSERT_THAT_ERROR(DB.getReference(Key2Hash).moveInto(KeyID), Succeeded());
+ ASSERT_THAT_ERROR(UniDB->KVPut(*KeyID, *ID1).moveInto(Val), Succeeded());
+ }
+
+ auto checkTree = [&](const HashType &Digest, StringRef ExpectedTree) {
+ OnDiskGraphDB &DB = UniDB->getGraphDB();
+ std::optional<ObjectID> ID;
+ ASSERT_THAT_ERROR(DB.getReference(Digest).moveInto(ID), Succeeded());
+ std::string PrintedTree;
+ raw_string_ostream OS(PrintedTree);
+ ASSERT_THAT_ERROR(printTree(DB, *ID, OS), Succeeded());
+ EXPECT_EQ(PrintedTree, ExpectedTree);
+ };
+ auto checkRootTree = [&]() {
+ return checkTree(RootHash, "root\n 1\n 2\n");
+ };
+
+ auto checkKey = [&](const HashType &Key, StringRef ExpectedData) {
+ OnDiskGraphDB &DB = UniDB->getGraphDB();
+ std::optional<ObjectID> Val;
+ ASSERT_THAT_ERROR(UniDB->KVGet(Key).moveInto(Val), Succeeded());
+ ASSERT_TRUE(Val.has_value());
+ std::optional<ondisk::ObjectHandle> Obj;
+ ASSERT_THAT_ERROR(DB.load(*Val).moveInto(Obj), Succeeded());
+ EXPECT_EQ(toStringRef(DB.getObjectData(*Obj)), ExpectedData);
+ };
+
+ checkRootTree();
+ checkTree(OtherHash, "other\n");
+ checkKey(Key1Hash, "root");
+ checkKey(Key2Hash, "1");
+
+ auto storeBigObject = [&](unsigned Index) {
+ SmallString<1000> Buf;
+ Buf.append(970, 'a');
+ raw_svector_ostream(Buf) << Index;
+ std::optional<ObjectID> ID;
+ ASSERT_THAT_ERROR(store(UniDB->getGraphDB(), Buf, {}).moveInto(ID),
+ Succeeded());
+ };
+
+ uint64_t PrevStoreSize = UniDB->getStorageSize();
+ unsigned Index = 0;
+ while (!UniDB->hasExceededSizeLimit()) {
+ storeBigObject(Index++);
+ }
+ EXPECT_GT(UniDB->getStorageSize(), PrevStoreSize);
+ UniDB->setSizeLimit(SizeLimit * 2);
+ EXPECT_FALSE(UniDB->hasExceededSizeLimit());
+ UniDB->setSizeLimit(SizeLimit);
+ EXPECT_TRUE(UniDB->hasExceededSizeLimit());
+
+ reopenDB();
+
+ EXPECT_FALSE(UniDB->hasExceededSizeLimit());
+ EXPECT_FALSE(UniDB->needsGarbageCollection());
+
+ checkRootTree();
+ checkKey(Key1Hash, "root");
+
+ while (!UniDB->hasExceededSizeLimit()) {
+ storeBigObject(Index++);
+ }
+ PrevStoreSize = UniDB->getStorageSize();
+ ASSERT_THAT_ERROR(UniDB->close(), Succeeded());
+ EXPECT_TRUE(UniDB->needsGarbageCollection());
+
+ reopenDB();
+ EXPECT_TRUE(UniDB->needsGarbageCollection());
+
+ std::optional<size_t> DirSizeBefore;
+ ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeBefore),
+ Succeeded());
+
+ ASSERT_THAT_ERROR(UnifiedOnDiskCache::collectGarbage(Temp.path()),
+ Succeeded());
+
+ std::optional<size_t> DirSizeAfter;
+ ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeAfter),
+ Succeeded());
+ EXPECT_LT(*DirSizeAfter, *DirSizeBefore);
+
+ reopenDB();
+ EXPECT_FALSE(UniDB->needsGarbageCollection());
+
+ checkRootTree();
+ checkKey(Key1Hash, "root");
+
+ EXPECT_LT(UniDB->getStorageSize(), PrevStoreSize);
+
+ // 'Other' tree and 'Key2' got garbage-collected.
+ {
+ OnDiskGraphDB &DB = UniDB->getGraphDB();
+ std::optional<ObjectID> ID;
+ ASSERT_THAT_ERROR(DB.getReference(OtherHash).moveInto(ID), Succeeded());
+ EXPECT_FALSE(DB.containsObject(*ID));
+ std::optional<ObjectID> Val;
+ ASSERT_THAT_ERROR(UniDB->KVGet(Key2Hash).moveInto(Val), Succeeded());
+ EXPECT_FALSE(Val.has_value());
+ }
+}
More information about the llvm-commits
mailing list