[llvm] [CAS] Add OnDiskGraphDB and OnDiskKeyValueDB (PR #114102)
Adrian Prantl via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 14 13:50:40 PDT 2025
================
@@ -0,0 +1,1768 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements OnDiskGraphDB, an on-disk CAS nodes database,
+/// independent of a particular hashing algorithm. It only needs to be
+/// configured for the hash size and controls the schema of the storage.
+///
+/// OnDiskGraphDB defines:
+///
+/// - How the data are stored inside database, either as a standalone file, or
+/// allocated inside a datapool.
+/// - How references to other objects inside the same database is stored. They
+/// are stored as internal references, instead of full hash value to save
+/// space.
+/// - How to chain databases together and import objects from upstream
+/// databases.
+///
+/// Here's a top-level description of the current layout:
+///
+/// - db/index.<version>: a file for the "index" table, named by \a
+/// IndexTableName and managed by \a TrieRawHashMap. The contents are 8B
+/// that are accessed atomically, describing the object kind and where/how
+/// it's stored (including an optional file offset). See \a TrieRecord for
+/// more details.
+/// - db/data.<version>: a file for the "data" table, named by \a
+/// DataPoolTableName and managed by \a DataStore. New objects within
+/// TrieRecord::MaxEmbeddedSize are inserted here as \a
+/// TrieRecord::StorageKind::DataPool.
+/// - db/obj.<offset>.<version>: a file storing an object outside the main
+/// "data" table, named by its offset into the "index" table, with the
+/// format of \a TrieRecord::StorageKind::Standalone.
+/// - db/leaf.<offset>.<version>: a file storing a leaf node outside the
+/// main "data" table, named by its offset into the "index" table, with
+/// the format of \a TrieRecord::StorageKind::StandaloneLeaf.
+/// - db/leaf+0.<offset>.<version>: a file storing a null-terminated leaf object
+/// outside the main "data" table, named by its offset into the "index" table,
+/// with the format of \a TrieRecord::StorageKind::StandaloneLeaf0.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include <atomic>
+#include <mutex>
+#include <optional>
+
+#define DEBUG_TYPE "on-disk-cas"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+static constexpr StringLiteral IndexTableName = "llvm.cas.index";
+static constexpr StringLiteral DataPoolTableName = "llvm.cas.data";
+
+static constexpr StringLiteral IndexFilePrefix = "index.";
+static constexpr StringLiteral DataPoolFilePrefix = "data.";
+
+static constexpr StringLiteral FilePrefixObject = "obj.";
+static constexpr StringLiteral FilePrefixLeaf = "leaf.";
+static constexpr StringLiteral FilePrefixLeaf0 = "leaf+0.";
+
+static Error createCorruptObjectError(Expected<ArrayRef<uint8_t>> ID) {
+ if (!ID)
+ return ID.takeError();
+
+ return createStringError(llvm::errc::invalid_argument,
+ "corrupt object '" + toHex(*ID) + "'");
+}
+
+namespace {
+
+/// Trie record data: 8B, atomic<uint64_t>
+/// - 1-byte: StorageKind
+/// - 7-bytes: DataStoreOffset (offset into referenced file)
+class TrieRecord {
+public:
+ enum class StorageKind : uint8_t {
+ /// Unknown object.
+ Unknown = 0,
+
+ /// data.vX: main pool, full DataStore record.
+ DataPool = 1,
+
+ /// obj.<TrieRecordOffset>.vX: standalone, with a full DataStore record.
+ Standalone = 10,
+
+ /// leaf.<TrieRecordOffset>.vX: standalone, just the data. File contents
+ /// exactly the data content and file size matches the data size. No refs.
+ StandaloneLeaf = 11,
+
+ /// leaf+0.<TrieRecordOffset>.vX: standalone, just the data plus an
+ /// extra null character ('\0'). File size is 1 bigger than the data size.
+ /// No refs.
+ StandaloneLeaf0 = 12,
+ };
+
+ static StringRef getStandaloneFilePrefix(StorageKind SK) {
+ switch (SK) {
+ default:
+ llvm_unreachable("Expected standalone storage kind");
+ case TrieRecord::StorageKind::Standalone:
+ return FilePrefixObject;
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ return FilePrefixLeaf;
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ return FilePrefixLeaf0;
+ }
+ }
+
+ enum Limits : int64_t {
+ // Saves files bigger than 64KB standalone instead of embedding them.
+ MaxEmbeddedSize = 64LL * 1024LL - 1,
+ };
+
+ struct Data {
+ StorageKind SK = StorageKind::Unknown;
+ FileOffset Offset;
+ };
+
+ static uint64_t pack(Data D) {
----------------
adrian-prantl wrote:
what is this function used for?
https://github.com/llvm/llvm-project/pull/114102
More information about the llvm-commits
mailing list