[llvm] [CAS] Add OnDiskGraphDB and OnDiskKeyValueDB (PR #114102)

Adrian Prantl via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 14 13:50:40 PDT 2025


================
@@ -0,0 +1,1768 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements OnDiskGraphDB, an on-disk CAS nodes database,
+/// independent of a particular hashing algorithm. It only needs to be
+/// configured for the hash size and controls the schema of the storage.
+///
+/// OnDiskGraphDB defines:
+///
+/// - How the data are stored inside database, either as a standalone file, or
+///   allocated inside a datapool.
+/// - How references to other objects inside the same database is stored. They
+///   are stored as internal references, instead of full hash value to save
+///   space.
+/// - How to chain databases together and import objects from upstream
+///   databases.
+///
+/// Here's a top-level description of the current layout:
+///
+/// - db/index.<version>: a file for the "index" table, named by \a
+///   IndexTableName and managed by \a TrieRawHashMap. The contents are 8B
+///   that are accessed atomically, describing the object kind and where/how
+///   it's stored (including an optional file offset). See \a TrieRecord for
+///   more details.
+/// - db/data.<version>: a file for the "data" table, named by \a
+///   DataPoolTableName and managed by \a DataStore. New objects within
+///   TrieRecord::MaxEmbeddedSize are inserted here as \a
+///   TrieRecord::StorageKind::DataPool.
+/// - db/obj.<offset>.<version>: a file storing an object outside the main
+///   "data" table, named by its offset into the "index" table, with the
+///   format of \a TrieRecord::StorageKind::Standalone.
+/// - db/leaf.<offset>.<version>: a file storing a leaf node outside the
+///   main "data" table, named by its offset into the "index" table, with
+///   the format of \a TrieRecord::StorageKind::StandaloneLeaf.
+/// - db/leaf+0.<offset>.<version>: a file storing a null-terminated leaf object
+///   outside the main "data" table, named by its offset into the "index" table,
+///   with the format of \a TrieRecord::StorageKind::StandaloneLeaf0.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include <atomic>
+#include <mutex>
+#include <optional>
+
+#define DEBUG_TYPE "on-disk-cas"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+static constexpr StringLiteral IndexTableName = "llvm.cas.index";
+static constexpr StringLiteral DataPoolTableName = "llvm.cas.data";
+
+static constexpr StringLiteral IndexFilePrefix = "index.";
+static constexpr StringLiteral DataPoolFilePrefix = "data.";
+
+static constexpr StringLiteral FilePrefixObject = "obj.";
+static constexpr StringLiteral FilePrefixLeaf = "leaf.";
+static constexpr StringLiteral FilePrefixLeaf0 = "leaf+0.";
+
+static Error createCorruptObjectError(Expected<ArrayRef<uint8_t>> ID) {
+  if (!ID)
+    return ID.takeError();
+
+  return createStringError(llvm::errc::invalid_argument,
+                           "corrupt object '" + toHex(*ID) + "'");
+}
+
+namespace {
+
+/// Trie record data: 8B, atomic<uint64_t>
+/// - 1-byte: StorageKind
+/// - 7-bytes: DataStoreOffset (offset into referenced file)
+class TrieRecord {
+public:
+  enum class StorageKind : uint8_t {
+    /// Unknown object.
+    Unknown = 0,
+
+    /// data.vX: main pool, full DataStore record.
+    DataPool = 1,
+
+    /// obj.<TrieRecordOffset>.vX: standalone, with a full DataStore record.
+    Standalone = 10,
+
+    /// leaf.<TrieRecordOffset>.vX: standalone, just the data. File contents
+    /// exactly the data content and file size matches the data size. No refs.
+    StandaloneLeaf = 11,
+
+    /// leaf+0.<TrieRecordOffset>.vX: standalone, just the data plus an
+    /// extra null character ('\0'). File size is 1 bigger than the data size.
+    /// No refs.
+    StandaloneLeaf0 = 12,
+  };
+
+  static StringRef getStandaloneFilePrefix(StorageKind SK) {
+    switch (SK) {
+    default:
+      llvm_unreachable("Expected standalone storage kind");
+    case TrieRecord::StorageKind::Standalone:
+      return FilePrefixObject;
+    case TrieRecord::StorageKind::StandaloneLeaf:
+      return FilePrefixLeaf;
+    case TrieRecord::StorageKind::StandaloneLeaf0:
+      return FilePrefixLeaf0;
+    }
+  }
+
+  enum Limits : int64_t {
+    // Saves files bigger than 64KB standalone instead of embedding them.
+    MaxEmbeddedSize = 64LL * 1024LL - 1,
+  };
+
+  struct Data {
+    StorageKind SK = StorageKind::Unknown;
+    FileOffset Offset;
+  };
+
+  static uint64_t pack(Data D) {
----------------
adrian-prantl wrote:

what is this function used for?

https://github.com/llvm/llvm-project/pull/114102


More information about the llvm-commits mailing list