[llvm] Re-land #161264: [CAS] Add OnDiskDataAllocator (PR #162112)

Steven Wu via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 6 09:30:57 PDT 2025


https://github.com/cachemeifyoucan created https://github.com/llvm/llvm-project/pull/162112

Fix the build configuration that has OnDiskCAS disabled.


>From fe479f35c2445d432fe843e2347fe3048bcf80a8 Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Mon, 6 Oct 2025 09:30:47 -0700
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.7
---
 llvm/include/llvm/CAS/OnDiskDataAllocator.h   |  95 +++++++
 llvm/lib/CAS/CMakeLists.txt                   |   1 +
 llvm/lib/CAS/OnDiskDataAllocator.cpp          | 234 ++++++++++++++++++
 llvm/unittests/CAS/CMakeLists.txt             |   1 +
 .../unittests/CAS/OnDiskDataAllocatorTest.cpp |  66 +++++
 5 files changed, 397 insertions(+)
 create mode 100644 llvm/include/llvm/CAS/OnDiskDataAllocator.h
 create mode 100644 llvm/lib/CAS/OnDiskDataAllocator.cpp
 create mode 100644 llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp

diff --git a/llvm/include/llvm/CAS/OnDiskDataAllocator.h b/llvm/include/llvm/CAS/OnDiskDataAllocator.h
new file mode 100644
index 0000000000000..2809df800621b
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskDataAllocator.h
@@ -0,0 +1,95 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares interface for OnDiskDataAllocator, a file backed data
+/// pool can be used to allocate space to store data packed in a single file. It
+/// is based on MappedFileRegionArena and includes a header in the beginning to
+/// provide metadata.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_ONDISKDATAALLOCATOR_H
+#define LLVM_CAS_ONDISKDATAALLOCATOR_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CAS/FileOffset.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm::cas {
+
+/// Sink for data. Stores variable length data with 8-byte alignment. Does not
+/// track size of data, which is assumed to known from context, or embedded.
+/// Uses 0-padding but does not guarantee 0-termination.
+class OnDiskDataAllocator {
+public:
+  using ValueProxy = MutableArrayRef<char>;
+
+  /// A pointer to data stored on disk.
+  class OnDiskPtr {
+  public:
+    FileOffset getOffset() const { return Offset; }
+    explicit operator bool() const { return bool(getOffset()); }
+    const ValueProxy &operator*() const {
+      assert(Offset && "Null dereference");
+      return Value;
+    }
+    const ValueProxy *operator->() const {
+      assert(Offset && "Null dereference");
+      return &Value;
+    }
+
+    OnDiskPtr() = default;
+
+  private:
+    friend class OnDiskDataAllocator;
+    OnDiskPtr(FileOffset Offset, ValueProxy Value)
+        : Offset(Offset), Value(Value) {}
+    FileOffset Offset;
+    ValueProxy Value;
+  };
+
+  /// Get the data of \p Size stored at the given \p Offset. Note the allocator
+  /// doesn't keep track of the allocation size, thus \p Size doesn't need to
+  /// match the size of allocation but needs to be smaller.
+  Expected<ArrayRef<char>> get(FileOffset Offset, size_t Size) const;
+
+  /// Allocate at least \p Size with 8-byte alignment.
+  Expected<OnDiskPtr> allocate(size_t Size);
+
+  /// \returns the buffer that was allocated at \p create time, with size
+  /// \p UserHeaderSize.
+  MutableArrayRef<uint8_t> getUserHeader();
+
+  size_t size() const;
+  size_t capacity() const;
+
+  static Expected<OnDiskDataAllocator>
+  create(const Twine &Path, const Twine &TableName, uint64_t MaxFileSize,
+         std::optional<uint64_t> NewFileInitialSize,
+         uint32_t UserHeaderSize = 0,
+         function_ref<void(void *)> UserHeaderInit = nullptr);
+
+  OnDiskDataAllocator(OnDiskDataAllocator &&RHS);
+  OnDiskDataAllocator &operator=(OnDiskDataAllocator &&RHS);
+
+  // No copy. Just call \a create() again.
+  OnDiskDataAllocator(const OnDiskDataAllocator &) = delete;
+  OnDiskDataAllocator &operator=(const OnDiskDataAllocator &) = delete;
+
+  ~OnDiskDataAllocator();
+
+private:
+  struct ImplType;
+  explicit OnDiskDataAllocator(std::unique_ptr<ImplType> Impl);
+  std::unique_ptr<ImplType> Impl;
+};
+
+} // namespace llvm::cas
+
+#endif // LLVM_CAS_ONDISKDATAALLOCATOR_H
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index 7ae5f7e46418e..bca39b645af45 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_component_library(LLVMCAS
   MappedFileRegionArena.cpp
   ObjectStore.cpp
   OnDiskCommon.cpp
+  OnDiskDataAllocator.cpp
   OnDiskTrieRawHashMap.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/CAS/OnDiskDataAllocator.cpp b/llvm/lib/CAS/OnDiskDataAllocator.cpp
new file mode 100644
index 0000000000000..13bbd66139178
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskDataAllocator.cpp
@@ -0,0 +1,234 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file Implements OnDiskDataAllocator.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "DatabaseFile.h"
+#include "llvm/Config/llvm-config.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+//===----------------------------------------------------------------------===//
+// DataAllocator data structures.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// DataAllocator table layout:
+/// - [8-bytes: Generic table header]
+/// - 8-bytes: AllocatorOffset (reserved for implementing free lists)
+/// - 8-bytes: Size for user data header
+/// - <user data buffer>
+///
+/// Record layout:
+/// - <data>
+class DataAllocatorHandle {
+public:
+  static constexpr TableHandle::TableKind Kind =
+      TableHandle::TableKind::DataAllocator;
+
+  struct Header {
+    TableHandle::Header GenericHeader;
+    std::atomic<int64_t> AllocatorOffset;
+    const uint64_t UserHeaderSize;
+  };
+
+  operator TableHandle() const {
+    if (!H)
+      return TableHandle();
+    return TableHandle(*Region, H->GenericHeader);
+  }
+
+  Expected<MutableArrayRef<char>> allocate(MappedFileRegionArena &Alloc,
+                                           size_t DataSize) {
+    assert(&Alloc.getRegion() == Region);
+    auto Ptr = Alloc.allocate(DataSize);
+    if (LLVM_UNLIKELY(!Ptr))
+      return Ptr.takeError();
+    return MutableArrayRef(*Ptr, DataSize);
+  }
+
+  explicit operator bool() const { return H; }
+  const Header &getHeader() const { return *H; }
+  MappedFileRegion &getRegion() const { return *Region; }
+
+  MutableArrayRef<uint8_t> getUserHeader() {
+    return MutableArrayRef(reinterpret_cast<uint8_t *>(H + 1),
+                           H->UserHeaderSize);
+  }
+
+  static Expected<DataAllocatorHandle>
+  create(MappedFileRegionArena &Alloc, StringRef Name, uint32_t UserHeaderSize);
+
+  DataAllocatorHandle() = default;
+  DataAllocatorHandle(MappedFileRegion &Region, Header &H)
+      : Region(&Region), H(&H) {}
+  DataAllocatorHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+      : DataAllocatorHandle(
+            Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+  }
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+};
+
+} // end anonymous namespace
+
+struct OnDiskDataAllocator::ImplType {
+  DatabaseFile File;
+  DataAllocatorHandle Store;
+};
+
+Expected<DataAllocatorHandle>
+DataAllocatorHandle::create(MappedFileRegionArena &Alloc, StringRef Name,
+                            uint32_t UserHeaderSize) {
+  // Allocate.
+  auto Offset =
+      Alloc.allocateOffset(sizeof(Header) + UserHeaderSize + Name.size() + 1);
+  if (LLVM_UNLIKELY(!Offset))
+    return Offset.takeError();
+
+  // Construct the header and the name.
+  assert(Name.size() <= UINT16_MAX && "Expected smaller table name");
+  auto *H = new (Alloc.getRegion().data() + *Offset)
+      Header{{TableHandle::TableKind::DataAllocator,
+              static_cast<uint16_t>(Name.size()),
+              static_cast<int32_t>(sizeof(Header) + UserHeaderSize)},
+             /*AllocatorOffset=*/{0},
+             /*UserHeaderSize=*/UserHeaderSize};
+  // Memset UserHeader.
+  char *UserHeader = reinterpret_cast<char *>(H + 1);
+  memset(UserHeader, 0, UserHeaderSize);
+  // Write database file name (null-terminated).
+  char *NameStorage = UserHeader + UserHeaderSize;
+  llvm::copy(Name, NameStorage);
+  NameStorage[Name.size()] = 0;
+  return DataAllocatorHandle(Alloc.getRegion(), *H);
+}
+
+Expected<OnDiskDataAllocator> OnDiskDataAllocator::create(
+    const Twine &PathTwine, const Twine &TableNameTwine, uint64_t MaxFileSize,
+    std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize,
+    function_ref<void(void *)> UserHeaderInit) {
+  assert(!UserHeaderSize || UserHeaderInit);
+  SmallString<128> PathStorage;
+  StringRef Path = PathTwine.toStringRef(PathStorage);
+  SmallString<128> TableNameStorage;
+  StringRef TableName = TableNameTwine.toStringRef(TableNameStorage);
+
+  // Constructor for if the file doesn't exist.
+  auto NewDBConstructor = [&](DatabaseFile &DB) -> Error {
+    auto Store =
+        DataAllocatorHandle::create(DB.getAlloc(), TableName, UserHeaderSize);
+    if (LLVM_UNLIKELY(!Store))
+      return Store.takeError();
+
+    if (auto E = DB.addTable(*Store))
+      return E;
+
+    if (UserHeaderSize)
+      UserHeaderInit(Store->getUserHeader().data());
+    return Error::success();
+  };
+
+  // Get or create the file.
+  Expected<DatabaseFile> File =
+      DatabaseFile::create(Path, MaxFileSize, NewDBConstructor);
+  if (!File)
+    return File.takeError();
+
+  // Find the table and validate it.
+  std::optional<TableHandle> Table = File->findTable(TableName);
+  if (!Table)
+    return createTableConfigError(std::errc::argument_out_of_domain, Path,
+                                  TableName, "table not found");
+  if (Error E = checkTable("table kind", (size_t)DataAllocatorHandle::Kind,
+                           (size_t)Table->getHeader().Kind, Path, TableName))
+    return std::move(E);
+  auto Store = Table->cast<DataAllocatorHandle>();
+  assert(Store && "Already checked the kind");
+
+  // Success.
+  OnDiskDataAllocator::ImplType Impl{DatabaseFile(std::move(*File)), Store};
+  return OnDiskDataAllocator(std::make_unique<ImplType>(std::move(Impl)));
+}
+
+Expected<OnDiskDataAllocator::OnDiskPtr>
+OnDiskDataAllocator::allocate(size_t Size) {
+  auto Data = Impl->Store.allocate(Impl->File.getAlloc(), Size);
+  if (LLVM_UNLIKELY(!Data))
+    return Data.takeError();
+
+  return OnDiskPtr(FileOffset(Data->data() - Impl->Store.getRegion().data()),
+                   *Data);
+}
+
+Expected<ArrayRef<char>> OnDiskDataAllocator::get(FileOffset Offset,
+                                                  size_t Size) const {
+  assert(Offset);
+  assert(Impl);
+  if (Offset.get() + Size >= Impl->File.getAlloc().size())
+    return createStringError(make_error_code(std::errc::protocol_error),
+                             "requested size too large in allocator");
+  return ArrayRef<char>{Impl->File.getRegion().data() + Offset.get(), Size};
+}
+
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() {
+  return Impl->Store.getUserHeader();
+}
+
+size_t OnDiskDataAllocator::size() const { return Impl->File.size(); }
+size_t OnDiskDataAllocator::capacity() const {
+  return Impl->File.getRegion().size();
+}
+
+OnDiskDataAllocator::OnDiskDataAllocator(std::unique_ptr<ImplType> Impl)
+    : Impl(std::move(Impl)) {}
+
+#else // !LLVM_ENABLE_ONDISK_CAS
+
+struct OnDiskDataAllocator::ImplType {};
+
+Expected<OnDiskDataAllocator> OnDiskDataAllocator::create(
+    const Twine &Path, const Twine &TableName, uint64_t MaxFileSize,
+    std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize,
+    function_ref<void(void *)> UserHeaderInit) {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskDataAllocator is not supported");
+}
+
+Expected<OnDiskDataAllocator::OnDiskPtr>
+OnDiskDataAllocator::allocate(size_t Size) {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskDataAllocator is not supported");
+}
+
+Expected<ArrayRef<char>> OnDiskDataAllocator::get(FileOffset Offset,
+                                                  size_t Size) const {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskDataAllocator is not supported");
+}
+
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() { return {}; }
+
+size_t OnDiskDataAllocator::size() const { return 0; }
+size_t OnDiskDataAllocator::capacity() const { return 0; }
+
+#endif // LLVM_ENABLE_ONDISK_CAS
+
+OnDiskDataAllocator::OnDiskDataAllocator(OnDiskDataAllocator &&RHS) = default;
+OnDiskDataAllocator &
+OnDiskDataAllocator::operator=(OnDiskDataAllocator &&RHS) = default;
+OnDiskDataAllocator::~OnDiskDataAllocator() = default;
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index 0f8fcb9e98954..ee40e6c9879a1 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_unittest(CASTests
   ActionCacheTest.cpp
   CASTestConfig.cpp
   ObjectStoreTest.cpp
+  OnDiskDataAllocatorTest.cpp
   OnDiskTrieRawHashMapTest.cpp
   ProgramTest.cpp
   )
diff --git a/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp b/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp
new file mode 100644
index 0000000000000..966fa03076841
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "llvm/CAS/MappedFileRegionArena.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+using namespace llvm;
+using namespace llvm::cas;
+
+TEST(OnDiskDataAllocatorTest, Allocate) {
+  unittest::TempDir Temp("data-allocator", /*Unique=*/true);
+  constexpr size_t MB = 1024u * 1024u;
+
+  std::optional<OnDiskDataAllocator> Allocator;
+  ASSERT_THAT_ERROR(OnDiskDataAllocator::create(
+                        Temp.path("allocator"), "data", /*MaxFileSize=*/MB,
+                        /*NewFileInitialSize=*/std::nullopt)
+                        .moveInto(Allocator),
+                    Succeeded());
+
+  // Allocate.
+  {
+    for (size_t Size = 1; Size < 16; ++Size) {
+      OnDiskDataAllocator::OnDiskPtr P;
+      ASSERT_THAT_ERROR(Allocator->allocate(Size).moveInto(P), Succeeded());
+      EXPECT_TRUE(
+          isAligned(MappedFileRegionArena::getAlign(), P.getOffset().get()));
+    }
+  }
+
+  // Out of space.
+  {
+    OnDiskDataAllocator::OnDiskPtr P;
+    ASSERT_THAT_ERROR(Allocator->allocate(MB).moveInto(P), Failed());
+  }
+
+  // Check size and capacity.
+  {
+    ASSERT_EQ(Allocator->capacity(), MB);
+    ASSERT_LE(Allocator->size(), MB);
+  }
+
+  // Get.
+  {
+    OnDiskDataAllocator::OnDiskPtr P;
+    ASSERT_THAT_ERROR(Allocator->allocate(32).moveInto(P), Succeeded());
+    ArrayRef<char> Data;
+    ASSERT_THAT_ERROR(Allocator->get(P.getOffset(), 16).moveInto(Data),
+                      Succeeded());
+    ASSERT_THAT_ERROR(Allocator->get(P.getOffset(), 1025).moveInto(Data),
+                      Failed());
+  }
+}
+
+#endif // LLVM_ENABLE_ONDISK_CAS



More information about the llvm-commits mailing list