[llvm] [CAS] Add OnDiskDataAllocator (PR #161264)
Steven Wu via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 29 12:48:41 PDT 2025
https://github.com/cachemeifyoucan created https://github.com/llvm/llvm-project/pull/161264
Add OnDiskDataAllocator, which is the data pool implementation inside a
OnDiskCAS that stores data in a single file. It is a based on
MappedFileRegionArena and wrapped inside a CAS database file.
>From aafbb3816742c10ded59661934685d5588e0d2c8 Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Mon, 29 Sep 2025 12:48:31 -0700
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.7
---
llvm/include/llvm/CAS/OnDiskDataAllocator.h | 94 ++++++++
llvm/lib/CAS/CMakeLists.txt | 1 +
llvm/lib/CAS/OnDiskDataAllocator.cpp | 225 ++++++++++++++++++
llvm/unittests/CAS/CMakeLists.txt | 1 +
.../unittests/CAS/OnDiskDataAllocatorTest.cpp | 55 +++++
5 files changed, 376 insertions(+)
create mode 100644 llvm/include/llvm/CAS/OnDiskDataAllocator.h
create mode 100644 llvm/lib/CAS/OnDiskDataAllocator.cpp
create mode 100644 llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp
diff --git a/llvm/include/llvm/CAS/OnDiskDataAllocator.h b/llvm/include/llvm/CAS/OnDiskDataAllocator.h
new file mode 100644
index 0000000000000..40f5b0fb45d74
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskDataAllocator.h
@@ -0,0 +1,94 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares interface for OnDiskDataAllocator, a file backed data
+/// pool can be used to allocate space to store data packed in a single file. It
+/// is based on MappedFileRegionArena and includes a header in the beginning to
+/// provide metadata.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_ONDISKDATAALLOCATOR_H
+#define LLVM_CAS_ONDISKDATAALLOCATOR_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CAS/FileOffset.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm::cas {
+
+/// Sink for data. Stores variable length data with 8-byte alignment. Does not
+/// track size of data, which is assumed to known from context, or embedded.
+/// Uses 0-padding but does not guarantee 0-termination.
+class OnDiskDataAllocator {
+public:
+ using ValueProxy = MutableArrayRef<char>;
+
+ /// An iterator-like return value for data insertion. Maybe it should be
+ /// called \c iterator, but it has no increment.
+ class pointer {
+ public:
+ FileOffset getOffset() const { return Offset; }
+ explicit operator bool() const { return bool(getOffset()); }
+ const ValueProxy &operator*() const {
+ assert(Offset && "Null dereference");
+ return Value;
+ }
+ const ValueProxy *operator->() const {
+ assert(Offset && "Null dereference");
+ return &Value;
+ }
+
+ pointer() = default;
+
+ private:
+ friend class OnDiskDataAllocator;
+ pointer(FileOffset Offset, ValueProxy Value)
+ : Offset(Offset), Value(Value) {}
+ FileOffset Offset;
+ ValueProxy Value;
+ };
+
+ /// Look up the data stored at the given offset.
+ const char *beginData(FileOffset Offset) const;
+
+ /// Allocate at least \p Size with 8-byte alignment.
+ Expected<pointer> allocate(size_t Size);
+
+ /// \returns the buffer that was allocated at \p create time, with size
+ /// \p UserHeaderSize.
+ MutableArrayRef<uint8_t> getUserHeader();
+
+ size_t size() const;
+ size_t capacity() const;
+
+ static Expected<OnDiskDataAllocator>
+ create(const Twine &Path, const Twine &TableName, uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize,
+ uint32_t UserHeaderSize = 0,
+ function_ref<void(void *)> UserHeaderInit = nullptr);
+
+ OnDiskDataAllocator(OnDiskDataAllocator &&RHS);
+ OnDiskDataAllocator &operator=(OnDiskDataAllocator &&RHS);
+
+ // No copy. Just call \a create() again.
+ OnDiskDataAllocator(const OnDiskDataAllocator &) = delete;
+ OnDiskDataAllocator &operator=(const OnDiskDataAllocator &) = delete;
+
+ ~OnDiskDataAllocator();
+
+private:
+ struct ImplType;
+ explicit OnDiskDataAllocator(std::unique_ptr<ImplType> Impl);
+ std::unique_ptr<ImplType> Impl;
+};
+
+} // namespace llvm::cas
+
+#endif // LLVM_CAS_ONDISKDATAALLOCATOR_H
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index cc866f25f3240..884bc44ec8b69 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_component_library(LLVMCAS
MappedFileRegionArena.cpp
ObjectStore.cpp
OnDiskCommon.cpp
+ OnDiskDataAllocator.cpp
OnDiskTrieRawHashMap.cpp
ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/CAS/OnDiskDataAllocator.cpp b/llvm/lib/CAS/OnDiskDataAllocator.cpp
new file mode 100644
index 0000000000000..739f351f0b78f
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskDataAllocator.cpp
@@ -0,0 +1,225 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file Implements OnDiskDataAllocator.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "DatabaseFile.h"
+#include "llvm/Config/llvm-config.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+OnDiskDataAllocator::OnDiskDataAllocator(OnDiskDataAllocator &&RHS) = default;
+OnDiskDataAllocator &
+OnDiskDataAllocator::operator=(OnDiskDataAllocator &&RHS) = default;
+OnDiskDataAllocator::~OnDiskDataAllocator() = default;
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+//===----------------------------------------------------------------------===//
+// DataAllocator data structures.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// DataAllocator table layout:
+/// - [8-bytes: Generic table header]
+/// - 8-bytes: AllocatorOffset (reserved for implementing free lists)
+/// - 8-bytes: Size for user data header
+/// - <user data buffer>
+///
+/// Record layout:
+/// - <data>
+class DataAllocatorHandle {
+public:
+ static constexpr TableHandle::TableKind Kind =
+ TableHandle::TableKind::DataAllocator;
+
+ struct Header {
+ TableHandle::Header GenericHeader;
+ std::atomic<int64_t> AllocatorOffset;
+ const uint64_t UserHeaderSize;
+ };
+
+ operator TableHandle() const {
+ if (!H)
+ return TableHandle();
+ return TableHandle(*Region, H->GenericHeader);
+ }
+
+ Expected<MutableArrayRef<char>> allocate(MappedFileRegionArena &Alloc,
+ size_t DataSize) {
+ assert(&Alloc.getRegion() == Region);
+ auto Ptr = Alloc.allocate(DataSize);
+ if (LLVM_UNLIKELY(!Ptr))
+ return Ptr.takeError();
+ return MutableArrayRef(*Ptr, DataSize);
+ }
+
+ explicit operator bool() const { return H; }
+ const Header &getHeader() const { return *H; }
+ MappedFileRegion &getRegion() const { return *Region; }
+
+ MutableArrayRef<uint8_t> getUserHeader() {
+ return MutableArrayRef(reinterpret_cast<uint8_t *>(H + 1),
+ H->UserHeaderSize);
+ }
+
+ static Expected<DataAllocatorHandle>
+ create(MappedFileRegionArena &Alloc, StringRef Name, uint32_t UserHeaderSize);
+
+ DataAllocatorHandle() = default;
+ DataAllocatorHandle(MappedFileRegion &Region, Header &H)
+ : Region(&Region), H(&H) {}
+ DataAllocatorHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+ : DataAllocatorHandle(
+ Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+ }
+
+private:
+ MappedFileRegion *Region = nullptr;
+ Header *H = nullptr;
+};
+
+} // end anonymous namespace
+
+struct OnDiskDataAllocator::ImplType {
+ DatabaseFile File;
+ DataAllocatorHandle Store;
+};
+
+Expected<DataAllocatorHandle>
+DataAllocatorHandle::create(MappedFileRegionArena &Alloc, StringRef Name,
+ uint32_t UserHeaderSize) {
+ // Allocate.
+ auto Offset =
+ Alloc.allocateOffset(sizeof(Header) + UserHeaderSize + Name.size() + 1);
+ if (LLVM_UNLIKELY(!Offset))
+ return Offset.takeError();
+
+ // Construct the header and the name.
+ assert(Name.size() <= UINT16_MAX && "Expected smaller table name");
+ auto *H = new (Alloc.getRegion().data() + *Offset)
+ Header{{TableHandle::TableKind::DataAllocator, (uint16_t)Name.size(),
+ (int32_t)(sizeof(Header) + UserHeaderSize)},
+ /*AllocatorOffset=*/{0},
+ /*UserHeaderSize=*/UserHeaderSize};
+ memset(H + 1, 0, UserHeaderSize);
+ char *NameStorage = reinterpret_cast<char *>(H + 1) + UserHeaderSize;
+ llvm::copy(Name, NameStorage);
+ NameStorage[Name.size()] = 0;
+ return DataAllocatorHandle(Alloc.getRegion(), *H);
+}
+
+Expected<OnDiskDataAllocator> OnDiskDataAllocator::create(
+ const Twine &PathTwine, const Twine &TableNameTwine, uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize,
+ function_ref<void(void *)> UserHeaderInit) {
+ assert(!UserHeaderSize || UserHeaderInit);
+ SmallString<128> PathStorage;
+ StringRef Path = PathTwine.toStringRef(PathStorage);
+ SmallString<128> TableNameStorage;
+ StringRef TableName = TableNameTwine.toStringRef(TableNameStorage);
+
+ // Constructor for if the file doesn't exist.
+ auto NewDBConstructor = [&](DatabaseFile &DB) -> Error {
+ auto Store =
+ DataAllocatorHandle::create(DB.getAlloc(), TableName, UserHeaderSize);
+ if (LLVM_UNLIKELY(!Store))
+ return Store.takeError();
+
+ if (auto E = DB.addTable(*Store))
+ return E;
+
+ if (UserHeaderSize)
+ UserHeaderInit(Store->getUserHeader().data());
+ return Error::success();
+ };
+
+ // Get or create the file.
+ Expected<DatabaseFile> File =
+ DatabaseFile::create(Path, MaxFileSize, NewDBConstructor);
+ if (!File)
+ return File.takeError();
+
+ // Find the table and validate it.
+ std::optional<TableHandle> Table = File->findTable(TableName);
+ if (!Table)
+ return createTableConfigError(std::errc::argument_out_of_domain, Path,
+ TableName, "table not found");
+ if (Error E = checkTable("table kind", (size_t)DataAllocatorHandle::Kind,
+ (size_t)Table->getHeader().Kind, Path, TableName))
+ return std::move(E);
+ auto Store = Table->cast<DataAllocatorHandle>();
+ assert(Store && "Already checked the kind");
+
+ // Success.
+ OnDiskDataAllocator::ImplType Impl{DatabaseFile(std::move(*File)), Store};
+ return OnDiskDataAllocator(std::make_unique<ImplType>(std::move(Impl)));
+}
+
+Expected<OnDiskDataAllocator::pointer>
+OnDiskDataAllocator::allocate(size_t Size) {
+ auto Data = Impl->Store.allocate(Impl->File.getAlloc(), Size);
+ if (LLVM_UNLIKELY(!Data))
+ return Data.takeError();
+
+ return pointer(FileOffset(Data->data() - Impl->Store.getRegion().data()),
+ *Data);
+}
+
+const char *OnDiskDataAllocator::beginData(FileOffset Offset) const {
+ assert(Offset);
+ assert(Impl);
+ assert(Offset.get() < Impl->File.getAlloc().size());
+ return Impl->File.getRegion().data() + Offset.get();
+}
+
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() {
+ return Impl->Store.getUserHeader();
+}
+
+size_t OnDiskDataAllocator::size() const { return Impl->File.size(); }
+size_t OnDiskDataAllocator::capacity() const {
+ return Impl->File.getRegion().size();
+}
+
+OnDiskDataAllocator::OnDiskDataAllocator(std::unique_ptr<ImplType> Impl)
+ : Impl(std::move(Impl)) {}
+
+#else // !LLVM_ENABLE_ONDISK_CAS
+
+struct OnDiskDataAllocator::ImplType {};
+
+Expected<OnDiskDataAllocator> OnDiskDataAllocator::create(
+ const Twine &Path, const Twine &TableName, uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize,
+ function_ref<void(void *)> UserHeaderInit) {
+ return createStringError(make_error_code(std::errc::not_supported),
+ "OnDiskDataAllocator is not supported");
+}
+
+Expected<OnDiskDataAllocator::pointer>
+OnDiskDataAllocator::allocate(size_t Size) {
+ return createStringError(make_error_code(std::errc::not_supported),
+ "OnDiskDataAllocator is not supported");
+}
+
+const char *OnDiskDataAllocator::beginData(FileOffset Offset) const {
+ return nullptr;
+}
+
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() { return {}; }
+
+size_t OnDiskDataAllocator::size() const { return 0; }
+size_t OnDiskDataAllocator::capacity() const { return 0; }
+
+#endif // LLVM_ENABLE_ONDISK_CAS
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index 0f8fcb9e98954..ee40e6c9879a1 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_unittest(CASTests
ActionCacheTest.cpp
CASTestConfig.cpp
ObjectStoreTest.cpp
+ OnDiskDataAllocatorTest.cpp
OnDiskTrieRawHashMapTest.cpp
ProgramTest.cpp
)
diff --git a/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp b/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp
new file mode 100644
index 0000000000000..38ef36afffbc5
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "llvm/CAS/MappedFileRegionArena.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+using namespace llvm;
+using namespace llvm::cas;
+
+TEST(OnDiskDataAllocatorTest, Allocate) {
+ unittest::TempDir Temp("data-allocator", /*Unique=*/true);
+ constexpr size_t MB = 1024u * 1024u;
+
+ std::optional<OnDiskDataAllocator> Allocator;
+ ASSERT_THAT_ERROR(OnDiskDataAllocator::create(
+ Temp.path("allocator"), "data", /*MaxFileSize=*/MB,
+ /*NewFileInitialSize=*/std::nullopt)
+ .moveInto(Allocator),
+ Succeeded());
+
+ // Allocate.
+ {
+ for (size_t Size = 1; Size < 16; ++Size) {
+ OnDiskDataAllocator::pointer P;
+ ASSERT_THAT_ERROR(Allocator->allocate(Size).moveInto(P), Succeeded());
+ ASSERT_TRUE(
+ isAligned(MappedFileRegionArena::getAlign(), P.getOffset().get()));
+ }
+ }
+
+ // Out of space.
+ {
+ OnDiskDataAllocator::pointer P;
+ ASSERT_THAT_ERROR(Allocator->allocate(MB).moveInto(P), Failed());
+ }
+
+ // Check size and capacity.
+ {
+ ASSERT_EQ(Allocator->capacity(), MB);
+ ASSERT_LE(Allocator->size(), MB);
+ }
+}
+
+#endif // LLVM_ENABLE_ONDISK_CAS
More information about the llvm-commits
mailing list