[llvm] [ADT] Add TrieRawHashMap (PR #69528)
David Blaikie via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 19 14:45:03 PDT 2023
================
@@ -0,0 +1,398 @@
+//===- TrieRawHashMap.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_TRIERAWHASHMAP_H
+#define LLVM_ADT_TRIERAWHASHMAP_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <atomic>
+#include <optional>
+
+namespace llvm {
+
+class raw_ostream;
+
+/// TrieRawHashMap - is a lock-free thread-safe trie that is can be used to
+/// store/index data based on a hash value. It can be customized to work with
+/// any hash algorithm or store any data.
+///
+/// Data structure:
+/// Data node stored in the Trie contains both hash and data:
+/// struct {
+/// HashT Hash;
+/// DataT Data;
+/// };
+///
+/// Data is stored/indexed via a prefix tree, where each node in the tree can be
+/// either the root, a sub-trie or a data node. Assuming a 4-bit hash and two
+/// data objects {0001, A} and {0100, B}, it can be stored in a trie
+/// (assuming Root has 2 bits, SubTrie has 1 bit):
+/// +--------+
+/// |Root[00]| -> {0001, A}
+/// | [01]| -> {0100, B}
+/// | [10]| (empty)
+/// | [11]| (empty)
+/// +--------+
+///
+/// Inserting a new object {0010, C} will result in:
+/// +--------+ +----------+
+/// |Root[00]| -> |SubTrie[0]| -> {0001, A}
+/// | | | [1]| -> {0010, C}
+/// | | +----------+
+/// | [01]| -> {0100, B}
+/// | [10]| (empty)
+/// | [11]| (empty)
+/// +--------+
+/// Note object A is sunk down to a sub-trie during the insertion. All the
+/// nodes are inserted through compare-exchange to ensure thread-safe and
+/// lock-free.
+///
+/// To find an object in the trie, walk the tree with prefix of the hash until
+/// the data node is found. Then the hash is compared with the hash stored in
+/// the data node to see if the is the same object.
+///
+/// Hash collision is not allowed so it is recommended to use trie with a
+/// "strong" hashing algorithm. A well-distributed hash can also result in
+/// better performance and memory usage.
+///
+/// It currently does not support iteration and deletion.
+
+/// Base class for a lock-free thread-safe hash-mapped trie.
+class ThreadSafeTrieRawHashMapBase {
+public:
+ static constexpr size_t TrieContentBaseSize = 4;
+ static constexpr size_t DefaultNumRootBits = 6;
+ static constexpr size_t DefaultNumSubtrieBits = 4;
+
+private:
+ template <class T> struct AllocValueType {
+ char Base[TrieContentBaseSize];
+ std::aligned_union_t<sizeof(T), T> Content;
+ };
+
+protected:
+ template <class T>
+ static constexpr size_t DefaultContentAllocSize = sizeof(AllocValueType<T>);
+
+ template <class T>
+ static constexpr size_t DefaultContentAllocAlign = alignof(AllocValueType<T>);
+
+ template <class T>
+ static constexpr size_t DefaultContentOffset =
+ offsetof(AllocValueType<T>, Content);
+
+public:
+ void operator delete(void *Ptr) { ::free(Ptr); }
+
+ LLVM_DUMP_METHOD void dump() const;
+ void print(raw_ostream &OS) const;
+
+protected:
+ /// Result of a lookup. Suitable for an insertion hint. Maybe could be
+ /// expanded into an iterator of sorts, but likely not useful (visiting
+ /// everything in the trie should probably be done some way other than
+ /// through an iterator pattern).
+ class PointerBase {
+ protected:
+ void *get() const { return I == -2u ? P : nullptr; }
+
+ public:
+ PointerBase() noexcept = default;
+ PointerBase(PointerBase &&) = default;
+ PointerBase(const PointerBase &) = default;
+ PointerBase &operator=(PointerBase &&) = default;
+ PointerBase &operator=(const PointerBase &) = default;
+
+ private:
+ friend class ThreadSafeTrieRawHashMapBase;
+ explicit PointerBase(void *Content) : P(Content), I(-2u) {}
+ PointerBase(void *P, unsigned I, unsigned B) : P(P), I(I), B(B) {}
+
+ bool isHint() const { return I != -1u && I != -2u; }
+
+ void *P = nullptr;
+ unsigned I = -1u;
+ unsigned B = 0;
+ };
+
+ /// Find the stored content with hash.
+ PointerBase find(ArrayRef<uint8_t> Hash) const;
+
+ /// Insert and return the stored content.
+ PointerBase
+ insert(PointerBase Hint, ArrayRef<uint8_t> Hash,
+ function_ref<const uint8_t *(void *Mem, ArrayRef<uint8_t> Hash)>
+ Constructor);
+
+ ThreadSafeTrieRawHashMapBase() = delete;
+
+ ThreadSafeTrieRawHashMapBase(
+ size_t ContentAllocSize, size_t ContentAllocAlign, size_t ContentOffset,
+ std::optional<size_t> NumRootBits = std::nullopt,
+ std::optional<size_t> NumSubtrieBits = std::nullopt);
+
+ /// Destructor, which asserts if there's anything to do. Subclasses should
+ /// call \a destroyImpl().
+ ///
+ /// \pre \a destroyImpl() was already called.
+ ~ThreadSafeTrieRawHashMapBase();
+ void destroyImpl(function_ref<void(void *ValueMem)> Destructor);
+
+ ThreadSafeTrieRawHashMapBase(ThreadSafeTrieRawHashMapBase &&RHS);
+
+ // Move assignment can be implemented in a thread-safe way if NumRootBits and
+ // NumSubtrieBits are stored inside the Root.
+ ThreadSafeTrieRawHashMapBase &
+ operator=(ThreadSafeTrieRawHashMapBase &&RHS) = delete;
+
+ // No copy.
+ ThreadSafeTrieRawHashMapBase(const ThreadSafeTrieRawHashMapBase &) = delete;
+ ThreadSafeTrieRawHashMapBase &
+ operator=(const ThreadSafeTrieRawHashMapBase &) = delete;
+
+ // Debug functions. Implementation details and not guaranteed to be
+ // thread-safe.
+ PointerBase getRoot() const;
+ unsigned getStartBit(PointerBase P) const;
+ unsigned getNumBits(PointerBase P) const;
+ unsigned getNumSlotUsed(PointerBase P) const;
+ std::string getTriePrefixAsString(PointerBase P) const;
+ unsigned getNumTries() const;
+ // Visit next trie in the allocation chain.
+ PointerBase getNextTrie(PointerBase P) const;
+
+private:
+ friend class TrieRawHashMapTestHelper;
+ const unsigned short ContentAllocSize;
+ const unsigned short ContentAllocAlign;
+ const unsigned short ContentOffset;
+ unsigned short NumRootBits;
+ unsigned short NumSubtrieBits;
+ struct ImplType;
+ // ImplPtr is owned by ThreadSafeTrieRawHashMapBase and needs to be freed in
+ // destoryImpl.
+ std::atomic<ImplType *> ImplPtr;
+ ImplType &getOrCreateImpl();
+ ImplType *getImpl() const;
+};
+
+/// Lock-free thread-safe hash-mapped trie.
+template <class T, size_t NumHashBytes>
+class ThreadSafeTrieRawHashMap : public ThreadSafeTrieRawHashMapBase {
+public:
+ using HashT = std::array<uint8_t, NumHashBytes>;
+
+ class LazyValueConstructor;
+ struct value_type {
+ const HashT Hash;
+ T Data;
+
+ value_type(value_type &&) = default;
+ value_type(const value_type &) = default;
+
+ value_type(ArrayRef<uint8_t> Hash, const T &Data)
+ : Hash(makeHash(Hash)), Data(Data) {}
+ value_type(ArrayRef<uint8_t> Hash, T &&Data)
+ : Hash(makeHash(Hash)), Data(std::move(Data)) {}
+
+ private:
+ friend class LazyValueConstructor;
+
+ struct EmplaceTag {};
+ template <class... ArgsT>
+ value_type(ArrayRef<uint8_t> Hash, EmplaceTag, ArgsT &&...Args)
+ : Hash(makeHash(Hash)), Data(std::forward<ArgsT>(Args)...) {}
+
+ static HashT makeHash(ArrayRef<uint8_t> HashRef) {
+ HashT Hash;
+ std::copy(HashRef.begin(), HashRef.end(), Hash.data());
+ return Hash;
+ }
+ };
+
+ using ThreadSafeTrieRawHashMapBase::operator delete;
+ using HashType = HashT;
+
+ using ThreadSafeTrieRawHashMapBase::dump;
+ using ThreadSafeTrieRawHashMapBase::print;
+
+private:
+ template <class ValueT> class PointerImpl : PointerBase {
+ friend class ThreadSafeTrieRawHashMap;
+
+ ValueT *get() const {
+ if (void *B = PointerBase::get())
+ return reinterpret_cast<ValueT *>(B);
----------------
dwblaikie wrote:
I'd assume ` reinterpret_cast` on a null pointer is valid/returns null, so this function could be simplified to:
```
return reinterpret_cast<ValueT*>(PointerBase::get());
```
maybe?
https://github.com/llvm/llvm-project/pull/69528
More information about the llvm-commits
mailing list