[llvm] [ADT] Add TrieRawHashMap (PR #69528)
Justin Bogner via llvm-commits
llvm-commits at lists.llvm.org
Sat Oct 26 17:40:55 PDT 2024
================
@@ -0,0 +1,379 @@
+//===- TrieRawHashMap.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_TRIERAWHASHMAP_H
+#define LLVM_ADT_TRIERAWHASHMAP_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <atomic>
+#include <optional>
+
+namespace llvm {
+
+class raw_ostream;
+
+/// TrieRawHashMap - is a lock-free thread-safe trie that is can be used to
+/// store/index data based on a hash value. It can be customized to work with
+/// any hash algorithm or store any data.
+///
+/// Data structure:
+/// Data node stored in the Trie contains both hash and data:
+/// struct {
+/// HashT Hash;
+/// DataT Data;
+/// };
+///
+/// Data is stored/indexed via a prefix tree, where each node in the tree can be
+/// either the root, a sub-trie or a data node. Assuming a 4-bit hash and two
+/// data objects {0001, A} and {0100, B}, it can be stored in a trie
+/// (assuming Root has 2 bits, SubTrie has 1 bit):
+/// +--------+
+/// |Root[00]| -> {0001, A}
+/// | [01]| -> {0100, B}
+/// | [10]| (empty)
+/// | [11]| (empty)
+/// +--------+
+///
+/// Inserting a new object {0010, C} will result in:
+/// +--------+ +----------+
+/// |Root[00]| -> |SubTrie[0]| -> {0001, A}
+/// | | | [1]| -> {0010, C}
+/// | | +----------+
+/// | [01]| -> {0100, B}
+/// | [10]| (empty)
+/// | [11]| (empty)
+/// +--------+
+/// Note object A is sunk down to a sub-trie during the insertion. All the
+/// nodes are inserted through compare-exchange to ensure thread-safe and
+/// lock-free.
+///
+/// To find an object in the trie, walk the tree with prefix of the hash until
+/// the data node is found. Then the hash is compared with the hash stored in
+/// the data node to see if the is the same object.
+///
+/// Hash collision is not allowed so it is recommended to use trie with a
+/// "strong" hashing algorithm. A well-distributed hash can also result in
+/// better performance and memory usage.
+///
+/// It currently does not support iteration and deletion.
+
+/// Base class for a lock-free thread-safe hash-mapped trie.
+class ThreadSafeTrieRawHashMapBase {
+public:
+ static constexpr size_t TrieContentBaseSize = 4;
+ static constexpr size_t DefaultNumRootBits = 6;
+ static constexpr size_t DefaultNumSubtrieBits = 4;
+
+private:
+ template <class T> struct AllocValueType {
+ char Base[TrieContentBaseSize];
+ std::aligned_union_t<sizeof(T), T> Content;
+ };
+
+protected:
+ template <class T>
+ static constexpr size_t DefaultContentAllocSize = sizeof(AllocValueType<T>);
+
+ template <class T>
+ static constexpr size_t DefaultContentAllocAlign = alignof(AllocValueType<T>);
+
+ template <class T>
+ static constexpr size_t DefaultContentOffset =
+ offsetof(AllocValueType<T>, Content);
+
+public:
+ void operator delete(void *Ptr) { ::free(Ptr); }
+
+ LLVM_DUMP_METHOD void dump() const;
+ void print(raw_ostream &OS) const;
+
+protected:
+ /// Result of a lookup. Suitable for an insertion hint. Maybe could be
+ /// expanded into an iterator of sorts, but likely not useful (visiting
+ /// everything in the trie should probably be done some way other than
+ /// through an iterator pattern).
+ class PointerBase {
+ protected:
+ void *get() const { return I == -2u ? P : nullptr; }
+
+ public:
+ PointerBase() noexcept = default;
+
+ private:
+ friend class ThreadSafeTrieRawHashMapBase;
+ explicit PointerBase(void *Content) : P(Content), I(-2u) {}
+ PointerBase(void *P, unsigned I, unsigned B) : P(P), I(I), B(B) {}
+
+ bool isHint() const { return I != -1u && I != -2u; }
+
+ void *P = nullptr;
+ unsigned I = -1u;
+ unsigned B = 0;
+ };
+
+ /// Find the stored content with hash.
+ PointerBase find(ArrayRef<uint8_t> Hash) const;
+
+ /// Insert and return the stored content.
+ PointerBase
+ insert(PointerBase Hint, ArrayRef<uint8_t> Hash,
+ function_ref<const uint8_t *(void *Mem, ArrayRef<uint8_t> Hash)>
+ Constructor);
+
+ ThreadSafeTrieRawHashMapBase() = delete;
+
+ ThreadSafeTrieRawHashMapBase(
+ size_t ContentAllocSize, size_t ContentAllocAlign, size_t ContentOffset,
+ std::optional<size_t> NumRootBits = std::nullopt,
+ std::optional<size_t> NumSubtrieBits = std::nullopt);
+
+ /// Destructor, which asserts if there's anything to do. Subclasses should
+ /// call \a destroyImpl().
+ ///
+ /// \pre \a destroyImpl() was already called.
+ ~ThreadSafeTrieRawHashMapBase();
+ void destroyImpl(function_ref<void(void *ValueMem)> Destructor);
+
+ ThreadSafeTrieRawHashMapBase(ThreadSafeTrieRawHashMapBase &&RHS);
+
+ // Move assignment can be implemented in a thread-safe way if NumRootBits and
+ // NumSubtrieBits are stored inside the Root.
+ ThreadSafeTrieRawHashMapBase &
+ operator=(ThreadSafeTrieRawHashMapBase &&RHS) = delete;
----------------
bogner wrote:
Is the comment here meant to be a TODO/FIXME, or just general commentary? It's a bit confusing to have a comment that says something could be done but then seeing that the code does not do it. Maybe rewording it like "We don't do X because of Y, but if we needed to then we could do Z" would make it clearer.
https://github.com/llvm/llvm-project/pull/69528
More information about the llvm-commits
mailing list