[clang-tools-extra] r342965 - [clangd] Implement VByte PostingList compression
Kirill Bobyrev via cfe-commits
cfe-commits at lists.llvm.org
Tue Sep 25 04:54:51 PDT 2018
Author: omtcyfz
Date: Tue Sep 25 04:54:51 2018
New Revision: 342965
URL: http://llvm.org/viewvc/llvm-project?rev=342965&view=rev
Log:
[clangd] Implement VByte PostingList compression
This patch implements Variable-length Byte compression of `PostingList`s
to sacrifice some performance for lower memory consumption.
`PostingList` compression and decompression was extensively tested using
fuzzer for multiple hours and runnning significant number of realistic
`FuzzyFindRequests`. AddressSanitizer and UndefinedBehaviorSanitizer
were used to ensure the correct behaviour.
Performance evaluation was conducted with recent LLVM symbol index (292k
symbols) and the collection of user-recorded queries (7751
`FuzzyFindRequest` JSON dumps):
| Metrics | Before| After | Change (%)
| ----- | ----- | ----- | -----
| Memory consumption (posting lists only), MB | 54.4 | 23.5 | -60%
| Time to process queries, sec | 7.70 | 9.4 | +25%
Reviewers: sammccall, ioeric
Reviewed By: sammccall
Differential Revision: https://reviews.llvm.org/D52300
Modified:
clang-tools-extra/trunk/clangd/index/dex/Dex.cpp
clang-tools-extra/trunk/clangd/index/dex/PostingList.cpp
clang-tools-extra/trunk/clangd/index/dex/PostingList.h
clang-tools-extra/trunk/unittests/clangd/DexTests.cpp
Modified: clang-tools-extra/trunk/clangd/index/dex/Dex.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/Dex.cpp?rev=342965&r1=342964&r2=342965&view=diff
==============================================================================
--- clang-tools-extra/trunk/clangd/index/dex/Dex.cpp (original)
+++ clang-tools-extra/trunk/clangd/index/dex/Dex.cpp Tue Sep 25 04:54:51 2018
@@ -128,8 +128,8 @@ void Dex::buildIndex() {
// Convert lists of items to posting lists.
for (const auto &TokenToPostingList : TempInvertedIndex)
- InvertedIndex.insert({TokenToPostingList.first,
- PostingList(move(TokenToPostingList.second))});
+ InvertedIndex.insert(
+ {TokenToPostingList.first, PostingList(TokenToPostingList.second)});
vlog("Built Dex with estimated memory usage {0} bytes.",
estimateMemoryUsage());
Modified: clang-tools-extra/trunk/clangd/index/dex/PostingList.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/PostingList.cpp?rev=342965&r1=342964&r2=342965&view=diff
==============================================================================
--- clang-tools-extra/trunk/clangd/index/dex/PostingList.cpp (original)
+++ clang-tools-extra/trunk/clangd/index/dex/PostingList.cpp Tue Sep 25 04:54:51 2018
@@ -9,6 +9,8 @@
#include "PostingList.h"
#include "Iterator.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
namespace clang {
namespace clangd {
@@ -16,21 +18,27 @@ namespace dex {
namespace {
-/// Implements Iterator over std::vector<DocID>. This is the most basic
-/// iterator and is simply a wrapper around
-/// std::vector<DocID>::const_iterator.
-class PlainIterator : public Iterator {
+/// Implements iterator of PostingList chunks. This requires iterating over two
+/// levels: the first level iterator iterates over the chunks and decompresses
+/// them on-the-fly when the contents of chunk are to be seen.
+class ChunkIterator : public Iterator {
public:
- explicit PlainIterator(llvm::ArrayRef<DocID> Documents)
- : Documents(Documents), Index(std::begin(Documents)) {}
+ explicit ChunkIterator(llvm::ArrayRef<Chunk> Chunks)
+ : Chunks(Chunks), CurrentChunk(Chunks.begin()) {
+ if (!Chunks.empty()) {
+ DecompressedChunk = CurrentChunk->decompress();
+ CurrentID = DecompressedChunk.begin();
+ }
+ }
- bool reachedEnd() const override { return Index == std::end(Documents); }
+ bool reachedEnd() const override { return CurrentChunk == Chunks.end(); }
/// Advances cursor to the next item.
void advance() override {
assert(!reachedEnd() &&
"Posting List iterator can't advance() at the end.");
- ++Index;
+ ++CurrentID;
+ normalizeCursor();
}
/// Applies binary search to advance cursor to the next item with DocID
@@ -38,16 +46,17 @@ public:
void advanceTo(DocID ID) override {
assert(!reachedEnd() &&
"Posting List iterator can't advance() at the end.");
- // If current ID is beyond requested one, iterator is already in the right
- // state.
- if (peek() < ID)
- Index = std::lower_bound(Index, std::end(Documents), ID);
+ if (ID <= peek())
+ return;
+ advanceToChunk(ID);
+ // Try to find ID within current chunk.
+ CurrentID = std::lower_bound(CurrentID, std::end(DecompressedChunk), ID);
+ normalizeCursor();
}
DocID peek() const override {
- assert(!reachedEnd() &&
- "Posting List iterator can't peek() at the end.");
- return *Index;
+ assert(!reachedEnd() && "Posting List iterator can't peek() at the end.");
+ return *CurrentID;
}
float consume() override {
@@ -56,27 +65,160 @@ public:
return DEFAULT_BOOST_SCORE;
}
- size_t estimateSize() const override { return Documents.size(); }
+ size_t estimateSize() const override {
+ return Chunks.size() * ApproxEntriesPerChunk;
+ }
private:
llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
OS << '[';
- if (Index != std::end(Documents))
- OS << *Index;
- else
- OS << "END";
+ if (CurrentChunk != Chunks.begin() ||
+ (CurrentID != DecompressedChunk.begin() && !DecompressedChunk.empty()))
+ OS << "... ";
+ OS << (reachedEnd() ? "END" : std::to_string(*CurrentID));
+ if (!reachedEnd() && CurrentID < DecompressedChunk.end() - 1)
+ OS << " ...";
OS << ']';
return OS;
}
- llvm::ArrayRef<DocID> Documents;
- llvm::ArrayRef<DocID>::const_iterator Index;
+ /// If the cursor is at the end of a chunk, place it at the start of the next
+ /// chunk.
+ void normalizeCursor() {
+ // Invariant is already established if examined chunk is not exhausted.
+ if (CurrentID != std::end(DecompressedChunk))
+ return;
+ // Advance to next chunk if current one is exhausted.
+ ++CurrentChunk;
+ if (CurrentChunk == Chunks.end()) // Reached the end of PostingList.
+ return;
+ DecompressedChunk = CurrentChunk->decompress();
+ CurrentID = DecompressedChunk.begin();
+ }
+
+ /// Advances CurrentChunk to the chunk which might contain ID.
+ void advanceToChunk(DocID ID) {
+ if ((CurrentChunk != Chunks.end() - 1) &&
+ ((CurrentChunk + 1)->Head <= ID)) {
+ // Find the next chunk with Head >= ID.
+ CurrentChunk = std::lower_bound(
+ CurrentChunk + 1, Chunks.end(), ID,
+ [](const Chunk &C, const DocID ID) { return C.Head <= ID; });
+ --CurrentChunk;
+ DecompressedChunk = CurrentChunk->decompress();
+ CurrentID = DecompressedChunk.begin();
+ }
+ }
+
+ llvm::ArrayRef<Chunk> Chunks;
+ /// Iterator over chunks.
+ /// If CurrentChunk is valid, then DecompressedChunk is
+ /// CurrentChunk->decompress() and CurrentID is a valid (non-end) iterator
+ /// into it.
+ decltype(Chunks)::const_iterator CurrentChunk;
+ llvm::SmallVector<DocID, Chunk::PayloadSize + 1> DecompressedChunk;
+ /// Iterator over DecompressedChunk.
+ decltype(DecompressedChunk)::iterator CurrentID;
+
+ static constexpr size_t ApproxEntriesPerChunk = 15;
};
+static constexpr size_t BitsPerEncodingByte = 7;
+
+/// Writes a variable length DocID into the buffer and updates the buffer size.
+/// If it doesn't fit, returns false and doesn't write to the buffer.
+bool encodeVByte(DocID Delta, llvm::MutableArrayRef<uint8_t> &Payload) {
+ assert(Delta != 0 && "0 is not a valid PostingList delta.");
+ // Calculate number of bytes Delta encoding would take by examining the
+ // meaningful bits.
+ unsigned Width = 1 + llvm::findLastSet(Delta) / BitsPerEncodingByte;
+ if (Width > Payload.size())
+ return false;
+
+ do {
+ uint8_t Encoding = Delta & 0x7f;
+ Delta >>= 7;
+ Payload.front() = Delta ? Encoding | 0x80 : Encoding;
+ Payload = Payload.drop_front();
+ } while (Delta != 0);
+ return true;
+}
+
+/// Use Variable-length Byte (VByte) delta encoding to compress sorted list of
+/// DocIDs. The compression stores deltas (differences) between subsequent
+/// DocIDs and encodes these deltas utilizing the least possible number of
+/// bytes.
+///
+/// Each encoding byte consists of two parts: the first bit (continuation bit)
+/// indicates whether this is the last byte (0 if this byte is the last) of
+/// current encoding and seven bytes a piece of DocID (payload). DocID contains
+/// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit
+/// payloads and one 4-bit payload), but in practice it is expected that gaps
+/// (deltas) between subsequent DocIDs are not large enough to require 5 bytes.
+/// In very dense posting lists (with average gaps less than 128) this
+/// representation would be 4 times more efficient than raw DocID array.
+///
+/// PostingList encoding example:
+///
+/// DocIDs 42 47 7000
+/// gaps 5 6958
+/// Encoding (raw number) 00000101 10110110 00101110
+std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) {
+ assert(!Documents.empty() && "Can't encode empty sequence.");
+ std::vector<Chunk> Result;
+ Result.emplace_back();
+ DocID Last = Result.back().Head = Documents.front();
+ llvm::MutableArrayRef<uint8_t> RemainingPayload = Result.back().Payload;
+ for (DocID Doc : Documents.drop_front()) {
+ if (!encodeVByte(Doc - Last, RemainingPayload)) { // didn't fit, flush chunk
+ Result.emplace_back();
+ Result.back().Head = Doc;
+ RemainingPayload = Result.back().Payload;
+ }
+ Last = Doc;
+ }
+ return std::vector<Chunk>(Result); // no move, shrink-to-fit
+}
+
+/// Reads variable length DocID from the buffer and updates the buffer size. If
+/// the stream is terminated, return None.
+llvm::Optional<DocID> readVByte(llvm::ArrayRef<uint8_t> &Bytes) {
+ if (Bytes.front() == 0 || Bytes.empty())
+ return llvm::None;
+ DocID Result = 0;
+ bool HasNextByte = true;
+ for (size_t Length = 0; HasNextByte && !Bytes.empty(); ++Length) {
+ assert(Length <= 5 && "Malformed VByte encoding sequence.");
+ // Write meaningful bits to the correct place in the document decoding.
+ Result |= (Bytes.front() & 0x7f) << (BitsPerEncodingByte * Length);
+ if ((Bytes.front() & 0x80) == 0)
+ HasNextByte = false;
+ Bytes = Bytes.drop_front();
+ }
+ return Result;
+}
+
} // namespace
+llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Chunk::decompress() const {
+ llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Result{Head};
+ llvm::ArrayRef<uint8_t> Bytes(Payload);
+ DocID Delta;
+ for (DocID Current = Head; !Bytes.empty(); Current += Delta) {
+ auto MaybeDelta = readVByte(Bytes);
+ if (!MaybeDelta)
+ break;
+ Delta = *MaybeDelta;
+ Result.push_back(Current + Delta);
+ }
+ return llvm::SmallVector<DocID, Chunk::PayloadSize + 1>{Result};
+}
+
+PostingList::PostingList(llvm::ArrayRef<DocID> Documents)
+ : Chunks(encodeStream(Documents)) {}
+
std::unique_ptr<Iterator> PostingList::iterator() const {
- return llvm::make_unique<PlainIterator>(Documents);
+ return llvm::make_unique<ChunkIterator>(Chunks);
}
} // namespace dex
Modified: clang-tools-extra/trunk/clangd/index/dex/PostingList.h
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/PostingList.h?rev=342965&r1=342964&r2=342965&view=diff
==============================================================================
--- clang-tools-extra/trunk/clangd/index/dex/PostingList.h (original)
+++ clang-tools-extra/trunk/clangd/index/dex/PostingList.h Tue Sep 25 04:54:51 2018
@@ -6,13 +6,19 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-// This defines posting list interface: a storage for identifiers of symbols
-// which can be characterized by a specific feature (such as fuzzy-find trigram,
-// scope, type or any other Search Token). Posting lists can be traversed in
-// order using an iterator and are values for inverted index, which maps search
-// tokens to corresponding posting lists.
-//
+///
+/// \file
+/// This defines posting list interface: a storage for identifiers of symbols
+/// which can be characterized by a specific feature (such as fuzzy-find
+/// trigram, scope, type or any other Search Token). Posting lists can be
+/// traversed in order using an iterator and are values for inverted index,
+/// which maps search tokens to corresponding posting lists.
+///
+/// In order to decrease size of Index in-memory representation, Variable Byte
+/// Encoding (VByte) is used for PostingLists compression. An overview of VByte
+/// algorithm can be found in "Introduction to Information Retrieval" book:
+/// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html
+///
//===----------------------------------------------------------------------===//
#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
@@ -20,6 +26,7 @@
#include "Iterator.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
#include <cstdint>
#include <vector>
@@ -29,20 +36,43 @@ namespace dex {
class Iterator;
+/// NOTE: This is an implementation detail.
+///
+/// Chunk is a fixed-width piece of PostingList which contains the first DocID
+/// in uncompressed format (Head) and delta-encoded Payload. It can be
+/// decompressed upon request.
+struct Chunk {
+ /// Keep sizeof(Chunk) == 32.
+ static constexpr size_t PayloadSize = 32 - sizeof(DocID);
+
+ llvm::SmallVector<DocID, PayloadSize + 1> decompress() const;
+
+ /// The first element of decompressed Chunk.
+ DocID Head;
+ /// VByte-encoded deltas.
+ std::array<uint8_t, PayloadSize> Payload = std::array<uint8_t, PayloadSize>();
+};
+static_assert(sizeof(Chunk) == 32, "Chunk should take 32 bytes of memory.");
+
/// PostingList is the storage of DocIDs which can be inserted to the Query
-/// Tree as a leaf by constructing Iterator over the PostingList object.
-// FIXME(kbobyrev): Use VByte algorithm to compress underlying data.
+/// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs
+/// are stored in underlying chunks. Compression saves memory at a small cost
+/// in access time, which is still fast enough in practice.
class PostingList {
public:
- explicit PostingList(const std::vector<DocID> &&Documents)
- : Documents(std::move(Documents)) {}
+ explicit PostingList(llvm::ArrayRef<DocID> Documents);
+ /// Constructs DocumentIterator over given posting list. DocumentIterator will
+ /// go through the chunks and decompress them on-the-fly when necessary.
std::unique_ptr<Iterator> iterator() const;
- size_t bytes() const { return Documents.size() * sizeof(DocID); }
+ /// Returns in-memory size.
+ size_t bytes() const {
+ return sizeof(Chunk) + Chunks.capacity() * sizeof(Chunk);
+ }
private:
- const std::vector<DocID> Documents;
+ const std::vector<Chunk> Chunks;
};
} // namespace dex
Modified: clang-tools-extra/trunk/unittests/clangd/DexTests.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/unittests/clangd/DexTests.cpp?rev=342965&r1=342964&r2=342965&view=diff
==============================================================================
--- clang-tools-extra/trunk/unittests/clangd/DexTests.cpp (original)
+++ clang-tools-extra/trunk/unittests/clangd/DexTests.cpp Tue Sep 25 04:54:51 2018
@@ -69,19 +69,6 @@ TEST(DexIterators, DocumentIterator) {
EXPECT_TRUE(DocIterator->reachedEnd());
}
-TEST(DexIterators, AndWithEmpty) {
- const PostingList L0({});
- const PostingList L1({0, 5, 7, 10, 42, 320, 9000});
-
- auto AndEmpty = createAnd(L0.iterator());
- EXPECT_TRUE(AndEmpty->reachedEnd());
-
- auto AndWithEmpty = createAnd(L0.iterator(), L1.iterator());
- EXPECT_TRUE(AndWithEmpty->reachedEnd());
-
- EXPECT_THAT(consumeIDs(*AndWithEmpty), ElementsAre());
-}
-
TEST(DexIterators, AndTwoLists) {
const PostingList L0({0, 5, 7, 10, 42, 320, 9000});
const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000});
@@ -120,20 +107,6 @@ TEST(DexIterators, AndThreeLists) {
EXPECT_TRUE(And->reachedEnd());
}
-TEST(DexIterators, OrWithEmpty) {
- const PostingList L0({});
- const PostingList L1({0, 5, 7, 10, 42, 320, 9000});
-
- auto OrEmpty = createOr(L0.iterator());
- EXPECT_TRUE(OrEmpty->reachedEnd());
-
- auto OrWithEmpty = createOr(L0.iterator(), L1.iterator());
- EXPECT_FALSE(OrWithEmpty->reachedEnd());
-
- EXPECT_THAT(consumeIDs(*OrWithEmpty),
- ElementsAre(0U, 5U, 7U, 10U, 42U, 320U, 9000U));
-}
-
TEST(DexIterators, OrTwoLists) {
const PostingList L0({0, 5, 7, 10, 42, 320, 9000});
const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000});
@@ -211,29 +184,27 @@ TEST(DexIterators, QueryTree) {
// |And Iterator: 1, 5, 9| |Or Iterator: 0, 1, 3, 5|
// +----------+----------+ +----------+------------+
// | |
- // +------+-----+ +---------------------+
- // | | | | |
- // +-------v-----+ +----+---+ +--v--+ +---v----+ +----v---+
- // |1, 3, 5, 8, 9| |Boost: 2| |Empty| |Boost: 3| |Boost: 4|
- // +-------------+ +----+---+ +-----+ +---+----+ +----+---+
- // | | |
- // +----v-----+ +-v--+ +---v---+
- // |1, 5, 7, 9| |1, 5| |0, 3, 5|
- // +----------+ +----+ +-------+
+ // +------+-----+ ------------+
+ // | | | |
+ // +-------v-----+ +----+---+ +---v----+ +----v---+
+ // |1, 3, 5, 8, 9| |Boost: 2| |Boost: 3| |Boost: 4|
+ // +-------------+ +----+---+ +---+----+ +----+---+
+ // | | |
+ // +----v-----+ +-v--+ +---v---+
+ // |1, 5, 7, 9| |1, 5| |0, 3, 5|
+ // +----------+ +----+ +-------+
//
const PostingList L0({1, 3, 5, 8, 9});
const PostingList L1({1, 5, 7, 9});
- const PostingList L3({});
- const PostingList L4({1, 5});
- const PostingList L5({0, 3, 5});
+ const PostingList L2({1, 5});
+ const PostingList L3({0, 3, 5});
// Root of the query tree: [1, 5]
auto Root = createAnd(
// Lower And Iterator: [1, 5, 9]
createAnd(L0.iterator(), createBoost(L1.iterator(), 2U)),
// Lower Or Iterator: [0, 1, 5]
- createOr(L3.iterator(), createBoost(L4.iterator(), 3U),
- createBoost(L5.iterator(), 4U)));
+ createOr(createBoost(L2.iterator(), 3U), createBoost(L3.iterator(), 4U)));
EXPECT_FALSE(Root->reachedEnd());
EXPECT_EQ(Root->peek(), 1U);
@@ -260,15 +231,13 @@ TEST(DexIterators, StringRepresentation)
const PostingList L2({1, 5, 7, 9});
const PostingList L3({0, 5});
const PostingList L4({0, 1, 5});
- const PostingList L5({});
-
- EXPECT_EQ(llvm::to_string(*(L0.iterator())), "[4]");
-
- auto Nested =
- createAnd(createAnd(L1.iterator(), L2.iterator()),
- createOr(L3.iterator(), L4.iterator(), L5.iterator()));
- EXPECT_EQ(llvm::to_string(*Nested), "(& (| [5] [1] [END]) (& [1] [1]))");
+ EXPECT_EQ(llvm::to_string(*(L0.iterator())), "[4 ...]");
+ auto It = L0.iterator();
+ It->advanceTo(19);
+ EXPECT_EQ(llvm::to_string(*It), "[... 20 ...]");
+ It->advanceTo(9000);
+ EXPECT_EQ(llvm::to_string(*It), "[... END]");
}
TEST(DexIterators, Limit) {
More information about the cfe-commits
mailing list