[clang-tools-extra] r342965 - [clangd] Implement VByte PostingList compression

Tue Sep 25 04:54:51 PDT 2018

Author: omtcyfz
Date: Tue Sep 25 04:54:51 2018
New Revision: 342965

URL: http://llvm.org/viewvc/llvm-project?rev=342965&view=rev
Log:
[clangd] Implement VByte PostingList compression

This patch implements Variable-length Byte compression of `PostingList`s
to sacrifice some performance for lower memory consumption.

`PostingList` compression and decompression was extensively tested using
fuzzer for multiple hours and runnning significant number of realistic
`FuzzyFindRequests`. AddressSanitizer and UndefinedBehaviorSanitizer
were used to ensure the correct behaviour.

Performance evaluation was conducted with recent LLVM symbol index (292k
symbols) and the collection of user-recorded queries (7751
`FuzzyFindRequest` JSON dumps):

| Metrics | Before| After | Change (%)
| -----  | -----  | -----   | -----
| Memory consumption (posting lists only), MB  |  54.4 | 23.5 | -60%
| Time to process queries, sec | 7.70 | 9.4 | +25%

Reviewers: sammccall, ioeric

Reviewed By: sammccall

Differential Revision: https://reviews.llvm.org/D52300

Modified:
    clang-tools-extra/trunk/clangd/index/dex/Dex.cpp
    clang-tools-extra/trunk/clangd/index/dex/PostingList.cpp
    clang-tools-extra/trunk/clangd/index/dex/PostingList.h
    clang-tools-extra/trunk/unittests/clangd/DexTests.cpp

Modified: clang-tools-extra/trunk/clangd/index/dex/Dex.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/Dex.cpp?rev=342965&r1=342964&r2=342965&view=diff
==============================================================================

--- clang-tools-extra/trunk/clangd/index/dex/Dex.cpp (original)
+++ clang-tools-extra/trunk/clangd/index/dex/Dex.cpp Tue Sep 25 04:54:51 2018
@@ -128,8 +128,8 @@ void Dex::buildIndex() {
 
   // Convert lists of items to posting lists.
   for (const auto &TokenToPostingList : TempInvertedIndex)
-    InvertedIndex.insert({TokenToPostingList.first,
-                          PostingList(move(TokenToPostingList.second))});
+    InvertedIndex.insert(
+        {TokenToPostingList.first, PostingList(TokenToPostingList.second)});
 
   vlog("Built Dex with estimated memory usage {0} bytes.",
        estimateMemoryUsage());

Modified: clang-tools-extra/trunk/clangd/index/dex/PostingList.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/PostingList.cpp?rev=342965&r1=342964&r2=342965&view=diff
==============================================================================
--- clang-tools-extra/trunk/clangd/index/dex/PostingList.cpp (original)
+++ clang-tools-extra/trunk/clangd/index/dex/PostingList.cpp Tue Sep 25 04:54:51 2018
@@ -9,6 +9,8 @@
 
 #include "PostingList.h"
 #include "Iterator.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
 
 namespace clang {
 namespace clangd {
@@ -16,21 +18,27 @@ namespace dex {
 
 namespace {
 
-/// Implements Iterator over std::vector<DocID>. This is the most basic
-/// iterator and is simply a wrapper around
-/// std::vector<DocID>::const_iterator.
-class PlainIterator : public Iterator {
+/// Implements iterator of PostingList chunks. This requires iterating over two
+/// levels: the first level iterator iterates over the chunks and decompresses
+/// them on-the-fly when the contents of chunk are to be seen.
+class ChunkIterator : public Iterator {
 public:
-  explicit PlainIterator(llvm::ArrayRef<DocID> Documents)
-      : Documents(Documents), Index(std::begin(Documents)) {}
+  explicit ChunkIterator(llvm::ArrayRef<Chunk> Chunks)
+      : Chunks(Chunks), CurrentChunk(Chunks.begin()) {
+    if (!Chunks.empty()) {
+      DecompressedChunk = CurrentChunk->decompress();
+      CurrentID = DecompressedChunk.begin();
+    }
+  }
 
-  bool reachedEnd() const override { return Index == std::end(Documents); }
+  bool reachedEnd() const override { return CurrentChunk == Chunks.end(); }
 
   /// Advances cursor to the next item.
   void advance() override {
     assert(!reachedEnd() &&
            "Posting List iterator can't advance() at the end.");
-    ++Index;
+    ++CurrentID;
+    normalizeCursor();
   }
 
   /// Applies binary search to advance cursor to the next item with DocID
@@ -38,16 +46,17 @@ public:
   void advanceTo(DocID ID) override {
     assert(!reachedEnd() &&
            "Posting List iterator can't advance() at the end.");
-    // If current ID is beyond requested one, iterator is already in the right
-    // state.
-    if (peek() < ID)
-      Index = std::lower_bound(Index, std::end(Documents), ID);
+    if (ID <= peek())
+      return;
+    advanceToChunk(ID);
+    // Try to find ID within current chunk.
+    CurrentID = std::lower_bound(CurrentID, std::end(DecompressedChunk), ID);
+    normalizeCursor();
   }
 
   DocID peek() const override {
-    assert(!reachedEnd() &&
-           "Posting List iterator can't peek() at the end.");
-    return *Index;
+    assert(!reachedEnd() && "Posting List iterator can't peek() at the end.");
+    return *CurrentID;
   }
 
   float consume() override {
@@ -56,27 +65,160 @@ public:
     return DEFAULT_BOOST_SCORE;
   }
 
-  size_t estimateSize() const override { return Documents.size(); }
+  size_t estimateSize() const override {
+    return Chunks.size() * ApproxEntriesPerChunk;
+  }
 
 private:
   llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
     OS << '[';
-    if (Index != std::end(Documents))
-      OS << *Index;
-    else
-      OS << "END";
+    if (CurrentChunk != Chunks.begin() ||
+        (CurrentID != DecompressedChunk.begin() && !DecompressedChunk.empty()))
+      OS << "... ";
+    OS << (reachedEnd() ? "END" : std::to_string(*CurrentID));
+    if (!reachedEnd() && CurrentID < DecompressedChunk.end() - 1)
+      OS << " ...";
     OS << ']';
     return OS;
   }
 
-  llvm::ArrayRef<DocID> Documents;
-  llvm::ArrayRef<DocID>::const_iterator Index;
+  /// If the cursor is at the end of a chunk, place it at the start of the next
+  /// chunk.
+  void normalizeCursor() {
+    // Invariant is already established if examined chunk is not exhausted.
+    if (CurrentID != std::end(DecompressedChunk))
+      return;
+    // Advance to next chunk if current one is exhausted.
+    ++CurrentChunk;
+    if (CurrentChunk == Chunks.end()) // Reached the end of PostingList.
+      return;
+    DecompressedChunk = CurrentChunk->decompress();
+    CurrentID = DecompressedChunk.begin();
+  }
+
+  /// Advances CurrentChunk to the chunk which might contain ID.
+  void advanceToChunk(DocID ID) {
+    if ((CurrentChunk != Chunks.end() - 1) &&
+        ((CurrentChunk + 1)->Head <= ID)) {
+      // Find the next chunk with Head >= ID.
+      CurrentChunk = std::lower_bound(
+          CurrentChunk + 1, Chunks.end(), ID,
+          [](const Chunk &C, const DocID ID) { return C.Head <= ID; });
+      --CurrentChunk;
+      DecompressedChunk = CurrentChunk->decompress();
+      CurrentID = DecompressedChunk.begin();
+    }
+  }
+
+  llvm::ArrayRef<Chunk> Chunks;
+  /// Iterator over chunks.
+  /// If CurrentChunk is valid, then DecompressedChunk is
+  /// CurrentChunk->decompress() and CurrentID is a valid (non-end) iterator
+  /// into it.
+  decltype(Chunks)::const_iterator CurrentChunk;
+  llvm::SmallVector<DocID, Chunk::PayloadSize + 1> DecompressedChunk;
+  /// Iterator over DecompressedChunk.
+  decltype(DecompressedChunk)::iterator CurrentID;
+
+  static constexpr size_t ApproxEntriesPerChunk = 15;
 };
 
+static constexpr size_t BitsPerEncodingByte = 7;
+
+/// Writes a variable length DocID into the buffer and updates the buffer size.
+/// If it doesn't fit, returns false and doesn't write to the buffer.
+bool encodeVByte(DocID Delta, llvm::MutableArrayRef<uint8_t> &Payload) {
+  assert(Delta != 0 && "0 is not a valid PostingList delta.");
+  // Calculate number of bytes Delta encoding would take by examining the
+  // meaningful bits.
+  unsigned Width = 1 + llvm::findLastSet(Delta) / BitsPerEncodingByte;
+  if (Width > Payload.size())
+    return false;
+
+  do {
+    uint8_t Encoding = Delta & 0x7f;
+    Delta >>= 7;
+    Payload.front() = Delta ? Encoding | 0x80 : Encoding;
+    Payload = Payload.drop_front();
+  } while (Delta != 0);
+  return true;
+}
+
+/// Use Variable-length Byte (VByte) delta encoding to compress sorted list of
+/// DocIDs. The compression stores deltas (differences) between subsequent
+/// DocIDs and encodes these deltas utilizing the least possible number of
+/// bytes.
+///
+/// Each encoding byte consists of two parts: the first bit (continuation bit)
+/// indicates whether this is the last byte (0 if this byte is the last) of
+/// current encoding and seven bytes a piece of DocID (payload). DocID contains
+/// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit
+/// payloads and one 4-bit payload), but in practice it is expected that gaps
+/// (deltas) between subsequent DocIDs are not large enough to require 5 bytes.
+/// In very dense posting lists (with average gaps less than 128) this
+/// representation would be 4 times more efficient than raw DocID array.
+///
+/// PostingList encoding example:
+///
+/// DocIDs    42            47        7000
+/// gaps                    5         6958
+/// Encoding  (raw number)  00000101  10110110 00101110
+std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) {
+  assert(!Documents.empty() && "Can't encode empty sequence.");
+  std::vector<Chunk> Result;
+  Result.emplace_back();
+  DocID Last = Result.back().Head = Documents.front();
+  llvm::MutableArrayRef<uint8_t> RemainingPayload = Result.back().Payload;
+  for (DocID Doc : Documents.drop_front()) {
+    if (!encodeVByte(Doc - Last, RemainingPayload)) { // didn't fit, flush chunk
+      Result.emplace_back();
+      Result.back().Head = Doc;
+      RemainingPayload = Result.back().Payload;
+    }
+    Last = Doc;
+  }
+  return std::vector<Chunk>(Result); // no move, shrink-to-fit
+}
+
+/// Reads variable length DocID from the buffer and updates the buffer size. If
+/// the stream is terminated, return None.
+llvm::Optional<DocID> readVByte(llvm::ArrayRef<uint8_t> &Bytes) {
+  if (Bytes.front() == 0 || Bytes.empty())
+    return llvm::None;
+  DocID Result = 0;
+  bool HasNextByte = true;
+  for (size_t Length = 0; HasNextByte && !Bytes.empty(); ++Length) {
+    assert(Length <= 5 && "Malformed VByte encoding sequence.");
+    // Write meaningful bits to the correct place in the document decoding.
+    Result |= (Bytes.front() & 0x7f) << (BitsPerEncodingByte * Length);
+    if ((Bytes.front() & 0x80) == 0)
+      HasNextByte = false;
+    Bytes = Bytes.drop_front();
+  }
+  return Result;
+}
+
 } // namespace
 
+llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Chunk::decompress() const {
+  llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Result{Head};
+  llvm::ArrayRef<uint8_t> Bytes(Payload);
+  DocID Delta;
+  for (DocID Current = Head; !Bytes.empty(); Current += Delta) {
+    auto MaybeDelta = readVByte(Bytes);
+    if (!MaybeDelta)
+      break;
+    Delta = *MaybeDelta;
+    Result.push_back(Current + Delta);
+  }
+  return llvm::SmallVector<DocID, Chunk::PayloadSize + 1>{Result};
+}
+
+PostingList::PostingList(llvm::ArrayRef<DocID> Documents)
+    : Chunks(encodeStream(Documents)) {}
+
 std::unique_ptr<Iterator> PostingList::iterator() const {
-  return llvm::make_unique<PlainIterator>(Documents);
+  return llvm::make_unique<ChunkIterator>(Chunks);
 }
 
 } // namespace dex

Modified: clang-tools-extra/trunk/clangd/index/dex/PostingList.h
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/PostingList.h?rev=342965&r1=342964&r2=342965&view=diff
==============================================================================
--- clang-tools-extra/trunk/clangd/index/dex/PostingList.h (original)
+++ clang-tools-extra/trunk/clangd/index/dex/PostingList.h Tue Sep 25 04:54:51 2018
@@ -6,13 +6,19 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This defines posting list interface: a storage for identifiers of symbols
-// which can be characterized by a specific feature (such as fuzzy-find trigram,
-// scope, type or any other Search Token). Posting lists can be traversed in
-// order using an iterator and are values for inverted index, which maps search
-// tokens to corresponding posting lists.
-//
+///
+/// \file
+/// This defines posting list interface: a storage for identifiers of symbols
+/// which can be characterized by a specific feature (such as fuzzy-find
+/// trigram, scope, type or any other Search Token). Posting lists can be
+/// traversed in order using an iterator and are values for inverted index,
+/// which maps search tokens to corresponding posting lists.
+///
+/// In order to decrease size of Index in-memory representation, Variable Byte
+/// Encoding (VByte) is used for PostingLists compression. An overview of VByte
+/// algorithm can be found in "Introduction to Information Retrieval" book:
+/// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html
+///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
@@ -20,6 +26,7 @@
 
 #include "Iterator.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include <cstdint>
 #include <vector>
 
@@ -29,20 +36,43 @@ namespace dex {
 
 class Iterator;
 
+/// NOTE: This is an implementation detail.
+///
+/// Chunk is a fixed-width piece of PostingList which contains the first DocID
+/// in uncompressed format (Head) and delta-encoded Payload. It can be
+/// decompressed upon request.
+struct Chunk {
+  /// Keep sizeof(Chunk) == 32.
+  static constexpr size_t PayloadSize = 32 - sizeof(DocID);
+
+  llvm::SmallVector<DocID, PayloadSize + 1> decompress() const;
+
+  /// The first element of decompressed Chunk.
+  DocID Head;
+  /// VByte-encoded deltas.
+  std::array<uint8_t, PayloadSize> Payload = std::array<uint8_t, PayloadSize>();
+};
+static_assert(sizeof(Chunk) == 32, "Chunk should take 32 bytes of memory.");
+
 /// PostingList is the storage of DocIDs which can be inserted to the Query
-/// Tree as a leaf by constructing Iterator over the PostingList object.
-// FIXME(kbobyrev): Use VByte algorithm to compress underlying data.
+/// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs
+/// are stored in underlying chunks. Compression saves memory at a small cost
+/// in access time, which is still fast enough in practice.
 class PostingList {
 public:
-  explicit PostingList(const std::vector<DocID> &&Documents)
-      : Documents(std::move(Documents)) {}
+  explicit PostingList(llvm::ArrayRef<DocID> Documents);
 
+  /// Constructs DocumentIterator over given posting list. DocumentIterator will
+  /// go through the chunks and decompress them on-the-fly when necessary.
   std::unique_ptr<Iterator> iterator() const;
 
-  size_t bytes() const { return Documents.size() * sizeof(DocID); }
+  /// Returns in-memory size.
+  size_t bytes() const {
+    return sizeof(Chunk) + Chunks.capacity() * sizeof(Chunk);
+  }
 
 private:
-  const std::vector<DocID> Documents;
+  const std::vector<Chunk> Chunks;
 };
 
 } // namespace dex

Modified: clang-tools-extra/trunk/unittests/clangd/DexTests.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/unittests/clangd/DexTests.cpp?rev=342965&r1=342964&r2=342965&view=diff
==============================================================================
--- clang-tools-extra/trunk/unittests/clangd/DexTests.cpp (original)
+++ clang-tools-extra/trunk/unittests/clangd/DexTests.cpp Tue Sep 25 04:54:51 2018
@@ -69,19 +69,6 @@ TEST(DexIterators, DocumentIterator) {
   EXPECT_TRUE(DocIterator->reachedEnd());
 }
 
-TEST(DexIterators, AndWithEmpty) {
-  const PostingList L0({});
-  const PostingList L1({0, 5, 7, 10, 42, 320, 9000});
-
-  auto AndEmpty = createAnd(L0.iterator());
-  EXPECT_TRUE(AndEmpty->reachedEnd());
-
-  auto AndWithEmpty = createAnd(L0.iterator(), L1.iterator());
-  EXPECT_TRUE(AndWithEmpty->reachedEnd());
-
-  EXPECT_THAT(consumeIDs(*AndWithEmpty), ElementsAre());
-}
-
 TEST(DexIterators, AndTwoLists) {
   const PostingList L0({0, 5, 7, 10, 42, 320, 9000});
   const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000});
@@ -120,20 +107,6 @@ TEST(DexIterators, AndThreeLists) {
   EXPECT_TRUE(And->reachedEnd());
 }
 
-TEST(DexIterators, OrWithEmpty) {
-  const PostingList L0({});
-  const PostingList L1({0, 5, 7, 10, 42, 320, 9000});
-
-  auto OrEmpty = createOr(L0.iterator());
-  EXPECT_TRUE(OrEmpty->reachedEnd());
-
-  auto OrWithEmpty = createOr(L0.iterator(), L1.iterator());
-  EXPECT_FALSE(OrWithEmpty->reachedEnd());
-
-  EXPECT_THAT(consumeIDs(*OrWithEmpty),
-              ElementsAre(0U, 5U, 7U, 10U, 42U, 320U, 9000U));
-}
-
 TEST(DexIterators, OrTwoLists) {
   const PostingList L0({0, 5, 7, 10, 42, 320, 9000});
   const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000});
@@ -211,29 +184,27 @@ TEST(DexIterators, QueryTree) {
   //      |And Iterator: 1, 5, 9|              |Or Iterator: 0, 1, 3, 5|
   //      +----------+----------+              +----------+------------+
   //                 |                                    |
-  //          +------+-----+                    +---------------------+
-  //          |            |                    |         |           |
-  //  +-------v-----+ +----+---+             +--v--+  +---v----+ +----v---+
-  //  |1, 3, 5, 8, 9| |Boost: 2|             |Empty|  |Boost: 3| |Boost: 4|
-  //  +-------------+ +----+---+             +-----+  +---+----+ +----+---+
-  //                       |                              |           |
-  //                  +----v-----+                      +-v--+    +---v---+
-  //                  |1, 5, 7, 9|                      |1, 5|    |0, 3, 5|
-  //                  +----------+                      +----+    +-------+
+  //          +------+-----+                        ------------+
+  //          |            |                        |           |
+  //  +-------v-----+ +----+---+                +---v----+ +----v---+
+  //  |1, 3, 5, 8, 9| |Boost: 2|                |Boost: 3| |Boost: 4|
+  //  +-------------+ +----+---+                +---+----+ +----+---+
+  //                       |                        |           |
+  //                  +----v-----+                +-v--+    +---v---+
+  //                  |1, 5, 7, 9|                |1, 5|    |0, 3, 5|
+  //                  +----------+                +----+    +-------+
   //
   const PostingList L0({1, 3, 5, 8, 9});
   const PostingList L1({1, 5, 7, 9});
-  const PostingList L3({});
-  const PostingList L4({1, 5});
-  const PostingList L5({0, 3, 5});
+  const PostingList L2({1, 5});
+  const PostingList L3({0, 3, 5});
 
   // Root of the query tree: [1, 5]
   auto Root = createAnd(
       // Lower And Iterator: [1, 5, 9]
       createAnd(L0.iterator(), createBoost(L1.iterator(), 2U)),
       // Lower Or Iterator: [0, 1, 5]
-      createOr(L3.iterator(), createBoost(L4.iterator(), 3U),
-               createBoost(L5.iterator(), 4U)));
+      createOr(createBoost(L2.iterator(), 3U), createBoost(L3.iterator(), 4U)));
 
   EXPECT_FALSE(Root->reachedEnd());
   EXPECT_EQ(Root->peek(), 1U);
@@ -260,15 +231,13 @@ TEST(DexIterators, StringRepresentation)
   const PostingList L2({1, 5, 7, 9});
   const PostingList L3({0, 5});
   const PostingList L4({0, 1, 5});
-  const PostingList L5({});
-
-  EXPECT_EQ(llvm::to_string(*(L0.iterator())), "[4]");
-
-  auto Nested =
-      createAnd(createAnd(L1.iterator(), L2.iterator()),
-                createOr(L3.iterator(), L4.iterator(), L5.iterator()));
 
-  EXPECT_EQ(llvm::to_string(*Nested), "(& (| [5] [1] [END]) (& [1] [1]))");
+  EXPECT_EQ(llvm::to_string(*(L0.iterator())), "[4 ...]");
+  auto It = L0.iterator();
+  It->advanceTo(19);
+  EXPECT_EQ(llvm::to_string(*It), "[... 20 ...]");
+  It->advanceTo(9000);
+  EXPECT_EQ(llvm::to_string(*It), "[... END]");
 }
 
 TEST(DexIterators, Limit) {