[clang-tools-extra] r340409 - [clangd] Implement BOOST iterator

Kirill Bobyrev via cfe-commits cfe-commits at lists.llvm.org
Wed Aug 22 06:44:15 PDT 2018


Author: omtcyfz
Date: Wed Aug 22 06:44:15 2018
New Revision: 340409

URL: http://llvm.org/viewvc/llvm-project?rev=340409&view=rev
Log:
[clangd] Implement BOOST iterator

This patch introduces BOOST iterator - a substantial block for efficient
and high-quality symbol retrieval. The concept of boosting allows
performing computationally inexpensive scoring on the query side so that
the final (expensive) scoring can only be applied on the items with the
highest preliminary score while eliminating the need to score too many
items.

Reviewed by: ilya-biryukov

Differential Revision: https://reviews.llvm.org/D50970

Modified:
    clang-tools-extra/trunk/clangd/index/dex/DexIndex.cpp
    clang-tools-extra/trunk/clangd/index/dex/Iterator.cpp
    clang-tools-extra/trunk/clangd/index/dex/Iterator.h
    clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp

Modified: clang-tools-extra/trunk/clangd/index/dex/DexIndex.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/DexIndex.cpp?rev=340409&r1=340408&r2=340409&view=diff
==============================================================================
--- clang-tools-extra/trunk/clangd/index/dex/DexIndex.cpp (original)
+++ clang-tools-extra/trunk/clangd/index/dex/DexIndex.cpp Wed Aug 22 06:44:15 2018
@@ -125,11 +125,15 @@ bool DexIndex::fuzzyFind(
     // using 100x of the requested number might not be good in practice, e.g.
     // when the requested number of items is small.
     const unsigned ItemsToRetrieve = 100 * Req.MaxCandidateCount;
-    std::vector<DocID> SymbolDocIDs = consume(*QueryIterator, ItemsToRetrieve);
+    // FIXME(kbobyrev): Add boosting to the query and utilize retrieved
+    // boosting scores.
+    std::vector<std::pair<DocID, float>> SymbolDocIDs =
+        consume(*QueryIterator, ItemsToRetrieve);
 
     // Retrieve top Req.MaxCandidateCount items.
     std::priority_queue<std::pair<float, const Symbol *>> Top;
-    for (const auto &SymbolDocID : SymbolDocIDs) {
+    for (const auto &P : SymbolDocIDs) {
+      const DocID SymbolDocID = P.first;
       const auto *Sym = (*Symbols)[SymbolDocID];
       const llvm::Optional<float> Score = Filter.match(Sym->Name);
       if (!Score)
@@ -161,7 +165,6 @@ void DexIndex::lookup(const LookupReques
   }
 }
 
-
 void DexIndex::findOccurrences(
     const OccurrencesRequest &Req,
     llvm::function_ref<void(const SymbolOccurrence &)> Callback) const {

Modified: clang-tools-extra/trunk/clangd/index/dex/Iterator.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/Iterator.cpp?rev=340409&r1=340408&r2=340409&view=diff
==============================================================================
--- clang-tools-extra/trunk/clangd/index/dex/Iterator.cpp (original)
+++ clang-tools-extra/trunk/clangd/index/dex/Iterator.cpp Wed Aug 22 06:44:15 2018
@@ -46,6 +46,8 @@ public:
     return *Index;
   }
 
+  float consume(DocID ID) override { return DEFAULT_BOOST_SCORE; }
+
 private:
   llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
     OS << '[';
@@ -103,6 +105,19 @@ public:
 
   DocID peek() const override { return Children.front()->peek(); }
 
+  // If not exhausted and points to the given item, consume() returns the
+  // product of Children->consume(ID). Otherwise, DEFAULT_BOOST_SCORE is
+  // returned.
+  float consume(DocID ID) override {
+    if (reachedEnd() || peek() != ID)
+      return DEFAULT_BOOST_SCORE;
+    return std::accumulate(
+        begin(Children), end(Children), DEFAULT_BOOST_SCORE,
+        [&](float Current, const std::unique_ptr<Iterator> &Child) {
+          return Current * Child->consume(ID);
+        });
+  }
+
 private:
   llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
     OS << "(& ";
@@ -209,6 +224,20 @@ public:
     return Result;
   }
 
+  // Returns the maximum boosting score among all Children when iterator is not
+  // exhausted and points to the given ID, DEFAULT_BOOST_SCORE otherwise.
+  float consume(DocID ID) override {
+    if (reachedEnd() || peek() != ID)
+      return DEFAULT_BOOST_SCORE;
+    return std::accumulate(
+        begin(Children), end(Children), DEFAULT_BOOST_SCORE,
+        [&](float Current, const std::unique_ptr<Iterator> &Child) {
+          return (!Child->reachedEnd() && Child->peek() == ID)
+                     ? std::max(Current, Child->consume(ID))
+                     : Current;
+        });
+  }
+
 private:
   llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
     OS << "(| ";
@@ -249,6 +278,8 @@ public:
     return Index;
   }
 
+  float consume(DocID) override { return DEFAULT_BOOST_SCORE; }
+
 private:
   llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
     OS << "(TRUE {" << Index << "} out of " << Size << ")";
@@ -260,13 +291,42 @@ private:
   DocID Size;
 };
 
+/// Boost iterator is a wrapper around its child which multiplies scores of
+/// each retrieved item by a given factor.
+class BoostIterator : public Iterator {
+public:
+  BoostIterator(std::unique_ptr<Iterator> Child, float Factor)
+      : Child(move(Child)), Factor(Factor) {}
+
+  bool reachedEnd() const override { return Child->reachedEnd(); }
+
+  void advance() override { Child->advance(); }
+
+  void advanceTo(DocID ID) override { Child->advanceTo(ID); }
+
+  DocID peek() const override { return Child->peek(); }
+
+  float consume(DocID ID) override { return Child->consume(ID) * Factor; }
+
+private:
+  llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
+    OS << "(BOOST " << Factor << ' ' << *Child << ')';
+    return OS;
+  }
+
+  std::unique_ptr<Iterator> Child;
+  float Factor;
+};
+
 } // end namespace
 
-std::vector<DocID> consume(Iterator &It, size_t Limit) {
-  std::vector<DocID> Result;
+std::vector<std::pair<DocID, float>> consume(Iterator &It, size_t Limit) {
+  std::vector<std::pair<DocID, float>> Result;
   for (size_t Retrieved = 0; !It.reachedEnd() && Retrieved < Limit;
-       It.advance(), ++Retrieved)
-    Result.push_back(It.peek());
+       It.advance(), ++Retrieved) {
+    DocID Document = It.peek();
+    Result.push_back(std::make_pair(Document, It.consume(Document)));
+  }
   return Result;
 }
 
@@ -288,6 +348,11 @@ std::unique_ptr<Iterator> createTrue(Doc
   return llvm::make_unique<TrueIterator>(Size);
 }
 
+std::unique_ptr<Iterator> createBoost(std::unique_ptr<Iterator> Child,
+                                      float Factor) {
+  return llvm::make_unique<BoostIterator>(move(Child), Factor);
+}
+
 } // namespace dex
 } // namespace clangd
 } // namespace clang

Modified: clang-tools-extra/trunk/clangd/index/dex/Iterator.h
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/Iterator.h?rev=340409&r1=340408&r2=340409&view=diff
==============================================================================
--- clang-tools-extra/trunk/clangd/index/dex/Iterator.h (original)
+++ clang-tools-extra/trunk/clangd/index/dex/Iterator.h Wed Aug 22 06:44:15 2018
@@ -66,11 +66,9 @@ using PostingListRef = llvm::ArrayRef<Do
 /// (their children) to form a multi-level Query Tree. The interface is designed
 /// to be extensible in order to support multiple types of iterators.
 class Iterator {
-  // FIXME(kbobyrev): Provide callback for matched documents.
-  // FIXME(kbobyrev): Implement new types of iterators: Label, Boost (with
-  // scoring), Limit.
   // FIXME(kbobyrev): Implement iterator cost, an estimate of advance() calls
   // before iterator exhaustion.
+  // FIXME(kbobyrev): Implement Limit iterator.
 public:
   /// Returns true if all valid DocIDs were processed and hence the iterator is
   /// exhausted.
@@ -89,6 +87,13 @@ public:
   ///
   /// Note: reachedEnd() must be false.
   virtual DocID peek() const = 0;
+  /// Retrieves boosting score. Query tree root should pass Root->peek() to this
+  /// function, the parameter is needed to propagate through the tree. Given ID
+  /// should be compared against BOOST iterator peek()s: some of the iterators
+  /// would not point to the item which was propagated to the top of the query
+  /// tree (e.g. if these iterators are branches of OR iterator) and hence
+  /// shouldn't apply any boosting to the consumed item.
+  virtual float consume(DocID ID) = 0;
 
   virtual ~Iterator() {}
 
@@ -107,32 +112,59 @@ public:
     return Iterator.dump(OS);
   }
 
+  constexpr static float DEFAULT_BOOST_SCORE = 1;
+
 private:
   virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0;
 };
 
 /// Advances the iterator until it is either exhausted or the number of
-/// requested items is reached. The result contains sorted DocumentIDs.
-std::vector<DocID> consume(Iterator &It,
-                           size_t Limit = std::numeric_limits<size_t>::max());
+/// requested items is reached. Returns pairs of document IDs with the
+/// corresponding boosting score.
+///
+/// Boosting can be seen as a compromise between retrieving too many items and
+/// calculating finals score for each of them (which might be very expensive)
+/// and not retrieving enough items so that items with very high final score
+/// would not be processed. Boosting score is a computationally efficient way
+/// to acquire preliminary scores of requested items.
+std::vector<std::pair<DocID, float>>
+consume(Iterator &It, size_t Limit = std::numeric_limits<size_t>::max());
 
 /// Returns a document iterator over given PostingList.
+///
+/// DocumentIterator returns DEFAULT_BOOST_SCORE for each processed item.
 std::unique_ptr<Iterator> create(PostingListRef Documents);
 
 /// Returns AND Iterator which performs the intersection of the PostingLists of
 /// its children.
+///
+/// consume(): AND Iterator returns the product of Childrens' boosting scores
+/// when not exhausted and DEFAULT_BOOST_SCORE otherwise.
 std::unique_ptr<Iterator>
 createAnd(std::vector<std::unique_ptr<Iterator>> Children);
 
 /// Returns OR Iterator which performs the union of the PostingLists of its
 /// children.
+///
+/// consume(): OR Iterator returns the highest boost value among children
+/// pointing to requested item when not exhausted and DEFAULT_BOOST_SCORE
+/// otherwise.
 std::unique_ptr<Iterator>
 createOr(std::vector<std::unique_ptr<Iterator>> Children);
 
 /// Returns TRUE Iterator which iterates over "virtual" PostingList containing
 /// all items in range [0, Size) in an efficient manner.
+///
+/// TRUE returns DEFAULT_BOOST_SCORE for each processed item.
 std::unique_ptr<Iterator> createTrue(DocID Size);
 
+/// Returns BOOST iterator which multiplies the score of each item by given
+/// factor. Boosting can be used as a computationally inexpensive filtering.
+/// Users can return significantly more items using consumeAndBoost() and then
+/// trim Top K using retrieval score.
+std::unique_ptr<Iterator> createBoost(std::unique_ptr<Iterator> Child,
+                                      float Factor);
+
 /// This allows createAnd(create(...), create(...)) syntax.
 template <typename... Args> std::unique_ptr<Iterator> createAnd(Args... args) {
   std::vector<std::unique_ptr<Iterator>> Children;

Modified: clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp?rev=340409&r1=340408&r2=340409&view=diff
==============================================================================
--- clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp (original)
+++ clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp Wed Aug 22 06:44:15 2018
@@ -29,6 +29,15 @@ namespace clangd {
 namespace dex {
 namespace {
 
+std::vector<DocID>
+consumeIDs(Iterator &It, size_t Limit = std::numeric_limits<size_t>::max()) {
+  auto IDAndScore = consume(It, Limit);
+  std::vector<DocID> IDs(IDAndScore.size());
+  for (size_t I = 0; I < IDAndScore.size(); ++I)
+    IDs[I] = IDAndScore[I].first;
+  return IDs;
+}
+
 TEST(DexIndexIterators, DocumentIterator) {
   const PostingList L = {4, 7, 8, 20, 42, 100};
   auto DocIterator = create(L);
@@ -62,7 +71,7 @@ TEST(DexIndexIterators, AndWithEmpty) {
   auto AndWithEmpty = createAnd(create(L0), create(L1));
   EXPECT_TRUE(AndWithEmpty->reachedEnd());
 
-  EXPECT_THAT(consume(*AndWithEmpty), ElementsAre());
+  EXPECT_THAT(consumeIDs(*AndWithEmpty), ElementsAre());
 }
 
 TEST(DexIndexIterators, AndTwoLists) {
@@ -72,7 +81,7 @@ TEST(DexIndexIterators, AndTwoLists) {
   auto And = createAnd(create(L1), create(L0));
 
   EXPECT_FALSE(And->reachedEnd());
-  EXPECT_THAT(consume(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U));
+  EXPECT_THAT(consumeIDs(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U));
 
   And = createAnd(create(L0), create(L1));
 
@@ -113,7 +122,7 @@ TEST(DexIndexIterators, OrWithEmpty) {
   auto OrWithEmpty = createOr(create(L0), create(L1));
   EXPECT_FALSE(OrWithEmpty->reachedEnd());
 
-  EXPECT_THAT(consume(*OrWithEmpty),
+  EXPECT_THAT(consumeIDs(*OrWithEmpty),
               ElementsAre(0U, 5U, 7U, 10U, 42U, 320U, 9000U));
 }
 
@@ -146,7 +155,7 @@ TEST(DexIndexIterators, OrTwoLists) {
 
   Or = createOr(create(L0), create(L1));
 
-  EXPECT_THAT(consume(*Or),
+  EXPECT_THAT(consumeIDs(*Or),
               ElementsAre(0U, 4U, 5U, 7U, 10U, 30U, 42U, 60U, 320U, 9000U));
 }
 
@@ -178,41 +187,45 @@ TEST(DexIndexIterators, OrThreeLists) {
 // FIXME(kbobyrev): The testcase below is similar to what is expected in real
 // queries. It should be updated once new iterators (such as boosting, limiting,
 // etc iterators) appear. However, it is not exhaustive and it would be
-// beneficial to implement automatic generation of query trees for more
-// comprehensive testing.
+// beneficial to implement automatic generation (e.g. fuzzing) of query trees
+// for more comprehensive testing.
 TEST(DexIndexIterators, QueryTree) {
-  // An example of more complicated query
   //
   //                      +-----------------+
   //                      |And Iterator:1, 5|
   //                      +--------+--------+
   //                               |
   //                               |
-  //                 +------------------------------------+
+  //                 +-------------+----------------------+
   //                 |                                    |
   //                 |                                    |
-  //      +----------v----------+              +----------v---------+
-  //      |And Iterator: 1, 5, 9|              |Or Iterator: 0, 1, 5|
-  //      +----------+----------+              +----------+---------+
+  //      +----------v----------+              +----------v------------+
+  //      |And Iterator: 1, 5, 9|              |Or Iterator: 0, 1, 3, 5|
+  //      +----------+----------+              +----------+------------+
   //                 |                                    |
-  //          +------+-----+                    +---------+-----------+
+  //          +------+-----+                    +---------------------+
   //          |            |                    |         |           |
-  //  +-------v-----+ +----v-----+           +--v--+    +-V--+    +---v---+
-  //  |1, 3, 5, 8, 9| |1, 5, 7, 9|           |Empty|    |0, 5|    |0, 1, 5|
-  //  +-------------+ +----------+           +-----+    +----+    +-------+
-
+  //  +-------v-----+ +----+---+             +--v--+  +---v----+ +----v---+
+  //  |1, 3, 5, 8, 9| |Boost: 2|             |Empty|  |Boost: 3| |Boost: 4|
+  //  +-------------+ +----+---+             +-----+  +---+----+ +----+---+
+  //                       |                              |           |
+  //                  +----v-----+                      +-v--+    +---v---+
+  //                  |1, 5, 7, 9|                      |1, 5|    |0, 3, 5|
+  //                  +----------+                      +----+    +-------+
+  //
   const PostingList L0 = {1, 3, 5, 8, 9};
   const PostingList L1 = {1, 5, 7, 9};
-  const PostingList L2 = {0, 5};
-  const PostingList L3 = {0, 1, 5};
-  const PostingList L4;
+  const PostingList L3;
+  const PostingList L4 = {1, 5};
+  const PostingList L5 = {0, 3, 5};
 
   // Root of the query tree: [1, 5]
   auto Root = createAnd(
       // Lower And Iterator: [1, 5, 9]
-      createAnd(create(L0), create(L1)),
+      createAnd(create(L0), createBoost(create(L1), 2U)),
       // Lower Or Iterator: [0, 1, 5]
-      createOr(create(L2), create(L3), create(L4)));
+      createOr(create(L3), createBoost(create(L4), 3U),
+               createBoost(create(L5), 4U)));
 
   EXPECT_FALSE(Root->reachedEnd());
   EXPECT_EQ(Root->peek(), 1U);
@@ -221,10 +234,14 @@ TEST(DexIndexIterators, QueryTree) {
   Root->advanceTo(1);
   Root->advanceTo(0);
   EXPECT_EQ(Root->peek(), 1U);
+  auto ElementBoost = Root->consume(Root->peek());
+  EXPECT_THAT(ElementBoost, 6);
   Root->advance();
   EXPECT_EQ(Root->peek(), 5U);
   Root->advanceTo(5);
   EXPECT_EQ(Root->peek(), 5U);
+  ElementBoost = Root->consume(Root->peek());
+  EXPECT_THAT(ElementBoost, 8);
   Root->advanceTo(9000);
   EXPECT_TRUE(Root->reachedEnd());
 }
@@ -256,29 +273,57 @@ TEST(DexIndexIterators, Limit) {
   const PostingList L5;
 
   auto DocIterator = create(L0);
-  EXPECT_THAT(consume(*DocIterator, 42), ElementsAre(4, 7, 8, 20, 42, 100));
+  EXPECT_THAT(consumeIDs(*DocIterator, 42), ElementsAre(4, 7, 8, 20, 42, 100));
 
   DocIterator = create(L0);
-  EXPECT_THAT(consume(*DocIterator), ElementsAre(4, 7, 8, 20, 42, 100));
+  EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(4, 7, 8, 20, 42, 100));
 
   DocIterator = create(L0);
-  EXPECT_THAT(consume(*DocIterator, 3), ElementsAre(4, 7, 8));
+  EXPECT_THAT(consumeIDs(*DocIterator, 3), ElementsAre(4, 7, 8));
 
   DocIterator = create(L0);
-  EXPECT_THAT(consume(*DocIterator, 0), ElementsAre());
+  EXPECT_THAT(consumeIDs(*DocIterator, 0), ElementsAre());
 }
 
 TEST(DexIndexIterators, True) {
   auto TrueIterator = createTrue(0U);
   EXPECT_TRUE(TrueIterator->reachedEnd());
-  EXPECT_THAT(consume(*TrueIterator), ElementsAre());
+  EXPECT_THAT(consumeIDs(*TrueIterator), ElementsAre());
 
   PostingList L0 = {1, 2, 5, 7};
   TrueIterator = createTrue(7U);
   EXPECT_THAT(TrueIterator->peek(), 0);
   auto AndIterator = createAnd(create(L0), move(TrueIterator));
   EXPECT_FALSE(AndIterator->reachedEnd());
-  EXPECT_THAT(consume(*AndIterator), ElementsAre(1, 2, 5));
+  EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(1, 2, 5));
+}
+
+TEST(DexIndexIterators, Boost) {
+  auto BoostIterator = createBoost(createTrue(5U), 42U);
+  EXPECT_FALSE(BoostIterator->reachedEnd());
+  auto ElementBoost = BoostIterator->consume(BoostIterator->peek());
+  EXPECT_THAT(ElementBoost, 42U);
+
+  const PostingList L0 = {2, 4};
+  const PostingList L1 = {1, 4};
+  auto Root = createOr(createTrue(5U), createBoost(create(L0), 2U),
+                       createBoost(create(L1), 3U));
+
+  ElementBoost = Root->consume(Root->peek());
+  EXPECT_THAT(ElementBoost, Iterator::DEFAULT_BOOST_SCORE);
+  Root->advance();
+  EXPECT_THAT(Root->peek(), 1U);
+  ElementBoost = Root->consume(Root->peek());
+  EXPECT_THAT(ElementBoost, 3);
+
+  Root->advance();
+  EXPECT_THAT(Root->peek(), 2U);
+  ElementBoost = Root->consume(Root->peek());
+  EXPECT_THAT(ElementBoost, 2);
+
+  Root->advanceTo(4);
+  ElementBoost = Root->consume(Root->peek());
+  EXPECT_THAT(ElementBoost, 3);
 }
 
 testing::Matcher<std::vector<Token>>




More information about the cfe-commits mailing list