[clang-tools-extra] 94b2ca1 - [pseudo] GC GSS nodes, reuse them with a freelist
Sam McCall via cfe-commits
cfe-commits at lists.llvm.org
Wed Jun 8 14:44:47 PDT 2022
Author: Sam McCall
Date: 2022-06-08T23:39:59+02:00
New Revision: 94b2ca18c10bfb4107a850d63148fc51e65d9512
URL: https://github.com/llvm/llvm-project/commit/94b2ca18c10bfb4107a850d63148fc51e65d9512
DIFF: https://github.com/llvm/llvm-project/commit/94b2ca18c10bfb4107a850d63148fc51e65d9512.diff
LOG: [pseudo] GC GSS nodes, reuse them with a freelist
Most GSS nodes have short effective lifetimes, keeping them around until the
end of the parse is wasteful. Mark and sweep them every 20 tokens.
When parsing clangd/AST.cpp, this reduces the GSS memory from 1MB to 20kB.
We pay ~5% performance for this according to the glrParse benchmark.
(Parsing more tokens between GCs doesn't seem to improve this further).
Compared to the refcounting approach in https://reviews.llvm.org/D126337, this
is simpler (at least the complexity is better isolated) and has >2x less
overhead. It doesn't provide death handlers (for error-handling) but we have
an alternative solution in mind.
Differential Revision: https://reviews.llvm.org/D126723
Added:
Modified:
clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
clang-tools-extra/pseudo/lib/GLR.cpp
clang-tools-extra/pseudo/tool/ClangPseudo.cpp
clang-tools-extra/pseudo/unittests/GLRTest.cpp
Removed:
################################################################################
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
index 70d6e46e5aed3..fc5ebeda1f279 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
@@ -68,6 +68,8 @@ struct GSS {
struct alignas(struct Node *) Node {
// LR state describing how parsing should continue from this head.
LRTable::StateID State;
+ // Used internally to track reachability during garbage collection.
+ bool GCParity;
// Number of the parents of this node.
// The parents hold previous parsed symbols, and may resume control after
// this node is reduced.
@@ -77,10 +79,6 @@ struct GSS {
// (In the literature, the node is attached to the *edge* to the parent).
const ForestNode *Payload = nullptr;
- // FIXME: Most nodes live a fairly short time, and are simply discarded.
- // Is it worth refcounting them (we have empty padding) and returning to a
- // freelist, to keep the working set small?
-
llvm::ArrayRef<const Node *> parents() const {
return llvm::makeArrayRef(reinterpret_cast<const Node *const *>(this + 1),
ParentCount);
@@ -90,23 +88,26 @@ struct GSS {
// Allocates a new node in the graph.
const Node *addNode(LRTable::StateID State, const ForestNode *Symbol,
- llvm::ArrayRef<const Node *> Parents) {
- ++NodeCount;
- Node *Result = new (Arena.Allocate(
- sizeof(Node) + Parents.size() * sizeof(Node *), alignof(Node)))
- Node({State, static_cast<unsigned>(Parents.size())});
- Result->Payload = Symbol;
- if (!Parents.empty())
- llvm::copy(Parents, reinterpret_cast<const Node **>(Result + 1));
- return Result;
- }
+ llvm::ArrayRef<const Node *> Parents);
+ // Frees all nodes not reachable as ancestors of Roots, and returns the count.
+ // Calling this periodically prevents steady memory growth of the GSS.
+ unsigned gc(std::vector<const Node *> &&Roots);
size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); }
- size_t nodeCount() const { return NodeCount; }
+ size_t nodesCreated() const { return NodesCreated; }
private:
+ // Nodes are recycled using freelists.
+ // They are variable size, so use one free-list per distinct #parents.
+ std::vector<std::vector<Node *>> FreeList;
+ Node *allocate(unsigned Parents);
+ void destroy(Node *N);
+ // The list of nodes created and not destroyed - our candidates for gc().
+ std::vector<Node *> Alive;
+ bool GCParity = false; // All nodes should match this, except during GC.
+
llvm::BumpPtrAllocator Arena;
- unsigned NodeCount = 0;
+ unsigned NodesCreated = 0;
};
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GSS::Node &);
diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp
index 10fa0d0568c02..1031e3dd2007c 100644
--- a/clang-tools-extra/pseudo/lib/GLR.cpp
+++ b/clang-tools-extra/pseudo/lib/GLR.cpp
@@ -12,6 +12,7 @@
#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
@@ -65,6 +66,18 @@ const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params,
std::vector<const GSS::Node *> NewHeads = {
GSS.addNode(/*State=*/Params.Table.getStartState(StartSymbol),
/*ForestNode=*/nullptr, {})};
+ auto MaybeGC = [&, Roots(std::vector<const GSS::Node *>{}), I(0u)]() mutable {
+ assert(PendingShift.empty() && PendingReduce.empty() &&
+ PendingAccept.empty() && "Running GC at the wrong time!");
+
+ if (++I != 20) // Run periodically to balance CPU and memory usage.
+ return;
+ I = 0;
+
+ // We need to copy the list: Roots is consumed by the GC.
+ Roots = NewHeads;
+ GSS.gc(std::move(Roots));
+ };
for (const ForestNode &Terminal : Terminals) {
LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n",
G.symbolName(Terminal.symbol()),
@@ -80,6 +93,7 @@ const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params,
glrShift(PendingShift, Terminal, Params,
[&](const GSS::Node *NewHead) { NewHeads.push_back(NewHead); });
+ MaybeGC();
}
LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next is eof\n"));
for (const auto *Heads : NewHeads)
@@ -373,5 +387,72 @@ void glrReduce(std::vector<ParseStep> &PendingReduce, const ParseParams &Params,
assert(Sequences.empty());
}
+const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,
+ llvm::ArrayRef<const Node *> Parents) {
+ Node *Result = new (allocate(Parents.size()))
+ Node({State, GCParity, static_cast<unsigned>(Parents.size())});
+ Alive.push_back(Result);
+ ++NodesCreated;
+ Result->Payload = Symbol;
+ if (!Parents.empty())
+ llvm::copy(Parents, reinterpret_cast<const Node **>(Result + 1));
+ return Result;
+}
+
+GSS::Node *GSS::allocate(unsigned Parents) {
+ if (FreeList.size() <= Parents)
+ FreeList.resize(Parents + 1);
+ auto &SizedList = FreeList[Parents];
+ if (!SizedList.empty()) {
+ auto *Result = SizedList.back();
+ SizedList.pop_back();
+ return Result;
+ }
+ return static_cast<Node *>(
+ Arena.Allocate(sizeof(Node) + Parents * sizeof(Node *), alignof(Node)));
+}
+
+void GSS::destroy(Node *N) {
+ unsigned ParentCount = N->ParentCount;
+ N->~Node();
+ assert(FreeList.size() > ParentCount && "established on construction!");
+ FreeList[ParentCount].push_back(N);
+}
+
+unsigned GSS::gc(std::vector<const Node *> &&Queue) {
+#ifndef NDEBUG
+ auto ParityMatches = [&](const Node *N) { return N->GCParity == GCParity; };
+ assert("Before GC" && llvm::all_of(Alive, ParityMatches));
+ auto Deferred = llvm::make_scope_exit(
+ [&] { assert("After GC" && llvm::all_of(Alive, ParityMatches)); });
+ assert(llvm::all_of(
+ Queue, [&](const Node *R) { return llvm::is_contained(Alive, R); }));
+#endif
+ unsigned InitialCount = Alive.size();
+
+ // Mark
+ GCParity = !GCParity;
+ while (!Queue.empty()) {
+ Node *N = const_cast<Node *>(Queue.back()); // Safe: we created these nodes.
+ Queue.pop_back();
+ if (N->GCParity != GCParity) { // Not seen yet
+ N->GCParity = GCParity; // Mark as seen
+ for (const Node *P : N->parents()) // And walk parents
+ Queue.push_back(P);
+ }
+ }
+ // Sweep
+ llvm::erase_if(Alive, [&](Node *N) {
+ if (N->GCParity == GCParity) // Walk reached this node.
+ return false;
+ destroy(N);
+ return true;
+ });
+
+ LLVM_DEBUG(llvm::dbgs() << "GC pruned " << (InitialCount - Alive.size())
+ << "/" << InitialCount << " GSS nodes\n");
+ return InitialCount - Alive.size();
+}
+
} // namespace pseudo
} // namespace clang
diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
index cbb45504f40c8..73fa18df47394 100644
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -134,7 +134,7 @@ int main(int argc, char *argv[]) {
llvm::outs() << "Forest bytes: " << Arena.bytes()
<< " nodes: " << Arena.nodeCount() << "\n";
llvm::outs() << "GSS bytes: " << GSS.bytes()
- << " nodes: " << GSS.nodeCount() << "\n";
+ << " nodes: " << GSS.nodesCreated() << "\n";
}
}
}
diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
index f885094111a6c..ca12c53214ae1 100644
--- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -393,6 +393,28 @@ TEST_F(GLRTest, GLRReduceOrder) {
"[ 0, end) └─IDENTIFIER := tok[0]\n");
}
+TEST(GSSTest, GC) {
+ // ┌-A-┬-AB
+ // ├-B-┘
+ // Root-+-C
+ // ├-D
+ // └-E
+ GSS GSStack;
+ auto *Root = GSStack.addNode(0, nullptr, {});
+ auto *A = GSStack.addNode(0, nullptr, {Root});
+ auto *B = GSStack.addNode(0, nullptr, {Root});
+ auto *C = GSStack.addNode(0, nullptr, {Root});
+ auto *D = GSStack.addNode(0, nullptr, {Root});
+ auto *AB = GSStack.addNode(0, nullptr, {A, B});
+
+ EXPECT_EQ(1u, GSStack.gc({AB, C})) << "D is destroyed";
+ EXPECT_EQ(0u, GSStack.gc({AB, C})) << "D is already gone";
+ auto *E = GSStack.addNode(0, nullptr, {Root});
+ EXPECT_EQ(D, E) << "Storage of GCed node D is reused for E";
+ EXPECT_EQ(3u, GSStack.gc({A, E})) << "Destroys B, AB, C";
+ EXPECT_EQ(1u, GSStack.gc({E})) << "Destroys A";
+}
+
} // namespace
} // namespace pseudo
} // namespace clang
More information about the cfe-commits
mailing list