[clang] [clang-format] Limit how much work guessLanguage() can do (PR #78925)

Sun Jan 21 18:49:51 PST 2024

https://github.com/HighCommander4 created https://github.com/llvm/llvm-project/pull/78925

guessLanguage() uses UnwrappedLineParser to process different preprocessor variants of a file. For large files with many preprocessor branches, the number of variants can be very large and the operation can hang for a long time and eventually OOM. (This has been observed particularly for single-header libraries such as miniaudio.h).

This patch implements a limit on how many variants guessLanguage() analyzes, to avoid such a performance cliff.

The limit is expressed as a maximum number of lines (summed over preprocessor variants) to process. This allows shorter files to have more variants processed than longer files.

Fixes https://github.com/clangd/clangd/issues/719
Fixes https://github.com/clangd/clangd/issues/1384
Fixes https://github.com/llvm/llvm-project/issues/70945

This patch does not fix the broader problem of actually trying to format such large headers, which involves using UnwrappedLineParser from call sites other than guessLanguage(), though the approach in the patch could be extended to other call sites as well.

>From 05c751db607874eddffc836d6d70da7418ee0589 Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976 at hotmail.com>
Date: Sun, 21 Jan 2024 02:45:16 -0500
Subject: [PATCH] [clang-format] Limit how much work guessLanguage() can do

guessLanguage() uses UnwrappedLineParser to process different
preprocessor variants of a file. For large files with many
preprocessor branches, the number of variants can be very large
and the operation can hang for a long time and eventually OOM.
(This has been observed particularly for single-header libraries
such as miniaudio.h).

This patch implements a limit on how many variants guessLanguage()
analyzes, to avoid such a performance cliff.

The limit is expressed as a maximum number of lines (summed over
preprocessor variants) to process. This allows shorter files to
have more variants processed than longer files.

Fixes https://github.com/clangd/clangd/issues/719
Fixes https://github.com/clangd/clangd/issues/1384
Fixes https://github.com/llvm/llvm-project/issues/70945

This patch does not fix the broader problem of actually trying to
format such large headers, which involves using UnwrappedLineParser
from call sites other than guessLanguage(), though the approach
in the patch could be extended to other call sites as well.
---
 clang/lib/Format/Format.cpp              |  8 +++++++-
 clang/lib/Format/TokenAnalyzer.cpp       |  8 +++++---
 clang/lib/Format/TokenAnalyzer.h         |  4 +++-
 clang/lib/Format/UnwrappedLineParser.cpp | 16 ++++++++++++++--
 clang/lib/Format/UnwrappedLineParser.h   |  8 +++++++-
 5 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index f798d555bf9929c..fbafce38d52b775 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -2830,7 +2830,7 @@ class Cleaner : public TokenAnalyzer {
 class ObjCHeaderStyleGuesser : public TokenAnalyzer {
 public:
   ObjCHeaderStyleGuesser(const Environment &Env, const FormatStyle &Style)
-      : TokenAnalyzer(Env, Style), IsObjC(false) {}
+      : TokenAnalyzer(Env, Style, MaxLinesToProcess), IsObjC(false) {}
 
   std::pair<tooling::Replacements, unsigned>
   analyze(TokenAnnotator &Annotator,
@@ -2846,6 +2846,12 @@ class ObjCHeaderStyleGuesser : public TokenAnalyzer {
   bool isObjC() { return IsObjC; }
 
 private:
+  // Limit the number of variants of the file TokenAnalyzer processes for
+  // the purpose of guessing the language. An inaccurate guess is better than
+  // hanging for a long time or OOMing, which has been observed with real
+  // libraries which are single-header with many preprocessor branches.
+  static const unsigned MaxLinesToProcess = (1 << 20);
+
   static bool
   guessIsObjC(const SourceManager &SourceManager,
               const SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
diff --git a/clang/lib/Format/TokenAnalyzer.cpp b/clang/lib/Format/TokenAnalyzer.cpp
index bd648c430f9b0a4..4f1195d5f82b554 100644
--- a/clang/lib/Format/TokenAnalyzer.cpp
+++ b/clang/lib/Format/TokenAnalyzer.cpp
@@ -83,12 +83,14 @@ Environment::Environment(StringRef Code, StringRef FileName,
       ID(VirtualSM->get().getMainFileID()), FirstStartColumn(FirstStartColumn),
       NextStartColumn(NextStartColumn), LastStartColumn(LastStartColumn) {}
 
-TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style)
+TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style,
+                             unsigned MaxLinesToProcess)
     : Style(Style), Env(Env),
       AffectedRangeMgr(Env.getSourceManager(), Env.getCharRanges()),
       UnwrappedLines(1),
       Encoding(encoding::detectEncoding(
-          Env.getSourceManager().getBufferData(Env.getFileID()))) {
+          Env.getSourceManager().getBufferData(Env.getFileID()))),
+      MaxLinesToProcess(MaxLinesToProcess) {
   LLVM_DEBUG(
       llvm::dbgs() << "File encoding: "
                    << (Encoding == encoding::Encoding_UTF8 ? "UTF8" : "unknown")
@@ -109,7 +111,7 @@ TokenAnalyzer::process(bool SkipAnnotation) {
   SmallVector<FormatToken *, 10> Tokens(Toks.begin(), Toks.end());
   UnwrappedLineParser Parser(Env.getSourceManager(), Style, Lex.getKeywords(),
                              Env.getFirstStartColumn(), Tokens, *this,
-                             Allocator, IdentTable);
+                             Allocator, IdentTable, MaxLinesToProcess);
   Parser.parse();
   assert(UnwrappedLines.back().empty());
   unsigned Penalty = 0;
diff --git a/clang/lib/Format/TokenAnalyzer.h b/clang/lib/Format/TokenAnalyzer.h
index 4086dab1c94c3ad..65770d4be5ca280 100644
--- a/clang/lib/Format/TokenAnalyzer.h
+++ b/clang/lib/Format/TokenAnalyzer.h
@@ -87,7 +87,8 @@ class Environment {
 
 class TokenAnalyzer : public UnwrappedLineConsumer {
 public:
-  TokenAnalyzer(const Environment &Env, const FormatStyle &Style);
+  TokenAnalyzer(const Environment &Env, const FormatStyle &Style,
+                unsigned MaxLinesToProcess = 0);
 
   std::pair<tooling::Replacements, unsigned>
   process(bool SkipAnnotation = false);
@@ -109,6 +110,7 @@ class TokenAnalyzer : public UnwrappedLineConsumer {
   AffectedRangeManager AffectedRangeMgr;
   SmallVector<SmallVector<UnwrappedLine, 16>, 2> UnwrappedLines;
   encoding::Encoding Encoding;
+  unsigned MaxLinesToProcess;
 };
 
 } // end namespace format
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 684609747a5513f..d925e19f8bf3bcc 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -151,7 +151,7 @@ UnwrappedLineParser::UnwrappedLineParser(
     const AdditionalKeywords &Keywords, unsigned FirstStartColumn,
     ArrayRef<FormatToken *> Tokens, UnwrappedLineConsumer &Callback,
     llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
-    IdentifierTable &IdentTable)
+    IdentifierTable &IdentTable, unsigned MaxLinesToProcess)
     : Line(new UnwrappedLine), MustBreakBeforeNextToken(false),
       CurrentLines(&Lines), Style(Style), Keywords(Keywords),
       CommentPragmasRegex(Style.CommentPragmas), Tokens(nullptr),
@@ -160,7 +160,8 @@ UnwrappedLineParser::UnwrappedLineParser(
                        ? IG_Rejected
                        : IG_Inited),
       IncludeGuardToken(nullptr), FirstStartColumn(FirstStartColumn),
-      Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable) {}
+      Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable),
+      MaxLinesToProcess(MaxLinesToProcess) {}
 
 void UnwrappedLineParser::reset() {
   PPBranchLevel = -1;
@@ -194,6 +195,8 @@ void UnwrappedLineParser::reset() {
 void UnwrappedLineParser::parse() {
   IndexedTokenSource TokenSource(AllTokens);
   Line->FirstStartColumn = FirstStartColumn;
+  size_t TotalLinesProcessed = 0;
+  size_t LastReport = 0;
   do {
     LLVM_DEBUG(llvm::dbgs() << "----\n");
     reset();
@@ -235,6 +238,7 @@ void UnwrappedLineParser::parse() {
         Callback.consumeUnwrappedLine(Line);
       }
       Callback.finishRun();
+      TotalLinesProcessed += ExpandedLines.size();
     }
 
     LLVM_DEBUG(llvm::dbgs() << "Unwrapped lines:\n");
@@ -243,6 +247,14 @@ void UnwrappedLineParser::parse() {
       Callback.consumeUnwrappedLine(Line);
     }
     Callback.finishRun();
+    TotalLinesProcessed += Lines.size();
+    if ((TotalLinesProcessed / 10000) > LastReport) {
+      llvm::errs() << "Processed " << TotalLinesProcessed << " lines\n";
+      LastReport = (TotalLinesProcessed / 10000);
+    }
+    if (MaxLinesToProcess > 0 && TotalLinesProcessed >= MaxLinesToProcess) {
+      break;
+    }
     Lines.clear();
     while (!PPLevelBranchIndex.empty() &&
            PPLevelBranchIndex.back() + 1 >= PPLevelBranchCount.back()) {
diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h
index 739298690bbd765..f54ad6bbd79a209 100644
--- a/clang/lib/Format/UnwrappedLineParser.h
+++ b/clang/lib/Format/UnwrappedLineParser.h
@@ -111,7 +111,8 @@ class UnwrappedLineParser {
                       unsigned FirstStartColumn, ArrayRef<FormatToken *> Tokens,
                       UnwrappedLineConsumer &Callback,
                       llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
-                      IdentifierTable &IdentTable);
+                      IdentifierTable &IdentTable,
+                      unsigned MaxLinesToProcess = 0);
 
   void parse();
 
@@ -406,6 +407,11 @@ class UnwrappedLineParser {
 
   MacroExpander Macros;
 
+  // If set to a nonzero value, stop generating new runs after this
+  // many lines (summed over all the runs generated so far) have been
+  // processed.
+  unsigned MaxLinesToProcess;
+
   friend class ScopedLineState;
   friend class CompoundStatementIndenter;
 };