[clang] [clang-format] Limit how much work guessLanguage() can do (PR #78925)

Sun Jan 21 18:50:21 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-clang-format

Author: Nathan Ridge (HighCommander4)

<details>
<summary>Changes</summary>

guessLanguage() uses UnwrappedLineParser to process different preprocessor variants of a file. For large files with many preprocessor branches, the number of variants can be very large and the operation can hang for a long time and eventually OOM. (This has been observed particularly for single-header libraries such as miniaudio.h).

This patch implements a limit on how many variants guessLanguage() analyzes, to avoid such a performance cliff.

The limit is expressed as a maximum number of lines (summed over preprocessor variants) to process. This allows shorter files to have more variants processed than longer files.

Fixes https://github.com/clangd/clangd/issues/719
Fixes https://github.com/clangd/clangd/issues/1384
Fixes https://github.com/llvm/llvm-project/issues/70945

This patch does not fix the broader problem of actually trying to format such large headers, which involves using UnwrappedLineParser from call sites other than guessLanguage(), though the approach in the patch could be extended to other call sites as well.

---
Full diff: https://github.com/llvm/llvm-project/pull/78925.diff


5 Files Affected:

- (modified) clang/lib/Format/Format.cpp (+7-1) 
- (modified) clang/lib/Format/TokenAnalyzer.cpp (+5-3) 
- (modified) clang/lib/Format/TokenAnalyzer.h (+3-1) 
- (modified) clang/lib/Format/UnwrappedLineParser.cpp (+14-2) 
- (modified) clang/lib/Format/UnwrappedLineParser.h (+7-1) 


``````````diff

diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index f798d555bf9929..fbafce38d52b77 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -2830,7 +2830,7 @@ class Cleaner : public TokenAnalyzer {
 class ObjCHeaderStyleGuesser : public TokenAnalyzer {
 public:
   ObjCHeaderStyleGuesser(const Environment &Env, const FormatStyle &Style)
-      : TokenAnalyzer(Env, Style), IsObjC(false) {}
+      : TokenAnalyzer(Env, Style, MaxLinesToProcess), IsObjC(false) {}
 
   std::pair<tooling::Replacements, unsigned>
   analyze(TokenAnnotator &Annotator,
@@ -2846,6 +2846,12 @@ class ObjCHeaderStyleGuesser : public TokenAnalyzer {
   bool isObjC() { return IsObjC; }
 
 private:
+  // Limit the number of variants of the file TokenAnalyzer processes for
+  // the purpose of guessing the language. An inaccurate guess is better than
+  // hanging for a long time or OOMing, which has been observed with real
+  // libraries which are single-header with many preprocessor branches.
+  static const unsigned MaxLinesToProcess = (1 << 20);
+
   static bool
   guessIsObjC(const SourceManager &SourceManager,
               const SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
diff --git a/clang/lib/Format/TokenAnalyzer.cpp b/clang/lib/Format/TokenAnalyzer.cpp
index bd648c430f9b0a..4f1195d5f82b55 100644
--- a/clang/lib/Format/TokenAnalyzer.cpp
+++ b/clang/lib/Format/TokenAnalyzer.cpp
@@ -83,12 +83,14 @@ Environment::Environment(StringRef Code, StringRef FileName,
       ID(VirtualSM->get().getMainFileID()), FirstStartColumn(FirstStartColumn),
       NextStartColumn(NextStartColumn), LastStartColumn(LastStartColumn) {}
 
-TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style)
+TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style,
+                             unsigned MaxLinesToProcess)
     : Style(Style), Env(Env),
       AffectedRangeMgr(Env.getSourceManager(), Env.getCharRanges()),
       UnwrappedLines(1),
       Encoding(encoding::detectEncoding(
-          Env.getSourceManager().getBufferData(Env.getFileID()))) {
+          Env.getSourceManager().getBufferData(Env.getFileID()))),
+      MaxLinesToProcess(MaxLinesToProcess) {
   LLVM_DEBUG(
       llvm::dbgs() << "File encoding: "
                    << (Encoding == encoding::Encoding_UTF8 ? "UTF8" : "unknown")
@@ -109,7 +111,7 @@ TokenAnalyzer::process(bool SkipAnnotation) {
   SmallVector<FormatToken *, 10> Tokens(Toks.begin(), Toks.end());
   UnwrappedLineParser Parser(Env.getSourceManager(), Style, Lex.getKeywords(),
                              Env.getFirstStartColumn(), Tokens, *this,
-                             Allocator, IdentTable);
+                             Allocator, IdentTable, MaxLinesToProcess);
   Parser.parse();
   assert(UnwrappedLines.back().empty());
   unsigned Penalty = 0;
diff --git a/clang/lib/Format/TokenAnalyzer.h b/clang/lib/Format/TokenAnalyzer.h
index 4086dab1c94c3a..65770d4be5ca28 100644
--- a/clang/lib/Format/TokenAnalyzer.h
+++ b/clang/lib/Format/TokenAnalyzer.h
@@ -87,7 +87,8 @@ class Environment {
 
 class TokenAnalyzer : public UnwrappedLineConsumer {
 public:
-  TokenAnalyzer(const Environment &Env, const FormatStyle &Style);
+  TokenAnalyzer(const Environment &Env, const FormatStyle &Style,
+                unsigned MaxLinesToProcess = 0);
 
   std::pair<tooling::Replacements, unsigned>
   process(bool SkipAnnotation = false);
@@ -109,6 +110,7 @@ class TokenAnalyzer : public UnwrappedLineConsumer {
   AffectedRangeManager AffectedRangeMgr;
   SmallVector<SmallVector<UnwrappedLine, 16>, 2> UnwrappedLines;
   encoding::Encoding Encoding;
+  unsigned MaxLinesToProcess;
 };
 
 } // end namespace format
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 684609747a5513..d925e19f8bf3bc 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -151,7 +151,7 @@ UnwrappedLineParser::UnwrappedLineParser(
     const AdditionalKeywords &Keywords, unsigned FirstStartColumn,
     ArrayRef<FormatToken *> Tokens, UnwrappedLineConsumer &Callback,
     llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
-    IdentifierTable &IdentTable)
+    IdentifierTable &IdentTable, unsigned MaxLinesToProcess)
     : Line(new UnwrappedLine), MustBreakBeforeNextToken(false),
       CurrentLines(&Lines), Style(Style), Keywords(Keywords),
       CommentPragmasRegex(Style.CommentPragmas), Tokens(nullptr),
@@ -160,7 +160,8 @@ UnwrappedLineParser::UnwrappedLineParser(
                        ? IG_Rejected
                        : IG_Inited),
       IncludeGuardToken(nullptr), FirstStartColumn(FirstStartColumn),
-      Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable) {}
+      Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable),
+      MaxLinesToProcess(MaxLinesToProcess) {}
 
 void UnwrappedLineParser::reset() {
   PPBranchLevel = -1;
@@ -194,6 +195,8 @@ void UnwrappedLineParser::reset() {
 void UnwrappedLineParser::parse() {
   IndexedTokenSource TokenSource(AllTokens);
   Line->FirstStartColumn = FirstStartColumn;
+  size_t TotalLinesProcessed = 0;
+  size_t LastReport = 0;
   do {
     LLVM_DEBUG(llvm::dbgs() << "----\n");
     reset();
@@ -235,6 +238,7 @@ void UnwrappedLineParser::parse() {
         Callback.consumeUnwrappedLine(Line);
       }
       Callback.finishRun();
+      TotalLinesProcessed += ExpandedLines.size();
     }
 
     LLVM_DEBUG(llvm::dbgs() << "Unwrapped lines:\n");
@@ -243,6 +247,14 @@ void UnwrappedLineParser::parse() {
       Callback.consumeUnwrappedLine(Line);
     }
     Callback.finishRun();
+    TotalLinesProcessed += Lines.size();
+    if ((TotalLinesProcessed / 10000) > LastReport) {
+      llvm::errs() << "Processed " << TotalLinesProcessed << " lines\n";
+      LastReport = (TotalLinesProcessed / 10000);
+    }
+    if (MaxLinesToProcess > 0 && TotalLinesProcessed >= MaxLinesToProcess) {
+      break;
+    }
     Lines.clear();
     while (!PPLevelBranchIndex.empty() &&
            PPLevelBranchIndex.back() + 1 >= PPLevelBranchCount.back()) {
diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h
index 739298690bbd76..f54ad6bbd79a20 100644
--- a/clang/lib/Format/UnwrappedLineParser.h
+++ b/clang/lib/Format/UnwrappedLineParser.h
@@ -111,7 +111,8 @@ class UnwrappedLineParser {
                       unsigned FirstStartColumn, ArrayRef<FormatToken *> Tokens,
                       UnwrappedLineConsumer &Callback,
                       llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
-                      IdentifierTable &IdentTable);
+                      IdentifierTable &IdentTable,
+                      unsigned MaxLinesToProcess = 0);
 
   void parse();
 
@@ -406,6 +407,11 @@ class UnwrappedLineParser {
 
   MacroExpander Macros;
 
+  // If set to a nonzero value, stop generating new runs after this
+  // many lines (summed over all the runs generated so far) have been
+  // processed.
+  unsigned MaxLinesToProcess;
+
   friend class ScopedLineState;
   friend class CompoundStatementIndenter;
 };

``````````

</details>


https://github.com/llvm/llvm-project/pull/78925