[clang] [clang-format] Limit how much work guessLanguage() can do (PR #78925)

Nathan Ridge via cfe-commits cfe-commits at lists.llvm.org
Mon Jan 22 12:21:03 PST 2024


https://github.com/HighCommander4 updated https://github.com/llvm/llvm-project/pull/78925

>From 47ebe97c199ade54a9538eaed02171d78e6a5a46 Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976 at hotmail.com>
Date: Sun, 21 Jan 2024 02:45:16 -0500
Subject: [PATCH] [clang-format] Limit how much work guessLanguage() can do

guessLanguage() uses UnwrappedLineParser to process different
preprocessor variants of a file. For large files with many
preprocessor branches, the number of variants can be very large
and the operation can hang for a long time and eventually OOM.
(This has been observed particularly for single-header libraries
such as miniaudio.h).

This patch implements a limit on how many variants guessLanguage()
analyzes, to avoid such a performance cliff.

The limit is expressed as a maximum number of lines (summed over
preprocessor variants) to process. This allows shorter files to
have more variants processed than longer files.

Fixes https://github.com/clangd/clangd/issues/719
Fixes https://github.com/clangd/clangd/issues/1384
Fixes https://github.com/llvm/llvm-project/issues/70945

This patch does not fix the broader problem of actually trying to
format such large headers, which involves using UnwrappedLineParser
from call sites other than guessLanguage(), though the approach
in the patch could be extended to other call sites as well.
---
 clang/lib/Format/Format.cpp              |  8 +++++++-
 clang/lib/Format/TokenAnalyzer.cpp       |  8 +++++---
 clang/lib/Format/TokenAnalyzer.h         |  4 +++-
 clang/lib/Format/UnwrappedLineParser.cpp | 11 +++++++++--
 clang/lib/Format/UnwrappedLineParser.h   |  8 +++++++-
 5 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index f798d555bf9929..fbafce38d52b77 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -2830,7 +2830,7 @@ class Cleaner : public TokenAnalyzer {
 class ObjCHeaderStyleGuesser : public TokenAnalyzer {
 public:
   ObjCHeaderStyleGuesser(const Environment &Env, const FormatStyle &Style)
-      : TokenAnalyzer(Env, Style), IsObjC(false) {}
+      : TokenAnalyzer(Env, Style, MaxLinesToProcess), IsObjC(false) {}
 
   std::pair<tooling::Replacements, unsigned>
   analyze(TokenAnnotator &Annotator,
@@ -2846,6 +2846,12 @@ class ObjCHeaderStyleGuesser : public TokenAnalyzer {
   bool isObjC() { return IsObjC; }
 
 private:
+  // Limit the number of variants of the file TokenAnalyzer processes for
+  // the purpose of guessing the language. An inaccurate guess is better than
+  // hanging for a long time or OOMing, which has been observed with real
+  // libraries which are single-header with many preprocessor branches.
+  static const unsigned MaxLinesToProcess = (1 << 20);
+
   static bool
   guessIsObjC(const SourceManager &SourceManager,
               const SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
diff --git a/clang/lib/Format/TokenAnalyzer.cpp b/clang/lib/Format/TokenAnalyzer.cpp
index bd648c430f9b0a..4f1195d5f82b55 100644
--- a/clang/lib/Format/TokenAnalyzer.cpp
+++ b/clang/lib/Format/TokenAnalyzer.cpp
@@ -83,12 +83,14 @@ Environment::Environment(StringRef Code, StringRef FileName,
       ID(VirtualSM->get().getMainFileID()), FirstStartColumn(FirstStartColumn),
       NextStartColumn(NextStartColumn), LastStartColumn(LastStartColumn) {}
 
-TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style)
+TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style,
+                             unsigned MaxLinesToProcess)
     : Style(Style), Env(Env),
       AffectedRangeMgr(Env.getSourceManager(), Env.getCharRanges()),
       UnwrappedLines(1),
       Encoding(encoding::detectEncoding(
-          Env.getSourceManager().getBufferData(Env.getFileID()))) {
+          Env.getSourceManager().getBufferData(Env.getFileID()))),
+      MaxLinesToProcess(MaxLinesToProcess) {
   LLVM_DEBUG(
       llvm::dbgs() << "File encoding: "
                    << (Encoding == encoding::Encoding_UTF8 ? "UTF8" : "unknown")
@@ -109,7 +111,7 @@ TokenAnalyzer::process(bool SkipAnnotation) {
   SmallVector<FormatToken *, 10> Tokens(Toks.begin(), Toks.end());
   UnwrappedLineParser Parser(Env.getSourceManager(), Style, Lex.getKeywords(),
                              Env.getFirstStartColumn(), Tokens, *this,
-                             Allocator, IdentTable);
+                             Allocator, IdentTable, MaxLinesToProcess);
   Parser.parse();
   assert(UnwrappedLines.back().empty());
   unsigned Penalty = 0;
diff --git a/clang/lib/Format/TokenAnalyzer.h b/clang/lib/Format/TokenAnalyzer.h
index 4086dab1c94c3a..65770d4be5ca28 100644
--- a/clang/lib/Format/TokenAnalyzer.h
+++ b/clang/lib/Format/TokenAnalyzer.h
@@ -87,7 +87,8 @@ class Environment {
 
 class TokenAnalyzer : public UnwrappedLineConsumer {
 public:
-  TokenAnalyzer(const Environment &Env, const FormatStyle &Style);
+  TokenAnalyzer(const Environment &Env, const FormatStyle &Style,
+                unsigned MaxLinesToProcess = 0);
 
   std::pair<tooling::Replacements, unsigned>
   process(bool SkipAnnotation = false);
@@ -109,6 +110,7 @@ class TokenAnalyzer : public UnwrappedLineConsumer {
   AffectedRangeManager AffectedRangeMgr;
   SmallVector<SmallVector<UnwrappedLine, 16>, 2> UnwrappedLines;
   encoding::Encoding Encoding;
+  unsigned MaxLinesToProcess;
 };
 
 } // end namespace format
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 684609747a5513..3aa92cdae8de44 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -151,7 +151,7 @@ UnwrappedLineParser::UnwrappedLineParser(
     const AdditionalKeywords &Keywords, unsigned FirstStartColumn,
     ArrayRef<FormatToken *> Tokens, UnwrappedLineConsumer &Callback,
     llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
-    IdentifierTable &IdentTable)
+    IdentifierTable &IdentTable, unsigned MaxLinesToProcess)
     : Line(new UnwrappedLine), MustBreakBeforeNextToken(false),
       CurrentLines(&Lines), Style(Style), Keywords(Keywords),
       CommentPragmasRegex(Style.CommentPragmas), Tokens(nullptr),
@@ -160,7 +160,8 @@ UnwrappedLineParser::UnwrappedLineParser(
                        ? IG_Rejected
                        : IG_Inited),
       IncludeGuardToken(nullptr), FirstStartColumn(FirstStartColumn),
-      Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable) {}
+      Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable),
+      MaxLinesToProcess(MaxLinesToProcess) {}
 
 void UnwrappedLineParser::reset() {
   PPBranchLevel = -1;
@@ -194,6 +195,7 @@ void UnwrappedLineParser::reset() {
 void UnwrappedLineParser::parse() {
   IndexedTokenSource TokenSource(AllTokens);
   Line->FirstStartColumn = FirstStartColumn;
+  size_t TotalLinesProcessed = 0;
   do {
     LLVM_DEBUG(llvm::dbgs() << "----\n");
     reset();
@@ -235,6 +237,7 @@ void UnwrappedLineParser::parse() {
         Callback.consumeUnwrappedLine(Line);
       }
       Callback.finishRun();
+      TotalLinesProcessed += ExpandedLines.size();
     }
 
     LLVM_DEBUG(llvm::dbgs() << "Unwrapped lines:\n");
@@ -243,6 +246,10 @@ void UnwrappedLineParser::parse() {
       Callback.consumeUnwrappedLine(Line);
     }
     Callback.finishRun();
+    TotalLinesProcessed += Lines.size();
+    if (MaxLinesToProcess > 0 && TotalLinesProcessed >= MaxLinesToProcess) {
+      break;
+    }
     Lines.clear();
     while (!PPLevelBranchIndex.empty() &&
            PPLevelBranchIndex.back() + 1 >= PPLevelBranchCount.back()) {
diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h
index 739298690bbd76..f54ad6bbd79a20 100644
--- a/clang/lib/Format/UnwrappedLineParser.h
+++ b/clang/lib/Format/UnwrappedLineParser.h
@@ -111,7 +111,8 @@ class UnwrappedLineParser {
                       unsigned FirstStartColumn, ArrayRef<FormatToken *> Tokens,
                       UnwrappedLineConsumer &Callback,
                       llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
-                      IdentifierTable &IdentTable);
+                      IdentifierTable &IdentTable,
+                      unsigned MaxLinesToProcess = 0);
 
   void parse();
 
@@ -406,6 +407,11 @@ class UnwrappedLineParser {
 
   MacroExpander Macros;
 
+  // If set to a nonzero value, stop generating new runs after this
+  // many lines (summed over all the runs generated so far) have been
+  // processed.
+  unsigned MaxLinesToProcess;
+
   friend class ScopedLineState;
   friend class CompoundStatementIndenter;
 };



More information about the cfe-commits mailing list