[clang-tools-extra] af89e47 - [pseudo] Add crude heuristics to choose taken preprocessor branches.

Sam McCall via cfe-commits cfe-commits at lists.llvm.org
Wed Apr 6 08:22:52 PDT 2022


Author: Sam McCall
Date: 2022-04-06T17:22:35+02:00
New Revision: af89e4792d23779969c70284dcf6cfafa411637c

URL: https://github.com/llvm/llvm-project/commit/af89e4792d23779969c70284dcf6cfafa411637c
DIFF: https://github.com/llvm/llvm-project/commit/af89e4792d23779969c70284dcf6cfafa411637c.diff

LOG: [pseudo] Add crude heuristics to choose taken preprocessor branches.

In files where different preprocessing paths are possible, our goal is to
choose a preprocessed token sequence which we can parse that pins down as much
of the grammatical structure as possible.
This forms the "primary parse", and the not-taken branches get parsed later,
and are constrained to be compatible with the primary parse.

Concretely:
  int x =
    #ifdef // TAKEN
      2 + 2 + 2 // determined during primary parse to be an expression
    #else
      2 // constrained to be an expression during a secondary parse
    #endif
    ;

Differential Revision: https://reviews.llvm.org/D121165

Added: 
    

Modified: 
    clang-tools-extra/pseudo/include/clang-pseudo/DirectiveMap.h
    clang-tools-extra/pseudo/include/clang-pseudo/Token.h
    clang-tools-extra/pseudo/lib/DirectiveMap.cpp
    clang-tools-extra/pseudo/test/lex.c
    clang-tools-extra/pseudo/tool/ClangPseudo.cpp
    clang-tools-extra/pseudo/unittests/DirectiveMapTest.cpp

Removed: 
    


################################################################################
diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/DirectiveMap.h b/clang-tools-extra/pseudo/include/clang-pseudo/DirectiveMap.h
index 14c752d29040a..67e815e67efdb 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/DirectiveMap.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/DirectiveMap.h
@@ -78,6 +78,11 @@ struct DirectiveMap {
     std::vector<std::pair<Directive, DirectiveMap>> Branches;
     /// The directive terminating the conditional, should be #endif.
     Directive End;
+    /// The index of the conditional branch we chose as active.
+    /// None indicates no branch was taken (e.g. #if 0 ... #endif).
+    /// The initial map from of `parse()` has no branches marked as taken.
+    /// See `chooseConditionalBranches()`.
+    llvm::Optional<unsigned> Taken;
   };
 
   /// Some piece of the file. {One of Code, Directive, Conditional}.
@@ -87,7 +92,6 @@ struct DirectiveMap {
   /// Extract preprocessor structure by examining the raw tokens.
   static DirectiveMap parse(const TokenStream &);
 
-  // FIXME: add heuristically selection of conditional branches.
   // FIXME: allow deriving a preprocessed stream
 };
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap &);
@@ -98,6 +102,24 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &,
 llvm::raw_ostream &operator<<(llvm::raw_ostream &,
                               const DirectiveMap::Conditional &);
 
+/// Selects a "taken" branch for each conditional directive in the file.
+///
+/// The choice is somewhat arbitrary, but aims to produce a useful parse:
+///  - idioms like `#if 0` are respected
+///  - we avoid paths that reach `#error`
+///  - we try to maximize the amount of code seen
+/// The choice may also be "no branch taken".
+///
+/// Choices are also made for conditionals themselves inside not-taken branches:
+///   #if 1 // taken!
+///   #else // not taken
+///      #if 1 // taken!
+///      #endif
+///   #endif
+///
+/// The choices are stored in Conditional::Taken nodes.
+void chooseConditionalBranches(DirectiveMap &, const TokenStream &Code);
+
 // FIXME: This approximates std::variant<Code, Directive, Conditional>.
 //         Switch once we can use C++17.
 class DirectiveMap::Chunk {

diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
index 4563477b2c4fe..2bbd598736e6d 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
@@ -74,6 +74,20 @@ struct Token {
     Flags |= uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
   }
 
+  /// Returns the next token in the stream. this may not be a sentinel.
+  const Token &next() const {
+    assert(Kind != tok::eof);
+    return *(this + 1);
+  }
+  /// Returns the next token in the stream, skipping over comments.
+  const Token &nextNC() const {
+    const Token *T = this;
+    do
+      T = &T->next();
+    while (T->Kind == tok::comment);
+    return *T;
+  }
+
   /// The type of token as determined by clang's lexer.
   clang::tok::TokenKind Kind = clang::tok::unknown;
 };

diff  --git a/clang-tools-extra/pseudo/lib/DirectiveMap.cpp b/clang-tools-extra/pseudo/lib/DirectiveMap.cpp
index 6e66708818954..f25231c9b7f28 100644
--- a/clang-tools-extra/pseudo/lib/DirectiveMap.cpp
+++ b/clang-tools-extra/pseudo/lib/DirectiveMap.cpp
@@ -151,10 +151,11 @@ DirectiveMap DirectiveMap::parse(const TokenStream &Code) {
 
 static void dump(llvm::raw_ostream &OS, const DirectiveMap &, unsigned Indent);
 static void dump(llvm::raw_ostream &OS,
-                 const DirectiveMap::Directive &Directive, unsigned Indent) {
-  OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n",
-                                     tok::getPPKeywordSpelling(Directive.Kind),
-                                     Directive.Tokens.size());
+                 const DirectiveMap::Directive &Directive, unsigned Indent,
+                 bool Taken = false) {
+  OS.indent(Indent) << llvm::formatv(
+      "#{0} ({1} tokens){2}\n", tok::getPPKeywordSpelling(Directive.Kind),
+      Directive.Tokens.size(), Taken ? " TAKEN" : "");
 }
 static void dump(llvm::raw_ostream &OS, const DirectiveMap::Code &Code,
                  unsigned Indent) {
@@ -163,8 +164,9 @@ static void dump(llvm::raw_ostream &OS, const DirectiveMap::Code &Code,
 static void dump(llvm::raw_ostream &OS,
                  const DirectiveMap::Conditional &Conditional,
                  unsigned Indent) {
-  for (const auto &Branch : Conditional.Branches) {
-    dump(OS, Branch.first, Indent);
+  for (unsigned I = 0; I < Conditional.Branches.size(); ++I) {
+    const auto &Branch = Conditional.Branches[I];
+    dump(OS, Branch.first, Indent, Conditional.Taken == I);
     dump(OS, Branch.second, Indent + 2);
   }
   dump(OS, Conditional.End, Indent);
@@ -203,5 +205,147 @@ OSTREAM_DUMP(DirectiveMap::Conditional)
 OSTREAM_DUMP(DirectiveMap::Code)
 #undef OSTREAM_DUMP
 
+namespace {
+// Makes choices about conditional branches.
+//
+// Generally it tries to maximize the amount of useful code we see.
+//
+// Caveat: each conditional is evaluated independently. Consider this code:
+//   #ifdef WINDOWS
+//     bool isWindows = true;
+//   #endif
+//   #ifndef WINDOWS
+//     bool isWindows = false;
+//   #endif
+// We take both branches and define isWindows twice. We could track more state
+// in order to produce a consistent view, but this is complex.
+class BranchChooser {
+public:
+  BranchChooser(const TokenStream &Code) : Code(Code) {}
+
+  void choose(DirectiveMap &M) { walk(M); }
+
+private:
+  // Describes code seen by making particular branch choices. Higher is better.
+  struct Score {
+    int Tokens = 0; // excluding comments and directives
+    int Directives = 0;
+    int Errors = 0; // #error directives
+
+    bool operator>(const Score &Other) const {
+      // Seeing errors is bad, other things are good.
+      return std::make_tuple(-Errors, Tokens, Directives) >
+             std::make_tuple(-Other.Errors, Other.Tokens, Other.Directives);
+    }
+
+    Score &operator+=(const Score &Other) {
+      Tokens += Other.Tokens;
+      Directives += Other.Directives;
+      Errors += Other.Errors;
+      return *this;
+    }
+  };
+
+  Score walk(DirectiveMap::Code &C) {
+    Score S;
+    for (const Token &T : Code.tokens(C.Tokens))
+      if (T.Kind != tok::comment)
+        ++S.Tokens;
+    return S;
+  }
+
+  Score walk(DirectiveMap::Directive &D) {
+    Score S;
+    S.Directives = 1;
+    S.Errors = D.Kind == tok::pp_error;
+    return S;
+  }
+
+  Score walk(DirectiveMap::Chunk &C) {
+    switch (C.kind()) {
+    case DirectiveMap::Chunk::K_Code:
+      return walk((DirectiveMap::Code &)C);
+    case DirectiveMap::Chunk::K_Directive:
+      return walk((DirectiveMap::Directive &)C);
+    case DirectiveMap::Chunk::K_Conditional:
+      return walk((DirectiveMap::Conditional &)C);
+    case DirectiveMap::Chunk::K_Empty:
+      break;
+    }
+    llvm_unreachable("bad chunk kind");
+  }
+
+  Score walk(DirectiveMap &M) {
+    Score S;
+    for (DirectiveMap::Chunk &C : M.Chunks)
+      S += walk(C);
+    return S;
+  }
+
+  Score walk(DirectiveMap::Conditional &C) {
+    Score Best;
+    bool MayTakeTrivial = true;
+    bool TookTrivial = false;
+
+    for (unsigned I = 0; I < C.Branches.size(); ++I) {
+      // Walk the branch to make its nested choices in any case.
+      Score BranchScore = walk(C.Branches[I].second);
+      // If we already took an #if 1, don't consider any other branches.
+      if (TookTrivial)
+        continue;
+      // Is this a trivial #if 0 or #if 1?
+      if (auto TriviallyTaken = isTakenWhenReached(C.Branches[I].first)) {
+        if (!*TriviallyTaken)
+          continue; // Don't consider #if 0 even if it scores well.
+        if (MayTakeTrivial)
+          TookTrivial = true;
+      } else {
+        // After a nontrivial condition, #elif 1 isn't guaranteed taken.
+        MayTakeTrivial = false;
+      }
+      // Is this the best branch so far? (Including if it's #if 1).
+      if (TookTrivial || !C.Taken.hasValue() || BranchScore > Best) {
+        Best = BranchScore;
+        C.Taken = I;
+      }
+    }
+    return Best;
+  }
+
+  // Return true if the directive starts an always-taken conditional branch,
+  // false if the branch is never taken, and None otherwise.
+  llvm::Optional<bool> isTakenWhenReached(const DirectiveMap::Directive &Dir) {
+    switch (Dir.Kind) {
+    case clang::tok::pp_if:
+    case clang::tok::pp_elif:
+      break; // handled below
+    case clang::tok::pp_else:
+      return true;
+    default: // #ifdef etc
+      return llvm::None;
+    }
+
+    const auto &Tokens = Code.tokens(Dir.Tokens);
+    assert(!Tokens.empty() && Tokens.front().Kind == tok::hash);
+    const Token &Name = Tokens.front().nextNC();
+    const Token &Value = Name.nextNC();
+    // Does the condition consist of exactly one token?
+    if (&Value >= Tokens.end() || &Value.nextNC() < Tokens.end())
+      return llvm::None;
+    return llvm::StringSwitch<llvm::Optional<bool>>(Value.text())
+        .Cases("true", "1", true)
+        .Cases("false", "0", false)
+        .Default(llvm::None);
+  }
+
+  const TokenStream &Code;
+};
+
+} // namespace
+
+void chooseConditionalBranches(DirectiveMap &Map, const TokenStream &Code) {
+  BranchChooser{Code}.choose(Map);
+}
+
 } // namespace pseudo
 } // namespace clang

diff  --git a/clang-tools-extra/pseudo/test/lex.c b/clang-tools-extra/pseudo/test/lex.c
index 39b114e8d9617..0bba84c91f405 100644
--- a/clang-tools-extra/pseudo/test/lex.c
+++ b/clang-tools-extra/pseudo/test/lex.c
@@ -41,7 +41,7 @@ TOKEN-NEXT: r_brace          6:0 "}" flags=1
 
 RUN: clang-pseudo -source %s -print-directive-map | FileCheck %s -check-prefix=PPS --strict-whitespace
      PPS: code (5 tokens)
-PPS-NEXT: #ifndef (3 tokens)
+PPS-NEXT: #ifndef (3 tokens) TAKEN
 PPS-NEXT:   code (4 tokens)
 PPS-NEXT: #else (2 tokens)
 PPS-NEXT:   code (3 tokens)

diff  --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
index e893ed2fb03e6..fd5eb60d5eb3d 100644
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -75,6 +75,7 @@ int main(int argc, char *argv[]) {
     clang::LangOptions LangOpts; // FIXME: use real options.
     auto Stream = clang::pseudo::lex(Text, LangOpts);
     auto Structure = clang::pseudo::DirectiveMap::parse(Stream);
+    clang::pseudo::chooseConditionalBranches(Structure, Stream);
 
     if (PrintDirectiveMap)
       llvm::outs() << Structure;

diff  --git a/clang-tools-extra/pseudo/unittests/DirectiveMapTest.cpp b/clang-tools-extra/pseudo/unittests/DirectiveMapTest.cpp
index 1feae342655be..010451fd7dc3f 100644
--- a/clang-tools-extra/pseudo/unittests/DirectiveMapTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/DirectiveMapTest.cpp
@@ -145,6 +145,162 @@ TEST(DirectiveMap, ParseBroken) {
   EXPECT_EQ(0u, X.End.Tokens.size());
 }
 
+TEST(DirectiveMap, ChooseBranches) {
+  LangOptions Opts;
+  const std::string Cases[] = {
+      R"cpp(
+        // Branches with no alternatives are taken
+        #if COND // TAKEN
+        int x;
+        #endif
+      )cpp",
+
+      R"cpp(
+        // Empty branches are better than nothing
+        #if COND // TAKEN
+        #endif
+      )cpp",
+
+      R"cpp(
+        // Trivially false branches are not taken, even with no alternatives.
+        #if 0
+        int x;
+        #endif
+      )cpp",
+
+      R"cpp(
+        // Longer branches are preferred over shorter branches
+        #if COND // TAKEN
+        int x = 1;
+        #else
+        int x;
+        #endif
+
+        #if COND
+        int x;
+        #else // TAKEN
+        int x = 1;
+        #endif
+      )cpp",
+
+      R"cpp(
+        // Trivially true branches are taken if previous branches are trivial.
+        #if 1 // TAKEN
+        #else
+          int x = 1;
+        #endif
+
+        #if 0
+          int x = 1;
+        #elif 0
+          int x = 2;
+        #elif 1 // TAKEN
+          int x;
+        #endif
+
+        #if 0
+          int x = 1;
+        #elif FOO // TAKEN
+          int x = 2;
+        #elif 1
+          int x;
+        #endif
+      )cpp",
+
+      R"cpp(
+        // #else is a trivially true branch
+        #if 0
+          int x = 1;
+        #elif 0
+          int x = 2;
+        #else // TAKEN
+          int x;
+        #endif
+      )cpp",
+
+      R"cpp(
+        // Directives break ties, but nondirective text is more important.
+        #if FOO
+          #define A 1 2 3
+        #else // TAKEN
+          #define B 4 5 6
+          #define C 7 8 9
+        #endif
+
+        #if FOO // TAKEN
+          ;
+          #define A 1 2 3
+        #else
+          #define B 4 5 6
+          #define C 7 8 9
+        #endif
+      )cpp",
+
+      R"cpp(
+        // Avoid #error directives.
+        #if FOO
+          int x = 42;
+          #error This branch is no good
+        #else // TAKEN
+        #endif
+
+        #if FOO
+          // All paths here lead to errors.
+          int x = 42;
+          #if 1 // TAKEN
+            #if COND // TAKEN
+              #error This branch is no good
+            #else
+              #error This one is no good either
+            #endif
+          #endif
+        #else // TAKEN
+        #endif
+      )cpp",
+
+      R"cpp(
+        // Populate taken branches recursively.
+        #if FOO // TAKEN
+          int x = 42;
+          #if BAR
+            ;
+          #else // TAKEN
+            int y = 43;
+          #endif
+        #else
+          int x;
+          #if BAR // TAKEN
+            int y;
+          #else
+            ;
+          #endif
+        #endif
+      )cpp",
+  };
+  for (const auto &Code : Cases) {
+    TokenStream S = cook(lex(Code, Opts), Opts);
+
+    std::function<void(const DirectiveMap &)> Verify =
+        [&](const DirectiveMap &M) {
+          for (const auto &C : M.Chunks) {
+            if (C.kind() != DirectiveMap::Chunk::K_Conditional)
+              continue;
+            const DirectiveMap::Conditional &Cond(C);
+            for (unsigned I = 0; I < Cond.Branches.size(); ++I) {
+              auto Directive = S.tokens(Cond.Branches[I].first.Tokens);
+              EXPECT_EQ(I == Cond.Taken, Directive.back().text() == "// TAKEN")
+                  << "At line " << Directive.front().Line << " of: " << Code;
+              Verify(Cond.Branches[I].second);
+            }
+          }
+        };
+
+    DirectiveMap Map = DirectiveMap::parse(S);
+    chooseConditionalBranches(Map, S);
+    Verify(Map);
+  }
+}
+
 } // namespace
 } // namespace pseudo
 } // namespace clang


        


More information about the cfe-commits mailing list