[clang-tools-extra] af89e47 - [pseudo] Add crude heuristics to choose taken preprocessor branches.
Sam McCall via cfe-commits
cfe-commits at lists.llvm.org
Wed Apr 6 08:22:52 PDT 2022
Author: Sam McCall
Date: 2022-04-06T17:22:35+02:00
New Revision: af89e4792d23779969c70284dcf6cfafa411637c
URL: https://github.com/llvm/llvm-project/commit/af89e4792d23779969c70284dcf6cfafa411637c
DIFF: https://github.com/llvm/llvm-project/commit/af89e4792d23779969c70284dcf6cfafa411637c.diff
LOG: [pseudo] Add crude heuristics to choose taken preprocessor branches.
In files where different preprocessing paths are possible, our goal is to
choose a preprocessed token sequence which we can parse that pins down as much
of the grammatical structure as possible.
This forms the "primary parse", and the not-taken branches get parsed later,
and are constrained to be compatible with the primary parse.
Concretely:
int x =
#ifdef // TAKEN
2 + 2 + 2 // determined during primary parse to be an expression
#else
2 // constrained to be an expression during a secondary parse
#endif
;
Differential Revision: https://reviews.llvm.org/D121165
Added:
Modified:
clang-tools-extra/pseudo/include/clang-pseudo/DirectiveMap.h
clang-tools-extra/pseudo/include/clang-pseudo/Token.h
clang-tools-extra/pseudo/lib/DirectiveMap.cpp
clang-tools-extra/pseudo/test/lex.c
clang-tools-extra/pseudo/tool/ClangPseudo.cpp
clang-tools-extra/pseudo/unittests/DirectiveMapTest.cpp
Removed:
################################################################################
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/DirectiveMap.h b/clang-tools-extra/pseudo/include/clang-pseudo/DirectiveMap.h
index 14c752d29040a..67e815e67efdb 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/DirectiveMap.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/DirectiveMap.h
@@ -78,6 +78,11 @@ struct DirectiveMap {
std::vector<std::pair<Directive, DirectiveMap>> Branches;
/// The directive terminating the conditional, should be #endif.
Directive End;
+ /// The index of the conditional branch we chose as active.
+ /// None indicates no branch was taken (e.g. #if 0 ... #endif).
+ /// The initial map from of `parse()` has no branches marked as taken.
+ /// See `chooseConditionalBranches()`.
+ llvm::Optional<unsigned> Taken;
};
/// Some piece of the file. {One of Code, Directive, Conditional}.
@@ -87,7 +92,6 @@ struct DirectiveMap {
/// Extract preprocessor structure by examining the raw tokens.
static DirectiveMap parse(const TokenStream &);
- // FIXME: add heuristically selection of conditional branches.
// FIXME: allow deriving a preprocessed stream
};
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap &);
@@ -98,6 +102,24 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &,
llvm::raw_ostream &operator<<(llvm::raw_ostream &,
const DirectiveMap::Conditional &);
+/// Selects a "taken" branch for each conditional directive in the file.
+///
+/// The choice is somewhat arbitrary, but aims to produce a useful parse:
+/// - idioms like `#if 0` are respected
+/// - we avoid paths that reach `#error`
+/// - we try to maximize the amount of code seen
+/// The choice may also be "no branch taken".
+///
+/// Choices are also made for conditionals themselves inside not-taken branches:
+/// #if 1 // taken!
+/// #else // not taken
+/// #if 1 // taken!
+/// #endif
+/// #endif
+///
+/// The choices are stored in Conditional::Taken nodes.
+void chooseConditionalBranches(DirectiveMap &, const TokenStream &Code);
+
// FIXME: This approximates std::variant<Code, Directive, Conditional>.
// Switch once we can use C++17.
class DirectiveMap::Chunk {
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
index 4563477b2c4fe..2bbd598736e6d 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
@@ -74,6 +74,20 @@ struct Token {
Flags |= uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
}
+ /// Returns the next token in the stream. this may not be a sentinel.
+ const Token &next() const {
+ assert(Kind != tok::eof);
+ return *(this + 1);
+ }
+ /// Returns the next token in the stream, skipping over comments.
+ const Token &nextNC() const {
+ const Token *T = this;
+ do
+ T = &T->next();
+ while (T->Kind == tok::comment);
+ return *T;
+ }
+
/// The type of token as determined by clang's lexer.
clang::tok::TokenKind Kind = clang::tok::unknown;
};
diff --git a/clang-tools-extra/pseudo/lib/DirectiveMap.cpp b/clang-tools-extra/pseudo/lib/DirectiveMap.cpp
index 6e66708818954..f25231c9b7f28 100644
--- a/clang-tools-extra/pseudo/lib/DirectiveMap.cpp
+++ b/clang-tools-extra/pseudo/lib/DirectiveMap.cpp
@@ -151,10 +151,11 @@ DirectiveMap DirectiveMap::parse(const TokenStream &Code) {
static void dump(llvm::raw_ostream &OS, const DirectiveMap &, unsigned Indent);
static void dump(llvm::raw_ostream &OS,
- const DirectiveMap::Directive &Directive, unsigned Indent) {
- OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n",
- tok::getPPKeywordSpelling(Directive.Kind),
- Directive.Tokens.size());
+ const DirectiveMap::Directive &Directive, unsigned Indent,
+ bool Taken = false) {
+ OS.indent(Indent) << llvm::formatv(
+ "#{0} ({1} tokens){2}\n", tok::getPPKeywordSpelling(Directive.Kind),
+ Directive.Tokens.size(), Taken ? " TAKEN" : "");
}
static void dump(llvm::raw_ostream &OS, const DirectiveMap::Code &Code,
unsigned Indent) {
@@ -163,8 +164,9 @@ static void dump(llvm::raw_ostream &OS, const DirectiveMap::Code &Code,
static void dump(llvm::raw_ostream &OS,
const DirectiveMap::Conditional &Conditional,
unsigned Indent) {
- for (const auto &Branch : Conditional.Branches) {
- dump(OS, Branch.first, Indent);
+ for (unsigned I = 0; I < Conditional.Branches.size(); ++I) {
+ const auto &Branch = Conditional.Branches[I];
+ dump(OS, Branch.first, Indent, Conditional.Taken == I);
dump(OS, Branch.second, Indent + 2);
}
dump(OS, Conditional.End, Indent);
@@ -203,5 +205,147 @@ OSTREAM_DUMP(DirectiveMap::Conditional)
OSTREAM_DUMP(DirectiveMap::Code)
#undef OSTREAM_DUMP
+namespace {
+// Makes choices about conditional branches.
+//
+// Generally it tries to maximize the amount of useful code we see.
+//
+// Caveat: each conditional is evaluated independently. Consider this code:
+// #ifdef WINDOWS
+// bool isWindows = true;
+// #endif
+// #ifndef WINDOWS
+// bool isWindows = false;
+// #endif
+// We take both branches and define isWindows twice. We could track more state
+// in order to produce a consistent view, but this is complex.
+class BranchChooser {
+public:
+ BranchChooser(const TokenStream &Code) : Code(Code) {}
+
+ void choose(DirectiveMap &M) { walk(M); }
+
+private:
+ // Describes code seen by making particular branch choices. Higher is better.
+ struct Score {
+ int Tokens = 0; // excluding comments and directives
+ int Directives = 0;
+ int Errors = 0; // #error directives
+
+ bool operator>(const Score &Other) const {
+ // Seeing errors is bad, other things are good.
+ return std::make_tuple(-Errors, Tokens, Directives) >
+ std::make_tuple(-Other.Errors, Other.Tokens, Other.Directives);
+ }
+
+ Score &operator+=(const Score &Other) {
+ Tokens += Other.Tokens;
+ Directives += Other.Directives;
+ Errors += Other.Errors;
+ return *this;
+ }
+ };
+
+ Score walk(DirectiveMap::Code &C) {
+ Score S;
+ for (const Token &T : Code.tokens(C.Tokens))
+ if (T.Kind != tok::comment)
+ ++S.Tokens;
+ return S;
+ }
+
+ Score walk(DirectiveMap::Directive &D) {
+ Score S;
+ S.Directives = 1;
+ S.Errors = D.Kind == tok::pp_error;
+ return S;
+ }
+
+ Score walk(DirectiveMap::Chunk &C) {
+ switch (C.kind()) {
+ case DirectiveMap::Chunk::K_Code:
+ return walk((DirectiveMap::Code &)C);
+ case DirectiveMap::Chunk::K_Directive:
+ return walk((DirectiveMap::Directive &)C);
+ case DirectiveMap::Chunk::K_Conditional:
+ return walk((DirectiveMap::Conditional &)C);
+ case DirectiveMap::Chunk::K_Empty:
+ break;
+ }
+ llvm_unreachable("bad chunk kind");
+ }
+
+ Score walk(DirectiveMap &M) {
+ Score S;
+ for (DirectiveMap::Chunk &C : M.Chunks)
+ S += walk(C);
+ return S;
+ }
+
+ Score walk(DirectiveMap::Conditional &C) {
+ Score Best;
+ bool MayTakeTrivial = true;
+ bool TookTrivial = false;
+
+ for (unsigned I = 0; I < C.Branches.size(); ++I) {
+ // Walk the branch to make its nested choices in any case.
+ Score BranchScore = walk(C.Branches[I].second);
+ // If we already took an #if 1, don't consider any other branches.
+ if (TookTrivial)
+ continue;
+ // Is this a trivial #if 0 or #if 1?
+ if (auto TriviallyTaken = isTakenWhenReached(C.Branches[I].first)) {
+ if (!*TriviallyTaken)
+ continue; // Don't consider #if 0 even if it scores well.
+ if (MayTakeTrivial)
+ TookTrivial = true;
+ } else {
+ // After a nontrivial condition, #elif 1 isn't guaranteed taken.
+ MayTakeTrivial = false;
+ }
+ // Is this the best branch so far? (Including if it's #if 1).
+ if (TookTrivial || !C.Taken.hasValue() || BranchScore > Best) {
+ Best = BranchScore;
+ C.Taken = I;
+ }
+ }
+ return Best;
+ }
+
+ // Return true if the directive starts an always-taken conditional branch,
+ // false if the branch is never taken, and None otherwise.
+ llvm::Optional<bool> isTakenWhenReached(const DirectiveMap::Directive &Dir) {
+ switch (Dir.Kind) {
+ case clang::tok::pp_if:
+ case clang::tok::pp_elif:
+ break; // handled below
+ case clang::tok::pp_else:
+ return true;
+ default: // #ifdef etc
+ return llvm::None;
+ }
+
+ const auto &Tokens = Code.tokens(Dir.Tokens);
+ assert(!Tokens.empty() && Tokens.front().Kind == tok::hash);
+ const Token &Name = Tokens.front().nextNC();
+ const Token &Value = Name.nextNC();
+ // Does the condition consist of exactly one token?
+ if (&Value >= Tokens.end() || &Value.nextNC() < Tokens.end())
+ return llvm::None;
+ return llvm::StringSwitch<llvm::Optional<bool>>(Value.text())
+ .Cases("true", "1", true)
+ .Cases("false", "0", false)
+ .Default(llvm::None);
+ }
+
+ const TokenStream &Code;
+};
+
+} // namespace
+
+void chooseConditionalBranches(DirectiveMap &Map, const TokenStream &Code) {
+ BranchChooser{Code}.choose(Map);
+}
+
} // namespace pseudo
} // namespace clang
diff --git a/clang-tools-extra/pseudo/test/lex.c b/clang-tools-extra/pseudo/test/lex.c
index 39b114e8d9617..0bba84c91f405 100644
--- a/clang-tools-extra/pseudo/test/lex.c
+++ b/clang-tools-extra/pseudo/test/lex.c
@@ -41,7 +41,7 @@ TOKEN-NEXT: r_brace 6:0 "}" flags=1
RUN: clang-pseudo -source %s -print-directive-map | FileCheck %s -check-prefix=PPS --strict-whitespace
PPS: code (5 tokens)
-PPS-NEXT: #ifndef (3 tokens)
+PPS-NEXT: #ifndef (3 tokens) TAKEN
PPS-NEXT: code (4 tokens)
PPS-NEXT: #else (2 tokens)
PPS-NEXT: code (3 tokens)
diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
index e893ed2fb03e6..fd5eb60d5eb3d 100644
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -75,6 +75,7 @@ int main(int argc, char *argv[]) {
clang::LangOptions LangOpts; // FIXME: use real options.
auto Stream = clang::pseudo::lex(Text, LangOpts);
auto Structure = clang::pseudo::DirectiveMap::parse(Stream);
+ clang::pseudo::chooseConditionalBranches(Structure, Stream);
if (PrintDirectiveMap)
llvm::outs() << Structure;
diff --git a/clang-tools-extra/pseudo/unittests/DirectiveMapTest.cpp b/clang-tools-extra/pseudo/unittests/DirectiveMapTest.cpp
index 1feae342655be..010451fd7dc3f 100644
--- a/clang-tools-extra/pseudo/unittests/DirectiveMapTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/DirectiveMapTest.cpp
@@ -145,6 +145,162 @@ TEST(DirectiveMap, ParseBroken) {
EXPECT_EQ(0u, X.End.Tokens.size());
}
+TEST(DirectiveMap, ChooseBranches) {
+ LangOptions Opts;
+ const std::string Cases[] = {
+ R"cpp(
+ // Branches with no alternatives are taken
+ #if COND // TAKEN
+ int x;
+ #endif
+ )cpp",
+
+ R"cpp(
+ // Empty branches are better than nothing
+ #if COND // TAKEN
+ #endif
+ )cpp",
+
+ R"cpp(
+ // Trivially false branches are not taken, even with no alternatives.
+ #if 0
+ int x;
+ #endif
+ )cpp",
+
+ R"cpp(
+ // Longer branches are preferred over shorter branches
+ #if COND // TAKEN
+ int x = 1;
+ #else
+ int x;
+ #endif
+
+ #if COND
+ int x;
+ #else // TAKEN
+ int x = 1;
+ #endif
+ )cpp",
+
+ R"cpp(
+ // Trivially true branches are taken if previous branches are trivial.
+ #if 1 // TAKEN
+ #else
+ int x = 1;
+ #endif
+
+ #if 0
+ int x = 1;
+ #elif 0
+ int x = 2;
+ #elif 1 // TAKEN
+ int x;
+ #endif
+
+ #if 0
+ int x = 1;
+ #elif FOO // TAKEN
+ int x = 2;
+ #elif 1
+ int x;
+ #endif
+ )cpp",
+
+ R"cpp(
+ // #else is a trivially true branch
+ #if 0
+ int x = 1;
+ #elif 0
+ int x = 2;
+ #else // TAKEN
+ int x;
+ #endif
+ )cpp",
+
+ R"cpp(
+ // Directives break ties, but nondirective text is more important.
+ #if FOO
+ #define A 1 2 3
+ #else // TAKEN
+ #define B 4 5 6
+ #define C 7 8 9
+ #endif
+
+ #if FOO // TAKEN
+ ;
+ #define A 1 2 3
+ #else
+ #define B 4 5 6
+ #define C 7 8 9
+ #endif
+ )cpp",
+
+ R"cpp(
+ // Avoid #error directives.
+ #if FOO
+ int x = 42;
+ #error This branch is no good
+ #else // TAKEN
+ #endif
+
+ #if FOO
+ // All paths here lead to errors.
+ int x = 42;
+ #if 1 // TAKEN
+ #if COND // TAKEN
+ #error This branch is no good
+ #else
+ #error This one is no good either
+ #endif
+ #endif
+ #else // TAKEN
+ #endif
+ )cpp",
+
+ R"cpp(
+ // Populate taken branches recursively.
+ #if FOO // TAKEN
+ int x = 42;
+ #if BAR
+ ;
+ #else // TAKEN
+ int y = 43;
+ #endif
+ #else
+ int x;
+ #if BAR // TAKEN
+ int y;
+ #else
+ ;
+ #endif
+ #endif
+ )cpp",
+ };
+ for (const auto &Code : Cases) {
+ TokenStream S = cook(lex(Code, Opts), Opts);
+
+ std::function<void(const DirectiveMap &)> Verify =
+ [&](const DirectiveMap &M) {
+ for (const auto &C : M.Chunks) {
+ if (C.kind() != DirectiveMap::Chunk::K_Conditional)
+ continue;
+ const DirectiveMap::Conditional &Cond(C);
+ for (unsigned I = 0; I < Cond.Branches.size(); ++I) {
+ auto Directive = S.tokens(Cond.Branches[I].first.Tokens);
+ EXPECT_EQ(I == Cond.Taken, Directive.back().text() == "// TAKEN")
+ << "At line " << Directive.front().Line << " of: " << Code;
+ Verify(Cond.Branches[I].second);
+ }
+ }
+ };
+
+ DirectiveMap Map = DirectiveMap::parse(S);
+ chooseConditionalBranches(Map, S);
+ Verify(Map);
+ }
+}
+
} // namespace
} // namespace pseudo
} // namespace clang
More information about the cfe-commits
mailing list