[clang] ee8ed0b - [clang][deps] Teach dep directive scanner about _Pragma
Ben Langmuir via cfe-commits
cfe-commits at lists.llvm.org
Tue May 9 10:31:09 PDT 2023
Author: Ben Langmuir
Date: 2023-05-09T10:05:12-07:00
New Revision: ee8ed0b3099e63ba0a18cca42b9cfdf098bc6201
URL: https://github.com/llvm/llvm-project/commit/ee8ed0b3099e63ba0a18cca42b9cfdf098bc6201
DIFF: https://github.com/llvm/llvm-project/commit/ee8ed0b3099e63ba0a18cca42b9cfdf098bc6201.diff
LOG: [clang][deps] Teach dep directive scanner about _Pragma
While we cannot handle `_Pragma` used inside macros, we can handle
this at the top level, and it some projects use the `_Pragma("once")`
spelling like that, which was causing spurious failures in the scanner.
Limitations
* Cannot handle #define ONCE _Pragma("once"), same issue as using
@import in a macro -- ideally we should diagnose this in obvious cases
* Our LangOpts are currently fixed, so we are not handling u"" strings
or R"()" strings that require C11/C++11.
rdar://108629982
Differential Revision: https://reviews.llvm.org/D149884
Added:
clang/test/ClangScanDeps/_Pragma-once.c
Modified:
clang/include/clang/Lex/Pragma.h
clang/lib/Lex/DependencyDirectivesScanner.cpp
clang/lib/Lex/Pragma.cpp
clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
Removed:
################################################################################
diff --git a/clang/include/clang/Lex/Pragma.h b/clang/include/clang/Lex/Pragma.h
index cf8cca5414eac..67eca618f6c4f 100644
--- a/clang/include/clang/Lex/Pragma.h
+++ b/clang/include/clang/Lex/Pragma.h
@@ -123,6 +123,13 @@ class PragmaNamespace : public PragmaHandler {
PragmaNamespace *getIfNamespace() override { return this; }
};
+/// Destringize a \c _Pragma("") string according to C11 6.10.9.1:
+/// "The string literal is destringized by deleting any encoding prefix,
+/// deleting the leading and trailing double-quotes, replacing each escape
+/// sequence \" by a double-quote, and replacing each escape sequence \\ by a
+/// single backslash."
+void prepare_PragmaString(SmallVectorImpl<char> &StrVal);
+
} // namespace clang
#endif // LLVM_CLANG_LEX_PRAGMA_H
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index a506b49176302..2bd2c5f8388c0 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -19,6 +19,7 @@
#include "clang/Basic/Diagnostic.h"
#include "clang/Lex/LexDiagnostic.h"
#include "clang/Lex/Lexer.h"
+#include "clang/Lex/Pragma.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringMap.h"
@@ -72,6 +73,8 @@ struct Scanner {
// Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.
LangOpts.ObjC = true;
LangOpts.LineComment = true;
+ // FIXME: we do not enable C11 or C++11, so we are missing u/u8/U"" and
+ // R"()" literals.
return LangOpts;
}
@@ -91,6 +94,10 @@ struct Scanner {
void skipLine(const char *&First, const char *const End);
void skipDirective(StringRef Name, const char *&First, const char *const End);
+ /// Returns the spelling of a string literal or identifier after performing
+ /// any processing needed to handle \c clang::Token::NeedsCleaning.
+ StringRef cleanStringIfNeeded(const dependency_directives_scan::Token &Tok);
+
/// Lexes next token and if it is identifier returns its string, otherwise
/// it skips the current line and returns \p std::nullopt.
///
@@ -112,6 +119,22 @@ struct Scanner {
const char *&First,
const char *const End);
+ /// Lexes next token and returns true iff it matches the kind \p K.
+ /// Otherwise it skips the current line and returns false.
+ ///
+ /// In any case (whatever the token kind) \p First and the \p Lexer will
+ /// advance beyond the token.
+ [[nodiscard]] bool isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
+ const char *const End);
+
+ /// Lexes next token and if it is string literal, returns its string.
+ /// Otherwise, it skips the current line and returns \p std::nullopt.
+ ///
+ /// In any case (whatever the token kind) \p First and the \p Lexer will
+ /// advance beyond the token.
+ [[nodiscard]] std::optional<StringRef>
+ tryLexStringLiteralOrSkipLine(const char *&First, const char *const End);
+
[[nodiscard]] bool scanImpl(const char *First, const char *const End);
[[nodiscard]] bool lexPPLine(const char *&First, const char *const End);
[[nodiscard]] bool lexAt(const char *&First, const char *const End);
@@ -119,6 +142,7 @@ struct Scanner {
[[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First,
const char *const End);
[[nodiscard]] bool lexPragma(const char *&First, const char *const End);
+ [[nodiscard]] bool lex_Pragma(const char *&First, const char *const End);
[[nodiscard]] bool lexEndif(const char *&First, const char *const End);
[[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First,
const char *const End);
@@ -525,15 +549,8 @@ void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
}
}
-[[nodiscard]] std::optional<StringRef>
-Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
- const dependency_directives_scan::Token &Tok = lexToken(First, End);
- if (Tok.isNot(tok::raw_identifier)) {
- if (!Tok.is(tok::eod))
- skipLine(First, End);
- return std::nullopt;
- }
-
+StringRef
+Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;
if (LLVM_LIKELY(!NeedsCleaning))
return Input.slice(Tok.Offset, Tok.getEnd());
@@ -541,6 +558,9 @@ Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
SmallString<64> Spelling;
Spelling.resize(Tok.Length);
+ // FIXME: C++11 raw string literals need special handling (see getSpellingSlow
+ // in the Lexer). Currently we cannot see them due to our LangOpts.
+
unsigned SpellingLength = 0;
const char *BufPtr = Input.begin() + Tok.Offset;
const char *AfterIdent = Input.begin() + Tok.getEnd();
@@ -555,6 +575,18 @@ Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
.first->first();
}
+std::optional<StringRef>
+Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
+ const dependency_directives_scan::Token &Tok = lexToken(First, End);
+ if (Tok.isNot(tok::raw_identifier)) {
+ if (!Tok.is(tok::eod))
+ skipLine(First, End);
+ return std::nullopt;
+ }
+
+ return cleanStringIfNeeded(Tok);
+}
+
StringRef Scanner::lexIdentifier(const char *&First, const char *const End) {
std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End);
assert(Id && "expected identifier token");
@@ -572,6 +604,28 @@ bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First,
return false;
}
+bool Scanner::isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
+ const char *const End) {
+ const dependency_directives_scan::Token &Tok = lexToken(First, End);
+ if (Tok.is(K))
+ return true;
+ skipLine(First, End);
+ return false;
+}
+
+std::optional<StringRef>
+Scanner::tryLexStringLiteralOrSkipLine(const char *&First,
+ const char *const End) {
+ const dependency_directives_scan::Token &Tok = lexToken(First, End);
+ if (!tok::isStringLiteral(Tok.Kind)) {
+ if (!Tok.is(tok::eod))
+ skipLine(First, End);
+ return std::nullopt;
+ }
+
+ return cleanStringIfNeeded(Tok);
+}
+
bool Scanner::lexAt(const char *&First, const char *const End) {
// Handle "@import".
@@ -629,6 +683,41 @@ bool Scanner::lexModule(const char *&First, const char *const End) {
return lexModuleDirectiveBody(Kind, First, End);
}
+bool Scanner::lex_Pragma(const char *&First, const char *const End) {
+ if (!isNextTokenOrSkipLine(tok::l_paren, First, End))
+ return false;
+
+ std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(First, End);
+
+ if (!Str || !isNextTokenOrSkipLine(tok::r_paren, First, End))
+ return false;
+
+ SmallString<64> Buffer(*Str);
+ prepare_PragmaString(Buffer);
+
+ // Use a new scanner instance since the tokens will be inside the allocated
+ // string. We should already have captured all the relevant tokens in the
+ // current scanner.
+ SmallVector<dependency_directives_scan::Token> DiscardTokens;
+ const char *Begin = Buffer.c_str();
+ Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags,
+ InputSourceLoc};
+
+ PragmaScanner.TheLexer.setParsingPreprocessorDirective(true);
+ if (PragmaScanner.lexPragma(Begin, Buffer.end()))
+ return true;
+
+ DirectiveKind K = PragmaScanner.topDirective();
+ if (K == pp_none) {
+ skipLine(First, End);
+ return false;
+ }
+
+ assert(Begin == Buffer.end());
+ pushDirective(K);
+ return false;
+}
+
bool Scanner::lexPragma(const char *&First, const char *const End) {
std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
if (!FoundId)
@@ -713,6 +802,7 @@ static bool isStartOfRelevantLine(char First) {
case 'i':
case 'e':
case 'm':
+ case '_':
return true;
}
return false;
@@ -749,6 +839,12 @@ bool Scanner::lexPPLine(const char *&First, const char *const End) {
if (*First == 'i' || *First == 'e' || *First == 'm')
return lexModule(First, End);
+ if (*First == '_') {
+ if (isNextIdentifierOrSkipLine("_Pragma", First, End))
+ return lex_Pragma(First, End);
+ return false;
+ }
+
// Handle preprocessing directives.
TheLexer.setParsingPreprocessorDirective(true);
diff --git a/clang/lib/Lex/Pragma.cpp b/clang/lib/Lex/Pragma.cpp
index db8a5891679f4..0b892a3755a50 100644
--- a/clang/lib/Lex/Pragma.cpp
+++ b/clang/lib/Lex/Pragma.cpp
@@ -262,17 +262,48 @@ void Preprocessor::Handle_Pragma(Token &Tok) {
SourceLocation RParenLoc = Tok.getLocation();
bool Invalid = false;
- std::string StrVal = getSpelling(StrTok, &Invalid);
+ SmallString<64> StrVal;
+ StrVal.resize(StrTok.getLength());
+ StringRef StrValRef = getSpelling(StrTok, StrVal, &Invalid);
if (Invalid) {
Diag(PragmaLoc, diag::err__Pragma_malformed);
return;
}
- // The _Pragma is lexically sound. Destringize according to C11 6.10.9.1:
- // "The string literal is destringized by deleting any encoding prefix,
- // deleting the leading and trailing double-quotes, replacing each escape
- // sequence \" by a double-quote, and replacing each escape sequence \\ by a
- // single backslash."
+ assert(StrValRef.size() <= StrVal.size());
+
+ // If the token was spelled somewhere else, copy it.
+ if (StrValRef.begin() != StrVal.begin())
+ StrVal.assign(StrValRef);
+ // Truncate if necessary.
+ else if (StrValRef.size() != StrVal.size())
+ StrVal.resize(StrValRef.size());
+
+ // The _Pragma is lexically sound. Destringize according to C11 6.10.9.1.
+ prepare_PragmaString(StrVal);
+
+ // Plop the string (including the newline and trailing null) into a buffer
+ // where we can lex it.
+ Token TmpTok;
+ TmpTok.startToken();
+ CreateString(StrVal, TmpTok);
+ SourceLocation TokLoc = TmpTok.getLocation();
+
+ // Make and enter a lexer object so that we lex and expand the tokens just
+ // like any others.
+ Lexer *TL = Lexer::Create_PragmaLexer(TokLoc, PragmaLoc, RParenLoc,
+ StrVal.size(), *this);
+
+ EnterSourceFileWithLexer(TL, nullptr);
+
+ // With everything set up, lex this as a #pragma directive.
+ HandlePragmaDirective({PIK__Pragma, PragmaLoc});
+
+ // Finally, return whatever came after the pragma directive.
+ return Lex(Tok);
+}
+
+void clang::prepare_PragmaString(SmallVectorImpl<char> &StrVal) {
if (StrVal[0] == 'L' || StrVal[0] == 'U' ||
(StrVal[0] == 'u' && StrVal[1] != '8'))
StrVal.erase(StrVal.begin());
@@ -296,8 +327,8 @@ void Preprocessor::Handle_Pragma(Token &Tok) {
// Remove 'R " d-char-sequence' and 'd-char-sequence "'. We'll replace the
// parens below.
- StrVal.erase(0, 2 + NumDChars);
- StrVal.erase(StrVal.size() - 1 - NumDChars);
+ StrVal.erase(StrVal.begin(), StrVal.begin() + 2 + NumDChars);
+ StrVal.erase(StrVal.end() - 1 - NumDChars, StrVal.end());
} else {
assert(StrVal[0] == '"' && StrVal[StrVal.size()-1] == '"' &&
"Invalid string token!");
@@ -319,27 +350,7 @@ void Preprocessor::Handle_Pragma(Token &Tok) {
StrVal[0] = ' ';
// Replace the terminating quote with a \n.
- StrVal[StrVal.size()-1] = '\n';
-
- // Plop the string (including the newline and trailing null) into a buffer
- // where we can lex it.
- Token TmpTok;
- TmpTok.startToken();
- CreateString(StrVal, TmpTok);
- SourceLocation TokLoc = TmpTok.getLocation();
-
- // Make and enter a lexer object so that we lex and expand the tokens just
- // like any others.
- Lexer *TL = Lexer::Create_PragmaLexer(TokLoc, PragmaLoc, RParenLoc,
- StrVal.size(), *this);
-
- EnterSourceFileWithLexer(TL, nullptr);
-
- // With everything set up, lex this as a #pragma directive.
- HandlePragmaDirective({PIK__Pragma, PragmaLoc});
-
- // Finally, return whatever came after the pragma directive.
- return Lex(Tok);
+ StrVal[StrVal.size() - 1] = '\n';
}
/// HandleMicrosoft__pragma - Like Handle_Pragma except the pragma text
diff --git a/clang/test/ClangScanDeps/_Pragma-once.c b/clang/test/ClangScanDeps/_Pragma-once.c
new file mode 100644
index 0000000000000..573f82c85698f
--- /dev/null
+++ b/clang/test/ClangScanDeps/_Pragma-once.c
@@ -0,0 +1,24 @@
+// Test scanning deps works with _Pragma syntax when not inside a macro.
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
+
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -j 1
+
+//--- cdb.json.template
+[{
+ "directory": "DIR",
+ "command": "clang -fsyntax-only DIR/tu.c",
+ "file": "DIR/tu.c"
+}]
+
+//--- a.h
+_Pragma("once")
+#include "b.h"
+
+//--- b.h
+#include "a.h"
+
+//--- tu.c
+#include "a.h"
diff --git a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
index 2f8804784a2e4..bc4eee73c1c29 100644
--- a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
+++ b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
@@ -503,6 +503,92 @@ TEST(MinimizeSourceToDependencyDirectivesTest, Pragma) {
EXPECT_STREQ("#pragma clang module import\n", Out.data());
}
+TEST(MinimizeSourceToDependencyDirectivesTest, UnderscorePragma) {
+ SmallVector<char, 128> Out;
+
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(R"(_)", Out));
+ EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(R"(_Pragma)", Out));
+ EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(R"(_Pragma()", Out));
+ EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(R"(_Pragma())", Out));
+ EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(R"(_Pragma(")", Out));
+ EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(R"(_Pragma("A"))", Out));
+ EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+ R"x(_Pragma("push_macro(\"MACRO\")"))x", Out));
+ EXPECT_STREQ(R"x(_Pragma("push_macro(\"MACRO\")"))x"
+ "\n",
+ Out.data());
+
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+ R"x(_Pragma("pop_macro(\"MACRO\")"))x", Out));
+ EXPECT_STREQ(R"x(_Pragma("pop_macro(\"MACRO\")"))x"
+ "\n",
+ Out.data());
+
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+ R"x(_Pragma("include_alias(\"A\", \"B\")"))x", Out));
+ EXPECT_STREQ(R"x(_Pragma("include_alias(\"A\", \"B\")"))x"
+ "\n",
+ Out.data());
+
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+ R"x(_Pragma("include_alias(<A>, <B>)"))x", Out));
+ EXPECT_STREQ(R"x(_Pragma("include_alias(<A>, <B>)"))x"
+ "\n",
+ Out.data());
+
+ ASSERT_FALSE(
+ minimizeSourceToDependencyDirectives(R"(_Pragma("clang"))", Out));
+ EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+ ASSERT_FALSE(
+ minimizeSourceToDependencyDirectives(R"(_Pragma("clang module"))", Out));
+ EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+ R"(_Pragma("clang module impor"))", Out));
+ EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+ R"(_Pragma("clang module import"))", Out));
+ EXPECT_STREQ(R"(_Pragma("clang module import"))"
+ "\n",
+ Out.data());
+
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+ R"(_Pragma("clang \
+ module \
+ import"))",
+ Out));
+ EXPECT_STREQ(R"(_Pragma("clang \
+ module \
+ import"))"
+ "\n",
+ Out.data());
+
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+ R"(_Pragma(L"clang module import"))", Out));
+ EXPECT_STREQ(R"(_Pragma(L"clang module import"))"
+ "\n",
+ Out.data());
+
+ // FIXME: u"" strings depend on using C11 language mode
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+ R"(_Pragma(u"clang module import"))", Out));
+ EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+ // FIXME: R"()" strings depend on using C++ 11 language mode
+ ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+ R"(_Pragma(R"abc(clang module import)abc"))", Out));
+ EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+}
+
TEST(MinimizeSourceToDependencyDirectivesTest, Include) {
SmallVector<char, 128> Out;
@@ -757,20 +843,26 @@ TEST(MinimizeSourceToDependencyDirectivesTest, PragmaOnce) {
#pragma once
// another comment
#include <test.h>
+_Pragma("once")
)";
ASSERT_FALSE(
minimizeSourceToDependencyDirectives(Source, Out, Tokens, Directives));
- EXPECT_STREQ("#pragma once\n#include <test.h>\n", Out.data());
- ASSERT_EQ(Directives.size(), 3u);
+ EXPECT_STREQ("#pragma once\n#include <test.h>\n_Pragma(\"once\")\n",
+ Out.data());
+ ASSERT_EQ(Directives.size(), 4u);
EXPECT_EQ(Directives[0].Kind, dependency_directives_scan::pp_pragma_once);
+ EXPECT_EQ(Directives[2].Kind, dependency_directives_scan::pp_pragma_once);
Source = R"(// comment
#pragma once extra tokens
// another comment
#include <test.h>
+ _Pragma("once") extra tokens
)";
ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out));
- EXPECT_STREQ("#pragma once extra tokens\n#include <test.h>\n", Out.data());
+ EXPECT_STREQ("#pragma once extra tokens\n#include "
+ "<test.h>\n_Pragma(\"once\")<TokBeforeEOF>\n",
+ Out.data());
}
TEST(MinimizeSourceToDependencyDirectivesTest,
More information about the cfe-commits
mailing list