[clang] ee8ed0b - [clang][deps] Teach dep directive scanner about _Pragma

Tue May 9 10:31:09 PDT 2023

Author: Ben Langmuir
Date: 2023-05-09T10:05:12-07:00
New Revision: ee8ed0b3099e63ba0a18cca42b9cfdf098bc6201

URL: https://github.com/llvm/llvm-project/commit/ee8ed0b3099e63ba0a18cca42b9cfdf098bc6201
DIFF: https://github.com/llvm/llvm-project/commit/ee8ed0b3099e63ba0a18cca42b9cfdf098bc6201.diff

LOG: [clang][deps] Teach dep directive scanner about _Pragma

While we cannot handle `_Pragma` used inside macros, we can handle
this at the top level, and it some projects use the `_Pragma("once")`
spelling like that, which was causing spurious failures in the scanner.

Limitations
* Cannot handle #define ONCE _Pragma("once"), same issue as using
  @import in a macro -- ideally we should diagnose this in obvious cases
* Our LangOpts are currently fixed, so we are not handling u"" strings
  or R"()" strings that require C11/C++11.

rdar://108629982

Differential Revision: https://reviews.llvm.org/D149884

Added: 
    clang/test/ClangScanDeps/_Pragma-once.c

Modified: 
    clang/include/clang/Lex/Pragma.h
    clang/lib/Lex/DependencyDirectivesScanner.cpp
    clang/lib/Lex/Pragma.cpp
    clang/unittests/Lex/DependencyDirectivesScannerTest.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Lex/Pragma.h b/clang/include/clang/Lex/Pragma.h
index cf8cca5414eac..67eca618f6c4f 100644

--- a/clang/include/clang/Lex/Pragma.h
+++ b/clang/include/clang/Lex/Pragma.h
@@ -123,6 +123,13 @@ class PragmaNamespace : public PragmaHandler {
   PragmaNamespace *getIfNamespace() override { return this; }
 };
 
+/// Destringize a \c _Pragma("") string according to C11 6.10.9.1:
+/// "The string literal is destringized by deleting any encoding prefix,
+/// deleting the leading and trailing double-quotes, replacing each escape
+/// sequence \" by a double-quote, and replacing each escape sequence \\ by a
+/// single backslash."
+void prepare_PragmaString(SmallVectorImpl<char> &StrVal);
+
 } // namespace clang
 
 #endif // LLVM_CLANG_LEX_PRAGMA_H

diff  --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index a506b49176302..2bd2c5f8388c0 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -19,6 +19,7 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Lex/LexDiagnostic.h"
 #include "clang/Lex/Lexer.h"
+#include "clang/Lex/Pragma.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
@@ -72,6 +73,8 @@ struct Scanner {
     // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.
     LangOpts.ObjC = true;
     LangOpts.LineComment = true;
+    // FIXME: we do not enable C11 or C++11, so we are missing u/u8/U"" and
+    // R"()" literals.
     return LangOpts;
   }
 
@@ -91,6 +94,10 @@ struct Scanner {
   void skipLine(const char *&First, const char *const End);
   void skipDirective(StringRef Name, const char *&First, const char *const End);
 
+  /// Returns the spelling of a string literal or identifier after performing
+  /// any processing needed to handle \c clang::Token::NeedsCleaning.
+  StringRef cleanStringIfNeeded(const dependency_directives_scan::Token &Tok);
+
   /// Lexes next token and if it is identifier returns its string, otherwise
   /// it skips the current line and returns \p std::nullopt.
   ///
@@ -112,6 +119,22 @@ struct Scanner {
                                                 const char *&First,
                                                 const char *const End);
 
+  /// Lexes next token and returns true iff it matches the kind \p K.
+  /// Otherwise it skips the current line and returns false.
+  ///
+  /// In any case (whatever the token kind) \p First and the \p Lexer will
+  /// advance beyond the token.
+  [[nodiscard]] bool isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
+                                           const char *const End);
+
+  /// Lexes next token and if it is string literal, returns its string.
+  /// Otherwise, it skips the current line and returns \p std::nullopt.
+  ///
+  /// In any case (whatever the token kind) \p First and the \p Lexer will
+  /// advance beyond the token.
+  [[nodiscard]] std::optional<StringRef>
+  tryLexStringLiteralOrSkipLine(const char *&First, const char *const End);
+
   [[nodiscard]] bool scanImpl(const char *First, const char *const End);
   [[nodiscard]] bool lexPPLine(const char *&First, const char *const End);
   [[nodiscard]] bool lexAt(const char *&First, const char *const End);
@@ -119,6 +142,7 @@ struct Scanner {
   [[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First,
                                const char *const End);
   [[nodiscard]] bool lexPragma(const char *&First, const char *const End);
+  [[nodiscard]] bool lex_Pragma(const char *&First, const char *const End);
   [[nodiscard]] bool lexEndif(const char *&First, const char *const End);
   [[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First,
                                 const char *const End);
@@ -525,15 +549,8 @@ void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
   }
 }
 
-[[nodiscard]] std::optional<StringRef>
-Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
-  const dependency_directives_scan::Token &Tok = lexToken(First, End);
-  if (Tok.isNot(tok::raw_identifier)) {
-    if (!Tok.is(tok::eod))
-      skipLine(First, End);
-    return std::nullopt;
-  }
-
+StringRef
+Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
   bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;
   if (LLVM_LIKELY(!NeedsCleaning))
     return Input.slice(Tok.Offset, Tok.getEnd());
@@ -541,6 +558,9 @@ Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
   SmallString<64> Spelling;
   Spelling.resize(Tok.Length);
 
+  // FIXME: C++11 raw string literals need special handling (see getSpellingSlow
+  // in the Lexer). Currently we cannot see them due to our LangOpts.
+
   unsigned SpellingLength = 0;
   const char *BufPtr = Input.begin() + Tok.Offset;
   const char *AfterIdent = Input.begin() + Tok.getEnd();
@@ -555,6 +575,18 @@ Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
       .first->first();
 }
 
+std::optional<StringRef>
+Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
+  const dependency_directives_scan::Token &Tok = lexToken(First, End);
+  if (Tok.isNot(tok::raw_identifier)) {
+    if (!Tok.is(tok::eod))
+      skipLine(First, End);
+    return std::nullopt;
+  }
+
+  return cleanStringIfNeeded(Tok);
+}
+
 StringRef Scanner::lexIdentifier(const char *&First, const char *const End) {
   std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End);
   assert(Id && "expected identifier token");
@@ -572,6 +604,28 @@ bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First,
   return false;
 }
 
+bool Scanner::isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
+                                    const char *const End) {
+  const dependency_directives_scan::Token &Tok = lexToken(First, End);
+  if (Tok.is(K))
+    return true;
+  skipLine(First, End);
+  return false;
+}
+
+std::optional<StringRef>
+Scanner::tryLexStringLiteralOrSkipLine(const char *&First,
+                                       const char *const End) {
+  const dependency_directives_scan::Token &Tok = lexToken(First, End);
+  if (!tok::isStringLiteral(Tok.Kind)) {
+    if (!Tok.is(tok::eod))
+      skipLine(First, End);
+    return std::nullopt;
+  }
+
+  return cleanStringIfNeeded(Tok);
+}
+
 bool Scanner::lexAt(const char *&First, const char *const End) {
   // Handle "@import".
 
@@ -629,6 +683,41 @@ bool Scanner::lexModule(const char *&First, const char *const End) {
   return lexModuleDirectiveBody(Kind, First, End);
 }
 
+bool Scanner::lex_Pragma(const char *&First, const char *const End) {
+  if (!isNextTokenOrSkipLine(tok::l_paren, First, End))
+    return false;
+
+  std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(First, End);
+
+  if (!Str || !isNextTokenOrSkipLine(tok::r_paren, First, End))
+    return false;
+
+  SmallString<64> Buffer(*Str);
+  prepare_PragmaString(Buffer);
+
+  // Use a new scanner instance since the tokens will be inside the allocated
+  // string. We should already have captured all the relevant tokens in the
+  // current scanner.
+  SmallVector<dependency_directives_scan::Token> DiscardTokens;
+  const char *Begin = Buffer.c_str();
+  Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags,
+                        InputSourceLoc};
+
+  PragmaScanner.TheLexer.setParsingPreprocessorDirective(true);
+  if (PragmaScanner.lexPragma(Begin, Buffer.end()))
+    return true;
+
+  DirectiveKind K = PragmaScanner.topDirective();
+  if (K == pp_none) {
+    skipLine(First, End);
+    return false;
+  }
+
+  assert(Begin == Buffer.end());
+  pushDirective(K);
+  return false;
+}
+
 bool Scanner::lexPragma(const char *&First, const char *const End) {
   std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
   if (!FoundId)
@@ -713,6 +802,7 @@ static bool isStartOfRelevantLine(char First) {
   case 'i':
   case 'e':
   case 'm':
+  case '_':
     return true;
   }
   return false;
@@ -749,6 +839,12 @@ bool Scanner::lexPPLine(const char *&First, const char *const End) {
   if (*First == 'i' || *First == 'e' || *First == 'm')
     return lexModule(First, End);
 
+  if (*First == '_') {
+    if (isNextIdentifierOrSkipLine("_Pragma", First, End))
+      return lex_Pragma(First, End);
+    return false;
+  }
+
   // Handle preprocessing directives.
 
   TheLexer.setParsingPreprocessorDirective(true);

diff  --git a/clang/lib/Lex/Pragma.cpp b/clang/lib/Lex/Pragma.cpp
index db8a5891679f4..0b892a3755a50 100644
--- a/clang/lib/Lex/Pragma.cpp
+++ b/clang/lib/Lex/Pragma.cpp
@@ -262,17 +262,48 @@ void Preprocessor::Handle_Pragma(Token &Tok) {
 
   SourceLocation RParenLoc = Tok.getLocation();
   bool Invalid = false;
-  std::string StrVal = getSpelling(StrTok, &Invalid);
+  SmallString<64> StrVal;
+  StrVal.resize(StrTok.getLength());
+  StringRef StrValRef = getSpelling(StrTok, StrVal, &Invalid);
   if (Invalid) {
     Diag(PragmaLoc, diag::err__Pragma_malformed);
     return;
   }
 
-  // The _Pragma is lexically sound.  Destringize according to C11 6.10.9.1:
-  // "The string literal is destringized by deleting any encoding prefix,
-  // deleting the leading and trailing double-quotes, replacing each escape
-  // sequence \" by a double-quote, and replacing each escape sequence \\ by a
-  // single backslash."
+  assert(StrValRef.size() <= StrVal.size());
+
+  // If the token was spelled somewhere else, copy it.
+  if (StrValRef.begin() != StrVal.begin())
+    StrVal.assign(StrValRef);
+  // Truncate if necessary.
+  else if (StrValRef.size() != StrVal.size())
+    StrVal.resize(StrValRef.size());
+
+  // The _Pragma is lexically sound.  Destringize according to C11 6.10.9.1.
+  prepare_PragmaString(StrVal);
+
+  // Plop the string (including the newline and trailing null) into a buffer
+  // where we can lex it.
+  Token TmpTok;
+  TmpTok.startToken();
+  CreateString(StrVal, TmpTok);
+  SourceLocation TokLoc = TmpTok.getLocation();
+
+  // Make and enter a lexer object so that we lex and expand the tokens just
+  // like any others.
+  Lexer *TL = Lexer::Create_PragmaLexer(TokLoc, PragmaLoc, RParenLoc,
+                                        StrVal.size(), *this);
+
+  EnterSourceFileWithLexer(TL, nullptr);
+
+  // With everything set up, lex this as a #pragma directive.
+  HandlePragmaDirective({PIK__Pragma, PragmaLoc});
+
+  // Finally, return whatever came after the pragma directive.
+  return Lex(Tok);
+}
+
+void clang::prepare_PragmaString(SmallVectorImpl<char> &StrVal) {
   if (StrVal[0] == 'L' || StrVal[0] == 'U' ||
       (StrVal[0] == 'u' && StrVal[1] != '8'))
     StrVal.erase(StrVal.begin());
@@ -296,8 +327,8 @@ void Preprocessor::Handle_Pragma(Token &Tok) {
 
     // Remove 'R " d-char-sequence' and 'd-char-sequence "'. We'll replace the
     // parens below.
-    StrVal.erase(0, 2 + NumDChars);
-    StrVal.erase(StrVal.size() - 1 - NumDChars);
+    StrVal.erase(StrVal.begin(), StrVal.begin() + 2 + NumDChars);
+    StrVal.erase(StrVal.end() - 1 - NumDChars, StrVal.end());
   } else {
     assert(StrVal[0] == '"' && StrVal[StrVal.size()-1] == '"' &&
            "Invalid string token!");
@@ -319,27 +350,7 @@ void Preprocessor::Handle_Pragma(Token &Tok) {
   StrVal[0] = ' ';
 
   // Replace the terminating quote with a \n.
-  StrVal[StrVal.size()-1] = '\n';
-
-  // Plop the string (including the newline and trailing null) into a buffer
-  // where we can lex it.
-  Token TmpTok;
-  TmpTok.startToken();
-  CreateString(StrVal, TmpTok);
-  SourceLocation TokLoc = TmpTok.getLocation();
-
-  // Make and enter a lexer object so that we lex and expand the tokens just
-  // like any others.
-  Lexer *TL = Lexer::Create_PragmaLexer(TokLoc, PragmaLoc, RParenLoc,
-                                        StrVal.size(), *this);
-
-  EnterSourceFileWithLexer(TL, nullptr);
-
-  // With everything set up, lex this as a #pragma directive.
-  HandlePragmaDirective({PIK__Pragma, PragmaLoc});
-
-  // Finally, return whatever came after the pragma directive.
-  return Lex(Tok);
+  StrVal[StrVal.size() - 1] = '\n';
 }
 
 /// HandleMicrosoft__pragma - Like Handle_Pragma except the pragma text

diff  --git a/clang/test/ClangScanDeps/_Pragma-once.c b/clang/test/ClangScanDeps/_Pragma-once.c
new file mode 100644
index 0000000000000..573f82c85698f
--- /dev/null
+++ b/clang/test/ClangScanDeps/_Pragma-once.c
@@ -0,0 +1,24 @@
+// Test scanning deps works with _Pragma syntax when not inside a macro.
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
+
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -j 1
+
+//--- cdb.json.template
+[{
+  "directory": "DIR",
+  "command": "clang -fsyntax-only DIR/tu.c",
+  "file": "DIR/tu.c"
+}]
+
+//--- a.h
+_Pragma("once")
+#include "b.h"
+
+//--- b.h
+#include "a.h"
+
+//--- tu.c
+#include "a.h"

diff  --git a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
index 2f8804784a2e4..bc4eee73c1c29 100644
--- a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
+++ b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
@@ -503,6 +503,92 @@ TEST(MinimizeSourceToDependencyDirectivesTest, Pragma) {
   EXPECT_STREQ("#pragma clang module import\n", Out.data());
 }
 
+TEST(MinimizeSourceToDependencyDirectivesTest, UnderscorePragma) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(R"(_)", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(R"(_Pragma)", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(R"(_Pragma()", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(R"(_Pragma())", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(R"(_Pragma(")", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(R"(_Pragma("A"))", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      R"x(_Pragma("push_macro(\"MACRO\")"))x", Out));
+  EXPECT_STREQ(R"x(_Pragma("push_macro(\"MACRO\")"))x"
+               "\n",
+               Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      R"x(_Pragma("pop_macro(\"MACRO\")"))x", Out));
+  EXPECT_STREQ(R"x(_Pragma("pop_macro(\"MACRO\")"))x"
+               "\n",
+               Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      R"x(_Pragma("include_alias(\"A\", \"B\")"))x", Out));
+  EXPECT_STREQ(R"x(_Pragma("include_alias(\"A\", \"B\")"))x"
+               "\n",
+               Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      R"x(_Pragma("include_alias(<A>, <B>)"))x", Out));
+  EXPECT_STREQ(R"x(_Pragma("include_alias(<A>, <B>)"))x"
+               "\n",
+               Out.data());
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives(R"(_Pragma("clang"))", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives(R"(_Pragma("clang module"))", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      R"(_Pragma("clang module impor"))", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      R"(_Pragma("clang module import"))", Out));
+  EXPECT_STREQ(R"(_Pragma("clang module import"))"
+               "\n",
+               Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      R"(_Pragma("clang \
+  module \
+  import"))",
+      Out));
+  EXPECT_STREQ(R"(_Pragma("clang \
+  module \
+  import"))"
+               "\n",
+               Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      R"(_Pragma(L"clang module import"))", Out));
+  EXPECT_STREQ(R"(_Pragma(L"clang module import"))"
+               "\n",
+               Out.data());
+
+  // FIXME: u"" strings depend on using C11 language mode
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      R"(_Pragma(u"clang module import"))", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+
+  // FIXME: R"()" strings depend on using C++ 11 language mode
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      R"(_Pragma(R"abc(clang module import)abc"))", Out));
+  EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
+}
+
 TEST(MinimizeSourceToDependencyDirectivesTest, Include) {
   SmallVector<char, 128> Out;
 
@@ -757,20 +843,26 @@ TEST(MinimizeSourceToDependencyDirectivesTest, PragmaOnce) {
 #pragma once
 // another comment
 #include <test.h>
+_Pragma("once")
 )";
   ASSERT_FALSE(
       minimizeSourceToDependencyDirectives(Source, Out, Tokens, Directives));
-  EXPECT_STREQ("#pragma once\n#include <test.h>\n", Out.data());
-  ASSERT_EQ(Directives.size(), 3u);
+  EXPECT_STREQ("#pragma once\n#include <test.h>\n_Pragma(\"once\")\n",
+               Out.data());
+  ASSERT_EQ(Directives.size(), 4u);
   EXPECT_EQ(Directives[0].Kind, dependency_directives_scan::pp_pragma_once);
+  EXPECT_EQ(Directives[2].Kind, dependency_directives_scan::pp_pragma_once);
 
   Source = R"(// comment
     #pragma once extra tokens
     // another comment
     #include <test.h>
+    _Pragma("once") extra tokens
     )";
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out));
-  EXPECT_STREQ("#pragma once extra tokens\n#include <test.h>\n", Out.data());
+  EXPECT_STREQ("#pragma once extra tokens\n#include "
+               "<test.h>\n_Pragma(\"once\")<TokBeforeEOF>\n",
+               Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest,