[PATCH] D125049: [pseudo] Only expand UCNs for raw_identifiers

Thu May 5 23:54:19 PDT 2022

This revision was automatically updated to reflect the committed changes.
Closed by commit rG232cc446ff7b: [pseudo] Only expand UCNs for raw_identifiers (authored by sammccall).

Changed prior to commit:
  https://reviews.llvm.org/D125049?vs=427490&id=427535#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D125049/new/

https://reviews.llvm.org/D125049

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/Token.h
  clang-tools-extra/pseudo/lib/Lex.cpp
  clang-tools-extra/pseudo/test/crash/backslashes.c
  clang-tools-extra/pseudo/tool/ClangPseudo.cpp


Index: clang-tools-extra/pseudo/tool/ClangPseudo.cpp
===================================================================

--- clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Signals.h"
 
 using clang::pseudo::Grammar;
 using llvm::cl::desc;
@@ -52,6 +53,7 @@
 
 int main(int argc, char *argv[]) {
   llvm::cl::ParseCommandLineOptions(argc, argv, "");
+  llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
 
   clang::LangOptions LangOpts = clang::pseudo::genericLangOpts();
   std::string SourceText;
Index: clang-tools-extra/pseudo/test/crash/backslashes.c
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/test/crash/backslashes.c
@@ -0,0 +1,4 @@
+// We used to try to interpret these backslashes as UCNs.
+// RUN: clang-pseudo -source=%s -print-tokens
+\
+\ x
Index: clang-tools-extra/pseudo/lib/Lex.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/Lex.cpp
+++ clang-tools-extra/pseudo/lib/Lex.cpp
@@ -90,12 +90,23 @@
         assert(CharSize != 0 && "no progress!");
         Pos += CharSize;
       }
-      // Remove universal character names (UCN).
+      llvm::StringRef Text = CleanBuffer;
       llvm::SmallString<64> UCNBuffer;
-      clang::expandUCNs(UCNBuffer, CleanBuffer);
+      // A surface reading of the standard suggests UCNs might appear anywhere.
+      // But we need only decode them in raw_identifiers.
+      //  - they cannot appear in punctuation/keyword tokens, because UCNs
+      //    cannot encode basic characters outside of literals [lex.charset]
+      //  - they can appear in literals, but we need not unescape them now.
+      //    We treat them as escape sequences when evaluating the literal.
+      //  - comments are handled similarly to literals
+      // This is good fortune, because expandUCNs requires its input to be a
+      // reasonably valid identifier (e.g. without stray backslashes).
+      if (Tok.Kind == tok::raw_identifier) {
+        clang::expandUCNs(UCNBuffer, CleanBuffer);
+        Text = UCNBuffer;
+      }
 
-      llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage);
-      Tok.Data = Text.data();
+      Tok.Data = Text.copy(*CleanedStorage).data();
       Tok.Length = Text.size();
       Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
     }
Index: clang-tools-extra/pseudo/include/clang-pseudo/Token.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/Token.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/Token.h
@@ -199,12 +199,15 @@
     clang::Language = clang::Language::CXX,
     clang::LangStandard::Kind = clang::LangStandard::lang_unspecified);
 
-/// Derives a token stream by decoding escapes, interpreting raw_identifiers and
-/// splitting the greatergreater token.
+/// Decoding raw tokens written in the source code, returning a derived stream.
 ///
-/// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and
-/// their backing data is owned by the returned stream.
-/// raw_identifier tokens are assigned specific types (identifier, keyword etc).
+/// - escaped newlines within tokens are removed
+/// - trigraphs are replaced with the characters they encode
+/// - UCNs within raw_identifiers are replaced by the characters they encode
+///   (UCNs within strings, comments etc are not translated)
+/// - raw_identifier tokens are assigned their correct keyword type
+/// - the >> token is split into separate > > tokens
+///   (we use a modified grammar where >> is a nonterminal, not a token)
 ///
 /// The StartsPPLine flag is preserved.
 ///


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D125049.427535.patch
Type: text/x-patch
Size: 3900 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20220506/d9dec1b7/attachment-0001.bin>