[PATCH] D125049: [pseudo] Only expand UCNs for raw_identifiers
Sam McCall via Phabricator via cfe-commits
cfe-commits at lists.llvm.org
Thu May 5 23:54:19 PDT 2022
This revision was automatically updated to reflect the committed changes.
Closed by commit rG232cc446ff7b: [pseudo] Only expand UCNs for raw_identifiers (authored by sammccall).
Changed prior to commit:
https://reviews.llvm.org/D125049?vs=427490&id=427535#toc
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D125049/new/
https://reviews.llvm.org/D125049
Files:
clang-tools-extra/pseudo/include/clang-pseudo/Token.h
clang-tools-extra/pseudo/lib/Lex.cpp
clang-tools-extra/pseudo/test/crash/backslashes.c
clang-tools-extra/pseudo/tool/ClangPseudo.cpp
Index: clang-tools-extra/pseudo/tool/ClangPseudo.cpp
===================================================================
--- clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -17,6 +17,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Signals.h"
using clang::pseudo::Grammar;
using llvm::cl::desc;
@@ -52,6 +53,7 @@
int main(int argc, char *argv[]) {
llvm::cl::ParseCommandLineOptions(argc, argv, "");
+ llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
clang::LangOptions LangOpts = clang::pseudo::genericLangOpts();
std::string SourceText;
Index: clang-tools-extra/pseudo/test/crash/backslashes.c
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/test/crash/backslashes.c
@@ -0,0 +1,4 @@
+// We used to try to interpret these backslashes as UCNs.
+// RUN: clang-pseudo -source=%s -print-tokens
+\
+\ x
Index: clang-tools-extra/pseudo/lib/Lex.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/Lex.cpp
+++ clang-tools-extra/pseudo/lib/Lex.cpp
@@ -90,12 +90,23 @@
assert(CharSize != 0 && "no progress!");
Pos += CharSize;
}
- // Remove universal character names (UCN).
+ llvm::StringRef Text = CleanBuffer;
llvm::SmallString<64> UCNBuffer;
- clang::expandUCNs(UCNBuffer, CleanBuffer);
+ // A surface reading of the standard suggests UCNs might appear anywhere.
+ // But we need only decode them in raw_identifiers.
+ // - they cannot appear in punctuation/keyword tokens, because UCNs
+ // cannot encode basic characters outside of literals [lex.charset]
+ // - they can appear in literals, but we need not unescape them now.
+ // We treat them as escape sequences when evaluating the literal.
+ // - comments are handled similarly to literals
+ // This is good fortune, because expandUCNs requires its input to be a
+ // reasonably valid identifier (e.g. without stray backslashes).
+ if (Tok.Kind == tok::raw_identifier) {
+ clang::expandUCNs(UCNBuffer, CleanBuffer);
+ Text = UCNBuffer;
+ }
- llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage);
- Tok.Data = Text.data();
+ Tok.Data = Text.copy(*CleanedStorage).data();
Tok.Length = Text.size();
Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
}
Index: clang-tools-extra/pseudo/include/clang-pseudo/Token.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/Token.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/Token.h
@@ -199,12 +199,15 @@
clang::Language = clang::Language::CXX,
clang::LangStandard::Kind = clang::LangStandard::lang_unspecified);
-/// Derives a token stream by decoding escapes, interpreting raw_identifiers and
-/// splitting the greatergreater token.
+/// Decoding raw tokens written in the source code, returning a derived stream.
///
-/// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and
-/// their backing data is owned by the returned stream.
-/// raw_identifier tokens are assigned specific types (identifier, keyword etc).
+/// - escaped newlines within tokens are removed
+/// - trigraphs are replaced with the characters they encode
+/// - UCNs within raw_identifiers are replaced by the characters they encode
+/// (UCNs within strings, comments etc are not translated)
+/// - raw_identifier tokens are assigned their correct keyword type
+/// - the >> token is split into separate > > tokens
+/// (we use a modified grammar where >> is a nonterminal, not a token)
///
/// The StartsPPLine flag is preserved.
///
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D125049.427535.patch
Type: text/x-patch
Size: 3900 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20220506/d9dec1b7/attachment-0001.bin>
More information about the cfe-commits
mailing list