r319904 - Stringizing raw string literals containing newline
Taewook Oh via cfe-commits
cfe-commits at lists.llvm.org
Wed Dec 6 09:00:53 PST 2017
Author: twoh
Date: Wed Dec 6 09:00:53 2017
New Revision: 319904
URL: http://llvm.org/viewvc/llvm-project?rev=319904&view=rev
Log:
Stringizing raw string literals containing newline
Summary: This patch implements 4.3 of http://open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4220.pdf. If a raw string contains a newline character, replace each newline character with the \n escape code. Without this patch, included test case (macro_raw_string.cpp) results compilation failure.
Reviewers: rsmith, doug.gregor, jkorous-apple
Reviewed By: jkorous-apple
Subscribers: jkorous-apple, vsapsai, cfe-commits
Differential Revision: https://reviews.llvm.org/D39279
Added:
cfe/trunk/test/Preprocessor/macro_raw_string.cpp
Modified:
cfe/trunk/include/clang/Lex/Lexer.h
cfe/trunk/lib/Lex/Lexer.cpp
cfe/trunk/unittests/Lex/LexerTest.cpp
Modified: cfe/trunk/include/clang/Lex/Lexer.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/Lexer.h?rev=319904&r1=319903&r2=319904&view=diff
==============================================================================
--- cfe/trunk/include/clang/Lex/Lexer.h (original)
+++ cfe/trunk/include/clang/Lex/Lexer.h Wed Dec 6 09:00:53 2017
@@ -70,7 +70,7 @@ class Lexer : public PreprocessorLexer {
SourceLocation FileLoc; // Location for start of file.
LangOptions LangOpts; // LangOpts enabled by this language (cache).
bool Is_PragmaLexer; // True if lexer for _Pragma handling.
-
+
//===--------------------------------------------------------------------===//
// Context-specific lexing flags set by the preprocessor.
//
@@ -241,17 +241,16 @@ public:
/// \brief Return the current location in the buffer.
const char *getBufferLocation() const { return BufferPtr; }
-
- /// Stringify - Convert the specified string into a C string by escaping '\'
- /// and " characters. This does not add surrounding ""'s to the string.
+
+ /// Stringify - Convert the specified string into a C string by i) escaping
+ /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
/// If Charify is true, this escapes the ' character instead of ".
static std::string Stringify(StringRef Str, bool Charify = false);
- /// Stringify - Convert the specified string into a C string by escaping '\'
- /// and " characters. This does not add surrounding ""'s to the string.
+ /// Stringify - Convert the specified string into a C string by i) escaping
+ /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
static void Stringify(SmallVectorImpl<char> &Str);
-
/// getSpelling - This method is used to get the spelling of a token into a
/// preallocated buffer, instead of as an std::string. The caller is required
/// to allocate enough space for the token, which is guaranteed to be at least
@@ -262,11 +261,11 @@ public:
/// to point to a constant buffer with the data already in it (avoiding a
/// copy). The caller is not allowed to modify the returned buffer pointer
/// if an internal buffer is returned.
- static unsigned getSpelling(const Token &Tok, const char *&Buffer,
+ static unsigned getSpelling(const Token &Tok, const char *&Buffer,
const SourceManager &SourceMgr,
const LangOptions &LangOpts,
bool *Invalid = nullptr);
-
+
/// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a
/// token is the characters used to represent the token in the source file
/// after trigraph expansion and escaped-newline folding. In particular, this
@@ -274,7 +273,7 @@ public:
/// UCNs, etc.
static std::string getSpelling(const Token &Tok,
const SourceManager &SourceMgr,
- const LangOptions &LangOpts,
+ const LangOptions &LangOpts,
bool *Invalid = nullptr);
/// getSpelling - This method is used to get the spelling of the
@@ -290,7 +289,7 @@ public:
const SourceManager &SourceMgr,
const LangOptions &LangOpts,
bool *invalid = nullptr);
-
+
/// MeasureTokenLength - Relex the token at the specified location and return
/// its length in bytes in the input file. If the token needs cleaning (e.g.
/// includes a trigraph or an escaped newline) then this count includes bytes
@@ -312,7 +311,7 @@ public:
static SourceLocation GetBeginningOfToken(SourceLocation Loc,
const SourceManager &SM,
const LangOptions &LangOpts);
-
+
/// AdvanceToTokenCharacter - If the current SourceLocation specifies a
/// location at the start of a token, return a new location that specifies a
/// character within the token. This handles trigraphs and escaped newlines.
@@ -320,7 +319,7 @@ public:
unsigned Character,
const SourceManager &SM,
const LangOptions &LangOpts);
-
+
/// \brief Computes the source location just past the end of the
/// token at this source location.
///
@@ -667,7 +666,7 @@ private:
bool SkipBlockComment (Token &Result, const char *CurPtr,
bool &TokAtPhysicalStartOfLine);
bool SaveLineComment (Token &Result, const char *CurPtr);
-
+
bool IsStartOfConflictMarker(const char *CurPtr);
bool HandleEndOfConflictMarker(const char *CurPtr);
Modified: cfe/trunk/lib/Lex/Lexer.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Lexer.cpp?rev=319904&r1=319903&r2=319904&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/Lexer.cpp (original)
+++ cfe/trunk/lib/Lex/Lexer.cpp Wed Dec 6 09:00:53 2017
@@ -209,30 +209,39 @@ Lexer *Lexer::Create_PragmaLexer(SourceL
return L;
}
-/// Stringify - Convert the specified string into a C string, with surrounding
-/// ""'s, and with escaped \ and " characters.
+template <typename T> void StringifyImpl(T &Str, char Quote) {
+ typename T::size_type i = 0, e = Str.size();
+ while (i < e) {
+ if (Str[i] == '\\' || Str[i] == Quote) {
+ Str.insert(Str.begin() + i, '\\');
+ i += 2;
+ ++e;
+ } else if (Str[i] == '\n' || Str[i] == '\r') {
+ // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
+ if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
+ Str[i] != Str[i + 1]) {
+ Str[i] = '\\';
+ Str[i + 1] = 'n';
+ } else {
+ // Replace '\n' and '\r' to '\\' followed by 'n'.
+ Str[i] = '\\';
+ Str.insert(Str.begin() + i + 1, 'n');
+ ++e;
+ }
+ i += 2;
+ } else
+ ++i;
+ }
+}
+
std::string Lexer::Stringify(StringRef Str, bool Charify) {
std::string Result = Str;
char Quote = Charify ? '\'' : '"';
- for (unsigned i = 0, e = Result.size(); i != e; ++i) {
- if (Result[i] == '\\' || Result[i] == Quote) {
- Result.insert(Result.begin()+i, '\\');
- ++i; ++e;
- }
- }
+ StringifyImpl(Result, Quote);
return Result;
}
-/// Stringify - Convert the specified string into a C string by escaping '\'
-/// and " characters. This does not add surrounding ""'s to the string.
-void Lexer::Stringify(SmallVectorImpl<char> &Str) {
- for (unsigned i = 0, e = Str.size(); i != e; ++i) {
- if (Str[i] == '\\' || Str[i] == '"') {
- Str.insert(Str.begin()+i, '\\');
- ++i; ++e;
- }
- }
-}
+void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
//===----------------------------------------------------------------------===//
// Token Spelling
@@ -367,7 +376,7 @@ std::string Lexer::getSpelling(const Tok
/// to point to a constant buffer with the data already in it (avoiding a
/// copy). The caller is not allowed to modify the returned buffer pointer
/// if an internal buffer is returned.
-unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
+unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
const SourceManager &SourceMgr,
const LangOptions &LangOpts, bool *Invalid) {
assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
@@ -592,17 +601,17 @@ PreambleBounds Lexer::ComputePreamble(St
if (TheTok.getKind() == tok::eof) {
break;
}
-
+
// If we haven't hit the end of the preprocessor directive, skip this
// token.
if (!TheTok.isAtStartOfLine())
continue;
-
+
// We've passed the end of the preprocessor directive, and will look
// at this token again below.
InPreprocessorDirective = false;
}
-
+
// Keep track of the # of lines in the preamble.
if (TheTok.isAtStartOfLine()) {
unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
@@ -619,13 +628,13 @@ PreambleBounds Lexer::ComputePreamble(St
ActiveCommentLoc = TheTok.getLocation();
continue;
}
-
+
if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
- // This is the start of a preprocessor directive.
+ // This is the start of a preprocessor directive.
Token HashTok = TheTok;
InPreprocessorDirective = true;
ActiveCommentLoc = SourceLocation();
-
+
// Figure out which directive this is. Since we're lexing raw tokens,
// we don't have an identifier table available. Instead, just look at
// the raw identifier to recognize and categorize preprocessor directives.
@@ -665,7 +674,7 @@ PreambleBounds Lexer::ComputePreamble(St
break;
}
}
-
+
// We only end up here if we didn't recognize the preprocessor
// directive or it was one that can't occur in the preamble at this
// point. Roll back the current token to the location of the '#'.
@@ -678,7 +687,7 @@ PreambleBounds Lexer::ComputePreamble(St
// the preamble.
break;
} while (true);
-
+
SourceLocation End;
if (ActiveCommentLoc.isValid())
End = ActiveCommentLoc; // don't truncate a decl comment.
@@ -700,13 +709,13 @@ SourceLocation Lexer::AdvanceToTokenChar
// trigraphs.
bool Invalid = false;
const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
-
+
// If they request the first char of the token, we're trivially done.
if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
return TokStart;
-
+
unsigned PhysOffset = 0;
-
+
// The usual case is that tokens don't contain anything interesting. Skip
// over the uninteresting characters. If a token only consists of simple
// chars, this method is extremely fast.
@@ -717,7 +726,7 @@ SourceLocation Lexer::AdvanceToTokenChar
--CharNo;
++PhysOffset;
}
-
+
// If we have a character that may be a trigraph or escaped newline, use a
// lexer to parse it correctly.
for (; CharNo; --CharNo) {
@@ -726,14 +735,14 @@ SourceLocation Lexer::AdvanceToTokenChar
TokPtr += Size;
PhysOffset += Size;
}
-
+
// Final detail: if we end up on an escaped newline, we want to return the
// location of the actual byte of the token. For example foo\<newline>bar
// advanced by 3 should return the location of b, not of \\. One compounding
// detail of this is that the escape may be made by a trigraph.
if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
-
+
return TokStart.getLocWithOffset(PhysOffset);
}
@@ -768,7 +777,7 @@ SourceLocation Lexer::getLocForEndOfToke
Len = Len - Offset;
else
return Loc;
-
+
return Loc.getLocWithOffset(Len);
}
@@ -965,7 +974,7 @@ StringRef Lexer::getImmediateMacroName(S
// For macro arguments we need to check that the argument did not come
// from an inner macro, e.g: "MAC1( MAC2(foo) )"
-
+
// Loc points to the argument id of the macro definition, move to the
// macro expansion.
Loc = SM.getImmediateExpansionRange(Loc).first;
@@ -1795,7 +1804,7 @@ bool Lexer::LexStringLiteral(Token &Resu
// getAndAdvanceChar.
if (C == '\\')
C = getAndAdvanceChar(CurPtr, Result);
-
+
if (C == '\n' || C == '\r' || // Newline.
(C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
@@ -1803,7 +1812,7 @@ bool Lexer::LexStringLiteral(Token &Resu
FormTokenWithChars(Result, CurPtr-1, tok::unknown);
return true;
}
-
+
if (C == 0) {
if (isCodeCompletionPoint(CurPtr-1)) {
PP->CodeCompleteNaturalLanguage();
@@ -2232,7 +2241,7 @@ bool Lexer::SaveLineComment(Token &Resul
std::string Spelling = PP->getSpelling(Result, &Invalid);
if (Invalid)
return true;
-
+
assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
Spelling[1] = '*'; // Change prefix to "/*".
Spelling += "*/"; // add suffix.
@@ -2558,7 +2567,7 @@ bool Lexer::LexEndOfFile(Token &Result,
resetExtendedTokenMode();
return true; // Have a token.
}
-
+
// If we are in raw mode, return this event as an EOF token. Let the caller
// that put us in raw mode handle the event.
if (isLexingRawMode()) {
@@ -2567,7 +2576,7 @@ bool Lexer::LexEndOfFile(Token &Result,
FormTokenWithChars(Result, BufferEnd, tok::eof);
return true;
}
-
+
if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
PP->setRecordedPreambleConditionalStack(ConditionalStack);
ConditionalStack.clear();
@@ -2679,7 +2688,7 @@ bool Lexer::IsStartOfConflictMarker(cons
if (CurPtr != BufferStart &&
CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
return false;
-
+
// Check to see if we have <<<<<<< or >>>>.
if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
!StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
@@ -2689,7 +2698,7 @@ bool Lexer::IsStartOfConflictMarker(cons
// it.
if (CurrentConflictMarkerState || isLexingRawMode())
return false;
-
+
ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
// Check to see if there is an ending marker somewhere in the buffer at the
@@ -2699,7 +2708,7 @@ bool Lexer::IsStartOfConflictMarker(cons
// Diagnose this, and ignore to the end of line.
Diag(CurPtr, diag::err_conflict_marker);
CurrentConflictMarkerState = Kind;
-
+
// Skip ahead to the end of line. We know this exists because the
// end-of-conflict marker starts with \r or \n.
while (*CurPtr != '\r' && *CurPtr != '\n') {
@@ -2709,7 +2718,7 @@ bool Lexer::IsStartOfConflictMarker(cons
BufferPtr = CurPtr;
return true;
}
-
+
// No end of conflict marker found.
return false;
}
@@ -2723,35 +2732,35 @@ bool Lexer::HandleEndOfConflictMarker(co
if (CurPtr != BufferStart &&
CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
return false;
-
+
// If we have a situation where we don't care about conflict markers, ignore
// it.
if (!CurrentConflictMarkerState || isLexingRawMode())
return false;
-
+
// Check to see if we have the marker (4 characters in a row).
for (unsigned i = 1; i != 4; ++i)
if (CurPtr[i] != CurPtr[0])
return false;
-
+
// If we do have it, search for the end of the conflict marker. This could
// fail if it got skipped with a '#if 0' or something. Note that CurPtr might
// be the end of conflict marker.
if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
CurrentConflictMarkerState)) {
CurPtr = End;
-
+
// Skip ahead to the end of line.
while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
++CurPtr;
-
+
BufferPtr = CurPtr;
-
+
// No longer in the conflict marker.
CurrentConflictMarkerState = CMK_None;
return true;
}
-
+
return false;
}
@@ -3060,7 +3069,7 @@ LexNextToken:
// We know the lexer hasn't changed, so just try again with this lexer.
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
-
+
case 26: // DOS & CP/M EOF: "^Z".
// If we're in Microsoft extensions mode, treat this as end of file.
if (LangOpts.MicrosoftExt) {
@@ -3072,7 +3081,7 @@ LexNextToken:
// If Microsoft extensions are disabled, this is just random garbage.
Kind = tok::unknown;
break;
-
+
case '\r':
if (CurPtr[0] == '\n')
Char = getAndAdvanceChar(CurPtr, Result);
@@ -3135,7 +3144,7 @@ LexNextToken:
// We only saw whitespace, so just try again with this lexer.
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
-
+
// C99 6.4.4.1: Integer Constants.
// C99 6.4.4.2: Floating Constants.
case '0': case '1': case '2': case '3': case '4':
@@ -3652,7 +3661,7 @@ LexNextToken:
// If this is '====' and we're in a conflict marker, ignore it.
if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
goto LexNextToken;
-
+
Kind = tok::equalequal;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else {
@@ -3739,7 +3748,7 @@ LexNextToken:
}
return LexUnicode(Result, CodePoint, CurPtr);
}
-
+
if (isLexingRawMode() || ParsingPreprocessorDirective ||
PP->isPreprocessedOutput()) {
++CurPtr;
Added: cfe/trunk/test/Preprocessor/macro_raw_string.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Preprocessor/macro_raw_string.cpp?rev=319904&view=auto
==============================================================================
--- cfe/trunk/test/Preprocessor/macro_raw_string.cpp (added)
+++ cfe/trunk/test/Preprocessor/macro_raw_string.cpp Wed Dec 6 09:00:53 2017
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -E -std=c++11 %s -o %t
+// RUN: %clang_cc1 %t
+
+#define FOO(str) foo(#str)
+
+extern void foo(const char *str);
+
+void bar() {
+ FOO(R"(foo
+ bar)");
+}
Modified: cfe/trunk/unittests/Lex/LexerTest.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/unittests/Lex/LexerTest.cpp?rev=319904&r1=319903&r2=319904&view=diff
==============================================================================
--- cfe/trunk/unittests/Lex/LexerTest.cpp (original)
+++ cfe/trunk/unittests/Lex/LexerTest.cpp Wed Dec 6 09:00:53 2017
@@ -37,7 +37,7 @@ protected:
DiagID(new DiagnosticIDs()),
Diags(DiagID, new DiagnosticOptions, new IgnoringDiagConsumer()),
SourceMgr(Diags, FileMgr),
- TargetOpts(new TargetOptions)
+ TargetOpts(new TargetOptions)
{
TargetOpts->Triple = "x86_64-apple-darwin11.1.0";
Target = TargetInfo::CreateTargetInfo(Diags, TargetOpts);
@@ -478,4 +478,42 @@ TEST_F(LexerTest, AvoidPastEndOfStringDe
EXPECT_TRUE(LexedTokens.empty());
}
+TEST_F(LexerTest, StringizingRasString) {
+ // For "std::string Lexer::Stringify(StringRef Str, bool Charify)".
+ std::string String1 = R"(foo
+ {"bar":[]}
+ baz)";
+ // For "void Lexer::Stringify(SmallVectorImpl<char> &Str)".
+ SmallString<128> String2;
+ String2 += String1.c_str();
+
+ // Corner cases.
+ std::string String3 = R"(\
+ \n
+ \\n
+ \\)";
+ SmallString<128> String4;
+ String4 += String3.c_str();
+ std::string String5 = R"(a\
+
+
+ \\b)";
+ SmallString<128> String6;
+ String6 += String5.c_str();
+
+ String1 = Lexer::Stringify(StringRef(String1));
+ Lexer::Stringify(String2);
+ String3 = Lexer::Stringify(StringRef(String3));
+ Lexer::Stringify(String4);
+ String5 = Lexer::Stringify(StringRef(String5));
+ Lexer::Stringify(String6);
+
+ EXPECT_EQ(String1, R"(foo\n {\"bar\":[]}\n baz)");
+ EXPECT_EQ(String2, R"(foo\n {\"bar\":[]}\n baz)");
+ EXPECT_EQ(String3, R"(\\\n \\n\n \\\\n\n \\\\)");
+ EXPECT_EQ(String4, R"(\\\n \\n\n \\\\n\n \\\\)");
+ EXPECT_EQ(String5, R"(a\\\n\n\n \\\\b)");
+ EXPECT_EQ(String6, R"(a\\\n\n\n \\\\b)");
+}
+
} // anonymous namespace
More information about the cfe-commits
mailing list