[cfe-dev] Review request: Patch for adding raw string support to clang
Chris Lattner
clattner at apple.com
Thu Jun 26 10:41:14 PDT 2008
On Jun 26, 2008, at 7:41 AM, Gordon Henriksen wrote:
> Hi Cédric,
>
> Please try this procedure when submitting your final version of this
> patch:
>
> http://lists.cs.uiuc.edu/pipermail/llvmdev/2008-January/011992.html
Hey Gordon, can you add that info to the developer policy guide?
-Chris
>
>
> On Jun 26, 2008, at 09:05, Cédric Venet wrote:
>
>> Hi,
>>
>> I mostly implemented the C++0x proposal for raw string. You can
>> declare a string like:
>>
>> const char* s2 = R"[test]"; // test
>> const char* s3 = R"azerty[test]azerty"; // test
>> const char* s4 = R"azerty[\t\\e]azertst]azerty"; // \t\\e]azertst
>> const char* s5 = R"[\
>> a
>> multiline
>> string
>> literal]";
>>
>> This isn't a very important functionnality, but it can be useful
>> (think regex). I have some point on which I would like comment
>> marked with TO REVIEW.
>> The patch is not yet ready for commit, but handle the mentioned case
>> without problem.
>>
>> Regards,
>>
>> Cédric
>> Index: include/clang/Basic/DiagnosticKinds.def
>> ===================================================================
>> --- include/clang/Basic/DiagnosticKinds.def (revision 52781)
>> +++ include/clang/Basic/DiagnosticKinds.def (working copy)
>> @@ -63,6 +63,8 @@
>>
>> DIAG(err_unterminated_string, ERROR,
>> "missing terminating \" character")
>> +DIAG(err_too_much_dchars_rawstring, ERROR,
>> + "The d-chars part of the string is too long (16 max)")
>> DIAG(err_unterminated_char, ERROR,
>> "missing terminating ' character")
>> DIAG(err_empty_character, ERROR,
>> Index: include/clang/Basic/TokenKinds.def
>> ===================================================================
>> --- include/clang/Basic/TokenKinds.def (revision 52781)
>> +++ include/clang/Basic/TokenKinds.def (working copy)
>> @@ -106,6 +106,8 @@
>> TOK(string_literal) // "foo"
>> TOK(wide_string_literal) // L"foo"
>> TOK(angle_string_literal)// <foo>
>> +TOK(raw_string_literal) // R"**[foo]**" (N2442), support for u, U,
>> u8 and L?
>> +TOK(wide_raw_string_literal) // LR"[foo]"
>>
>> // C99 6.4.6: Punctuators.
>> TOK(l_square) // [
>> Index: include/clang/Lex/Lexer.h
>> ===================================================================
>> --- include/clang/Lex/Lexer.h (revision 52781)
>> +++ include/clang/Lex/Lexer.h (working copy)
>> @@ -361,7 +361,8 @@
>> // Helper functions to lex the remainder of a token of the
>> specific type.
>> void LexIdentifier (Token &Result, const char *CurPtr);
>> void LexNumericConstant (Token &Result, const char *CurPtr);
>> - void LexStringLiteral (Token &Result, const char
>> *CurPtr,bool Wide);
>> + void LexStringLiteral (Token &Result, const char *CurPtr,
>> bool Wide);
>> + void LexRawStringLiteral (Token &Result, const char *CurPtr,
>> bool Wide);
>> void LexAngledStringLiteral(Token &Result, const char *CurPtr);
>> void LexCharConstant (Token &Result, const char *CurPtr);
>> bool LexEndOfFile (Token &Result, const char *CurPtr);
>> Index: include/clang/Parse/Parser.h
>> ===================================================================
>> --- include/clang/Parse/Parser.h (revision 52781)
>> +++ include/clang/Parse/Parser.h (working copy)
>> @@ -106,7 +106,9 @@
>> ///
>> bool isTokenStringLiteral() const {
>> return Tok.getKind() == tok::string_literal ||
>> - Tok.getKind() == tok::wide_string_literal;
>> + Tok.getKind() == tok::wide_string_literal ||
>> + Tok.getKind() == tok::raw_string_literal ||
>> + Tok.getKind() == tok::wide_raw_string_literal;
>> }
>>
>> /// ConsumeToken - Consume the current 'peek token' and lex the
>> next one.
>> Index: lib/Lex/Lexer.cpp
>> ===================================================================
>> --- lib/Lex/Lexer.cpp (revision 52781)
>> +++ lib/Lex/Lexer.cpp (working copy)
>> @@ -592,9 +592,82 @@
>> FormTokenWithChars(Result, CurPtr);
>> }
>>
>> +void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
>> bool Wide) {
>> +/* TO REVIEW: should I let this comment here?
>> +raw-string:
>> + "d-char-sequence opt [r-char-sequenceopt ]d-char-sequenceopt "
>> +
>> +r-char-sequence:
>> + r-char
>> + r-char-sequence r-char
>> +
>> +r-char:
>> + any member of the source character set, except, (1), a
>> backslash \ followed by a u or U, or, (2), a right square bracket ]
>> + followed by the initial d-char-sequence
>> (which may be empty) followed by a double quote ".
>> + universal-character-name
>> +
>> +d-char-sequence:
>> + d-char
>> + d-char-sequence d-char
>> +
>> +d-char:
>> + any member of the basic source character set, except space, the
>> left square bracket [, the right square bracket ],
>> + or the control characters representing
>> horizontal tab, vertical tab, form feed, or new-line.
>> +*/
>> +
>> + char dchar_seq[16];
>> + int dchar_seq_len = 0;
>> +
>> + // first read the optional d-char-sequence (0 to 16 characters)
>> + char C = getAndAdvanceChar(CurPtr, Result);
>> +
>> + while (C != '[') {
>> + // FIXME: check the characters are in the allowed set
>> + if(dchar_seq_len>=16) {
>> + Diag(BufferPtr, diag::err_too_much_dchars_rawstring);
>> + // TO REVIEW: should we attempt to recuperate on error here?
>> + Result.setKind(tok::unknown);
>> + FormTokenWithChars(Result, CurPtr-1);
>> + return;
>> + }
>> + dchar_seq[dchar_seq_len++] = C;
>> + C = getAndAdvanceChar(CurPtr, Result);
>> + }
>> + // skip the '['
>> + C = getAndAdvanceChar(CurPtr, Result);
>> + while(1) {
>> + while (C != ']') {
>> + //if( a backslash \ followed by a u or U
>> + if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
>> + if (!LexingRawMode) Diag(BufferPtr,
>> diag::err_unterminated_string);
>> + Result.setKind(tok::unknown);
>> + FormTokenWithChars(Result, CurPtr-1);
>> + return;
>> + }
>> + C = getAndAdvanceChar(CurPtr, Result);
>> + }
>> + int i;
>> + for(i=0;i<dchar_seq_len;++i) {
>> + C = getAndAdvanceChar(CurPtr, Result);
>> + // End of file handled in next loop iteration by previous code
>> + if(C!=dchar_seq[i]) break;
>> + }
>> + if(i==dchar_seq_len) {
>> + // End of file handled in next loop iteration by previous code
>> + C = getAndAdvanceChar(CurPtr, Result);
>> + if(C=='"') break;
>> + }
>> + }
>> +
>> + Result.setKind(Wide ? tok::wide_raw_string_literal :
>> tok::raw_string_literal);
>> +
>> + // Update the location of the token as well as the BufferPtr
>> instance var.
>> + FormTokenWithChars(Result, CurPtr);
>> +}
>> +
>> /// LexStringLiteral - Lex the remainder of a string literal, after
>> having lexed
>> /// either " or L".
>> -void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
>> bool Wide){
>> +void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
>> bool Wide) {
>> const char *NulCharacter = 0; // Does this string contain the \0
>> character?
>>
>> char C = getAndAdvanceChar(CurPtr, Result);
>> @@ -1284,6 +1357,21 @@
>> MIOpt.ReadToken();
>> return LexNumericConstant(Result, CurPtr);
>>
>> + case 'R': // Identifier (Racoon) or raw string literal (R"xyz").
>> + // Notify MIOpt that we read a non-whitespace/non-comment token.
>> + MIOpt.ReadToken();
>> + Char = getCharAndSize(CurPtr, SizeTmp);
>> +
>> + // TO REVIEW: enable this only for C++0x, or any language with
>> extension
>> + // activated? or add a features like the pascal string?
>> +
>> + // Raw string string literal.
>> + if (Char == '"' && getFeatures().CPlusPlus0x)
>> + return LexRawStringLiteral(Result, ConsumeChar(CurPtr,
>> SizeTmp, Result),
>> + false);
>> +
>> + return LexIdentifier(Result, CurPtr);
>> +
>> case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
>> // Notify MIOpt that we read a non-whitespace/non-comment token.
>> MIOpt.ReadToken();
>> @@ -1293,7 +1381,14 @@
>> if (Char == '"')
>> return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp,
>> Result),
>> true);
>> -
>> +// TO REVIEW: how to do this properly? if not followed by a ", need
>> to
>> +// unconsume the char, perhaps saving the curptr and sizetmp var is
>> enough?
>> +/* else if(Char == 'R') {
>> + ConsumeChar(
>> + if(getCharAndSize(CurPtr,SizeTmp)=='"')
>> + return LexRawStringLiteral(Result, ConsumeChar(CurPtr,
>> SizeTmp, Result),
>> + true);
>> +*/
>> // Wide character constant.
>> if (Char == '\'')
>> return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp,
>> Result));
>> @@ -1302,7 +1397,7 @@
>> // C99 6.4.2: Identifiers.
>> case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case
>> 'G':
>> case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case
>> 'N':
>> - case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case
>> 'U':
>> + case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': case
>> 'U':
>> case 'V': case 'W': case 'X': case 'Y': case 'Z':
>> case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case
>> 'g':
>> case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case
>> 'n':
>> Index: lib/Lex/LiteralSupport.cpp
>> ===================================================================
>> --- lib/Lex/LiteralSupport.cpp (revision 52781)
>> +++ lib/Lex/LiteralSupport.cpp (working copy)
>> @@ -608,7 +608,8 @@
>> MaxTokenLength = StringToks[i].getLength();
>>
>> // Remember if we see any wide strings.
>> - AnyWide |= StringToks[i].is(tok::wide_string_literal);
>> + AnyWide |= StringToks[i].is(tok::wide_string_literal)
>> + || StringToks[i].is(tok::wide_raw_string_literal);
>> }
>>
>>
>> @@ -649,6 +650,7 @@
>> // Get the spelling of the token, which eliminates trigraphs,
>> etc. We know
>> // that ThisTokBuf points to a buffer that is big enough for the
>> whole token
>> // and 'spelled' tokens can only shrink.
>> + // FIXME: what about raw string and trigraph, escaped end of
>> line ...??
>> unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
>> const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end
>> quote.
>>
>> @@ -660,13 +662,34 @@
>> ++ThisTokBuf;
>> ThisIsWide = true;
>> }
>> -
>> +
>> + // Skip R marker for raw strings.
>> + bool ThisIsRaw = false;
>> + if (ThisTokBuf[0] == 'R') {
>> + ++ThisTokBuf;
>> + ThisIsRaw = true;
>> + }
>> +
>> assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
>> ++ThisTokBuf;
>> -
>> +
>> + // if we have a raw string, skip the dchar sequence
>> + if(ThisIsRaw) {
>> + while(ThisTokBuf[0]!='[') {
>> + ++ThisTokBuf;
>> + --ThisTokEnd;
>> + // TO REVIEW: Assert needed?
>> + assert(ThisTokBuf<=ThisTokEnd
>> + && "Expected \"d-char seq[, lexer broken?");
>> + }
>> + ++ThisTokBuf;
>> + --ThisTokEnd;
>> + }
>> +
>> // Check if this is a pascal string
>> if (pp.getLangOptions().PascalStrings && ThisTokBuf + 1 !=
>> ThisTokEnd &&
>> ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
>> + // TO REVIEW: has a pascal raw string a meaning?
>>
>> // If the \p sequence is found in the first token, we have a
>> pascal string
>> // Otherwise, if we already have a pascal string, ignore the
>> first \p
>> @@ -702,6 +725,18 @@
>> continue;
>> }
>>
>> + // TO REVIEW: Would it be better to use a completly different
>> loop?
>> + if(ThisIsRaw) {
>> + ++ThisTokBuf;
>> + if(ThisTokBuf[0]=='u' || ThisTokBuf[0]=='U') {
>> + // FIXME: handle UCN
>> + assert(false && "UCN not yet implemented");
>> + } else {
>> + *ResultPtr++ = '\\';
>> + continue;
>> + }
>> + }
>> +
>> // Otherwise, this is an escape character. Process it.
>> unsigned ResultChar = ProcessCharEscape(ThisTokBuf,
>> ThisTokEnd, hadError,
>>
>> StringToks[i].getLocation(),
>> Index: lib/Parse/ParseExpr.cpp
>> ===================================================================
>> --- lib/Parse/ParseExpr.cpp (revision 52781)
>> +++ lib/Parse/ParseExpr.cpp (working copy)
>> @@ -534,6 +534,8 @@
>> return ParsePostfixExpressionSuffix(Res);
>> case tok::string_literal: // primary-expression: string-literal
>> case tok::wide_string_literal:
>> + case tok::raw_string_literal:
>> + case tok::wide_raw_string_literal:
>> Res = ParseStringLiteralExpression();
>> if (Res.isInvalid) return Res;
>> // This can be followed by postfix-expr pieces (e.g. "foo"[1]).
>> Index: lib/Parse/ParseObjc.cpp
>> ===================================================================
>> --- lib/Parse/ParseObjc.cpp (revision 52781)
>> +++ lib/Parse/ParseObjc.cpp (working copy)
>> @@ -1327,6 +1327,8 @@
>> switch (Tok.getKind()) {
>> case tok::string_literal: // primary-expression: string-literal
>> case tok::wide_string_literal:
>> + case tok::raw_string_literal: // TO REVIEW: objC and C++0x
>> possible?
>> + case tok::wide_raw_string_literal:
>> return
>> ParsePostfixExpressionSuffix(ParseObjCStringLiteral(AtLoc));
>> default:
>> break;
>> Index: lib/Parse/Parser.cpp
>> ===================================================================
>> --- lib/Parse/Parser.cpp (revision 52781)
>> +++ lib/Parse/Parser.cpp (working copy)
>> @@ -156,6 +156,8 @@
>>
>> case tok::string_literal:
>> case tok::wide_string_literal:
>> + case tok::raw_string_literal:
>> + case tok::wide_raw_string_literal:
>> ConsumeStringToken();
>> break;
>> case tok::semi:
>> // RUN: clang -emit-llvm -std=c++0x %s
>> // FIXME: how to test that the strings are correct? this should emit:
>> //@.str = internal constant [5 x i8] c"test\00" ; <[5 x i8]*>
>> [#uses=1]
>> //@s1 = global i8* getelementptr ([5 x i8]* @.str, i32 0, i32 0),
>> align 4 ; <i8**> [#uses=0]
>> //@s2 = global i8* getelementptr ([5 x i8]* @.str, i32 0, i32 0),
>> align 4 ; <i8**> [#uses=0]
>> //@s3 = global i8* getelementptr ([5 x i8]* @.str, i32 0, i32 0),
>> align 4 ; <i8**> [#uses=0]
>> //@.str1 = internal constant [14 x i8] c"\5Ct\5C\5Ce]azertst\00" ;
>> <[14 x i8]*> [#uses=1]
>> //@s4 = global i8* getelementptr ([14 x i8]* @.str1, i32 0, i32 0),
>> align 4 ; <i8**> [#uses=0]
>> //@.str2 = internal constant [30 x i8] c"a\0D\0Amultiline\0D\0Astring
>> \0D\0Aliteral\00" ; <[30 x i8]*> [#uses=1]
>> //@s5 = global i8* getelementptr ([30 x i8]* @.str2, i32 0, i32 0),
>> align 4 ; <i8**> [#uses=0]
>>
>> const char* s1 = "test";
>> const char* s2 = R"[test]";
>> const char* s3 = R"azerty[test]azerty";
>> const char* s4 = R"azerty[\t\\e]azertst]azerty";
>> const char* s5 = R"[\
>> a
>> multiline
>> string
>> literal]";
>> const char* s6 = LR"[test]";
>>
>> // expected error:
>> // const char* err1 = R"azerty[test]azetry";
>>
>> // RUN: clang -fsyntaxonly -verify -std=c++0x %s
>> // should be -Eonly ? but verify don't work then?
>>
>> const char* err1 = R"12345678901234567[test]12345678901234567";
>> const char* err2 = R"azerty[test]azetry"; // expected error:
>>
>> _______________________________________________
>> cfe-dev mailing list
>> cfe-dev at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/cfe-dev
>
>
>
> — Gordon
>
>
> _______________________________________________
> cfe-dev mailing list
> cfe-dev at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/cfe-dev
More information about the cfe-dev
mailing list