[cfe-dev] Review request: Patch for adding raw string support to clang
Gordon Henriksen
gordonhenriksen at mac.com
Thu Jun 26 07:41:00 PDT 2008
Hi Cédric,
Please try this procedure when submitting your final version of this
patch:
http://lists.cs.uiuc.edu/pipermail/llvmdev/2008-January/011992.html
On Jun 26, 2008, at 09:05, Cédric Venet wrote:
> Hi,
>
> I mostly implemented the C++0x proposal for raw string. You can
> declare a string like:
>
> const char* s2 = R"[test]"; // test
> const char* s3 = R"azerty[test]azerty"; // test
> const char* s4 = R"azerty[\t\\e]azertst]azerty"; // \t\\e]azertst
> const char* s5 = R"[\
> a
> multiline
> string
> literal]";
>
> This isn't a very important functionnality, but it can be useful
> (think regex). I have some point on which I would like comment
> marked with TO REVIEW.
> The patch is not yet ready for commit, but handle the mentioned case
> without problem.
>
> Regards,
>
> Cédric
> Index: include/clang/Basic/DiagnosticKinds.def
> ===================================================================
> --- include/clang/Basic/DiagnosticKinds.def (revision 52781)
> +++ include/clang/Basic/DiagnosticKinds.def (working copy)
> @@ -63,6 +63,8 @@
>
> DIAG(err_unterminated_string, ERROR,
> "missing terminating \" character")
> +DIAG(err_too_much_dchars_rawstring, ERROR,
> + "The d-chars part of the string is too long (16 max)")
> DIAG(err_unterminated_char, ERROR,
> "missing terminating ' character")
> DIAG(err_empty_character, ERROR,
> Index: include/clang/Basic/TokenKinds.def
> ===================================================================
> --- include/clang/Basic/TokenKinds.def (revision 52781)
> +++ include/clang/Basic/TokenKinds.def (working copy)
> @@ -106,6 +106,8 @@
> TOK(string_literal) // "foo"
> TOK(wide_string_literal) // L"foo"
> TOK(angle_string_literal)// <foo>
> +TOK(raw_string_literal) // R"**[foo]**" (N2442), support for u, U,
> u8 and L?
> +TOK(wide_raw_string_literal) // LR"[foo]"
>
> // C99 6.4.6: Punctuators.
> TOK(l_square) // [
> Index: include/clang/Lex/Lexer.h
> ===================================================================
> --- include/clang/Lex/Lexer.h (revision 52781)
> +++ include/clang/Lex/Lexer.h (working copy)
> @@ -361,7 +361,8 @@
> // Helper functions to lex the remainder of a token of the
> specific type.
> void LexIdentifier (Token &Result, const char *CurPtr);
> void LexNumericConstant (Token &Result, const char *CurPtr);
> - void LexStringLiteral (Token &Result, const char
> *CurPtr,bool Wide);
> + void LexStringLiteral (Token &Result, const char *CurPtr,
> bool Wide);
> + void LexRawStringLiteral (Token &Result, const char *CurPtr,
> bool Wide);
> void LexAngledStringLiteral(Token &Result, const char *CurPtr);
> void LexCharConstant (Token &Result, const char *CurPtr);
> bool LexEndOfFile (Token &Result, const char *CurPtr);
> Index: include/clang/Parse/Parser.h
> ===================================================================
> --- include/clang/Parse/Parser.h (revision 52781)
> +++ include/clang/Parse/Parser.h (working copy)
> @@ -106,7 +106,9 @@
> ///
> bool isTokenStringLiteral() const {
> return Tok.getKind() == tok::string_literal ||
> - Tok.getKind() == tok::wide_string_literal;
> + Tok.getKind() == tok::wide_string_literal ||
> + Tok.getKind() == tok::raw_string_literal ||
> + Tok.getKind() == tok::wide_raw_string_literal;
> }
>
> /// ConsumeToken - Consume the current 'peek token' and lex the
> next one.
> Index: lib/Lex/Lexer.cpp
> ===================================================================
> --- lib/Lex/Lexer.cpp (revision 52781)
> +++ lib/Lex/Lexer.cpp (working copy)
> @@ -592,9 +592,82 @@
> FormTokenWithChars(Result, CurPtr);
> }
>
> +void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
> bool Wide) {
> +/* TO REVIEW: should I let this comment here?
> +raw-string:
> + "d-char-sequence opt [r-char-sequenceopt ]d-char-sequenceopt "
> +
> +r-char-sequence:
> + r-char
> + r-char-sequence r-char
> +
> +r-char:
> + any member of the source character set, except, (1), a
> backslash \ followed by a u or U, or, (2), a right square bracket ]
> + followed by the initial d-char-sequence
> (which may be empty) followed by a double quote ".
> + universal-character-name
> +
> +d-char-sequence:
> + d-char
> + d-char-sequence d-char
> +
> +d-char:
> + any member of the basic source character set, except space, the
> left square bracket [, the right square bracket ],
> + or the control characters representing
> horizontal tab, vertical tab, form feed, or new-line.
> +*/
> +
> + char dchar_seq[16];
> + int dchar_seq_len = 0;
> +
> + // first read the optional d-char-sequence (0 to 16 characters)
> + char C = getAndAdvanceChar(CurPtr, Result);
> +
> + while (C != '[') {
> + // FIXME: check the characters are in the allowed set
> + if(dchar_seq_len>=16) {
> + Diag(BufferPtr, diag::err_too_much_dchars_rawstring);
> + // TO REVIEW: should we attempt to recuperate on error here?
> + Result.setKind(tok::unknown);
> + FormTokenWithChars(Result, CurPtr-1);
> + return;
> + }
> + dchar_seq[dchar_seq_len++] = C;
> + C = getAndAdvanceChar(CurPtr, Result);
> + }
> + // skip the '['
> + C = getAndAdvanceChar(CurPtr, Result);
> + while(1) {
> + while (C != ']') {
> + //if( a backslash \ followed by a u or U
> + if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
> + if (!LexingRawMode) Diag(BufferPtr,
> diag::err_unterminated_string);
> + Result.setKind(tok::unknown);
> + FormTokenWithChars(Result, CurPtr-1);
> + return;
> + }
> + C = getAndAdvanceChar(CurPtr, Result);
> + }
> + int i;
> + for(i=0;i<dchar_seq_len;++i) {
> + C = getAndAdvanceChar(CurPtr, Result);
> + // End of file handled in next loop iteration by previous code
> + if(C!=dchar_seq[i]) break;
> + }
> + if(i==dchar_seq_len) {
> + // End of file handled in next loop iteration by previous code
> + C = getAndAdvanceChar(CurPtr, Result);
> + if(C=='"') break;
> + }
> + }
> +
> + Result.setKind(Wide ? tok::wide_raw_string_literal :
> tok::raw_string_literal);
> +
> + // Update the location of the token as well as the BufferPtr
> instance var.
> + FormTokenWithChars(Result, CurPtr);
> +}
> +
> /// LexStringLiteral - Lex the remainder of a string literal, after
> having lexed
> /// either " or L".
> -void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
> bool Wide){
> +void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
> bool Wide) {
> const char *NulCharacter = 0; // Does this string contain the \0
> character?
>
> char C = getAndAdvanceChar(CurPtr, Result);
> @@ -1284,6 +1357,21 @@
> MIOpt.ReadToken();
> return LexNumericConstant(Result, CurPtr);
>
> + case 'R': // Identifier (Racoon) or raw string literal (R"xyz").
> + // Notify MIOpt that we read a non-whitespace/non-comment token.
> + MIOpt.ReadToken();
> + Char = getCharAndSize(CurPtr, SizeTmp);
> +
> + // TO REVIEW: enable this only for C++0x, or any language with
> extension
> + // activated? or add a features like the pascal string?
> +
> + // Raw string string literal.
> + if (Char == '"' && getFeatures().CPlusPlus0x)
> + return LexRawStringLiteral(Result, ConsumeChar(CurPtr,
> SizeTmp, Result),
> + false);
> +
> + return LexIdentifier(Result, CurPtr);
> +
> case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
> // Notify MIOpt that we read a non-whitespace/non-comment token.
> MIOpt.ReadToken();
> @@ -1293,7 +1381,14 @@
> if (Char == '"')
> return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp,
> Result),
> true);
> -
> +// TO REVIEW: how to do this properly? if not followed by a ", need
> to
> +// unconsume the char, perhaps saving the curptr and sizetmp var is
> enough?
> +/* else if(Char == 'R') {
> + ConsumeChar(
> + if(getCharAndSize(CurPtr,SizeTmp)=='"')
> + return LexRawStringLiteral(Result, ConsumeChar(CurPtr,
> SizeTmp, Result),
> + true);
> +*/
> // Wide character constant.
> if (Char == '\'')
> return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp,
> Result));
> @@ -1302,7 +1397,7 @@
> // C99 6.4.2: Identifiers.
> case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case
> 'G':
> case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case
> 'N':
> - case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case
> 'U':
> + case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': case
> 'U':
> case 'V': case 'W': case 'X': case 'Y': case 'Z':
> case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case
> 'g':
> case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case
> 'n':
> Index: lib/Lex/LiteralSupport.cpp
> ===================================================================
> --- lib/Lex/LiteralSupport.cpp (revision 52781)
> +++ lib/Lex/LiteralSupport.cpp (working copy)
> @@ -608,7 +608,8 @@
> MaxTokenLength = StringToks[i].getLength();
>
> // Remember if we see any wide strings.
> - AnyWide |= StringToks[i].is(tok::wide_string_literal);
> + AnyWide |= StringToks[i].is(tok::wide_string_literal)
> + || StringToks[i].is(tok::wide_raw_string_literal);
> }
>
>
> @@ -649,6 +650,7 @@
> // Get the spelling of the token, which eliminates trigraphs,
> etc. We know
> // that ThisTokBuf points to a buffer that is big enough for the
> whole token
> // and 'spelled' tokens can only shrink.
> + // FIXME: what about raw string and trigraph, escaped end of
> line ...??
> unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
> const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end
> quote.
>
> @@ -660,13 +662,34 @@
> ++ThisTokBuf;
> ThisIsWide = true;
> }
> -
> +
> + // Skip R marker for raw strings.
> + bool ThisIsRaw = false;
> + if (ThisTokBuf[0] == 'R') {
> + ++ThisTokBuf;
> + ThisIsRaw = true;
> + }
> +
> assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
> ++ThisTokBuf;
> -
> +
> + // if we have a raw string, skip the dchar sequence
> + if(ThisIsRaw) {
> + while(ThisTokBuf[0]!='[') {
> + ++ThisTokBuf;
> + --ThisTokEnd;
> + // TO REVIEW: Assert needed?
> + assert(ThisTokBuf<=ThisTokEnd
> + && "Expected \"d-char seq[, lexer broken?");
> + }
> + ++ThisTokBuf;
> + --ThisTokEnd;
> + }
> +
> // Check if this is a pascal string
> if (pp.getLangOptions().PascalStrings && ThisTokBuf + 1 !=
> ThisTokEnd &&
> ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
> + // TO REVIEW: has a pascal raw string a meaning?
>
> // If the \p sequence is found in the first token, we have a
> pascal string
> // Otherwise, if we already have a pascal string, ignore the
> first \p
> @@ -702,6 +725,18 @@
> continue;
> }
>
> + // TO REVIEW: Would it be better to use a completly different
> loop?
> + if(ThisIsRaw) {
> + ++ThisTokBuf;
> + if(ThisTokBuf[0]=='u' || ThisTokBuf[0]=='U') {
> + // FIXME: handle UCN
> + assert(false && "UCN not yet implemented");
> + } else {
> + *ResultPtr++ = '\\';
> + continue;
> + }
> + }
> +
> // Otherwise, this is an escape character. Process it.
> unsigned ResultChar = ProcessCharEscape(ThisTokBuf,
> ThisTokEnd, hadError,
>
> StringToks[i].getLocation(),
> Index: lib/Parse/ParseExpr.cpp
> ===================================================================
> --- lib/Parse/ParseExpr.cpp (revision 52781)
> +++ lib/Parse/ParseExpr.cpp (working copy)
> @@ -534,6 +534,8 @@
> return ParsePostfixExpressionSuffix(Res);
> case tok::string_literal: // primary-expression: string-literal
> case tok::wide_string_literal:
> + case tok::raw_string_literal:
> + case tok::wide_raw_string_literal:
> Res = ParseStringLiteralExpression();
> if (Res.isInvalid) return Res;
> // This can be followed by postfix-expr pieces (e.g. "foo"[1]).
> Index: lib/Parse/ParseObjc.cpp
> ===================================================================
> --- lib/Parse/ParseObjc.cpp (revision 52781)
> +++ lib/Parse/ParseObjc.cpp (working copy)
> @@ -1327,6 +1327,8 @@
> switch (Tok.getKind()) {
> case tok::string_literal: // primary-expression: string-literal
> case tok::wide_string_literal:
> + case tok::raw_string_literal: // TO REVIEW: objC and C++0x
> possible?
> + case tok::wide_raw_string_literal:
> return
> ParsePostfixExpressionSuffix(ParseObjCStringLiteral(AtLoc));
> default:
> break;
> Index: lib/Parse/Parser.cpp
> ===================================================================
> --- lib/Parse/Parser.cpp (revision 52781)
> +++ lib/Parse/Parser.cpp (working copy)
> @@ -156,6 +156,8 @@
>
> case tok::string_literal:
> case tok::wide_string_literal:
> + case tok::raw_string_literal:
> + case tok::wide_raw_string_literal:
> ConsumeStringToken();
> break;
> case tok::semi:
> // RUN: clang -emit-llvm -std=c++0x %s
> // FIXME: how to test that the strings are correct? this should emit:
> //@.str = internal constant [5 x i8] c"test\00" ; <[5 x i8]*>
> [#uses=1]
> //@s1 = global i8* getelementptr ([5 x i8]* @.str, i32 0, i32 0),
> align 4 ; <i8**> [#uses=0]
> //@s2 = global i8* getelementptr ([5 x i8]* @.str, i32 0, i32 0),
> align 4 ; <i8**> [#uses=0]
> //@s3 = global i8* getelementptr ([5 x i8]* @.str, i32 0, i32 0),
> align 4 ; <i8**> [#uses=0]
> //@.str1 = internal constant [14 x i8] c"\5Ct\5C\5Ce]azertst\00" ;
> <[14 x i8]*> [#uses=1]
> //@s4 = global i8* getelementptr ([14 x i8]* @.str1, i32 0, i32 0),
> align 4 ; <i8**> [#uses=0]
> //@.str2 = internal constant [30 x i8] c"a\0D\0Amultiline\0D\0Astring
> \0D\0Aliteral\00" ; <[30 x i8]*> [#uses=1]
> //@s5 = global i8* getelementptr ([30 x i8]* @.str2, i32 0, i32 0),
> align 4 ; <i8**> [#uses=0]
>
> const char* s1 = "test";
> const char* s2 = R"[test]";
> const char* s3 = R"azerty[test]azerty";
> const char* s4 = R"azerty[\t\\e]azertst]azerty";
> const char* s5 = R"[\
> a
> multiline
> string
> literal]";
> const char* s6 = LR"[test]";
>
> // expected error:
> // const char* err1 = R"azerty[test]azetry";
>
> // RUN: clang -fsyntaxonly -verify -std=c++0x %s
> // should be -Eonly ? but verify don't work then?
>
> const char* err1 = R"12345678901234567[test]12345678901234567";
> const char* err2 = R"azerty[test]azetry"; // expected error:
>
> _______________________________________________
> cfe-dev mailing list
> cfe-dev at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/cfe-dev
— Gordon
More information about the cfe-dev
mailing list