[cfe-dev] Review request: Patch for adding raw string support to clang

Thu Jun 26 07:41:00 PDT 2008

Hi Cédric,

Please try this procedure when submitting your final version of this  
patch:

http://lists.cs.uiuc.edu/pipermail/llvmdev/2008-January/011992.html

On Jun 26, 2008, at 09:05, Cédric Venet wrote:

> Hi,
>
> I mostly implemented the C++0x proposal for raw string. You can  
> declare a string like:
>
> const char* s2 = R"[test]"; // test
> const char* s3 = R"azerty[test]azerty"; // test
> const char* s4 = R"azerty[\t\\e]azertst]azerty"; // \t\\e]azertst
> const char* s5 = R"[\
> a
> multiline
> string
> literal]";
>
> This isn't a very important functionnality, but it can be useful  
> (think regex). I have some point on which I would like comment  
> marked with TO REVIEW.
> The patch is not yet ready for commit, but handle the mentioned case  
> without problem.
>
> Regards,
>
> Cédric
> Index: include/clang/Basic/DiagnosticKinds.def
> ===================================================================
> --- include/clang/Basic/DiagnosticKinds.def	(revision 52781)
> +++ include/clang/Basic/DiagnosticKinds.def	(working copy)
> @@ -63,6 +63,8 @@
>
> DIAG(err_unterminated_string, ERROR,
>      "missing terminating \" character")
> +DIAG(err_too_much_dchars_rawstring, ERROR,
> +	 "The d-chars part of the string is too long (16 max)")
> DIAG(err_unterminated_char, ERROR,
>      "missing terminating ' character")
> DIAG(err_empty_character, ERROR,
> Index: include/clang/Basic/TokenKinds.def
> ===================================================================
> --- include/clang/Basic/TokenKinds.def	(revision 52781)
> +++ include/clang/Basic/TokenKinds.def	(working copy)
> @@ -106,6 +106,8 @@
> TOK(string_literal)      // "foo"
> TOK(wide_string_literal) // L"foo"
> TOK(angle_string_literal)// <foo>
> +TOK(raw_string_literal)  // R"**[foo]**" (N2442), support for u, U,  
> u8 and L?
> +TOK(wide_raw_string_literal) // LR"[foo]"
>
> // C99 6.4.6: Punctuators.
> TOK(l_square)            // [
> Index: include/clang/Lex/Lexer.h
> ===================================================================
> --- include/clang/Lex/Lexer.h	(revision 52781)
> +++ include/clang/Lex/Lexer.h	(working copy)
> @@ -361,7 +361,8 @@
>   // Helper functions to lex the remainder of a token of the  
> specific type.
>   void LexIdentifier         (Token &Result, const char *CurPtr);
>   void LexNumericConstant    (Token &Result, const char *CurPtr);
> -  void LexStringLiteral      (Token &Result, const char  
> *CurPtr,bool Wide);
> +  void LexStringLiteral      (Token &Result, const char *CurPtr,  
> bool Wide);
> +  void LexRawStringLiteral   (Token &Result, const char *CurPtr,  
> bool Wide);
>   void LexAngledStringLiteral(Token &Result, const char *CurPtr);
>   void LexCharConstant       (Token &Result, const char *CurPtr);
>   bool LexEndOfFile          (Token &Result, const char *CurPtr);
> Index: include/clang/Parse/Parser.h
> ===================================================================
> --- include/clang/Parse/Parser.h	(revision 52781)
> +++ include/clang/Parse/Parser.h	(working copy)
> @@ -106,7 +106,9 @@
>   ///
>   bool isTokenStringLiteral() const {
>     return Tok.getKind() == tok::string_literal ||
> -           Tok.getKind() == tok::wide_string_literal;
> +           Tok.getKind() == tok::wide_string_literal ||
> +           Tok.getKind() == tok::raw_string_literal  ||
> +           Tok.getKind() == tok::wide_raw_string_literal;
>   }
>
>   /// ConsumeToken - Consume the current 'peek token' and lex the  
> next one.
> Index: lib/Lex/Lexer.cpp
> ===================================================================
> --- lib/Lex/Lexer.cpp	(revision 52781)
> +++ lib/Lex/Lexer.cpp	(working copy)
> @@ -592,9 +592,82 @@
>   FormTokenWithChars(Result, CurPtr);
> }
>
> +void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,  
> bool Wide) {
> +/* TO REVIEW: should I let this comment here?
> +raw-string:
> +  "d-char-sequence opt [r-char-sequenceopt ]d-char-sequenceopt "
> +
> +r-char-sequence:
> +    r-char
> +    r-char-sequence r-char
> +
> +r-char:
> +    any member of the source character set, except, (1), a  
> backslash \ followed by a u or U, or, (2), a right square bracket ]
> +                        followed by the initial d-char-sequence  
> (which may be empty) followed by a double quote ".
> +    universal-character-name
> +
> +d-char-sequence:
> +    d-char
> +    d-char-sequence d-char
> +
> +d-char:
> +    any member of the basic source character set, except space, the  
> left square bracket [, the right square bracket ],
> +                        or the control characters representing  
> horizontal tab, vertical tab, form feed, or new-line.
> +*/
> +
> +  char dchar_seq[16];
> +  int  dchar_seq_len = 0;
> +
> +  // first read the optional d-char-sequence (0 to 16 characters)
> +  char C = getAndAdvanceChar(CurPtr, Result);
> +
> +  while (C != '[') {
> +    // FIXME: check the characters are in the allowed set
> +    if(dchar_seq_len>=16) {
> +      Diag(BufferPtr, diag::err_too_much_dchars_rawstring);
> +      // TO REVIEW: should we attempt to recuperate on error here?
> +      Result.setKind(tok::unknown);
> +      FormTokenWithChars(Result, CurPtr-1);
> +      return;
> +    }
> +    dchar_seq[dchar_seq_len++] = C;
> +    C = getAndAdvanceChar(CurPtr, Result);
> +  }
> +  // skip the '['
> +  C = getAndAdvanceChar(CurPtr, Result);
> +  while(1) {
> +    while (C != ']') {
> +      //if( a backslash \ followed by a u or U
> +      if (C == 0 && CurPtr-1 == BufferEnd) {  // End of file.
> +        if (!LexingRawMode) Diag(BufferPtr,  
> diag::err_unterminated_string);
> +        Result.setKind(tok::unknown);
> +        FormTokenWithChars(Result, CurPtr-1);
> +        return;
> +      }
> +      C = getAndAdvanceChar(CurPtr, Result);
> +    }
> +    int i;
> +    for(i=0;i<dchar_seq_len;++i) {
> +      C = getAndAdvanceChar(CurPtr, Result);
> +      // End of file handled in next loop iteration by previous code
> +      if(C!=dchar_seq[i]) break;
> +    }
> +    if(i==dchar_seq_len) {
> +      // End of file handled in next loop iteration by previous code
> +      C = getAndAdvanceChar(CurPtr, Result);
> +      if(C=='"') break;
> +    }
> +  }
> +
> +  Result.setKind(Wide ? tok::wide_raw_string_literal :  
> tok::raw_string_literal);
> +
> +  // Update the location of the token as well as the BufferPtr  
> instance var.
> +  FormTokenWithChars(Result, CurPtr);
> +}
> +
> /// LexStringLiteral - Lex the remainder of a string literal, after  
> having lexed
> /// either " or L".
> -void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,  
> bool Wide){
> +void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,  
> bool Wide) {
>   const char *NulCharacter = 0; // Does this string contain the \0  
> character?
>
>   char C = getAndAdvanceChar(CurPtr, Result);
> @@ -1284,6 +1357,21 @@
>     MIOpt.ReadToken();
>     return LexNumericConstant(Result, CurPtr);
>
> +  case 'R':   // Identifier (Racoon) or raw string literal (R"xyz").
> +    // Notify MIOpt that we read a non-whitespace/non-comment token.
> +    MIOpt.ReadToken();
> +    Char = getCharAndSize(CurPtr, SizeTmp);
> +
> +    // TO REVIEW: enable this only for C++0x, or any language with  
> extension
> +    // activated? or add a features like the pascal string?
> +
> +    // Raw string string literal.
> +    if (Char == '"' && getFeatures().CPlusPlus0x)
> +      return LexRawStringLiteral(Result, ConsumeChar(CurPtr,  
> SizeTmp, Result),
> +                                 false);
> +
> +    return LexIdentifier(Result, CurPtr);
> +
>   case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
>     // Notify MIOpt that we read a non-whitespace/non-comment token.
>     MIOpt.ReadToken();
> @@ -1293,7 +1381,14 @@
>     if (Char == '"')
>       return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp,  
> Result),
>                               true);
> -
> +// TO REVIEW: how to do this properly? if not followed by a ", need  
> to
> +// unconsume the char, perhaps saving the curptr and sizetmp var is  
> enough?
> +/*    else if(Char == 'R') {
> +      ConsumeChar(
> +      if(getCharAndSize(CurPtr,SizeTmp)=='"')
> +      return LexRawStringLiteral(Result, ConsumeChar(CurPtr,  
> SizeTmp, Result),
> +                                 true);
> +*/
>     // Wide character constant.
>     if (Char == '\'')
>       return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp,  
> Result));
> @@ -1302,7 +1397,7 @@
>   // C99 6.4.2: Identifiers.
>   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case  
> 'G':
>   case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case  
> 'N':
> -  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case  
> 'U':
> +  case 'O': case 'P': case 'Q':    /*'R'*/case 'S': case 'T': case  
> 'U':
>   case 'V': case 'W': case 'X': case 'Y': case 'Z':
>   case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case  
> 'g':
>   case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case  
> 'n':
> Index: lib/Lex/LiteralSupport.cpp
> ===================================================================
> --- lib/Lex/LiteralSupport.cpp	(revision 52781)
> +++ lib/Lex/LiteralSupport.cpp	(working copy)
> @@ -608,7 +608,8 @@
>       MaxTokenLength = StringToks[i].getLength();
>
>     // Remember if we see any wide strings.
> -    AnyWide |= StringToks[i].is(tok::wide_string_literal);
> +    AnyWide |= StringToks[i].is(tok::wide_string_literal)
> +            || StringToks[i].is(tok::wide_raw_string_literal);
>   }
>
>
> @@ -649,6 +650,7 @@
>     // Get the spelling of the token, which eliminates trigraphs,  
> etc.  We know
>     // that ThisTokBuf points to a buffer that is big enough for the  
> whole token
>     // and 'spelled' tokens can only shrink.
> +    // FIXME: what about raw string and trigraph, escaped end of  
> line ...??
>     unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
>     const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1;  // Skip end  
> quote.
>
> @@ -660,13 +662,34 @@
>       ++ThisTokBuf;
>       ThisIsWide = true;
>     }
> -
> +
> +    // Skip R marker for raw strings.
> +    bool ThisIsRaw = false;
> +    if (ThisTokBuf[0] == 'R') {
> +      ++ThisTokBuf;
> +      ThisIsRaw = true;
> +    }
> +
>     assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
>     ++ThisTokBuf;
> -
> +
> +    // if we have a raw string, skip the dchar sequence
> +    if(ThisIsRaw) {
> +      while(ThisTokBuf[0]!='[') {
> +        ++ThisTokBuf;
> +        --ThisTokEnd;
> +        // TO REVIEW: Assert needed?
> +        assert(ThisTokBuf<=ThisTokEnd
> +          && "Expected \"d-char seq[, lexer broken?");
> +      }
> +      ++ThisTokBuf;
> +      --ThisTokEnd;
> +    }
> +
>     // Check if this is a pascal string
>     if (pp.getLangOptions().PascalStrings && ThisTokBuf + 1 !=  
> ThisTokEnd &&
>         ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
> +      // TO REVIEW: has a pascal raw string a meaning?
>
>       // If the \p sequence is found in the first token, we have a  
> pascal string
>       // Otherwise, if we already have a pascal string, ignore the  
> first \p
> @@ -702,6 +725,18 @@
>         continue;
>       }
>
> +      // TO REVIEW: Would it be better to use a completly different  
> loop?
> +      if(ThisIsRaw) {
> +        ++ThisTokBuf;
> +        if(ThisTokBuf[0]=='u' || ThisTokBuf[0]=='U') {
> +          // FIXME: handle UCN
> +          assert(false && "UCN not yet implemented");
> +        } else {
> +          *ResultPtr++ = '\\';
> +          continue;
> +        }
> +      }
> +
>       // Otherwise, this is an escape character.  Process it.
>       unsigned ResultChar = ProcessCharEscape(ThisTokBuf,  
> ThisTokEnd, hadError,
>                                                
> StringToks[i].getLocation(),
> Index: lib/Parse/ParseExpr.cpp
> ===================================================================
> --- lib/Parse/ParseExpr.cpp	(revision 52781)
> +++ lib/Parse/ParseExpr.cpp	(working copy)
> @@ -534,6 +534,8 @@
>     return ParsePostfixExpressionSuffix(Res);
>   case tok::string_literal:    // primary-expression: string-literal
>   case tok::wide_string_literal:
> +  case tok::raw_string_literal:
> +  case tok::wide_raw_string_literal:
>     Res = ParseStringLiteralExpression();
>     if (Res.isInvalid) return Res;
>     // This can be followed by postfix-expr pieces (e.g. "foo"[1]).
> Index: lib/Parse/ParseObjc.cpp
> ===================================================================
> --- lib/Parse/ParseObjc.cpp	(revision 52781)
> +++ lib/Parse/ParseObjc.cpp	(working copy)
> @@ -1327,6 +1327,8 @@
>   switch (Tok.getKind()) {
>   case tok::string_literal:    // primary-expression: string-literal
>   case tok::wide_string_literal:
> +  case tok::raw_string_literal: // TO REVIEW: objC and C++0x  
> possible?
> +  case tok::wide_raw_string_literal:
>     return  
> ParsePostfixExpressionSuffix(ParseObjCStringLiteral(AtLoc));
>   default:
>     break;
> Index: lib/Parse/Parser.cpp
> ===================================================================
> --- lib/Parse/Parser.cpp	(revision 52781)
> +++ lib/Parse/Parser.cpp	(working copy)
> @@ -156,6 +156,8 @@
>
>     case tok::string_literal:
>     case tok::wide_string_literal:
> +    case tok::raw_string_literal:
> +    case tok::wide_raw_string_literal:
>       ConsumeStringToken();
>       break;
>     case tok::semi:
> // RUN: clang -emit-llvm -std=c++0x %s
> // FIXME: how to test that the strings are correct? this should emit:
> //@.str = internal constant [5 x i8] c"test\00"		; <[5 x i8]*>  
> [#uses=1]
> //@s1 = global i8* getelementptr ([5 x i8]* @.str, i32 0, i32 0),  
> align 4		; <i8**> [#uses=0]
> //@s2 = global i8* getelementptr ([5 x i8]* @.str, i32 0, i32 0),  
> align 4		; <i8**> [#uses=0]
> //@s3 = global i8* getelementptr ([5 x i8]* @.str, i32 0, i32 0),  
> align 4		; <i8**> [#uses=0]
> //@.str1 = internal constant [14 x i8] c"\5Ct\5C\5Ce]azertst\00"		;  
> <[14 x i8]*> [#uses=1]
> //@s4 = global i8* getelementptr ([14 x i8]* @.str1, i32 0, i32 0),  
> align 4		; <i8**> [#uses=0]
> //@.str2 = internal constant [30 x i8] c"a\0D\0Amultiline\0D\0Astring 
> \0D\0Aliteral\00"		; <[30 x i8]*> [#uses=1]
> //@s5 = global i8* getelementptr ([30 x i8]* @.str2, i32 0, i32 0),  
> align 4		; <i8**> [#uses=0]
>
> const char* s1 = "test";
> const char* s2 = R"[test]";
> const char* s3 = R"azerty[test]azerty";
> const char* s4 = R"azerty[\t\\e]azertst]azerty";
> const char* s5 = R"[\
> a
> multiline
> string
> literal]";
> const char* s6 = LR"[test]";
>
> // expected error:
> // const char* err1 = R"azerty[test]azetry";
>
> // RUN: clang -fsyntaxonly -verify -std=c++0x %s
> // should be -Eonly ? but verify don't work then?
>
> const char* err1 = R"12345678901234567[test]12345678901234567";
> const char* err2 = R"azerty[test]azetry"; // expected error:
>
> _______________________________________________
> cfe-dev mailing list
> cfe-dev at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/cfe-dev

— Gordon