[cfe-dev] Review request: Patch for adding raw string support to clang

Thu Jun 26 10:41:14 PDT 2008

On Jun 26, 2008, at 7:41 AM, Gordon Henriksen wrote:

> Hi Cédric,
>
> Please try this procedure when submitting your final version of this
> patch:
>
> http://lists.cs.uiuc.edu/pipermail/llvmdev/2008-January/011992.html

Hey Gordon, can you add that info to the developer policy guide?

-Chris

>
>
> On Jun 26, 2008, at 09:05, Cédric Venet wrote:
>
>> Hi,
>>
>> I mostly implemented the C++0x proposal for raw string. You can
>> declare a string like:
>>
>> const char* s2 = R"[test]"; // test
>> const char* s3 = R"azerty[test]azerty"; // test
>> const char* s4 = R"azerty[\t\\e]azertst]azerty"; // \t\\e]azertst
>> const char* s5 = R"[\
>> a
>> multiline
>> string
>> literal]";
>>
>> This isn't a very important functionnality, but it can be useful
>> (think regex). I have some point on which I would like comment
>> marked with TO REVIEW.
>> The patch is not yet ready for commit, but handle the mentioned case
>> without problem.
>>
>> Regards,
>>
>> Cédric
>> Index: include/clang/Basic/DiagnosticKinds.def
>> ===================================================================
>> --- include/clang/Basic/DiagnosticKinds.def	(revision 52781)
>> +++ include/clang/Basic/DiagnosticKinds.def	(working copy)
>> @@ -63,6 +63,8 @@
>>
>> DIAG(err_unterminated_string, ERROR,
>>     "missing terminating \" character")
>> +DIAG(err_too_much_dchars_rawstring, ERROR,
>> +	 "The d-chars part of the string is too long (16 max)")
>> DIAG(err_unterminated_char, ERROR,
>>     "missing terminating ' character")
>> DIAG(err_empty_character, ERROR,
>> Index: include/clang/Basic/TokenKinds.def
>> ===================================================================
>> --- include/clang/Basic/TokenKinds.def	(revision 52781)
>> +++ include/clang/Basic/TokenKinds.def	(working copy)
>> @@ -106,6 +106,8 @@
>> TOK(string_literal)      // "foo"
>> TOK(wide_string_literal) // L"foo"
>> TOK(angle_string_literal)// <foo>
>> +TOK(raw_string_literal)  // R"**[foo]**" (N2442), support for u, U,
>> u8 and L?
>> +TOK(wide_raw_string_literal) // LR"[foo]"
>>
>> // C99 6.4.6: Punctuators.
>> TOK(l_square)            // [
>> Index: include/clang/Lex/Lexer.h
>> ===================================================================
>> --- include/clang/Lex/Lexer.h	(revision 52781)
>> +++ include/clang/Lex/Lexer.h	(working copy)
>> @@ -361,7 +361,8 @@
>>  // Helper functions to lex the remainder of a token of the
>> specific type.
>>  void LexIdentifier         (Token &Result, const char *CurPtr);
>>  void LexNumericConstant    (Token &Result, const char *CurPtr);
>> -  void LexStringLiteral      (Token &Result, const char
>> *CurPtr,bool Wide);
>> +  void LexStringLiteral      (Token &Result, const char *CurPtr,
>> bool Wide);
>> +  void LexRawStringLiteral   (Token &Result, const char *CurPtr,
>> bool Wide);
>>  void LexAngledStringLiteral(Token &Result, const char *CurPtr);
>>  void LexCharConstant       (Token &Result, const char *CurPtr);
>>  bool LexEndOfFile          (Token &Result, const char *CurPtr);
>> Index: include/clang/Parse/Parser.h
>> ===================================================================
>> --- include/clang/Parse/Parser.h	(revision 52781)
>> +++ include/clang/Parse/Parser.h	(working copy)
>> @@ -106,7 +106,9 @@
>>  ///
>>  bool isTokenStringLiteral() const {
>>    return Tok.getKind() == tok::string_literal ||
>> -           Tok.getKind() == tok::wide_string_literal;
>> +           Tok.getKind() == tok::wide_string_literal ||
>> +           Tok.getKind() == tok::raw_string_literal  ||
>> +           Tok.getKind() == tok::wide_raw_string_literal;
>>  }
>>
>>  /// ConsumeToken - Consume the current 'peek token' and lex the
>> next one.
>> Index: lib/Lex/Lexer.cpp
>> ===================================================================
>> --- lib/Lex/Lexer.cpp	(revision 52781)
>> +++ lib/Lex/Lexer.cpp	(working copy)
>> @@ -592,9 +592,82 @@
>>  FormTokenWithChars(Result, CurPtr);
>> }
>>
>> +void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
>> bool Wide) {
>> +/* TO REVIEW: should I let this comment here?
>> +raw-string:
>> +  "d-char-sequence opt [r-char-sequenceopt ]d-char-sequenceopt "
>> +
>> +r-char-sequence:
>> +    r-char
>> +    r-char-sequence r-char
>> +
>> +r-char:
>> +    any member of the source character set, except, (1), a
>> backslash \ followed by a u or U, or, (2), a right square bracket ]
>> +                        followed by the initial d-char-sequence
>> (which may be empty) followed by a double quote ".
>> +    universal-character-name
>> +
>> +d-char-sequence:
>> +    d-char
>> +    d-char-sequence d-char
>> +
>> +d-char:
>> +    any member of the basic source character set, except space, the
>> left square bracket [, the right square bracket ],
>> +                        or the control characters representing
>> horizontal tab, vertical tab, form feed, or new-line.
>> +*/
>> +
>> +  char dchar_seq[16];
>> +  int  dchar_seq_len = 0;
>> +
>> +  // first read the optional d-char-sequence (0 to 16 characters)
>> +  char C = getAndAdvanceChar(CurPtr, Result);
>> +
>> +  while (C != '[') {
>> +    // FIXME: check the characters are in the allowed set
>> +    if(dchar_seq_len>=16) {
>> +      Diag(BufferPtr, diag::err_too_much_dchars_rawstring);
>> +      // TO REVIEW: should we attempt to recuperate on error here?
>> +      Result.setKind(tok::unknown);
>> +      FormTokenWithChars(Result, CurPtr-1);
>> +      return;
>> +    }
>> +    dchar_seq[dchar_seq_len++] = C;
>> +    C = getAndAdvanceChar(CurPtr, Result);
>> +  }
>> +  // skip the '['
>> +  C = getAndAdvanceChar(CurPtr, Result);
>> +  while(1) {
>> +    while (C != ']') {
>> +      //if( a backslash \ followed by a u or U
>> +      if (C == 0 && CurPtr-1 == BufferEnd) {  // End of file.
>> +        if (!LexingRawMode) Diag(BufferPtr,
>> diag::err_unterminated_string);
>> +        Result.setKind(tok::unknown);
>> +        FormTokenWithChars(Result, CurPtr-1);
>> +        return;
>> +      }
>> +      C = getAndAdvanceChar(CurPtr, Result);
>> +    }
>> +    int i;
>> +    for(i=0;i<dchar_seq_len;++i) {
>> +      C = getAndAdvanceChar(CurPtr, Result);
>> +      // End of file handled in next loop iteration by previous code
>> +      if(C!=dchar_seq[i]) break;
>> +    }
>> +    if(i==dchar_seq_len) {
>> +      // End of file handled in next loop iteration by previous code
>> +      C = getAndAdvanceChar(CurPtr, Result);
>> +      if(C=='"') break;
>> +    }
>> +  }
>> +
>> +  Result.setKind(Wide ? tok::wide_raw_string_literal :
>> tok::raw_string_literal);
>> +
>> +  // Update the location of the token as well as the BufferPtr
>> instance var.
>> +  FormTokenWithChars(Result, CurPtr);
>> +}
>> +
>> /// LexStringLiteral - Lex the remainder of a string literal, after
>> having lexed
>> /// either " or L".
>> -void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
>> bool Wide){
>> +void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
>> bool Wide) {
>>  const char *NulCharacter = 0; // Does this string contain the \0
>> character?
>>
>>  char C = getAndAdvanceChar(CurPtr, Result);
>> @@ -1284,6 +1357,21 @@
>>    MIOpt.ReadToken();
>>    return LexNumericConstant(Result, CurPtr);
>>
>> +  case 'R':   // Identifier (Racoon) or raw string literal (R"xyz").
>> +    // Notify MIOpt that we read a non-whitespace/non-comment token.
>> +    MIOpt.ReadToken();
>> +    Char = getCharAndSize(CurPtr, SizeTmp);
>> +
>> +    // TO REVIEW: enable this only for C++0x, or any language with
>> extension
>> +    // activated? or add a features like the pascal string?
>> +
>> +    // Raw string string literal.
>> +    if (Char == '"' && getFeatures().CPlusPlus0x)
>> +      return LexRawStringLiteral(Result, ConsumeChar(CurPtr,
>> SizeTmp, Result),
>> +                                 false);
>> +
>> +    return LexIdentifier(Result, CurPtr);
>> +
>>  case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
>>    // Notify MIOpt that we read a non-whitespace/non-comment token.
>>    MIOpt.ReadToken();
>> @@ -1293,7 +1381,14 @@
>>    if (Char == '"')
>>      return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp,
>> Result),
>>                              true);
>> -
>> +// TO REVIEW: how to do this properly? if not followed by a ", need
>> to
>> +// unconsume the char, perhaps saving the curptr and sizetmp var is
>> enough?
>> +/*    else if(Char == 'R') {
>> +      ConsumeChar(
>> +      if(getCharAndSize(CurPtr,SizeTmp)=='"')
>> +      return LexRawStringLiteral(Result, ConsumeChar(CurPtr,
>> SizeTmp, Result),
>> +                                 true);
>> +*/
>>    // Wide character constant.
>>    if (Char == '\'')
>>      return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp,
>> Result));
>> @@ -1302,7 +1397,7 @@
>>  // C99 6.4.2: Identifiers.
>>  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case
>> 'G':
>>  case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case
>> 'N':
>> -  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case
>> 'U':
>> +  case 'O': case 'P': case 'Q':    /*'R'*/case 'S': case 'T': case
>> 'U':
>>  case 'V': case 'W': case 'X': case 'Y': case 'Z':
>>  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case
>> 'g':
>>  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case
>> 'n':
>> Index: lib/Lex/LiteralSupport.cpp
>> ===================================================================
>> --- lib/Lex/LiteralSupport.cpp	(revision 52781)
>> +++ lib/Lex/LiteralSupport.cpp	(working copy)
>> @@ -608,7 +608,8 @@
>>      MaxTokenLength = StringToks[i].getLength();
>>
>>    // Remember if we see any wide strings.
>> -    AnyWide |= StringToks[i].is(tok::wide_string_literal);
>> +    AnyWide |= StringToks[i].is(tok::wide_string_literal)
>> +            || StringToks[i].is(tok::wide_raw_string_literal);
>>  }
>>
>>
>> @@ -649,6 +650,7 @@
>>    // Get the spelling of the token, which eliminates trigraphs,
>> etc.  We know
>>    // that ThisTokBuf points to a buffer that is big enough for the
>> whole token
>>    // and 'spelled' tokens can only shrink.
>> +    // FIXME: what about raw string and trigraph, escaped end of
>> line ...??
>>    unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
>>    const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1;  // Skip end
>> quote.
>>
>> @@ -660,13 +662,34 @@
>>      ++ThisTokBuf;
>>      ThisIsWide = true;
>>    }
>> -
>> +
>> +    // Skip R marker for raw strings.
>> +    bool ThisIsRaw = false;
>> +    if (ThisTokBuf[0] == 'R') {
>> +      ++ThisTokBuf;
>> +      ThisIsRaw = true;
>> +    }
>> +
>>    assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
>>    ++ThisTokBuf;
>> -
>> +
>> +    // if we have a raw string, skip the dchar sequence
>> +    if(ThisIsRaw) {
>> +      while(ThisTokBuf[0]!='[') {
>> +        ++ThisTokBuf;
>> +        --ThisTokEnd;
>> +        // TO REVIEW: Assert needed?
>> +        assert(ThisTokBuf<=ThisTokEnd
>> +          && "Expected \"d-char seq[, lexer broken?");
>> +      }
>> +      ++ThisTokBuf;
>> +      --ThisTokEnd;
>> +    }
>> +
>>    // Check if this is a pascal string
>>    if (pp.getLangOptions().PascalStrings && ThisTokBuf + 1 !=
>> ThisTokEnd &&
>>        ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
>> +      // TO REVIEW: has a pascal raw string a meaning?
>>
>>      // If the \p sequence is found in the first token, we have a
>> pascal string
>>      // Otherwise, if we already have a pascal string, ignore the
>> first \p
>> @@ -702,6 +725,18 @@
>>        continue;
>>      }
>>
>> +      // TO REVIEW: Would it be better to use a completly different
>> loop?
>> +      if(ThisIsRaw) {
>> +        ++ThisTokBuf;
>> +        if(ThisTokBuf[0]=='u' || ThisTokBuf[0]=='U') {
>> +          // FIXME: handle UCN
>> +          assert(false && "UCN not yet implemented");
>> +        } else {
>> +          *ResultPtr++ = '\\';
>> +          continue;
>> +        }
>> +      }
>> +
>>      // Otherwise, this is an escape character.  Process it.
>>      unsigned ResultChar = ProcessCharEscape(ThisTokBuf,
>> ThisTokEnd, hadError,
>>
>> StringToks[i].getLocation(),
>> Index: lib/Parse/ParseExpr.cpp
>> ===================================================================
>> --- lib/Parse/ParseExpr.cpp	(revision 52781)
>> +++ lib/Parse/ParseExpr.cpp	(working copy)
>> @@ -534,6 +534,8 @@
>>    return ParsePostfixExpressionSuffix(Res);
>>  case tok::string_literal:    // primary-expression: string-literal
>>  case tok::wide_string_literal:
>> +  case tok::raw_string_literal:
>> +  case tok::wide_raw_string_literal:
>>    Res = ParseStringLiteralExpression();
>>    if (Res.isInvalid) return Res;
>>    // This can be followed by postfix-expr pieces (e.g. "foo"[1]).
>> Index: lib/Parse/ParseObjc.cpp
>> ===================================================================
>> --- lib/Parse/ParseObjc.cpp	(revision 52781)
>> +++ lib/Parse/ParseObjc.cpp	(working copy)
>> @@ -1327,6 +1327,8 @@
>>  switch (Tok.getKind()) {
>>  case tok::string_literal:    // primary-expression: string-literal
>>  case tok::wide_string_literal:
>> +  case tok::raw_string_literal: // TO REVIEW: objC and C++0x
>> possible?
>> +  case tok::wide_raw_string_literal:
>>    return
>> ParsePostfixExpressionSuffix(ParseObjCStringLiteral(AtLoc));
>>  default:
>>    break;
>> Index: lib/Parse/Parser.cpp
>> ===================================================================
>> --- lib/Parse/Parser.cpp	(revision 52781)
>> +++ lib/Parse/Parser.cpp	(working copy)
>> @@ -156,6 +156,8 @@
>>
>>    case tok::string_literal:
>>    case tok::wide_string_literal:
>> +    case tok::raw_string_literal:
>> +    case tok::wide_raw_string_literal:
>>      ConsumeStringToken();
>>      break;
>>    case tok::semi:
>> // RUN: clang -emit-llvm -std=c++0x %s
>> // FIXME: how to test that the strings are correct? this should emit:
>> //@.str = internal constant [5 x i8] c"test\00"		; <[5 x i8]*>
>> [#uses=1]
>> //@s1 = global i8* getelementptr ([5 x i8]* @.str, i32 0, i32 0),
>> align 4		; <i8**> [#uses=0]
>> //@s2 = global i8* getelementptr ([5 x i8]* @.str, i32 0, i32 0),
>> align 4		; <i8**> [#uses=0]
>> //@s3 = global i8* getelementptr ([5 x i8]* @.str, i32 0, i32 0),
>> align 4		; <i8**> [#uses=0]
>> //@.str1 = internal constant [14 x i8] c"\5Ct\5C\5Ce]azertst\00"		;
>> <[14 x i8]*> [#uses=1]
>> //@s4 = global i8* getelementptr ([14 x i8]* @.str1, i32 0, i32 0),
>> align 4		; <i8**> [#uses=0]
>> //@.str2 = internal constant [30 x i8] c"a\0D\0Amultiline\0D\0Astring
>> \0D\0Aliteral\00"		; <[30 x i8]*> [#uses=1]
>> //@s5 = global i8* getelementptr ([30 x i8]* @.str2, i32 0, i32 0),
>> align 4		; <i8**> [#uses=0]
>>
>> const char* s1 = "test";
>> const char* s2 = R"[test]";
>> const char* s3 = R"azerty[test]azerty";
>> const char* s4 = R"azerty[\t\\e]azertst]azerty";
>> const char* s5 = R"[\
>> a
>> multiline
>> string
>> literal]";
>> const char* s6 = LR"[test]";
>>
>> // expected error:
>> // const char* err1 = R"azerty[test]azetry";
>>
>> // RUN: clang -fsyntaxonly -verify -std=c++0x %s
>> // should be -Eonly ? but verify don't work then?
>>
>> const char* err1 = R"12345678901234567[test]12345678901234567";
>> const char* err2 = R"azerty[test]azetry"; // expected error:
>>
>> _______________________________________________
>> cfe-dev mailing list
>> cfe-dev at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/cfe-dev
>
>
>
> — Gordon
>
>
> _______________________________________________
> cfe-dev mailing list
> cfe-dev at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/cfe-dev