[cfe-commits] r158390 - in /cfe/trunk: lib/AST/Expr.cpp lib/Lex/LiteralSupport.cpp lib/Sema/SemaChecking.cpp test/SemaCXX/format-strings-0x.cpp

Tue Jun 12 23:41:52 PDT 2012

+  assert((Kind == StringLiteral::Ascii || Kind == StringLiteral::UTF8) &&
+         "Only narrow string literals are currently supported");

If a non-narrow string-literal is encountered with asserts off, will this
just continue on and silently corrupt the rest of the compilation? Or will
parsing the non-narrow string literal gracefully fail somewhere earlier in
the pipeline?

On Tue, Jun 12, 2012 at 10:37 PM, Richard Smith
<richard-llvm at metafoo.co.uk>wrote:

> Author: rsmith
> Date: Wed Jun 13 00:37:23 2012
> New Revision: 158390
>
> URL: http://llvm.org/viewvc/llvm-project?rev=158390&view=rev
> Log:
> PR13099: Teach -Wformat about raw string literals, UTF-8 strings and
> Unicode escape sequences.
>
> Modified:
>    cfe/trunk/lib/AST/Expr.cpp
>    cfe/trunk/lib/Lex/LiteralSupport.cpp
>    cfe/trunk/lib/Sema/SemaChecking.cpp
>    cfe/trunk/test/SemaCXX/format-strings-0x.cpp
>
> Modified: cfe/trunk/lib/AST/Expr.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/Expr.cpp?rev=158390&r1=158389&r2=158390&view=diff
>
> ==============================================================================
> --- cfe/trunk/lib/AST/Expr.cpp (original)
> +++ cfe/trunk/lib/AST/Expr.cpp Wed Jun 13 00:37:23 2012
> @@ -679,7 +679,8 @@
>  SourceLocation StringLiteral::
>  getLocationOfByte(unsigned ByteNo, const SourceManager &SM,
>                   const LangOptions &Features, const TargetInfo &Target)
> const {
> -  assert(Kind == StringLiteral::Ascii && "This only works for ASCII
> strings");
> +  assert((Kind == StringLiteral::Ascii || Kind == StringLiteral::UTF8) &&
> +         "Only narrow string literals are currently supported");
>
>   // Loop over all of the tokens in this string until we find the one that
>   // contains the byte we're looking for.
>
> Modified: cfe/trunk/lib/Lex/LiteralSupport.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=158390&r1=158389&r2=158390&view=diff
>
> ==============================================================================
> --- cfe/trunk/lib/Lex/LiteralSupport.cpp (original)
> +++ cfe/trunk/lib/Lex/LiteralSupport.cpp Wed Jun 13 00:37:23 2012
> @@ -250,6 +250,39 @@
>   return true;
>  }
>
> +/// MeasureUCNEscape - Determine the number of bytes within the resulting
> string
> +/// which this UCN will occupy.
> +static int MeasureUCNEscape(const char *ThisTokBegin, const char
> *&ThisTokBuf,
> +                            const char *ThisTokEnd, unsigned
> CharByteWidth,
> +                            const LangOptions &Features, bool &HadError) {
> +  // UTF-32: 4 bytes per escape.
> +  if (CharByteWidth == 4)
> +    return 4;
> +
> +  uint32_t UcnVal = 0;
> +  unsigned short UcnLen = 0;
> +  FullSourceLoc Loc;
> +
> +  if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
> +                        UcnLen, Loc, 0, Features, true)) {
> +    HadError = true;
> +    return 0;
> +  }
> +
> +  // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
> +  if (CharByteWidth == 2)
> +    return UcnVal <= 0xFFFF ? 2 : 4;
> +
> +  // UTF-8.
> +  if (UcnVal < 0x80)
> +    return 1;
> +  if (UcnVal < 0x800)
> +    return 2;
> +  if (UcnVal < 0x10000)
> +    return 3;
> +  return 4;
> +}
> +
>  /// EncodeUCNEscape - Read the Universal Character Name, check
> constraints and
>  /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
>  /// StringLiteralParser. When we decide to implement UCN's for
> identifiers,
> @@ -265,7 +298,7 @@
>   unsigned short UcnLen = 0;
>   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
> UcnLen,
>                         Loc, Diags, Features, true)) {
> -    HadError = 1;
> +    HadError = true;
>     return;
>   }
>
> @@ -1369,14 +1402,31 @@
>   if (StringInvalid)
>     return 0;
>
> +  const char *SpellingStart = SpellingPtr;
> +  const char *SpellingEnd = SpellingPtr+TokLen;
> +
> +  // Handle UTF-8 strings just like narrow strings.
> +  if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
> +    SpellingPtr += 2;
> +
>   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
>          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings
> yet");
>
> +  // For raw string literals, this is easy.
> +  if (SpellingPtr[0] == 'R') {
> +    assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
> +    // Skip 'R"'.
> +    SpellingPtr += 2;
> +    while (*SpellingPtr != '(') {
> +      ++SpellingPtr;
> +      assert(SpellingPtr < SpellingEnd && "Missing ( for raw string
> literal");
> +    }
> +    // Skip '('.
> +    ++SpellingPtr;
> +    return SpellingPtr - SpellingStart + ByteNo;
> +  }
>
> -  const char *SpellingStart = SpellingPtr;
> -  const char *SpellingEnd = SpellingPtr+TokLen;
> -
> -  // Skip over the leading quote.
> +  // Skip over the leading quote
>   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
>   ++SpellingPtr;
>
> @@ -1393,11 +1443,23 @@
>
>     // Otherwise, this is an escape character.  Advance over it.
>     bool HadError = false;
> -    ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
> -                      FullSourceLoc(Tok.getLocation(), SM),
> -                      CharByteWidth*8, Diags);
> +    if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
> +      const char *EscapePtr = SpellingPtr;
> +      unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr,
> SpellingEnd,
> +                                      1, Features, HadError);
> +      if (Len > ByteNo) {
> +        // ByteNo is somewhere within the escape sequence.
> +        SpellingPtr = EscapePtr;
> +        break;
> +      }
> +      ByteNo -= Len;
> +    } else {
> +      ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
> +                        FullSourceLoc(Tok.getLocation(), SM),
> +                        CharByteWidth*8, Diags);
> +      --ByteNo;
> +    }
>     assert(!HadError && "This method isn't valid on erroneous strings");
> -    --ByteNo;
>   }
>
>   return SpellingPtr-SpellingStart;
>
> Modified: cfe/trunk/lib/Sema/SemaChecking.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaChecking.cpp?rev=158390&r1=158389&r2=158390&view=diff
>
> ==============================================================================
> --- cfe/trunk/lib/Sema/SemaChecking.cpp (original)
> +++ cfe/trunk/lib/Sema/SemaChecking.cpp Wed Jun 13 00:37:23 2012
> @@ -2633,7 +2633,7 @@
>                              bool inFunctionCall) {
>
>   // CHECK: is the format string a wide literal?
> -  if (!FExpr->isAscii()) {
> +  if (!FExpr->isAscii() && !FExpr->isUTF8()) {
>     CheckFormatHandler::EmitFormatDiagnostic(
>       *this, inFunctionCall, Args[format_idx],
>       PDiag(diag::warn_format_string_is_wide_literal),
> FExpr->getLocStart(),
>
> Modified: cfe/trunk/test/SemaCXX/format-strings-0x.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/cfe/trunk/test/SemaCXX/format-strings-0x.cpp?rev=158390&r1=158389&r2=158390&view=diff
>
> ==============================================================================
> --- cfe/trunk/test/SemaCXX/format-strings-0x.cpp (original)
> +++ cfe/trunk/test/SemaCXX/format-strings-0x.cpp Wed Jun 13 00:37:23 2012
> @@ -12,4 +12,16 @@
>   scanf("%afoobar", fp);
>   printf(nullptr);
>   printf(*sp); // expected-warning {{not a string literal}}
> +
> +  // PR13099
> +  printf(
> +    R"foobar(%)foobar"
> +    R"bazquux(d)bazquux" // expected-warning {{more '%' conversions than
> data arguments}}
> +    R"xyzzy()xyzzy");
> +
> +  printf(u8"this is %d test", 0); // ok
> +  printf(u8R"foo(
> +      \u1234\U0010fffe
> +      %d)foo" // expected-warning {{more '%' conversions than data
> arguments}}
> +  );
>  }
>
>
> _______________________________________________
> cfe-commits mailing list
> cfe-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20120612/cac25230/attachment.html>