<span style="font-family:arial,sans-serif;font-size:13px;background-color:rgb(255,255,255)">+  assert((Kind == StringLiteral::Ascii || Kind == StringLiteral::UTF8) &&</span><br style="font-family:arial,sans-serif;font-size:13px;background-color:rgb(255,255,255)">

<span style="font-family:arial,sans-serif;font-size:13px;background-color:rgb(255,255,255)">+         "Only narrow string literals are currently supported");</span><div><font face="arial, sans-serif"><br></font></div>

<div><font face="arial, sans-serif">If a non-narrow string-literal is encountered with asserts off, will this just continue on and silently corrupt the rest of the compilation? Or will parsing the non-narrow string literal gracefully fail somewhere earlier in the pipeline?<br>

</font><br><div class="gmail_quote">On Tue, Jun 12, 2012 at 10:37 PM, Richard Smith <span dir="ltr"><<a href="mailto:richard-llvm@metafoo.co.uk" target="_blank">richard-llvm@metafoo.co.uk</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

Author: rsmith<br>

Date: Wed Jun 13 00:37:23 2012<br>

New Revision: 158390<br>

<br>

URL: <a href="http://llvm.org/viewvc/llvm-project?rev=158390&view=rev" target="_blank">http://llvm.org/viewvc/llvm-project?rev=158390&view=rev</a><br>

Log:<br>

PR13099: Teach -Wformat about raw string literals, UTF-8 strings and Unicode escape sequences.<br>

<br>

Modified:<br>

    cfe/trunk/lib/AST/Expr.cpp<br>

    cfe/trunk/lib/Lex/LiteralSupport.cpp<br>

    cfe/trunk/lib/Sema/SemaChecking.cpp<br>

    cfe/trunk/test/SemaCXX/format-strings-0x.cpp<br>

<br>

Modified: cfe/trunk/lib/AST/Expr.cpp<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/Expr.cpp?rev=158390&r1=158389&r2=158390&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/Expr.cpp?rev=158390&r1=158389&r2=158390&view=diff</a><br>


==============================================================================<br>

--- cfe/trunk/lib/AST/Expr.cpp (original)<br>

+++ cfe/trunk/lib/AST/Expr.cpp Wed Jun 13 00:37:23 2012<br>

@@ -679,7 +679,8 @@<br>

 SourceLocation StringLiteral::<br>

 getLocationOfByte(unsigned ByteNo, const SourceManager &SM,<br>

                   const LangOptions &Features, const TargetInfo &Target) const {<br>

-  assert(Kind == StringLiteral::Ascii && "This only works for ASCII strings");<br>

+  assert((Kind == StringLiteral::Ascii || Kind == StringLiteral::UTF8) &&<br>

+         "Only narrow string literals are currently supported");<br>

<br>

   // Loop over all of the tokens in this string until we find the one that<br>

   // contains the byte we're looking for.<br>

<br>

Modified: cfe/trunk/lib/Lex/LiteralSupport.cpp<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=158390&r1=158389&r2=158390&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=158390&r1=158389&r2=158390&view=diff</a><br>


==============================================================================<br>

--- cfe/trunk/lib/Lex/LiteralSupport.cpp (original)<br>

+++ cfe/trunk/lib/Lex/LiteralSupport.cpp Wed Jun 13 00:37:23 2012<br>

@@ -250,6 +250,39 @@<br>

   return true;<br>

 }<br>

<br>

+/// MeasureUCNEscape - Determine the number of bytes within the resulting string<br>

+/// which this UCN will occupy.<br>

+static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,<br>

+                            const char *ThisTokEnd, unsigned CharByteWidth,<br>

+                            const LangOptions &Features, bool &HadError) {<br>

+  // UTF-32: 4 bytes per escape.<br>

+  if (CharByteWidth == 4)<br>

+    return 4;<br>

+<br>

+  uint32_t UcnVal = 0;<br>

+  unsigned short UcnLen = 0;<br>

+  FullSourceLoc Loc;<br>

+<br>

+  if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,<br>

+                        UcnLen, Loc, 0, Features, true)) {<br>

+    HadError = true;<br>

+    return 0;<br>

+  }<br>

+<br>

+  // UTF-16: 2 bytes for BMP, 4 bytes otherwise.<br>

+  if (CharByteWidth == 2)<br>

+    return UcnVal <= 0xFFFF ? 2 : 4;<br>

+<br>

+  // UTF-8.<br>

+  if (UcnVal < 0x80)<br>

+    return 1;<br>

+  if (UcnVal < 0x800)<br>

+    return 2;<br>

+  if (UcnVal < 0x10000)<br>

+    return 3;<br>

+  return 4;<br>

+}<br>

+<br>

 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and<br>

 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of<br>

 /// StringLiteralParser. When we decide to implement UCN's for identifiers,<br>

@@ -265,7 +298,7 @@<br>

   unsigned short UcnLen = 0;<br>

   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,<br>

                         Loc, Diags, Features, true)) {<br>

-    HadError = 1;<br>

+    HadError = true;<br>

     return;<br>

   }<br>

<br>

@@ -1369,14 +1402,31 @@<br>

   if (StringInvalid)<br>

     return 0;<br>

<br>

+  const char *SpellingStart = SpellingPtr;<br>

+  const char *SpellingEnd = SpellingPtr+TokLen;<br>

+<br>

+  // Handle UTF-8 strings just like narrow strings.<br>

+  if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')<br>

+    SpellingPtr += 2;<br>

+<br>

   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&<br>

          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");<br>

<br>

+  // For raw string literals, this is easy.<br>

+  if (SpellingPtr[0] == 'R') {<br>

+    assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");<br>

+    // Skip 'R"'.<br>

+    SpellingPtr += 2;<br>

+    while (*SpellingPtr != '(') {<br>

+      ++SpellingPtr;<br>

+      assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");<br>

+    }<br>

+    // Skip '('.<br>

+    ++SpellingPtr;<br>

+    return SpellingPtr - SpellingStart + ByteNo;<br>

+  }<br>

<br>

-  const char *SpellingStart = SpellingPtr;<br>

-  const char *SpellingEnd = SpellingPtr+TokLen;<br>

-<br>

-  // Skip over the leading quote.<br>

+  // Skip over the leading quote<br>

   assert(SpellingPtr[0] == '"' && "Should be a string literal!");<br>

   ++SpellingPtr;<br>

<br>

@@ -1393,11 +1443,23 @@<br>

<br>

     // Otherwise, this is an escape character.  Advance over it.<br>

     bool HadError = false;<br>

-    ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,<br>

-                      FullSourceLoc(Tok.getLocation(), SM),<br>

-                      CharByteWidth*8, Diags);<br>

+    if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {<br>

+      const char *EscapePtr = SpellingPtr;<br>

+      unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,<br>

+                                      1, Features, HadError);<br>

+      if (Len > ByteNo) {<br>

+        // ByteNo is somewhere within the escape sequence.<br>

+        SpellingPtr = EscapePtr;<br>

+        break;<br>

+      }<br>

+      ByteNo -= Len;<br>

+    } else {<br>

+      ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,<br>

+                        FullSourceLoc(Tok.getLocation(), SM),<br>

+                        CharByteWidth*8, Diags);<br>

+      --ByteNo;<br>

+    }<br>

     assert(!HadError && "This method isn't valid on erroneous strings");<br>

-    --ByteNo;<br>

   }<br>

<br>

   return SpellingPtr-SpellingStart;<br>

<br>

Modified: cfe/trunk/lib/Sema/SemaChecking.cpp<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaChecking.cpp?rev=158390&r1=158389&r2=158390&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaChecking.cpp?rev=158390&r1=158389&r2=158390&view=diff</a><br>


==============================================================================<br>

--- cfe/trunk/lib/Sema/SemaChecking.cpp (original)<br>

+++ cfe/trunk/lib/Sema/SemaChecking.cpp Wed Jun 13 00:37:23 2012<br>

@@ -2633,7 +2633,7 @@<br>

                              bool inFunctionCall) {<br>

<br>

   // CHECK: is the format string a wide literal?<br>

-  if (!FExpr->isAscii()) {<br>

+  if (!FExpr->isAscii() && !FExpr->isUTF8()) {<br>

     CheckFormatHandler::EmitFormatDiagnostic(<br>

       *this, inFunctionCall, Args[format_idx],<br>

       PDiag(diag::warn_format_string_is_wide_literal), FExpr->getLocStart(),<br>

<br>

Modified: cfe/trunk/test/SemaCXX/format-strings-0x.cpp<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/cfe/trunk/test/SemaCXX/format-strings-0x.cpp?rev=158390&r1=158389&r2=158390&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/cfe/trunk/test/SemaCXX/format-strings-0x.cpp?rev=158390&r1=158389&r2=158390&view=diff</a><br>


==============================================================================<br>

--- cfe/trunk/test/SemaCXX/format-strings-0x.cpp (original)<br>

+++ cfe/trunk/test/SemaCXX/format-strings-0x.cpp Wed Jun 13 00:37:23 2012<br>

@@ -12,4 +12,16 @@<br>

   scanf("%afoobar", fp);<br>

   printf(nullptr);<br>

   printf(*sp); // expected-warning {{not a string literal}}<br>

+<br>

+  // PR13099<br>

+  printf(<br>

+    R"foobar(%)foobar"<br>

+    R"bazquux(d)bazquux" // expected-warning {{more '%' conversions than data arguments}}<br>

+    R"xyzzy()xyzzy");<br>

+<br>

+  printf(u8"this is %d test", 0); // ok<br>

+  printf(u8R"foo(<br>

+      \u1234\U0010fffe<br>

+      %d)foo" // expected-warning {{more '%' conversions than data arguments}}<br>

+  );<br>

 }<br>

<br>

<br>

_______________________________________________<br>

cfe-commits mailing list<br>

<a href="mailto:cfe-commits@cs.uiuc.edu">cfe-commits@cs.uiuc.edu</a><br>

<a href="http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits" target="_blank">http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits</a><br>

</blockquote></div><br></div>