<span style="font-family:arial,sans-serif;font-size:13px;background-color:rgb(255,255,255)">+ assert((Kind == StringLiteral::Ascii || Kind == StringLiteral::UTF8) &&</span><br style="font-family:arial,sans-serif;font-size:13px;background-color:rgb(255,255,255)">
<span style="font-family:arial,sans-serif;font-size:13px;background-color:rgb(255,255,255)">+ "Only narrow string literals are currently supported");</span><div><font face="arial, sans-serif"><br></font></div>
<div><font face="arial, sans-serif">If a non-narrow string-literal is encountered with asserts off, will this just continue on and silently corrupt the rest of the compilation? Or will parsing the non-narrow string literal gracefully fail somewhere earlier in the pipeline?<br>
</font><br><div class="gmail_quote">On Tue, Jun 12, 2012 at 10:37 PM, Richard Smith <span dir="ltr"><<a href="mailto:richard-llvm@metafoo.co.uk" target="_blank">richard-llvm@metafoo.co.uk</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
Author: rsmith<br>
Date: Wed Jun 13 00:37:23 2012<br>
New Revision: 158390<br>
<br>
URL: <a href="http://llvm.org/viewvc/llvm-project?rev=158390&view=rev" target="_blank">http://llvm.org/viewvc/llvm-project?rev=158390&view=rev</a><br>
Log:<br>
PR13099: Teach -Wformat about raw string literals, UTF-8 strings and Unicode escape sequences.<br>
<br>
Modified:<br>
cfe/trunk/lib/AST/Expr.cpp<br>
cfe/trunk/lib/Lex/LiteralSupport.cpp<br>
cfe/trunk/lib/Sema/SemaChecking.cpp<br>
cfe/trunk/test/SemaCXX/format-strings-0x.cpp<br>
<br>
Modified: cfe/trunk/lib/AST/Expr.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/Expr.cpp?rev=158390&r1=158389&r2=158390&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/Expr.cpp?rev=158390&r1=158389&r2=158390&view=diff</a><br>
==============================================================================<br>
--- cfe/trunk/lib/AST/Expr.cpp (original)<br>
+++ cfe/trunk/lib/AST/Expr.cpp Wed Jun 13 00:37:23 2012<br>
@@ -679,7 +679,8 @@<br>
SourceLocation StringLiteral::<br>
getLocationOfByte(unsigned ByteNo, const SourceManager &SM,<br>
const LangOptions &Features, const TargetInfo &Target) const {<br>
- assert(Kind == StringLiteral::Ascii && "This only works for ASCII strings");<br>
+ assert((Kind == StringLiteral::Ascii || Kind == StringLiteral::UTF8) &&<br>
+ "Only narrow string literals are currently supported");<br>
<br>
// Loop over all of the tokens in this string until we find the one that<br>
// contains the byte we're looking for.<br>
<br>
Modified: cfe/trunk/lib/Lex/LiteralSupport.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=158390&r1=158389&r2=158390&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=158390&r1=158389&r2=158390&view=diff</a><br>
==============================================================================<br>
--- cfe/trunk/lib/Lex/LiteralSupport.cpp (original)<br>
+++ cfe/trunk/lib/Lex/LiteralSupport.cpp Wed Jun 13 00:37:23 2012<br>
@@ -250,6 +250,39 @@<br>
return true;<br>
}<br>
<br>
+/// MeasureUCNEscape - Determine the number of bytes within the resulting string<br>
+/// which this UCN will occupy.<br>
+static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,<br>
+ const char *ThisTokEnd, unsigned CharByteWidth,<br>
+ const LangOptions &Features, bool &HadError) {<br>
+ // UTF-32: 4 bytes per escape.<br>
+ if (CharByteWidth == 4)<br>
+ return 4;<br>
+<br>
+ uint32_t UcnVal = 0;<br>
+ unsigned short UcnLen = 0;<br>
+ FullSourceLoc Loc;<br>
+<br>
+ if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,<br>
+ UcnLen, Loc, 0, Features, true)) {<br>
+ HadError = true;<br>
+ return 0;<br>
+ }<br>
+<br>
+ // UTF-16: 2 bytes for BMP, 4 bytes otherwise.<br>
+ if (CharByteWidth == 2)<br>
+ return UcnVal <= 0xFFFF ? 2 : 4;<br>
+<br>
+ // UTF-8.<br>
+ if (UcnVal < 0x80)<br>
+ return 1;<br>
+ if (UcnVal < 0x800)<br>
+ return 2;<br>
+ if (UcnVal < 0x10000)<br>
+ return 3;<br>
+ return 4;<br>
+}<br>
+<br>
/// EncodeUCNEscape - Read the Universal Character Name, check constraints and<br>
/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of<br>
/// StringLiteralParser. When we decide to implement UCN's for identifiers,<br>
@@ -265,7 +298,7 @@<br>
unsigned short UcnLen = 0;<br>
if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,<br>
Loc, Diags, Features, true)) {<br>
- HadError = 1;<br>
+ HadError = true;<br>
return;<br>
}<br>
<br>
@@ -1369,14 +1402,31 @@<br>
if (StringInvalid)<br>
return 0;<br>
<br>
+ const char *SpellingStart = SpellingPtr;<br>
+ const char *SpellingEnd = SpellingPtr+TokLen;<br>
+<br>
+ // Handle UTF-8 strings just like narrow strings.<br>
+ if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')<br>
+ SpellingPtr += 2;<br>
+<br>
assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&<br>
SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");<br>
<br>
+ // For raw string literals, this is easy.<br>
+ if (SpellingPtr[0] == 'R') {<br>
+ assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");<br>
+ // Skip 'R"'.<br>
+ SpellingPtr += 2;<br>
+ while (*SpellingPtr != '(') {<br>
+ ++SpellingPtr;<br>
+ assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");<br>
+ }<br>
+ // Skip '('.<br>
+ ++SpellingPtr;<br>
+ return SpellingPtr - SpellingStart + ByteNo;<br>
+ }<br>
<br>
- const char *SpellingStart = SpellingPtr;<br>
- const char *SpellingEnd = SpellingPtr+TokLen;<br>
-<br>
- // Skip over the leading quote.<br>
+ // Skip over the leading quote<br>
assert(SpellingPtr[0] == '"' && "Should be a string literal!");<br>
++SpellingPtr;<br>
<br>
@@ -1393,11 +1443,23 @@<br>
<br>
// Otherwise, this is an escape character. Advance over it.<br>
bool HadError = false;<br>
- ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,<br>
- FullSourceLoc(Tok.getLocation(), SM),<br>
- CharByteWidth*8, Diags);<br>
+ if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {<br>
+ const char *EscapePtr = SpellingPtr;<br>
+ unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,<br>
+ 1, Features, HadError);<br>
+ if (Len > ByteNo) {<br>
+ // ByteNo is somewhere within the escape sequence.<br>
+ SpellingPtr = EscapePtr;<br>
+ break;<br>
+ }<br>
+ ByteNo -= Len;<br>
+ } else {<br>
+ ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,<br>
+ FullSourceLoc(Tok.getLocation(), SM),<br>
+ CharByteWidth*8, Diags);<br>
+ --ByteNo;<br>
+ }<br>
assert(!HadError && "This method isn't valid on erroneous strings");<br>
- --ByteNo;<br>
}<br>
<br>
return SpellingPtr-SpellingStart;<br>
<br>
Modified: cfe/trunk/lib/Sema/SemaChecking.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaChecking.cpp?rev=158390&r1=158389&r2=158390&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaChecking.cpp?rev=158390&r1=158389&r2=158390&view=diff</a><br>
==============================================================================<br>
--- cfe/trunk/lib/Sema/SemaChecking.cpp (original)<br>
+++ cfe/trunk/lib/Sema/SemaChecking.cpp Wed Jun 13 00:37:23 2012<br>
@@ -2633,7 +2633,7 @@<br>
bool inFunctionCall) {<br>
<br>
// CHECK: is the format string a wide literal?<br>
- if (!FExpr->isAscii()) {<br>
+ if (!FExpr->isAscii() && !FExpr->isUTF8()) {<br>
CheckFormatHandler::EmitFormatDiagnostic(<br>
*this, inFunctionCall, Args[format_idx],<br>
PDiag(diag::warn_format_string_is_wide_literal), FExpr->getLocStart(),<br>
<br>
Modified: cfe/trunk/test/SemaCXX/format-strings-0x.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/cfe/trunk/test/SemaCXX/format-strings-0x.cpp?rev=158390&r1=158389&r2=158390&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/cfe/trunk/test/SemaCXX/format-strings-0x.cpp?rev=158390&r1=158389&r2=158390&view=diff</a><br>
==============================================================================<br>
--- cfe/trunk/test/SemaCXX/format-strings-0x.cpp (original)<br>
+++ cfe/trunk/test/SemaCXX/format-strings-0x.cpp Wed Jun 13 00:37:23 2012<br>
@@ -12,4 +12,16 @@<br>
scanf("%afoobar", fp);<br>
printf(nullptr);<br>
printf(*sp); // expected-warning {{not a string literal}}<br>
+<br>
+ // PR13099<br>
+ printf(<br>
+ R"foobar(%)foobar"<br>
+ R"bazquux(d)bazquux" // expected-warning {{more '%' conversions than data arguments}}<br>
+ R"xyzzy()xyzzy");<br>
+<br>
+ printf(u8"this is %d test", 0); // ok<br>
+ printf(u8R"foo(<br>
+ \u1234\U0010fffe<br>
+ %d)foo" // expected-warning {{more '%' conversions than data arguments}}<br>
+ );<br>
}<br>
<br>
<br>
_______________________________________________<br>
cfe-commits mailing list<br>
<a href="mailto:cfe-commits@cs.uiuc.edu">cfe-commits@cs.uiuc.edu</a><br>
<a href="http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits" target="_blank">http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits</a><br>
</blockquote></div><br></div>