[cfe-commits] r68076 - in /cfe/trunk: include/clang/Basic/DiagnosticLexKinds.td lib/Lex/LiteralSupport.cpp test/Sema/ucn-cstring.c
Steve Naroff
snaroff at apple.com
Mon Mar 30 16:46:03 PDT 2009
Author: snaroff
Date: Mon Mar 30 18:46:03 2009
New Revision: 68076
URL: http://llvm.org/viewvc/llvm-project?rev=68076&view=rev
Log:
Implement UCN support for C string literals (C99 6.4.3) and add some very basic tests. Chris Goller has graciously offered to write some test to help validate UCN support.
>From a front-end perspective, I believe this code should work for ObjC @-strings. At the moment, I believe we need to tweak the code generation for @-strings (which doesn't appear to handle them). Will be investigating.
Added:
cfe/trunk/test/Sema/ucn-cstring.c
Modified:
cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
cfe/trunk/lib/Lex/LiteralSupport.cpp
Modified: cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td?rev=68076&r1=68075&r2=68076&view=diff
==============================================================================
--- cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td (original)
+++ cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td Mon Mar 30 18:46:03 2009
@@ -52,6 +52,10 @@
"use of non-standard escape character '\\%0'">;
def ext_unknown_escape : Extension<"unknown escape sequence '\\%0'">;
def err_hex_escape_no_digits : Error<"\\x used with no following hex digits">;
+def err_ucn_escape_no_digits : Error<"\\u used with no following hex digits">;
+def err_ucn_escape_invalid : Error<"invalid universal character">;
+def err_ucn_escape_incomplete : Error<"incomplete universal character name">;
+def err_ucn_escape_too_big : Error<"universal character name is too long">;
def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">;
def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">;
def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">;
Modified: cfe/trunk/lib/Lex/LiteralSupport.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=68076&r1=68075&r2=68076&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/LiteralSupport.cpp (original)
+++ cfe/trunk/lib/Lex/LiteralSupport.cpp Mon Mar 30 18:46:03 2009
@@ -71,8 +71,6 @@
case 'v':
ResultChar = 11;
break;
-
- //case 'u': case 'U': // FIXME: UCNs.
case 'x': { // Hex escape.
ResultChar = 0;
if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
@@ -151,7 +149,90 @@
return ResultChar;
}
-
+/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
+/// convert the UTF32 to UTF8. This is a subroutine of StringLiteralParser.
+/// When we decide to implement UCN's for character constants and identifiers,
+/// we will likely rework our support for UCN's.
+static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
+ char *&ResultBuf, const char *ResultBufEnd,
+ bool &HadError,
+ SourceLocation Loc, Preprocessor &PP) {
+ // FIXME: Add a warning - UCN's are only valid in C++ & C99.
+
+ // Skip the '\u' char's.
+ ThisTokBuf += 2;
+
+ if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
+ PP.Diag(Loc, diag::err_ucn_escape_no_digits);
+ HadError = 1;
+ return;
+ }
+ typedef unsigned int UTF32;
+
+ UTF32 UcnVal = 0;
+ unsigned short UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
+ for (; ThisTokBuf != ThisTokEnd && UcnLen; ++ThisTokBuf, UcnLen--) {
+ int CharVal = HexDigitValue(ThisTokBuf[0]);
+ if (CharVal == -1) break;
+ UcnVal <<= 4;
+ UcnVal |= CharVal;
+ }
+ // If we didn't consume the proper number of digits, there is a problem.
+ if (UcnLen) {
+ PP.Diag(Loc, diag::err_ucn_escape_incomplete);
+ HadError = 1;
+ return;
+ }
+ // Check UCN constraints (C99 6.4.3p2)
+ if ((UcnVal < 0xa0 &&
+ (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
+ || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)) {
+ PP.Diag(Loc, diag::err_ucn_escape_invalid);
+ HadError = 1;
+ return;
+ }
+ // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
+ // The conversion below was inspired by:
+ // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
+ // First, we determine how many bytes the result will require.
+ typedef unsigned char UTF8;
+
+ unsigned short bytesToWrite = 0;
+ if (UcnVal < (UTF32)0x80)
+ bytesToWrite = 1;
+ else if (UcnVal < (UTF32)0x800)
+ bytesToWrite = 2;
+ else if (UcnVal < (UTF32)0x10000)
+ bytesToWrite = 3;
+ else
+ bytesToWrite = 4;
+
+ // If the buffer isn't big enough, bail.
+ if ((ResultBuf + bytesToWrite) >= ResultBufEnd) {
+ PP.Diag(Loc, diag::err_ucn_escape_too_big);
+ HadError = 1;
+ return;
+ }
+ const unsigned byteMask = 0xBF;
+ const unsigned byteMark = 0x80;
+
+ // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
+ // into the first byte, depending on how many bytes follow. There are
+ // as many entries in this table as there are UTF8 sequence types.
+ static const UTF8 firstByteMark[7] = {
+ 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
+ };
+ // Finally, we write the bytes into ResultBuf.
+ ResultBuf += bytesToWrite;
+ switch (bytesToWrite) { // note: everything falls through.
+ case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
+ case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
+ case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
+ case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
+ }
+ // Update the buffer.
+ ResultBuf += bytesToWrite;
+}
/// integer-constant: [C99 6.4.4.1]
@@ -757,23 +838,29 @@
*ResultPtr++ = InStart[0];
// Add zeros at the end.
for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
- *ResultPtr++ = 0;
+ *ResultPtr++ = 0;
}
}
continue;
}
- // Otherwise, this is an escape character. Process it.
- unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
- StringToks[i].getLocation(),
- ThisIsWide, PP);
-
- // Note: our internal rep of wide char tokens is always little-endian.
- *ResultPtr++ = ResultChar & 0xFF;
-
- if (AnyWide) {
- for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
- *ResultPtr++ = ResultChar >> i*8;
+ if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
+ ProcessUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
+ GetString() + ResultBuf.size(),
+ hadError, StringToks[i].getLocation(), PP);
+ } else {
+ // Otherwise, this is a non-UCN escape character. Process it.
+ unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
+ StringToks[i].getLocation(),
+ ThisIsWide, PP);
+
+ // Note: our internal rep of wide char tokens is always little-endian.
+ *ResultPtr++ = ResultChar & 0xFF;
+
+ if (AnyWide) {
+ for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
+ *ResultPtr++ = ResultChar >> i*8;
+ }
}
}
}
Added: cfe/trunk/test/Sema/ucn-cstring.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Sema/ucn-cstring.c?rev=68076&view=auto
==============================================================================
--- cfe/trunk/test/Sema/ucn-cstring.c (added)
+++ cfe/trunk/test/Sema/ucn-cstring.c Mon Mar 30 18:46:03 2009
@@ -0,0 +1,15 @@
+// RUN: clang-cc %s -verify -fsyntax-only -pedantic
+
+#include <stdio.h>
+
+int main(void) {
+ printf("%s (%d)\n", "hello \u2192 \u2603 \u2190 world", sizeof("hello \u2192 \u2603 \u2190 world"));
+ printf("%s (%d)\n", "\U00010400\U0001D12B", sizeof("\U00010400\U0001D12B"));
+ // Some error conditions...
+ printf("%s\n", "\U"); // expected-error{{\u used with no following hex digits}}
+ printf("%s\n", "\U00"); // expected-error{{incomplete universal character name}}
+ printf("%s\n", "\U0001"); // expected-error{{incomplete universal character name}}
+ printf("%s\n", "\u0001"); // expected-error{{invalid universal character}}
+ return 0;
+}
+
More information about the cfe-commits
mailing list