[cfe-commits] r116129 - in /cfe/trunk: lib/Lex/LiteralSupport.cpp test/CodeGen/char-literal.c test/CodeGen/string-literal-short-wstring.c test/Lexer/c90.c test/Lexer/wchar.c
Nico Weber
nicolasweber at gmx.de
Fri Oct 8 17:27:47 PDT 2010
Author: nico
Date: Fri Oct 8 19:27:47 2010
New Revision: 116129
URL: http://llvm.org/viewvc/llvm-project?rev=116129&view=rev
Log:
Add support for UCNs for character literals
Added:
cfe/trunk/test/CodeGen/char-literal.c
Modified:
cfe/trunk/lib/Lex/LiteralSupport.cpp
cfe/trunk/test/CodeGen/string-literal-short-wstring.c
cfe/trunk/test/Lexer/c90.c
cfe/trunk/test/Lexer/wchar.c
Modified: cfe/trunk/lib/Lex/LiteralSupport.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=116129&r1=116128&r2=116129&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/LiteralSupport.cpp (original)
+++ cfe/trunk/lib/Lex/LiteralSupport.cpp Fri Oct 8 19:27:47 2010
@@ -164,13 +164,10 @@
}
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
-/// convert the UTF32 to UTF8. This is a subroutine of StringLiteralParser.
-/// When we decide to implement UCN's for character constants and identifiers,
-/// we will likely rework our support for UCN's.
-static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
- char *&ResultBuf, bool &HadError,
+/// return the UTF32.
+static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
+ uint32_t &UcnVal, unsigned short &UcnLen,
SourceLocation Loc, Preprocessor &PP,
- bool wide,
bool Complain) {
if (!PP.getLangOptions().CPlusPlus && !PP.getLangOptions().C99)
PP.Diag(Loc, diag::warn_ucn_not_valid_in_c89);
@@ -184,27 +181,22 @@
if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
if (Complain)
PP.Diag(Loc, diag::err_ucn_escape_no_digits);
- HadError = 1;
- return;
+ return false;
}
- typedef uint32_t UTF32;
-
- UTF32 UcnVal = 0;
- unsigned short UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
+ UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
unsigned short UcnLenSave = UcnLen;
- for (; ThisTokBuf != ThisTokEnd && UcnLen; ++ThisTokBuf, UcnLen--) {
+ for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
int CharVal = HexDigitValue(ThisTokBuf[0]);
if (CharVal == -1) break;
UcnVal <<= 4;
UcnVal |= CharVal;
}
// If we didn't consume the proper number of digits, there is a problem.
- if (UcnLen) {
+ if (UcnLenSave) {
if (Complain)
PP.Diag(PP.AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin),
diag::err_ucn_escape_incomplete);
- HadError = 1;
- return;
+ return false;
}
// Check UCN constraints (C99 6.4.3p2).
if ((UcnVal < 0xa0 &&
@@ -213,13 +205,33 @@
|| (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
if (Complain)
PP.Diag(Loc, diag::err_ucn_escape_invalid);
+ return false;
+ }
+ return true;
+}
+
+/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
+/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
+/// StringLiteralParser. When we decide to implement UCN's for identifiers,
+/// we will likely rework our support for UCN's.
+static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
+ char *&ResultBuf, bool &HadError,
+ SourceLocation Loc, Preprocessor &PP,
+ bool wide,
+ bool Complain) {
+ typedef uint32_t UTF32;
+ UTF32 UcnVal = 0;
+ unsigned short UcnLen = 0;
+ if (!ProcessUCNEscape(ThisTokBuf, ThisTokEnd,
+ UcnVal, UcnLen, Loc, PP, Complain)) {
HadError = 1;
return;
}
+
if (wide) {
- (void)UcnLenSave;
- assert((UcnLenSave == 4 || UcnLenSave == 8) &&
- "ProcessUCNEscape - only ucn length of 4 or 8 supported");
+ (void)UcnLen;
+ assert((UcnLen== 4 || UcnLen== 8) &&
+ "EncodeUCNEscape - only ucn length of 4 or 8 supported");
if (!PP.getLangOptions().ShortWChar) {
// Note: our internal rep of wide char tokens is always little-endian.
@@ -702,11 +714,26 @@
bool Warned = false;
while (begin[0] != '\'') {
uint64_t ResultChar;
+
+ // Is this a Universal Character Name escape?
if (begin[0] != '\\') // If this is a normal character, consume it.
ResultChar = *begin++;
- else // Otherwise, this is an escape character.
- ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP,
- /*Complain=*/true);
+ else { // Otherwise, this is an escape character.
+ // Check for UCN.
+ if (begin[1] == 'u' || begin[1] == 'U') {
+ uint32_t utf32 = 0;
+ unsigned short UcnLen = 0;
+ if (!ProcessUCNEscape(begin, end, utf32, UcnLen,
+ Loc, PP, /*Complain=*/true)) {
+ HadError = 1;
+ }
+ ResultChar = utf32;
+ } else {
+ // Otherwise, this is a non-UCN escape character. Process it.
+ ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP,
+ /*Complain=*/true);
+ }
+ }
// If this is a multi-character constant (e.g. 'abc'), handle it. These are
// implementation defined (C99 6.4.4.4p10).
@@ -746,6 +773,9 @@
// Transfer the value from APInt to uint64_t
Value = LitVal.getZExtValue();
+ if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF)
+ PP.Diag(Loc, diag::warn_ucn_escape_too_large);
+
// If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
// if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
// character constants are not sign extended in the this implementation:
@@ -915,9 +945,9 @@
}
// Is this a Universal Character Name escape?
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
- ProcessUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
- hadError, StringToks[i].getLocation(), PP, wide,
- Complain);
+ EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
+ hadError, StringToks[i].getLocation(), PP, wide,
+ Complain);
continue;
}
// Otherwise, this is a non-UCN escape character. Process it.
Added: cfe/trunk/test/CodeGen/char-literal.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/char-literal.c?rev=116129&view=auto
==============================================================================
--- cfe/trunk/test/CodeGen/char-literal.c (added)
+++ cfe/trunk/test/CodeGen/char-literal.c Fri Oct 8 19:27:47 2010
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -x c++ -triple i386-unknown-unkown -emit-llvm %s -o - | FileCheck %s
+// Runs in c++ mode so that wchar_t is available.
+
+int main() {
+ // CHECK: store i8 97
+ char a = 'a';
+
+ // Should pick second character.
+ // CHECK: store i8 98
+ char b = 'ab';
+
+ // CHECK: store i32 97
+ wchar_t wa = L'a';
+
+ // Should pick second character.
+ // CHECK: store i32 98
+ wchar_t wb = L'ab';
+
+ // Should pick last character and store its lowest byte.
+ // This does not match gcc, which takes the last character, converts it to
+ // utf8, and then picks the second-lowest byte of that (they probably store
+ // the utf8 in uint16_ts internally and take the lower byte of that).
+ // CHECK: store i8 48
+ char c = '\u1120\u0220\U00102030';
+
+ // CHECK: store i32 61451
+ wchar_t wc = L'\uF00B';
+
+ // CHECK: store i32 1110027
+ wchar_t wd = L'\U0010F00B';
+
+ // Should pick second character.
+ // CHECK: store i32 1110027
+ wchar_t we = L'\u1234\U0010F00B';
+}
Modified: cfe/trunk/test/CodeGen/string-literal-short-wstring.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/string-literal-short-wstring.c?rev=116129&r1=116128&r2=116129&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/string-literal-short-wstring.c (original)
+++ cfe/trunk/test/CodeGen/string-literal-short-wstring.c Fri Oct 8 19:27:47 2010
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -emit-llvm -fshort-wchar %s -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -emit-llvm -fshort-wchar %s -o - | FileCheck %s
+// Runs in c++ mode so that wchar_t is available.
int main() {
// This should convert to utf8.
@@ -6,9 +7,37 @@
char b[10] = "\u1120\u0220\U00102030";
// CHECK: private constant [6 x i8] c"A\00B\00\00\00"
- void *foo = L"AB";
+ const wchar_t *foo = L"AB";
// This should convert to utf16.
// CHECK: private constant [10 x i8] c" \11 \02\C8\DB0\DC\00\00"
- void *bar = L"\u1120\u0220\U00102030";
+ const wchar_t *bar = L"\u1120\u0220\U00102030";
+
+
+
+ // Should pick second character.
+ // CHECK: store i8 98
+ char c = 'ab';
+
+ // CHECK: store i16 97
+ wchar_t wa = L'a';
+
+ // Should pick second character.
+ // CHECK: store i16 98
+ wchar_t wb = L'ab';
+
+ // -4085 == 0xf00b
+ // CHECK: store i16 -4085
+ wchar_t wc = L'\uF00B';
+
+ // Should take lower word of the 4byte UNC sequence. This does not match
+ // gcc. I don't understand what gcc does (it looks like it converts to utf16,
+ // then takes the second (!) utf16 word, swaps the lower two nibbles, and
+ // stores that?).
+ // CHECK: store i16 -4085
+ wchar_t wd = L'\U0010F00B'; // has utf16 encoding dbc8 dcb0
+
+ // Should pick second character. (gcc: -9205)
+ // CHECK: store i16 -4085
+ wchar_t we = L'\u1234\U0010F00B';
}
Modified: cfe/trunk/test/Lexer/c90.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Lexer/c90.c?rev=116129&r1=116128&r2=116129&view=diff
==============================================================================
--- cfe/trunk/test/Lexer/c90.c (original)
+++ cfe/trunk/test/Lexer/c90.c Fri Oct 8 19:27:47 2010
@@ -30,4 +30,5 @@
void test3() {
(void)L"\u1234"; // expected-error {{unicode escape sequences are only valid in C99 or C++}}
+ (void)L'\u1234'; // expected-error {{unicode escape sequences are only valid in C99 or C++}}
}
Modified: cfe/trunk/test/Lexer/wchar.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Lexer/wchar.c?rev=116129&r1=116128&r2=116129&view=diff
==============================================================================
--- cfe/trunk/test/Lexer/wchar.c (original)
+++ cfe/trunk/test/Lexer/wchar.c Fri Oct 8 19:27:47 2010
@@ -2,5 +2,11 @@
void f() {
(void)L"\U00010000"; // expected-warning {{character unicode escape sequence too long for its type}}
+
+ (void)L'\U00010000'; // expected-warning {{character unicode escape sequence too long for its type}}
+
+ (void)L'ab'; // expected-warning {{extraneous characters in wide character constant ignored}}
+
+ (void)L'a\u1000'; // expected-warning {{extraneous characters in wide character constant ignored}}
}
More information about the cfe-commits
mailing list