[cfe-commits] [PATCH] Support for universal character names in identifiers

Thu Nov 15 19:17:43 PST 2012

Patch attached.  Adds support universal character names in identifiers, e.g.:

char * \u00FC = "u-umlaut";

Not that it's particularly useful, but it's a longstanding hole in our
C99 support.

The general outline of the approach is that the spelling of the
identifier token contains the UCN, but the IdentifierInfo for the
identifier token contains pure UTF-8.  I think this is reasonable
given the C phases of translation, and consistent with the way we
handle UCNs in other contexts.

I'm intentionally leaving out most of the support for universal
character names in user-defined literals, to try and reduce the size
of the patch.

I know this patch is a little lacking in terms of tests, but I'm not
really sure what tests we need; suggestions welcome.

-Eli
-------------- next part --------------
Index: test/Preprocessor/ucn-pp-identifier.c
===================================================================

--- test/Preprocessor/ucn-pp-identifier.c	(revision 0)
+++ test/Preprocessor/ucn-pp-identifier.c	(revision 0)
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 %s -Eonly -std=c99 -pedantic -verify
+
+#define \u00FC
+#define a\u00FD() 0
+#ifndef \u00FC
+#error "This should never happen"
+#endif
+
+#if a\u00FD()
+#error "This should never happen"
+#endif
+
+// Check that we allow UCNs in preprocessing numbers.
+// (Why exactly C allows them, I have no idea, but those are the rules)
+#define CONCAT(a,b) a ## b
+#define \U000100010\u00FD 1
+#if !CONCAT(\U00010001, 0\u00FD)
+#error "This should never happen"
+#endif
+
+// Check that we don't accept all uses of \u and \U as UCNs.
+// (Again, sort of weird, but part of the rules)
+#if \uarecool // expected-error {{invalid token at start of a preprocessor expression}}
+#endif
+#if \U0001000  // expected-error {{invalid token at start of a preprocessor expression}}
+#endif
+
+// Make sure we reject disallowed UCNs
+#define \ufffe // expected-error {{character '\ufffe' cannot be used in a universal character name in an identifer}}
+#define \U10000000  // expected-error {{character '\U10000000' cannot be used in a universal character name in an identifer}}
Index: test/CXX/over/over.oper/over.literal/p8.cpp
===================================================================
--- test/CXX/over/over.oper/over.literal/p8.cpp	(revision 168014)
+++ test/CXX/over/over.oper/over.literal/p8.cpp	(working copy)
@@ -7,8 +7,7 @@
 
 void operator "" _km(long double); // ok
 string operator "" _i18n(const char*, std::size_t); // ok
-// FIXME: This should be accepted once we support UCNs
-template<char...> int operator "" \u03C0(); // ok, UCN for lowercase pi // expected-error {{expected identifier}}
+template<char...> int operator "" \u03C0(); // ok, UCN for lowercase pi // expected-warning {{reserved}}
 float operator ""E(const char *); // expected-error {{invalid suffix on literal}} expected-warning {{reserved}}
 float operator " " B(const char *); // expected-error {{must be '""'}} expected-warning {{reserved}}
 string operator "" 5X(const char *, std::size_t); // expected-error {{expected identifier}}
Index: test/CodeGen/ucn-identifier.c
===================================================================
--- test/CodeGen/ucn-identifier.c	(revision 0)
+++ test/CodeGen/ucn-identifier.c	(revision 0)
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s
+
+// CHECK: @"\C3\BC" = global i32 10
+int \u00FC = 10;
Index: include/clang/Basic/DiagnosticLexKinds.td
===================================================================
--- include/clang/Basic/DiagnosticLexKinds.td	(revision 168014)
+++ include/clang/Basic/DiagnosticLexKinds.td	(working copy)
@@ -93,8 +93,12 @@
   "multi-character character constant">, InGroup<MultiChar>;
 def ext_four_char_character_literal : Extension<
   "multi-character character constant">, InGroup<FourByteMultiChar>;
-  
 
+
+def err_ucn_invalid_in_id : Error<
+  "character '%0' cannot be used in a universal character name "
+  "in an identifer">;
+
 // Literal
 def ext_nonstandard_escape : Extension<
   "use of non-standard escape character '\\%0'">;
Index: include/clang/Lex/Lexer.h
===================================================================
--- include/clang/Lex/Lexer.h	(revision 168014)
+++ include/clang/Lex/Lexer.h	(working copy)
@@ -573,6 +573,10 @@
   void cutOffLexing() { BufferPtr = BufferEnd; }
 
   bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
+
+  bool isUCNAfterSlash(const char *CurPtr, unsigned Size, unsigned SizeTmp[5]);
+  void ConsumeUCNAfterSlash(const char *&CurPtr, unsigned SizeTmp[5],
+                            Token &Result);
 };
 
 
Index: include/clang/Lex/Token.h
===================================================================
--- include/clang/Lex/Token.h	(revision 168014)
+++ include/clang/Lex/Token.h	(working copy)
@@ -74,9 +74,10 @@
     StartOfLine   = 0x01,  // At start of line or only after whitespace.
     LeadingSpace  = 0x02,  // Whitespace exists before this token.
     DisableExpand = 0x04,  // This identifier may never be macro expanded.
-    NeedsCleaning = 0x08,   // Contained an escaped newline or trigraph.
+    NeedsCleaning = 0x08,  // Contained an escaped newline or trigraph.
     LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
-    HasUDSuffix = 0x20     // This string or character literal has a ud-suffix.
+    HasUDSuffix = 0x20,    // This string or character literal has a ud-suffix.
+    HasUCN = 0x40          // This identifier contains a UCN
   };
 
   tok::TokenKind getKind() const { return (tok::TokenKind)Kind; }
Index: lib/Lex/Lexer.cpp
===================================================================
--- lib/Lex/Lexer.cpp	(revision 168014)
+++ lib/Lex/Lexer.cpp	(working copy)
@@ -336,10 +336,12 @@
   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
   if (Tok.is(tok::raw_identifier))
     TokStart = Tok.getRawIdentifierData();
-  else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
-    // Just return the string from the identifier table, which is very quick.
-    Buffer = II->getNameStart();
-    return II->getLength();
+  else if (!(Tok.getFlags() & Token::HasUCN)) {
+    if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
+      // Just return the string from the identifier table, which is very quick.
+      Buffer = II->getNameStart();
+      return II->getLength();
+    }
   }
 
   // NOTE: this can be checked even after testing for an IdentifierInfo.
@@ -1341,7 +1343,6 @@
 ///   2. If this is an escaped newline (potentially with whitespace between
 ///      the backslash and newline), implicitly skip the newline and return
 ///      the char after it.
-///   3. If this is a UCN, return it.  FIXME: C++ UCN's?
 ///
 /// This handles the slow/uncommon case of the getCharAndSize method.  Here we
 /// know that we can accumulate into Size, and that we have already incremented
@@ -1466,6 +1467,34 @@
 // Helper methods for lexing.
 //===----------------------------------------------------------------------===//
 
+bool Lexer::isUCNAfterSlash(const char* CurPtr, unsigned Size,
+                            unsigned SizeTmp[9]) {
+  unsigned SizeTmpSum = Size;
+  char FirstChar = getCharAndSize(CurPtr + SizeTmpSum, SizeTmp[0]);
+  unsigned NumHexDigits;
+  if (FirstChar == 'u')
+    NumHexDigits = 4;
+  else if (FirstChar == 'U')
+    NumHexDigits = 8;
+  else
+    return false;
+  SizeTmpSum += SizeTmp[0];
+  for (unsigned i = 0; i < NumHexDigits; ++i) {
+    if (!isxdigit(getCharAndSize(CurPtr + SizeTmpSum, SizeTmp[i+1])))
+      return false;
+    SizeTmpSum += SizeTmp[i+1];
+  }
+  return true;
+}
+
+void Lexer::ConsumeUCNAfterSlash(const char *&CurPtr, unsigned SizeTmp[9],
+                                 Token &Result) {
+  char FirstChar = getCharAndSize(CurPtr, SizeTmp[0]);
+  unsigned NumChars = FirstChar == 'U' ? 9 : 5;
+  for (unsigned i = 0; i < NumChars; ++i)
+    CurPtr = ConsumeChar(CurPtr, SizeTmp[i], Result);
+  }
+
 /// \brief Routine that indiscriminately skips bytes in the source file.
 void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) {
   BufferPtr += Bytes;
@@ -1485,7 +1514,6 @@
 
   // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
   // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
-  // FIXME: UCNs.
   //
   // TODO: Could merge these checks into a CharInfo flag to make the comparison
   // cheaper
@@ -1513,12 +1541,13 @@
   }
 
   // Otherwise, $,\,? in identifier found.  Enter slower path.
-
+  bool hasUCN = false;
   C = getCharAndSize(CurPtr, Size);
   while (1) {
     if (C == '$') {
       // If we hit a $ and they are not supported in identifiers, we are done.
-      if (!LangOpts.DollarIdents) goto FinishIdentifier;
+      if (!LangOpts.DollarIdents)
+        break;
 
       // Otherwise, emit a diagnostic and continue.
       if (!isLexingRawMode())
@@ -1526,20 +1555,32 @@
       CurPtr = ConsumeChar(CurPtr, Size, Result);
       C = getCharAndSize(CurPtr, Size);
       continue;
-    } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
-      // Found end of identifier.
-      goto FinishIdentifier;
+    } else if (C == '\\') {
+      unsigned SizeTmp[9];
+      if (!isUCNAfterSlash(CurPtr, Size, SizeTmp))
+        break;
+
+      CurPtr = ConsumeChar(CurPtr, Size, Result);
+      ConsumeUCNAfterSlash(CurPtr, SizeTmp, Result);
+      hasUCN = true;
+      C = getCharAndSize(CurPtr, Size);
+      continue;
+    } else if (!isIdentifierBody(C)) {
+      break;
     }
 
     // Otherwise, this character is good, consume it.
     CurPtr = ConsumeChar(CurPtr, Size, Result);
 
     C = getCharAndSize(CurPtr, Size);
-    while (isIdentifierBody(C)) { // FIXME: UCNs.
+    while (isIdentifierBody(C)) {
       CurPtr = ConsumeChar(CurPtr, Size, Result);
       C = getCharAndSize(CurPtr, Size);
     }
   }
+  if (hasUCN)
+    Result.setFlag(Token::HasUCN);
+  goto FinishIdentifier;
 }
 
 /// isHexaLiteral - Return true if Start points to a hex constant.
@@ -1560,12 +1601,22 @@
   unsigned Size;
   char C = getCharAndSize(CurPtr, Size);
   char PrevCh = 0;
-  while (isNumberBody(C)) { // FIXME: UCNs in ud-suffix.
+  while (isNumberBody(C)) {
     CurPtr = ConsumeChar(CurPtr, Size, Result);
     PrevCh = C;
     C = getCharAndSize(CurPtr, Size);
   }
 
+   // Check for a UCN.
+   if (C == '\\') {
+     unsigned SizeTmp[9];
+     if (isUCNAfterSlash(CurPtr, Size, SizeTmp)) {
+       ConsumeChar(CurPtr, Size, Result);
+       ConsumeUCNAfterSlash(CurPtr, SizeTmp, Result);
+       return LexNumericConstant(Result, CurPtr);
+     }
+   }
+
   // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
   if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
     // If we are in Microsoft mode, don't continue if the constant is hex.
@@ -1778,6 +1829,7 @@
     // Skip escaped characters.
     if (C == '\\') {
       // Skip the escaped character.
+      // Longer escape sequences and UCNs get implicitly handled by the loop.
       getAndAdvanceChar(CurPtr, Result);
     } else if (C == '\n' || C == '\r' ||             // Newline.
                (C == 0 && (CurPtr-1 == BufferEnd ||  // End of file.
@@ -1825,6 +1877,7 @@
     // Skip escaped characters.
     if (C == '\\') {
       // Skip the escaped character.
+      // Longer escape sequences and UCNs get implicitly handled by the loop.
       getAndAdvanceChar(CurPtr, Result);
     } else if (C == '\n' || C == '\r' ||             // Newline.
                (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
@@ -3206,9 +3259,21 @@
       Kind = tok::unknown;
     break;
 
-  case '\\':
-    // FIXME: UCN's.
-    // FALL THROUGH.
+  case '\\': {
+    unsigned SizeTmpArr[9];
+    if (isUCNAfterSlash(CurPtr, 0, SizeTmpArr)) {
+      ConsumeUCNAfterSlash(CurPtr, SizeTmpArr, Result);
+      Result.setFlag(Token::HasUCN);
+
+      // Notify MIOpt that we read a non-whitespace/non-comment token.
+      MIOpt.ReadToken();
+      
+      return LexIdentifier(Result, CurPtr);
+    }
+    Kind = tok::unknown;
+    break;
+  }
+
   default:
     Kind = tok::unknown;
     break;
Index: lib/Lex/Preprocessor.cpp
===================================================================
--- lib/Lex/Preprocessor.cpp	(revision 168014)
+++ lib/Lex/Preprocessor.cpp	(working copy)
@@ -37,11 +37,13 @@
 #include "clang/Lex/LexDiagnostic.h"
 #include "clang/Lex/CodeCompletionHandler.h"
 #include "clang/Lex/ModuleLoader.h"
+#include "clang/Basic/ConvertUTF.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Capacity.h"
@@ -398,7 +400,7 @@
                                           SmallVectorImpl<char> &Buffer,
                                           bool *Invalid) const {
   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
-  if (Tok.isNot(tok::raw_identifier)) {
+  if (Tok.isNot(tok::raw_identifier) && !(Tok.getFlags() & Token::HasUCN)) {
     // Try the fast path.
     if (const IdentifierInfo *II = Tok.getIdentifierInfo())
       return II->getName();
@@ -496,6 +498,79 @@
 // Lexer Event Handling.
 //===----------------------------------------------------------------------===//
 
+static int HexDigitValue(char C) {
+  if (C >= '0' && C <= '9') return C-'0';
+  if (C >= 'a' && C <= 'f') return C-'a'+10;
+  return C-'A'+10;
+}
+
+namespace {
+  struct UCNCharRange {
+    unsigned Lower;
+    unsigned Upper;
+  };
+  UCNCharRange UCNAllowedCharRanges[] =
+      // 1
+    { { 0x00A8, 0x00A8 }, { 0x00AA, 0x00AA }, { 0x00AD, 0x00AD },
+      { 0x00AF, 0x00AF }, { 0x00B2, 0x00B5 }, { 0x00B7, 0x00BA },
+      { 0x00BC, 0x00BE }, { 0x00C0, 0x00D6 }, { 0x00D8, 0x00F6 },
+      { 0x00F8, 0x00FF },
+      // 2
+      { 0x0100, 0x167F }, { 0x1681, 0x180D }, { 0x180F, 0x1FFF },
+      // 3
+      { 0x200B, 0x200D }, { 0x202A, 0x202E }, { 0x203F, 0x2040 },
+      { 0x2054, 0x2054 }, { 0x2060, 0x206F },
+      // 4
+      { 0x2070, 0x218F }, { 0x2460, 0x24FF }, { 0x2776, 0x2793 },
+      { 0x2C00, 0x2DFF }, { 0x2E80, 0x2FFF },
+      // 5
+      { 0x3004, 0x3007 }, { 0x3021, 0x302F }, { 0x3031, 0x303F },
+      // 6
+      { 0x3040, 0xD7FF },
+      // 7
+      { 0xF900, 0xFD3D }, { 0xFD40, 0xFDCF }, { 0xFDF0, 0xFE44 },
+      { 0xFE47, 0xFFFD },
+      // 8
+      { 0x10000, 0x1FFFD }, { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD },
+      { 0x40000, 0x4FFFD }, { 0x50000, 0x5FFFD }, { 0x60000, 0x6FFFD },
+      { 0x70000, 0x7FFFD }, { 0x80000, 0x8FFFD }, { 0x90000, 0x9FFFD },
+      { 0xA0000, 0xAFFFD }, { 0xB0000, 0xBFFFD }, { 0xC0000, 0xCFFFD },
+      { 0xD0000, 0xDFFFD }, { 0xE0000, 0xEFFFD } };
+}
+
+static bool isAllowedIDChar(unsigned c) {
+  unsigned LowPoint = 0;
+  unsigned HighPoint = llvm::array_lengthof(UCNAllowedCharRanges);
+  while (HighPoint != LowPoint) {
+    unsigned MidPoint = (HighPoint + LowPoint) / 2;
+    if (c < UCNAllowedCharRanges[MidPoint].Lower)
+      HighPoint = MidPoint;
+    else if (c > UCNAllowedCharRanges[MidPoint].Upper)
+      LowPoint = MidPoint + 1;
+    else
+      return true;
+  }
+  return false;
+}
+
+static bool isAllowedInitiallyIDChar(unsigned c) {
+  return isAllowedIDChar(c) &&
+         !(0x0300 <= c && c <= 0x036F) &&
+         !(0x1DC0 <= c && c <= 0x1DFF) &&
+         !(0x20D0 <= c && c <= 0x20FF) &&
+         !(0xFE20 <= c && c <= 0xFE2F);
+}
+
+static void AppendCodePoint(unsigned Codepoint,
+                            llvm::SmallVectorImpl<char> &Str) {
+  char ResultBuf[4];
+  char *ResultPtr = ResultBuf;
+  bool Res = ConvertCodePointToUTF8(Codepoint, ResultPtr);
+  (void)Res;
+  assert(Res && "Unexpected conversion failure");
+  Str.append(ResultBuf, ResultPtr);
+}
+
 /// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the
 /// identifier information for the token and install it into the token,
 /// updating the token kind accordingly.
@@ -504,14 +579,52 @@
 
   // Look up this token, see if it is a macro, or if it is a language keyword.
   IdentifierInfo *II;
-  if (!Identifier.needsCleaning()) {
+  if (!Identifier.needsCleaning() && !(Identifier.getFlags() & Token::HasUCN)) {
     // No cleaning needed, just use the characters from the lexed buffer.
     II = getIdentifierInfo(StringRef(Identifier.getRawIdentifierData(),
                                            Identifier.getLength()));
   } else {
     // Cleaning needed, alloca a buffer, clean into it, then use the buffer.
     SmallString<64> IdentifierBuffer;
+    SmallString<64> UCNIdentifierBuffer;
     StringRef CleanedStr = getSpelling(Identifier, IdentifierBuffer);
+    if (Identifier.getFlags() & Token::HasUCN) {
+      for (unsigned i = 0, e = CleanedStr.size(); i != e; ++i) {
+        if (CleanedStr[i] == '\\') {
+          unsigned UcnVal;
+          unsigned NumChars;
+          if (CleanedStr[i+1] == 'u') {
+            UcnVal = (HexDigitValue(CleanedStr[i+2]) << 12) +
+                     (HexDigitValue(CleanedStr[i+3]) << 8) +
+                     (HexDigitValue(CleanedStr[i+4]) << 4) +
+                     (HexDigitValue(CleanedStr[i+5]));
+            NumChars = 6;
+          } else {
+            assert(CleanedStr[i+1] == 'U');
+            UcnVal = (HexDigitValue(CleanedStr[i+2]) << 28) +
+                     (HexDigitValue(CleanedStr[i+3]) << 24) +
+                     (HexDigitValue(CleanedStr[i+4]) << 20) +
+                     (HexDigitValue(CleanedStr[i+5]) << 16) +
+                     (HexDigitValue(CleanedStr[i+6]) << 12) +
+                     (HexDigitValue(CleanedStr[i+7]) << 8) +
+                     (HexDigitValue(CleanedStr[i+8]) << 4) +
+                     (HexDigitValue(CleanedStr[i+9]));
+            NumChars = 10;
+          }
+          if (UCNIdentifierBuffer.empty() ? !isAllowedInitiallyIDChar(UcnVal) :
+                                            !isAllowedIDChar(UcnVal)) {
+            StringRef CurCharacter = CleanedStr.substr(i, NumChars);
+            Diag(Identifier, diag::err_ucn_invalid_in_id) << CurCharacter;
+            UcnVal = 0xFFFD;
+          }
+          AppendCodePoint(UcnVal, UCNIdentifierBuffer);
+          i += NumChars - 1;
+        } else {
+          UCNIdentifierBuffer.push_back(CleanedStr[i]);
+        }
+      }
+      CleanedStr = UCNIdentifierBuffer;
+    }
     II = getIdentifierInfo(CleanedStr);
   }