r173931 - Comment parsing: resolve more named character references

Dmitri Gribenko gribozavr at gmail.com
Wed Jan 30 06:29:28 PST 2013


Author: gribozavr
Date: Wed Jan 30 08:29:28 2013
New Revision: 173931

URL: http://llvm.org/viewvc/llvm-project?rev=173931&view=rev
Log:
Comment parsing: resolve more named character references

This reimplements r173850 with a better approach:
(1) use a TableGen-generated matcher instead of doing a linear search;
(2) avoid allocations for new strings by converting code points to string
    iterals with TableGen.

Added:
    cfe/trunk/include/clang/AST/CommentHTMLNamedCharacterReferences.td
    cfe/trunk/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
Modified:
    cfe/trunk/include/clang/AST/CMakeLists.txt
    cfe/trunk/include/clang/AST/CommentLexer.h
    cfe/trunk/include/clang/AST/Makefile
    cfe/trunk/lib/AST/CMakeLists.txt
    cfe/trunk/lib/AST/CommentLexer.cpp
    cfe/trunk/utils/TableGen/CMakeLists.txt
    cfe/trunk/utils/TableGen/TableGen.cpp
    cfe/trunk/utils/TableGen/TableGenBackends.h

Modified: cfe/trunk/include/clang/AST/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/AST/CMakeLists.txt?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/include/clang/AST/CMakeLists.txt (original)
+++ cfe/trunk/include/clang/AST/CMakeLists.txt Wed Jan 30 08:29:28 2013
@@ -33,6 +33,10 @@ clang_tablegen(CommentHTMLTagsProperties
   SOURCE CommentHTMLTags.td
   TARGET ClangCommentHTMLTagsProperties)
 
+clang_tablegen(CommentHTMLNamedCharacterReferences.inc -gen-clang-comment-html-named-character-references
+  SOURCE CommentHTMLNamedCharacterReferences.td
+  TARGET ClangCommentHTMLNamedCharacterReferences)
+
 clang_tablegen(CommentCommandInfo.inc -gen-clang-comment-command-info
   SOURCE CommentCommands.td
   TARGET ClangCommentCommandInfo)

Added: cfe/trunk/include/clang/AST/CommentHTMLNamedCharacterReferences.td
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/AST/CommentHTMLNamedCharacterReferences.td?rev=173931&view=auto
==============================================================================
--- cfe/trunk/include/clang/AST/CommentHTMLNamedCharacterReferences.td (added)
+++ cfe/trunk/include/clang/AST/CommentHTMLNamedCharacterReferences.td Wed Jan 30 08:29:28 2013
@@ -0,0 +1,177 @@
+// HTML Named Character Reference
+class NCR<string spelling, int codePoint> {
+  string Spelling = spelling;
+  int CodePoint = codePoint;
+}
+
+// The list below includes named character references supported by Doxygen:
+// http://www.stack.nl/~dimitri/doxygen/manual/htmlcmds.html
+//
+// It does not include all HTML 5 named character references.
+//
+// Corresponding code point values can be found here:
+// http://www.w3.org/TR/2011/WD-html5-20110113/named-character-references.html
+
+def : NCR<"copy",  0x000A9>;
+def : NCR<"COPY",  0x000A9>;
+def : NCR<"trade", 0x02122>;
+def : NCR<"TRADE", 0x02122>;
+def : NCR<"reg",   0x000AE>;
+def : NCR<"REG",   0x000AE>;
+def : NCR<"lt",    0x0003C>;
+def : NCR<"Lt",    0x0003C>;
+def : NCR<"LT",    0x0003C>;
+def : NCR<"gt",    0x0003E>;
+def : NCR<"Gt",    0x0003E>;
+def : NCR<"GT",    0x0003E>;
+def : NCR<"amp",   0x00026>;
+def : NCR<"AMP",   0x00026>;
+def : NCR<"apos",  0x00027>;
+def : NCR<"quot",  0x00022>;
+def : NCR<"QUOT",  0x00022>;
+def : NCR<"lsquo", 0x02018>;
+def : NCR<"rsquo", 0x02019>;
+def : NCR<"ldquo", 0x0201C>;
+def : NCR<"rdquo", 0x0201D>;
+def : NCR<"ndash", 0x02013>;
+def : NCR<"mdash", 0x02014>;
+
+def : NCR<"Auml", 0x000C4>;
+def : NCR<"Euml", 0x000CB>;
+def : NCR<"Iuml", 0x000CF>;
+def : NCR<"Ouml", 0x000D6>;
+def : NCR<"Uuml", 0x000DC>;
+def : NCR<"Yuml", 0x00178>;
+def : NCR<"auml", 0x000E4>;
+def : NCR<"euml", 0x000EB>;
+def : NCR<"iuml", 0x000EF>;
+def : NCR<"ouml", 0x000F6>;
+def : NCR<"uuml", 0x000FC>;
+def : NCR<"yuml", 0x000FF>;
+
+def : NCR<"Aacute", 0x000C1>;
+def : NCR<"Eacute", 0x000C9>;
+def : NCR<"Iacute", 0x000CD>;
+def : NCR<"Oacute", 0x000D3>;
+def : NCR<"Uacute", 0x000DA>;
+def : NCR<"Yacute", 0x000DD>;
+def : NCR<"aacute", 0x000E1>;
+def : NCR<"eacute", 0x000E9>;
+def : NCR<"iacute", 0x000ED>;
+def : NCR<"oacute", 0x000F3>;
+def : NCR<"uacute", 0x000FA>;
+def : NCR<"yacute", 0x000FD>;
+
+def : NCR<"Agrave", 0x000C0>;
+def : NCR<"Egrave", 0x000C8>;
+def : NCR<"Igrave", 0x000CC>;
+def : NCR<"Ograve", 0x000D2>;
+def : NCR<"Ugrave", 0x000D9>;
+// def : NCR<"Ygrave", 0x01EF2>; // Defined neither in Doxygen, nor in HTML5.
+def : NCR<"agrave", 0x000E0>;
+def : NCR<"egrave", 0x000E8>;
+def : NCR<"igrave", 0x000EC>;
+def : NCR<"ograve", 0x000F2>;
+def : NCR<"ugrave", 0x000F9>;
+def : NCR<"ygrave", 0x01EF3>; // Defined in Doxygen, not defined in HTML5.
+
+def : NCR<"Acirc", 0x000C2>;
+def : NCR<"Ecirc", 0x000CA>;
+def : NCR<"Icirc", 0x000CE>;
+def : NCR<"Ocirc", 0x000D4>;
+def : NCR<"Ucirc", 0x000DB>;
+def : NCR<"Ycirc", 0x00176>; // Not defined in Doxygen, defined in HTML5.
+def : NCR<"acirc", 0x000E2>;
+def : NCR<"ecirc", 0x000EA>;
+def : NCR<"icirc", 0x000EE>;
+def : NCR<"ocirc", 0x000F4>;
+def : NCR<"ucirc", 0x000FB>;
+def : NCR<"ycirc", 0x00177>;
+
+def : NCR<"Atilde", 0x000C3>;
+def : NCR<"Ntilde", 0x000D1>;
+def : NCR<"Otilde", 0x000D5>;
+def : NCR<"atilde", 0x000E3>;
+def : NCR<"ntilde", 0x000F1>;
+def : NCR<"otilde", 0x000F5>;
+
+def : NCR<"szlig", 0x000DF>;
+
+def : NCR<"ccedil", 0x000E7>;
+def : NCR<"Ccedil", 0x000C7>;
+
+def : NCR<"aring", 0x000E5>;
+def : NCR<"Aring", 0x000C5>;
+
+def : NCR<"nbsp", 0x000A0>;
+
+def : NCR<"Gamma",   0x00393>;
+def : NCR<"Delta",   0x00394>;
+def : NCR<"Theta",   0x00398>;
+def : NCR<"Lambda",  0x0039B>;
+def : NCR<"Xi",      0x0039E>;
+def : NCR<"Pi",      0x003A0>;
+def : NCR<"Sigma",   0x003A3>;
+def : NCR<"Upsilon", 0x003A5>;
+def : NCR<"Phi",     0x003A6>;
+def : NCR<"Psi",     0x003A8>;
+def : NCR<"Omega",   0x003A9>;
+
+def : NCR<"alpha",   0x003B1>;
+def : NCR<"beta",    0x003B2>;
+def : NCR<"gamma",   0x003B3>;
+def : NCR<"delta",   0x003B4>;
+def : NCR<"epsilon", 0x003B5>;
+def : NCR<"zeta",    0x003B6>;
+def : NCR<"eta",     0x003B7>;
+def : NCR<"theta",   0x003B8>;
+def : NCR<"iota",    0x003B9>;
+def : NCR<"kappa",   0x003BA>;
+def : NCR<"lambda",  0x003BB>;
+def : NCR<"mu",      0x003BC>;
+def : NCR<"nu",      0x003BD>;
+def : NCR<"xi",      0x003BE>;
+def : NCR<"pi",      0x003C0>;
+def : NCR<"rho",     0x003C1>;
+def : NCR<"sigma",   0x003C3>;
+def : NCR<"tau",     0x003C4>;
+def : NCR<"upsilon", 0x003C5>;
+def : NCR<"phi",     0x003C6>;
+def : NCR<"chi",     0x003C7>;
+def : NCR<"psi",     0x003C8>;
+def : NCR<"omega",   0x003C9>;
+def : NCR<"sigmaf",  0x003C2>;
+
+def : NCR<"sect",   0x000A7>;
+def : NCR<"deg",    0x000B0>;
+def : NCR<"prime",  0x02032>;
+def : NCR<"Prime",  0x02033>;
+def : NCR<"infin",  0x0221E>;
+def : NCR<"empty",  0x02205>;
+def : NCR<"plusmn", 0x000B1>;
+def : NCR<"times",  0x000D7>;
+def : NCR<"minus",  0x02212>;
+def : NCR<"sdot",   0x022C5>;
+def : NCR<"part",   0x02202>;
+def : NCR<"nabla",  0x02207>;
+def : NCR<"radic",  0x0221A>;
+def : NCR<"perp",   0x022A5>;
+def : NCR<"sum",    0x02211>;
+def : NCR<"int",    0x0222B>;
+def : NCR<"prod",   0x0220F>;
+def : NCR<"sim",    0x0223C>;
+def : NCR<"asymp",  0x02248>;
+def : NCR<"ne",     0x02260>;
+def : NCR<"equiv",  0x02261>;
+def : NCR<"prop",   0x0221D>;
+def : NCR<"le",     0x02264>;
+def : NCR<"ge",     0x02265>;
+def : NCR<"larr",   0x02190>;
+def : NCR<"rarr",   0x02192>;
+def : NCR<"isin",   0x02208>;
+def : NCR<"notin",  0x02209>;
+def : NCR<"lceil",  0x02308>;
+def : NCR<"rceil",  0x02309>;
+def : NCR<"lfloor", 0x0230A>;
+def : NCR<"rfloor", 0x0230B>;
+

Modified: cfe/trunk/include/clang/AST/CommentLexer.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/AST/CommentLexer.h?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/include/clang/AST/CommentLexer.h (original)
+++ cfe/trunk/include/clang/AST/CommentLexer.h Wed Jan 30 08:29:28 2013
@@ -282,18 +282,11 @@ private:
   /// it stands for (e.g., "<").
   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
 
-  /// Given a Doxygen-supported named character reference (e.g., "™"),
-  /// it returns its UTF8 encoding.
-  StringRef HTMLDoxygenCharacterReference(StringRef Name) const;
-
   /// Given a Unicode codepoint as base-10 integer, return the character.
   StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
 
   /// Given a Unicode codepoint as base-16 integer, return the character.
   StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
-  
-  /// Helper routine to do part of the work for resolveHTMLHexCharacterReference.
-  StringRef helperResolveHTMLHexCharacterReference(unsigned CodePoint) const;
 
   void formTokenWithChars(Token &Result, const char *TokEnd,
                           tok::TokenKind Kind) {

Modified: cfe/trunk/include/clang/AST/Makefile
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/AST/Makefile?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/include/clang/AST/Makefile (original)
+++ cfe/trunk/include/clang/AST/Makefile Wed Jan 30 08:29:28 2013
@@ -3,7 +3,9 @@ TD_SRC_DIR = $(PROJ_SRC_DIR)/../Basic
 BUILT_SOURCES = Attrs.inc AttrImpl.inc AttrDump.inc \
                 StmtNodes.inc DeclNodes.inc \
                 CommentNodes.inc CommentHTMLTags.inc \
-                CommentHTMLTagsProperties.inc CommentCommandInfo.inc
+                CommentHTMLTagsProperties.inc \
+                CommentHTMLNamedCharacterReferences.inc \
+                CommentCommandInfo.inc
 
 TABLEGEN_INC_FILES_COMMON = 1
 
@@ -52,6 +54,12 @@ $(ObjDir)/CommentHTMLTagsProperties.inc.
 	$(Echo) "Building Clang comment HTML tag properties with tblgen"
 	$(Verb) $(ClangTableGen) -gen-clang-comment-html-tags-properties -o $(call SYSPATH, $@) $<
 
+$(ObjDir)/CommentHTMLNamedCharacterReferences.inc.tmp : \
+                    $(PROJ_SRC_DIR)/CommentHTMLNamedCharacterReferences.td \
+                    $(CLANG_TBLGEN) $(ObjDir)/.dir
+	$(Echo) "Building Clang named character reference translation function with tblgen"
+	$(Verb) $(ClangTableGen) -gen-clang-comment-html-named-character-references -o $(call SYSPATH, $@) $<
+
 $(ObjDir)/CommentCommandInfo.inc.tmp : $(PROJ_SRC_DIR)/CommentCommands.td \
                                               $(CLANG_TBLGEN) $(ObjDir)/.dir
 	$(Echo) "Building Clang comment command info with tblgen"

Modified: cfe/trunk/lib/AST/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/CMakeLists.txt?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/lib/AST/CMakeLists.txt (original)
+++ cfe/trunk/lib/AST/CMakeLists.txt Wed Jan 30 08:29:28 2013
@@ -68,6 +68,7 @@ add_dependencies(clangAST
   ClangCommentNodes
   ClangCommentHTMLTags
   ClangCommentHTMLTagsProperties
+  ClangCommentHTMLNamedCharacterReferences
   ClangDeclNodes
   ClangDiagnosticAST
   ClangDiagnosticComment

Modified: cfe/trunk/lib/AST/CommentLexer.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/CommentLexer.cpp?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/lib/AST/CommentLexer.cpp (original)
+++ cfe/trunk/lib/AST/CommentLexer.cpp Wed Jan 30 08:29:28 2013
@@ -30,22 +30,8 @@ bool isHTMLHexCharacterReferenceCharacte
          (C >= 'A' && C <= 'F');
 }
 
-#include "clang/AST/CommentHTMLTags.inc"
-
-} // unnamed namespace
-
-static unsigned getCodePoint(StringRef Name) {
-  unsigned CodePoint = 0;
-  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
-    CodePoint *= 16;
-    const char C = Name[i];
-    assert(isHTMLHexCharacterReferenceCharacter(C));
-    CodePoint += llvm::hexDigitValue(C);
-  }
-  return CodePoint;
-}
-
-StringRef Lexer::helperResolveHTMLHexCharacterReference(unsigned CodePoint) const {
+StringRef convertCodePointToUTF8(llvm::BumpPtrAllocator &Allocator,
+                                 unsigned CodePoint) {
   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
   char *ResolvedPtr = Resolved;
   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
@@ -53,164 +39,22 @@ StringRef Lexer::helperResolveHTMLHexCha
   else
     return StringRef();
 }
-  
-StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
-  unsigned CodePoint = getCodePoint(Name);
-  return helperResolveHTMLHexCharacterReference(CodePoint);
-}
+
+#include "clang/AST/CommentHTMLTags.inc"
+#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
+
+} // unnamed namespace
 
 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
+  // Fast path, first check a few most widely used named character references.
   return llvm::StringSwitch<StringRef>(Name)
       .Case("amp", "&")
       .Case("lt", "<")
       .Case("gt", ">")
       .Case("quot", "\"")
       .Case("apos", "\'")
-      .Default("");
-}
-  
-StringRef Lexer::HTMLDoxygenCharacterReference(StringRef Name) const {
-  return llvm::StringSwitch<StringRef>(Name)
-  .Case("copy", helperResolveHTMLHexCharacterReference(0x000A9))
-  .Case("trade",        helperResolveHTMLHexCharacterReference(0x02122))
-  .Case("reg",  helperResolveHTMLHexCharacterReference(0x000AE))
-  .Case("lt",   helperResolveHTMLHexCharacterReference(0x0003C))
-  .Case("gt",   helperResolveHTMLHexCharacterReference(0x0003C))
-  .Case("amp",  helperResolveHTMLHexCharacterReference(0x00026))
-  .Case("apos", helperResolveHTMLHexCharacterReference(0x00027))
-  .Case("quot", helperResolveHTMLHexCharacterReference(0x00022))
-  .Case("lsquo",        helperResolveHTMLHexCharacterReference(0x02018))
-  .Case("rsquo",        helperResolveHTMLHexCharacterReference(0x02019))
-  .Case("ldquo",        helperResolveHTMLHexCharacterReference(0x0201C))
-  .Case("rdquo",        helperResolveHTMLHexCharacterReference(0x0201D))
-  .Case("ndash",        helperResolveHTMLHexCharacterReference(0x02013))
-  .Case("mdash",        helperResolveHTMLHexCharacterReference(0x02014))
-  .Case("Auml", helperResolveHTMLHexCharacterReference(0x000C4))
-  .Case("Euml", helperResolveHTMLHexCharacterReference(0x000CB))
-  .Case("Iuml", helperResolveHTMLHexCharacterReference(0x000CF))
-  .Case("Ouml", helperResolveHTMLHexCharacterReference(0x000D6))
-  .Case("Uuml", helperResolveHTMLHexCharacterReference(0x000DC))
-  .Case("Yuml", helperResolveHTMLHexCharacterReference(0x00178))
-  .Case("auml", helperResolveHTMLHexCharacterReference(0x000E4))
-  .Case("euml", helperResolveHTMLHexCharacterReference(0x000EB))
-  .Case("iuml", helperResolveHTMLHexCharacterReference(0x000EF))
-  .Case("ouml", helperResolveHTMLHexCharacterReference(0x000F6))
-  .Case("uuml", helperResolveHTMLHexCharacterReference(0x000FC))
-  .Case("yuml", helperResolveHTMLHexCharacterReference(0x000FF))
-  .Case("Aacute",       helperResolveHTMLHexCharacterReference(0x000C1))
-  .Case("Eacute",       helperResolveHTMLHexCharacterReference(0x000C9))
-  .Case("Iacute",       helperResolveHTMLHexCharacterReference(0x000CD))
-  .Case("Oacute",       helperResolveHTMLHexCharacterReference(0x000D3))
-  .Case("Uacute",       helperResolveHTMLHexCharacterReference(0x000DA))
-  .Case("Yacute",       helperResolveHTMLHexCharacterReference(0x000DD))
-  .Case("aacute",       helperResolveHTMLHexCharacterReference(0x000E1))
-  .Case("eacute",       helperResolveHTMLHexCharacterReference(0x000E9))
-  .Case("iacute",       helperResolveHTMLHexCharacterReference(0x000ED))
-  .Case("oacute",       helperResolveHTMLHexCharacterReference(0x000F3))
-  .Case("uacute",       helperResolveHTMLHexCharacterReference(0x000FA))
-  .Case("yacute",       helperResolveHTMLHexCharacterReference(0x000FD))
-  .Case("Agrave",       helperResolveHTMLHexCharacterReference(0x000C0))
-  .Case("Egrave",       helperResolveHTMLHexCharacterReference(0x000C8))
-  .Case("Igrave",       helperResolveHTMLHexCharacterReference(0x000CC))
-  .Case("Ograve",       helperResolveHTMLHexCharacterReference(0x000D2))
-  .Case("Ugrave",       helperResolveHTMLHexCharacterReference(0x000D9))
-  .Case("agrave",       helperResolveHTMLHexCharacterReference(0x000E0))
-  .Case("egrave",       helperResolveHTMLHexCharacterReference(0x000E8))
-  .Case("igrave",       helperResolveHTMLHexCharacterReference(0x000EC))
-  .Case("ograve",       helperResolveHTMLHexCharacterReference(0x000F2))
-  .Case("ugrave",       helperResolveHTMLHexCharacterReference(0x000F9))
-  .Case("ygrave",       helperResolveHTMLHexCharacterReference(0x01EF3))
-  .Case("Acirc",        helperResolveHTMLHexCharacterReference(0x000C2))
-  .Case("Ecirc",        helperResolveHTMLHexCharacterReference(0x000CA))
-  .Case("Icirc",        helperResolveHTMLHexCharacterReference(0x000CE))
-  .Case("Ocirc",        helperResolveHTMLHexCharacterReference(0x000D4))
-  .Case("Ucirc",        helperResolveHTMLHexCharacterReference(0x000DB))
-  .Case("acirc",        helperResolveHTMLHexCharacterReference(0x000E2))
-  .Case("ecirc",        helperResolveHTMLHexCharacterReference(0x000EA))
-  .Case("icirc",        helperResolveHTMLHexCharacterReference(0x000EE))
-  .Case("ocirc",        helperResolveHTMLHexCharacterReference(0x000F4))
-  .Case("ucirc",        helperResolveHTMLHexCharacterReference(0x000FB))
-  .Case("ycirc",        helperResolveHTMLHexCharacterReference(0x00177))
-  .Case("Atilde",       helperResolveHTMLHexCharacterReference(0x000C3))
-  .Case("Ntilde",       helperResolveHTMLHexCharacterReference(0x000D1))
-  .Case("Otilde",       helperResolveHTMLHexCharacterReference(0x000D5))
-  .Case("atilde",       helperResolveHTMLHexCharacterReference(0x000E3))
-  .Case("ntilde",       helperResolveHTMLHexCharacterReference(0x000F1))
-  .Case("otilde",       helperResolveHTMLHexCharacterReference(0x000F5))
-  .Case("szlig",        helperResolveHTMLHexCharacterReference(0x000DF))
-  .Case("ccedil",       helperResolveHTMLHexCharacterReference(0x000E7))
-  .Case("Ccedil",       helperResolveHTMLHexCharacterReference(0x000C7))
-  .Case("aring",        helperResolveHTMLHexCharacterReference(0x000E5))
-  .Case("Aring",        helperResolveHTMLHexCharacterReference(0x000C5))
-  .Case("nbsp", helperResolveHTMLHexCharacterReference(0x000A0))
-  .Case("Gamma",        helperResolveHTMLHexCharacterReference(0x00393))
-  .Case("Delta",        helperResolveHTMLHexCharacterReference(0x00394))
-  .Case("Theta",        helperResolveHTMLHexCharacterReference(0x00398))
-  .Case("Lambda",       helperResolveHTMLHexCharacterReference(0x0039B))
-  .Case("Xi",   helperResolveHTMLHexCharacterReference(0x0039E))
-  .Case("Pi",   helperResolveHTMLHexCharacterReference(0x003A0))
-  .Case("Sigma",        helperResolveHTMLHexCharacterReference(0x003A3))
-  .Case("Upsilon",      helperResolveHTMLHexCharacterReference(0x003A5))
-  .Case("Phi",  helperResolveHTMLHexCharacterReference(0x003A6))
-  .Case("Psi",  helperResolveHTMLHexCharacterReference(0x003A8))
-  .Case("Omega",        helperResolveHTMLHexCharacterReference(0x003A9))
-  .Case("alpha",        helperResolveHTMLHexCharacterReference(0x003B1))
-  .Case("beta", helperResolveHTMLHexCharacterReference(0x003B2))
-  .Case("gamma",        helperResolveHTMLHexCharacterReference(0x003B3))
-  .Case("delta",        helperResolveHTMLHexCharacterReference(0x003B4))
-  .Case("epsilon",      helperResolveHTMLHexCharacterReference(0x003B5))
-  .Case("zeta", helperResolveHTMLHexCharacterReference(0x003B6))
-  .Case("eta",  helperResolveHTMLHexCharacterReference(0x003B7))
-  .Case("theta",        helperResolveHTMLHexCharacterReference(0x003B8))
-  .Case("iota", helperResolveHTMLHexCharacterReference(0x003B9))
-  .Case("kappa",        helperResolveHTMLHexCharacterReference(0x003BA))
-  .Case("lambda",       helperResolveHTMLHexCharacterReference(0x003BB))
-  .Case("mu",   helperResolveHTMLHexCharacterReference(0x003BC))
-  .Case("nu",   helperResolveHTMLHexCharacterReference(0x003BD))
-  .Case("xi",   helperResolveHTMLHexCharacterReference(0x003BE))
-  .Case("pi",   helperResolveHTMLHexCharacterReference(0x003C0))
-  .Case("rho",  helperResolveHTMLHexCharacterReference(0x003C1))
-  .Case("sigma",        helperResolveHTMLHexCharacterReference(0x003C3))
-  .Case("tau",  helperResolveHTMLHexCharacterReference(0x003C4))
-  .Case("upsilon",      helperResolveHTMLHexCharacterReference(0x003C5))
-  .Case("phi",  helperResolveHTMLHexCharacterReference(0x003C6))
-  .Case("chi",  helperResolveHTMLHexCharacterReference(0x003C7))
-  .Case("psi",  helperResolveHTMLHexCharacterReference(0x003C8))
-  .Case("omega",        helperResolveHTMLHexCharacterReference(0x003C9))
-  .Case("sigmaf",       helperResolveHTMLHexCharacterReference(0x003C2))
-  .Case("sect", helperResolveHTMLHexCharacterReference(0x000A7))
-  .Case("deg",  helperResolveHTMLHexCharacterReference(0x000B0))
-  .Case("prime",        helperResolveHTMLHexCharacterReference(0x02032))
-  .Case("Prime",        helperResolveHTMLHexCharacterReference(0x02033))
-  .Case("infin",        helperResolveHTMLHexCharacterReference(0x0221E))
-  .Case("empty",        helperResolveHTMLHexCharacterReference(0x02205))
-  .Case("plusmn",       helperResolveHTMLHexCharacterReference(0x000B1))
-  .Case("times",        helperResolveHTMLHexCharacterReference(0x000D7))
-  .Case("minus",        helperResolveHTMLHexCharacterReference(0x02212))
-  .Case("sdot", helperResolveHTMLHexCharacterReference(0x022C5))
-  .Case("part", helperResolveHTMLHexCharacterReference(0x02202))
-  .Case("nabla",        helperResolveHTMLHexCharacterReference(0x02207))
-  .Case("radic",        helperResolveHTMLHexCharacterReference(0x0221A))
-  .Case("perp", helperResolveHTMLHexCharacterReference(0x022A5))
-  .Case("sum",  helperResolveHTMLHexCharacterReference(0x02211))
-  .Case("int",  helperResolveHTMLHexCharacterReference(0x0222B))
-  .Case("prod", helperResolveHTMLHexCharacterReference(0x0220F))
-  .Case("sim",  helperResolveHTMLHexCharacterReference(0x0223C))
-  .Case("asymp",        helperResolveHTMLHexCharacterReference(0x02248))
-  .Case("ne",   helperResolveHTMLHexCharacterReference(0x02260))
-  .Case("equiv",        helperResolveHTMLHexCharacterReference(0x02261))
-  .Case("prop", helperResolveHTMLHexCharacterReference(0x0221D))
-  .Case("le",   helperResolveHTMLHexCharacterReference(0x02264))
-  .Case("ge",   helperResolveHTMLHexCharacterReference(0x02265))
-  .Case("larr", helperResolveHTMLHexCharacterReference(0x02190))
-  .Case("rarr", helperResolveHTMLHexCharacterReference(0x02192))
-  .Case("isin", helperResolveHTMLHexCharacterReference(0x02208))
-  .Case("notin",        helperResolveHTMLHexCharacterReference(0x02209))
-  .Case("lceil",        helperResolveHTMLHexCharacterReference(0x02308))
-  .Case("rceil",        helperResolveHTMLHexCharacterReference(0x02309))
-  .Case("lfloor",       helperResolveHTMLHexCharacterReference(0x0230A))
-  .Case("rfloor",       helperResolveHTMLHexCharacterReference(0x0230B))
-  .Default("");
+      // Slow path.
+      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
 }
 
 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
@@ -220,13 +64,18 @@ StringRef Lexer::resolveHTMLDecimalChara
     CodePoint *= 10;
     CodePoint += Name[i] - '0';
   }
+  return convertCodePointToUTF8(Allocator, CodePoint);
+}
 
-  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
-  char *ResolvedPtr = Resolved;
-  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
-    return StringRef(Resolved, ResolvedPtr - Resolved);
-  else
-    return StringRef();
+StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
+  unsigned CodePoint = 0;
+  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
+    CodePoint *= 16;
+    const char C = Name[i];
+    assert(isHTMLHexCharacterReferenceCharacter(C));
+    CodePoint += llvm::hexDigitValue(C);
+  }
+  return convertCodePointToUTF8(Allocator, CodePoint);
 }
 
 void Lexer::skipLineStartingDecorations() {
@@ -725,17 +574,8 @@ void Lexer::lexHTMLCharacterReference(To
   StringRef Name(NamePtr, TokenPtr - NamePtr);
   TokenPtr++; // Skip semicolon.
   StringRef Resolved;
-  if (isNamed) {
+  if (isNamed)
     Resolved = resolveHTMLNamedCharacterReference(Name);
-    if (Resolved.empty()) {
-      Resolved = HTMLDoxygenCharacterReference(Name);
-      if (!Resolved.empty()) {
-        formTokenWithChars(T, TokenPtr, tok::text);
-        T.setText(Resolved);
-        return;
-      }
-    }
-  }
   else if (isDecimal)
     Resolved = resolveHTMLDecimalCharacterReference(Name);
   else

Modified: cfe/trunk/utils/TableGen/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/utils/TableGen/CMakeLists.txt?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/utils/TableGen/CMakeLists.txt (original)
+++ cfe/trunk/utils/TableGen/CMakeLists.txt Wed Jan 30 08:29:28 2013
@@ -4,6 +4,7 @@ add_tablegen(clang-tblgen CLANG
   ClangASTNodesEmitter.cpp
   ClangAttrEmitter.cpp
   ClangCommentCommandInfoEmitter.cpp
+  ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
   ClangCommentHTMLTagsEmitter.cpp
   ClangDiagnosticsEmitter.cpp
   ClangSACheckersEmitter.cpp

Added: cfe/trunk/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp?rev=173931&view=auto
==============================================================================
--- cfe/trunk/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp (added)
+++ cfe/trunk/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp Wed Jan 30 08:29:28 2013
@@ -0,0 +1,83 @@
+//===--- ClangCommentHTMLNamedCharacterReferenceEmitter.cpp -----------------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This tablegen backend emits an fficient function to translate HTML named
+// character references to UTF-8 sequences.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/StringMatcher.h"
+#include <vector>
+
+using namespace llvm;
+
+/// \brief Convert a code point to the corresponding UTF-8 sequence represented
+/// as a C string literal.
+///
+/// \returns true on success.
+static bool translateCodePointToUTF8(unsigned CodePoint,
+                                     SmallVectorImpl<char> &CLiteral) {
+  char Translated[UNI_MAX_UTF8_BYTES_PER_CODE_POINT];
+  char *TranslatedPtr = Translated;
+  if (!ConvertCodePointToUTF8(CodePoint, TranslatedPtr))
+    return false;
+
+  StringRef UTF8(Translated, TranslatedPtr - Translated);
+
+  raw_svector_ostream OS(CLiteral);
+  OS << "\"";
+  for (size_t i = 0, e = UTF8.size(); i != e; ++i) {
+    OS << "\\x";
+    OS.write_hex(static_cast<unsigned char>(UTF8[i]));
+  }
+  OS << "\"";
+
+  return true;
+}
+
+namespace clang {
+void EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records,
+                                                  raw_ostream &OS) {
+  std::vector<Record *> Tags = Records.getAllDerivedDefinitions("NCR");
+  std::vector<StringMatcher::StringPair> NameToUTF8;
+  SmallString<32> CLiteral;
+  for (std::vector<Record *>::iterator I = Tags.begin(), E = Tags.end();
+       I != E; ++I) {
+    Record &Tag = **I;
+    std::string Spelling = Tag.getValueAsString("Spelling");
+    uint64_t CodePoint = Tag.getValueAsInt("CodePoint");
+    CLiteral.clear();
+    CLiteral.append("return ");
+    if (!translateCodePointToUTF8(CodePoint, CLiteral)) {
+      SrcMgr.PrintMessage(Tag.getLoc().front(),
+                          SourceMgr::DK_Error,
+                          Twine("invalid code point"));
+      continue;
+    }
+    CLiteral.append(";");
+
+    StringMatcher::StringPair Match(Spelling, CLiteral.str());
+    NameToUTF8.push_back(Match);
+  }
+
+  OS << "// This file is generated by TableGen.  Do not edit.\n\n";
+
+  OS << "StringRef translateHTMLNamedCharacterReferenceToUTF8(\n"
+        "                                             StringRef Name) {\n";
+  StringMatcher("Name", NameToUTF8, OS).Emit();
+  OS << "  return StringRef();\n"
+     << "}\n\n";
+}
+
+} // end namespace clang
+

Modified: cfe/trunk/utils/TableGen/TableGen.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/utils/TableGen/TableGen.cpp?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/utils/TableGen/TableGen.cpp (original)
+++ cfe/trunk/utils/TableGen/TableGen.cpp Wed Jan 30 08:29:28 2013
@@ -44,6 +44,7 @@ enum ActionType {
   GenClangSACheckers,
   GenClangCommentHTMLTags,
   GenClangCommentHTMLTagsProperties,
+  GenClangCommentHTMLNamedCharacterReferences,
   GenClangCommentCommandInfo,
   GenOptParserDefs, GenOptParserImpl,
   GenArmNeon,
@@ -111,6 +112,10 @@ namespace {
                                "gen-clang-comment-html-tags-properties",
                                "Generate efficient matchers for HTML tag "
                                "properties"),
+                    clEnumValN(GenClangCommentHTMLNamedCharacterReferences,
+                               "gen-clang-comment-html-named-character-references",
+                               "Generate function to translate named character "
+                               "references to UTF-8 sequences"),
                     clEnumValN(GenClangCommentCommandInfo,
                                "gen-clang-comment-command-info",
                                "Generate list of commands that are used in "
@@ -194,6 +199,9 @@ bool ClangTableGenMain(raw_ostream &OS, 
   case GenClangCommentHTMLTagsProperties:
     EmitClangCommentHTMLTagsProperties(Records, OS);
     break;
+  case GenClangCommentHTMLNamedCharacterReferences:
+    EmitClangCommentHTMLNamedCharacterReferences(Records, OS);
+    break;
   case GenClangCommentCommandInfo:
     EmitClangCommentCommandInfo(Records, OS);
     break;

Modified: cfe/trunk/utils/TableGen/TableGenBackends.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/utils/TableGen/TableGenBackends.h?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/utils/TableGen/TableGenBackends.h (original)
+++ cfe/trunk/utils/TableGen/TableGenBackends.h Wed Jan 30 08:29:28 2013
@@ -51,6 +51,7 @@ void EmitClangSACheckers(RecordKeeper &R
 
 void EmitClangCommentHTMLTags(RecordKeeper &Records, raw_ostream &OS);
 void EmitClangCommentHTMLTagsProperties(RecordKeeper &Records, raw_ostream &OS);
+void EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records, raw_ostream &OS);
 
 void EmitClangCommentCommandInfo(RecordKeeper &Records, raw_ostream &OS);
 





More information about the cfe-commits mailing list