r173931 - Comment parsing: resolve more named character references
Dmitri Gribenko
gribozavr at gmail.com
Wed Jan 30 06:29:28 PST 2013
Author: gribozavr
Date: Wed Jan 30 08:29:28 2013
New Revision: 173931
URL: http://llvm.org/viewvc/llvm-project?rev=173931&view=rev
Log:
Comment parsing: resolve more named character references
This reimplements r173850 with a better approach:
(1) use a TableGen-generated matcher instead of doing a linear search;
(2) avoid allocations for new strings by converting code points to string
iterals with TableGen.
Added:
cfe/trunk/include/clang/AST/CommentHTMLNamedCharacterReferences.td
cfe/trunk/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
Modified:
cfe/trunk/include/clang/AST/CMakeLists.txt
cfe/trunk/include/clang/AST/CommentLexer.h
cfe/trunk/include/clang/AST/Makefile
cfe/trunk/lib/AST/CMakeLists.txt
cfe/trunk/lib/AST/CommentLexer.cpp
cfe/trunk/utils/TableGen/CMakeLists.txt
cfe/trunk/utils/TableGen/TableGen.cpp
cfe/trunk/utils/TableGen/TableGenBackends.h
Modified: cfe/trunk/include/clang/AST/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/AST/CMakeLists.txt?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/include/clang/AST/CMakeLists.txt (original)
+++ cfe/trunk/include/clang/AST/CMakeLists.txt Wed Jan 30 08:29:28 2013
@@ -33,6 +33,10 @@ clang_tablegen(CommentHTMLTagsProperties
SOURCE CommentHTMLTags.td
TARGET ClangCommentHTMLTagsProperties)
+clang_tablegen(CommentHTMLNamedCharacterReferences.inc -gen-clang-comment-html-named-character-references
+ SOURCE CommentHTMLNamedCharacterReferences.td
+ TARGET ClangCommentHTMLNamedCharacterReferences)
+
clang_tablegen(CommentCommandInfo.inc -gen-clang-comment-command-info
SOURCE CommentCommands.td
TARGET ClangCommentCommandInfo)
Added: cfe/trunk/include/clang/AST/CommentHTMLNamedCharacterReferences.td
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/AST/CommentHTMLNamedCharacterReferences.td?rev=173931&view=auto
==============================================================================
--- cfe/trunk/include/clang/AST/CommentHTMLNamedCharacterReferences.td (added)
+++ cfe/trunk/include/clang/AST/CommentHTMLNamedCharacterReferences.td Wed Jan 30 08:29:28 2013
@@ -0,0 +1,177 @@
+// HTML Named Character Reference
+class NCR<string spelling, int codePoint> {
+ string Spelling = spelling;
+ int CodePoint = codePoint;
+}
+
+// The list below includes named character references supported by Doxygen:
+// http://www.stack.nl/~dimitri/doxygen/manual/htmlcmds.html
+//
+// It does not include all HTML 5 named character references.
+//
+// Corresponding code point values can be found here:
+// http://www.w3.org/TR/2011/WD-html5-20110113/named-character-references.html
+
+def : NCR<"copy", 0x000A9>;
+def : NCR<"COPY", 0x000A9>;
+def : NCR<"trade", 0x02122>;
+def : NCR<"TRADE", 0x02122>;
+def : NCR<"reg", 0x000AE>;
+def : NCR<"REG", 0x000AE>;
+def : NCR<"lt", 0x0003C>;
+def : NCR<"Lt", 0x0003C>;
+def : NCR<"LT", 0x0003C>;
+def : NCR<"gt", 0x0003E>;
+def : NCR<"Gt", 0x0003E>;
+def : NCR<"GT", 0x0003E>;
+def : NCR<"amp", 0x00026>;
+def : NCR<"AMP", 0x00026>;
+def : NCR<"apos", 0x00027>;
+def : NCR<"quot", 0x00022>;
+def : NCR<"QUOT", 0x00022>;
+def : NCR<"lsquo", 0x02018>;
+def : NCR<"rsquo", 0x02019>;
+def : NCR<"ldquo", 0x0201C>;
+def : NCR<"rdquo", 0x0201D>;
+def : NCR<"ndash", 0x02013>;
+def : NCR<"mdash", 0x02014>;
+
+def : NCR<"Auml", 0x000C4>;
+def : NCR<"Euml", 0x000CB>;
+def : NCR<"Iuml", 0x000CF>;
+def : NCR<"Ouml", 0x000D6>;
+def : NCR<"Uuml", 0x000DC>;
+def : NCR<"Yuml", 0x00178>;
+def : NCR<"auml", 0x000E4>;
+def : NCR<"euml", 0x000EB>;
+def : NCR<"iuml", 0x000EF>;
+def : NCR<"ouml", 0x000F6>;
+def : NCR<"uuml", 0x000FC>;
+def : NCR<"yuml", 0x000FF>;
+
+def : NCR<"Aacute", 0x000C1>;
+def : NCR<"Eacute", 0x000C9>;
+def : NCR<"Iacute", 0x000CD>;
+def : NCR<"Oacute", 0x000D3>;
+def : NCR<"Uacute", 0x000DA>;
+def : NCR<"Yacute", 0x000DD>;
+def : NCR<"aacute", 0x000E1>;
+def : NCR<"eacute", 0x000E9>;
+def : NCR<"iacute", 0x000ED>;
+def : NCR<"oacute", 0x000F3>;
+def : NCR<"uacute", 0x000FA>;
+def : NCR<"yacute", 0x000FD>;
+
+def : NCR<"Agrave", 0x000C0>;
+def : NCR<"Egrave", 0x000C8>;
+def : NCR<"Igrave", 0x000CC>;
+def : NCR<"Ograve", 0x000D2>;
+def : NCR<"Ugrave", 0x000D9>;
+// def : NCR<"Ygrave", 0x01EF2>; // Defined neither in Doxygen, nor in HTML5.
+def : NCR<"agrave", 0x000E0>;
+def : NCR<"egrave", 0x000E8>;
+def : NCR<"igrave", 0x000EC>;
+def : NCR<"ograve", 0x000F2>;
+def : NCR<"ugrave", 0x000F9>;
+def : NCR<"ygrave", 0x01EF3>; // Defined in Doxygen, not defined in HTML5.
+
+def : NCR<"Acirc", 0x000C2>;
+def : NCR<"Ecirc", 0x000CA>;
+def : NCR<"Icirc", 0x000CE>;
+def : NCR<"Ocirc", 0x000D4>;
+def : NCR<"Ucirc", 0x000DB>;
+def : NCR<"Ycirc", 0x00176>; // Not defined in Doxygen, defined in HTML5.
+def : NCR<"acirc", 0x000E2>;
+def : NCR<"ecirc", 0x000EA>;
+def : NCR<"icirc", 0x000EE>;
+def : NCR<"ocirc", 0x000F4>;
+def : NCR<"ucirc", 0x000FB>;
+def : NCR<"ycirc", 0x00177>;
+
+def : NCR<"Atilde", 0x000C3>;
+def : NCR<"Ntilde", 0x000D1>;
+def : NCR<"Otilde", 0x000D5>;
+def : NCR<"atilde", 0x000E3>;
+def : NCR<"ntilde", 0x000F1>;
+def : NCR<"otilde", 0x000F5>;
+
+def : NCR<"szlig", 0x000DF>;
+
+def : NCR<"ccedil", 0x000E7>;
+def : NCR<"Ccedil", 0x000C7>;
+
+def : NCR<"aring", 0x000E5>;
+def : NCR<"Aring", 0x000C5>;
+
+def : NCR<"nbsp", 0x000A0>;
+
+def : NCR<"Gamma", 0x00393>;
+def : NCR<"Delta", 0x00394>;
+def : NCR<"Theta", 0x00398>;
+def : NCR<"Lambda", 0x0039B>;
+def : NCR<"Xi", 0x0039E>;
+def : NCR<"Pi", 0x003A0>;
+def : NCR<"Sigma", 0x003A3>;
+def : NCR<"Upsilon", 0x003A5>;
+def : NCR<"Phi", 0x003A6>;
+def : NCR<"Psi", 0x003A8>;
+def : NCR<"Omega", 0x003A9>;
+
+def : NCR<"alpha", 0x003B1>;
+def : NCR<"beta", 0x003B2>;
+def : NCR<"gamma", 0x003B3>;
+def : NCR<"delta", 0x003B4>;
+def : NCR<"epsilon", 0x003B5>;
+def : NCR<"zeta", 0x003B6>;
+def : NCR<"eta", 0x003B7>;
+def : NCR<"theta", 0x003B8>;
+def : NCR<"iota", 0x003B9>;
+def : NCR<"kappa", 0x003BA>;
+def : NCR<"lambda", 0x003BB>;
+def : NCR<"mu", 0x003BC>;
+def : NCR<"nu", 0x003BD>;
+def : NCR<"xi", 0x003BE>;
+def : NCR<"pi", 0x003C0>;
+def : NCR<"rho", 0x003C1>;
+def : NCR<"sigma", 0x003C3>;
+def : NCR<"tau", 0x003C4>;
+def : NCR<"upsilon", 0x003C5>;
+def : NCR<"phi", 0x003C6>;
+def : NCR<"chi", 0x003C7>;
+def : NCR<"psi", 0x003C8>;
+def : NCR<"omega", 0x003C9>;
+def : NCR<"sigmaf", 0x003C2>;
+
+def : NCR<"sect", 0x000A7>;
+def : NCR<"deg", 0x000B0>;
+def : NCR<"prime", 0x02032>;
+def : NCR<"Prime", 0x02033>;
+def : NCR<"infin", 0x0221E>;
+def : NCR<"empty", 0x02205>;
+def : NCR<"plusmn", 0x000B1>;
+def : NCR<"times", 0x000D7>;
+def : NCR<"minus", 0x02212>;
+def : NCR<"sdot", 0x022C5>;
+def : NCR<"part", 0x02202>;
+def : NCR<"nabla", 0x02207>;
+def : NCR<"radic", 0x0221A>;
+def : NCR<"perp", 0x022A5>;
+def : NCR<"sum", 0x02211>;
+def : NCR<"int", 0x0222B>;
+def : NCR<"prod", 0x0220F>;
+def : NCR<"sim", 0x0223C>;
+def : NCR<"asymp", 0x02248>;
+def : NCR<"ne", 0x02260>;
+def : NCR<"equiv", 0x02261>;
+def : NCR<"prop", 0x0221D>;
+def : NCR<"le", 0x02264>;
+def : NCR<"ge", 0x02265>;
+def : NCR<"larr", 0x02190>;
+def : NCR<"rarr", 0x02192>;
+def : NCR<"isin", 0x02208>;
+def : NCR<"notin", 0x02209>;
+def : NCR<"lceil", 0x02308>;
+def : NCR<"rceil", 0x02309>;
+def : NCR<"lfloor", 0x0230A>;
+def : NCR<"rfloor", 0x0230B>;
+
Modified: cfe/trunk/include/clang/AST/CommentLexer.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/AST/CommentLexer.h?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/include/clang/AST/CommentLexer.h (original)
+++ cfe/trunk/include/clang/AST/CommentLexer.h Wed Jan 30 08:29:28 2013
@@ -282,18 +282,11 @@ private:
/// it stands for (e.g., "<").
StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
- /// Given a Doxygen-supported named character reference (e.g., "™"),
- /// it returns its UTF8 encoding.
- StringRef HTMLDoxygenCharacterReference(StringRef Name) const;
-
/// Given a Unicode codepoint as base-10 integer, return the character.
StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
/// Given a Unicode codepoint as base-16 integer, return the character.
StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
-
- /// Helper routine to do part of the work for resolveHTMLHexCharacterReference.
- StringRef helperResolveHTMLHexCharacterReference(unsigned CodePoint) const;
void formTokenWithChars(Token &Result, const char *TokEnd,
tok::TokenKind Kind) {
Modified: cfe/trunk/include/clang/AST/Makefile
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/AST/Makefile?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/include/clang/AST/Makefile (original)
+++ cfe/trunk/include/clang/AST/Makefile Wed Jan 30 08:29:28 2013
@@ -3,7 +3,9 @@ TD_SRC_DIR = $(PROJ_SRC_DIR)/../Basic
BUILT_SOURCES = Attrs.inc AttrImpl.inc AttrDump.inc \
StmtNodes.inc DeclNodes.inc \
CommentNodes.inc CommentHTMLTags.inc \
- CommentHTMLTagsProperties.inc CommentCommandInfo.inc
+ CommentHTMLTagsProperties.inc \
+ CommentHTMLNamedCharacterReferences.inc \
+ CommentCommandInfo.inc
TABLEGEN_INC_FILES_COMMON = 1
@@ -52,6 +54,12 @@ $(ObjDir)/CommentHTMLTagsProperties.inc.
$(Echo) "Building Clang comment HTML tag properties with tblgen"
$(Verb) $(ClangTableGen) -gen-clang-comment-html-tags-properties -o $(call SYSPATH, $@) $<
+$(ObjDir)/CommentHTMLNamedCharacterReferences.inc.tmp : \
+ $(PROJ_SRC_DIR)/CommentHTMLNamedCharacterReferences.td \
+ $(CLANG_TBLGEN) $(ObjDir)/.dir
+ $(Echo) "Building Clang named character reference translation function with tblgen"
+ $(Verb) $(ClangTableGen) -gen-clang-comment-html-named-character-references -o $(call SYSPATH, $@) $<
+
$(ObjDir)/CommentCommandInfo.inc.tmp : $(PROJ_SRC_DIR)/CommentCommands.td \
$(CLANG_TBLGEN) $(ObjDir)/.dir
$(Echo) "Building Clang comment command info with tblgen"
Modified: cfe/trunk/lib/AST/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/CMakeLists.txt?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/lib/AST/CMakeLists.txt (original)
+++ cfe/trunk/lib/AST/CMakeLists.txt Wed Jan 30 08:29:28 2013
@@ -68,6 +68,7 @@ add_dependencies(clangAST
ClangCommentNodes
ClangCommentHTMLTags
ClangCommentHTMLTagsProperties
+ ClangCommentHTMLNamedCharacterReferences
ClangDeclNodes
ClangDiagnosticAST
ClangDiagnosticComment
Modified: cfe/trunk/lib/AST/CommentLexer.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/CommentLexer.cpp?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/lib/AST/CommentLexer.cpp (original)
+++ cfe/trunk/lib/AST/CommentLexer.cpp Wed Jan 30 08:29:28 2013
@@ -30,22 +30,8 @@ bool isHTMLHexCharacterReferenceCharacte
(C >= 'A' && C <= 'F');
}
-#include "clang/AST/CommentHTMLTags.inc"
-
-} // unnamed namespace
-
-static unsigned getCodePoint(StringRef Name) {
- unsigned CodePoint = 0;
- for (unsigned i = 0, e = Name.size(); i != e; ++i) {
- CodePoint *= 16;
- const char C = Name[i];
- assert(isHTMLHexCharacterReferenceCharacter(C));
- CodePoint += llvm::hexDigitValue(C);
- }
- return CodePoint;
-}
-
-StringRef Lexer::helperResolveHTMLHexCharacterReference(unsigned CodePoint) const {
+StringRef convertCodePointToUTF8(llvm::BumpPtrAllocator &Allocator,
+ unsigned CodePoint) {
char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
char *ResolvedPtr = Resolved;
if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
@@ -53,164 +39,22 @@ StringRef Lexer::helperResolveHTMLHexCha
else
return StringRef();
}
-
-StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
- unsigned CodePoint = getCodePoint(Name);
- return helperResolveHTMLHexCharacterReference(CodePoint);
-}
+
+#include "clang/AST/CommentHTMLTags.inc"
+#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
+
+} // unnamed namespace
StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
+ // Fast path, first check a few most widely used named character references.
return llvm::StringSwitch<StringRef>(Name)
.Case("amp", "&")
.Case("lt", "<")
.Case("gt", ">")
.Case("quot", "\"")
.Case("apos", "\'")
- .Default("");
-}
-
-StringRef Lexer::HTMLDoxygenCharacterReference(StringRef Name) const {
- return llvm::StringSwitch<StringRef>(Name)
- .Case("copy", helperResolveHTMLHexCharacterReference(0x000A9))
- .Case("trade", helperResolveHTMLHexCharacterReference(0x02122))
- .Case("reg", helperResolveHTMLHexCharacterReference(0x000AE))
- .Case("lt", helperResolveHTMLHexCharacterReference(0x0003C))
- .Case("gt", helperResolveHTMLHexCharacterReference(0x0003C))
- .Case("amp", helperResolveHTMLHexCharacterReference(0x00026))
- .Case("apos", helperResolveHTMLHexCharacterReference(0x00027))
- .Case("quot", helperResolveHTMLHexCharacterReference(0x00022))
- .Case("lsquo", helperResolveHTMLHexCharacterReference(0x02018))
- .Case("rsquo", helperResolveHTMLHexCharacterReference(0x02019))
- .Case("ldquo", helperResolveHTMLHexCharacterReference(0x0201C))
- .Case("rdquo", helperResolveHTMLHexCharacterReference(0x0201D))
- .Case("ndash", helperResolveHTMLHexCharacterReference(0x02013))
- .Case("mdash", helperResolveHTMLHexCharacterReference(0x02014))
- .Case("Auml", helperResolveHTMLHexCharacterReference(0x000C4))
- .Case("Euml", helperResolveHTMLHexCharacterReference(0x000CB))
- .Case("Iuml", helperResolveHTMLHexCharacterReference(0x000CF))
- .Case("Ouml", helperResolveHTMLHexCharacterReference(0x000D6))
- .Case("Uuml", helperResolveHTMLHexCharacterReference(0x000DC))
- .Case("Yuml", helperResolveHTMLHexCharacterReference(0x00178))
- .Case("auml", helperResolveHTMLHexCharacterReference(0x000E4))
- .Case("euml", helperResolveHTMLHexCharacterReference(0x000EB))
- .Case("iuml", helperResolveHTMLHexCharacterReference(0x000EF))
- .Case("ouml", helperResolveHTMLHexCharacterReference(0x000F6))
- .Case("uuml", helperResolveHTMLHexCharacterReference(0x000FC))
- .Case("yuml", helperResolveHTMLHexCharacterReference(0x000FF))
- .Case("Aacute", helperResolveHTMLHexCharacterReference(0x000C1))
- .Case("Eacute", helperResolveHTMLHexCharacterReference(0x000C9))
- .Case("Iacute", helperResolveHTMLHexCharacterReference(0x000CD))
- .Case("Oacute", helperResolveHTMLHexCharacterReference(0x000D3))
- .Case("Uacute", helperResolveHTMLHexCharacterReference(0x000DA))
- .Case("Yacute", helperResolveHTMLHexCharacterReference(0x000DD))
- .Case("aacute", helperResolveHTMLHexCharacterReference(0x000E1))
- .Case("eacute", helperResolveHTMLHexCharacterReference(0x000E9))
- .Case("iacute", helperResolveHTMLHexCharacterReference(0x000ED))
- .Case("oacute", helperResolveHTMLHexCharacterReference(0x000F3))
- .Case("uacute", helperResolveHTMLHexCharacterReference(0x000FA))
- .Case("yacute", helperResolveHTMLHexCharacterReference(0x000FD))
- .Case("Agrave", helperResolveHTMLHexCharacterReference(0x000C0))
- .Case("Egrave", helperResolveHTMLHexCharacterReference(0x000C8))
- .Case("Igrave", helperResolveHTMLHexCharacterReference(0x000CC))
- .Case("Ograve", helperResolveHTMLHexCharacterReference(0x000D2))
- .Case("Ugrave", helperResolveHTMLHexCharacterReference(0x000D9))
- .Case("agrave", helperResolveHTMLHexCharacterReference(0x000E0))
- .Case("egrave", helperResolveHTMLHexCharacterReference(0x000E8))
- .Case("igrave", helperResolveHTMLHexCharacterReference(0x000EC))
- .Case("ograve", helperResolveHTMLHexCharacterReference(0x000F2))
- .Case("ugrave", helperResolveHTMLHexCharacterReference(0x000F9))
- .Case("ygrave", helperResolveHTMLHexCharacterReference(0x01EF3))
- .Case("Acirc", helperResolveHTMLHexCharacterReference(0x000C2))
- .Case("Ecirc", helperResolveHTMLHexCharacterReference(0x000CA))
- .Case("Icirc", helperResolveHTMLHexCharacterReference(0x000CE))
- .Case("Ocirc", helperResolveHTMLHexCharacterReference(0x000D4))
- .Case("Ucirc", helperResolveHTMLHexCharacterReference(0x000DB))
- .Case("acirc", helperResolveHTMLHexCharacterReference(0x000E2))
- .Case("ecirc", helperResolveHTMLHexCharacterReference(0x000EA))
- .Case("icirc", helperResolveHTMLHexCharacterReference(0x000EE))
- .Case("ocirc", helperResolveHTMLHexCharacterReference(0x000F4))
- .Case("ucirc", helperResolveHTMLHexCharacterReference(0x000FB))
- .Case("ycirc", helperResolveHTMLHexCharacterReference(0x00177))
- .Case("Atilde", helperResolveHTMLHexCharacterReference(0x000C3))
- .Case("Ntilde", helperResolveHTMLHexCharacterReference(0x000D1))
- .Case("Otilde", helperResolveHTMLHexCharacterReference(0x000D5))
- .Case("atilde", helperResolveHTMLHexCharacterReference(0x000E3))
- .Case("ntilde", helperResolveHTMLHexCharacterReference(0x000F1))
- .Case("otilde", helperResolveHTMLHexCharacterReference(0x000F5))
- .Case("szlig", helperResolveHTMLHexCharacterReference(0x000DF))
- .Case("ccedil", helperResolveHTMLHexCharacterReference(0x000E7))
- .Case("Ccedil", helperResolveHTMLHexCharacterReference(0x000C7))
- .Case("aring", helperResolveHTMLHexCharacterReference(0x000E5))
- .Case("Aring", helperResolveHTMLHexCharacterReference(0x000C5))
- .Case("nbsp", helperResolveHTMLHexCharacterReference(0x000A0))
- .Case("Gamma", helperResolveHTMLHexCharacterReference(0x00393))
- .Case("Delta", helperResolveHTMLHexCharacterReference(0x00394))
- .Case("Theta", helperResolveHTMLHexCharacterReference(0x00398))
- .Case("Lambda", helperResolveHTMLHexCharacterReference(0x0039B))
- .Case("Xi", helperResolveHTMLHexCharacterReference(0x0039E))
- .Case("Pi", helperResolveHTMLHexCharacterReference(0x003A0))
- .Case("Sigma", helperResolveHTMLHexCharacterReference(0x003A3))
- .Case("Upsilon", helperResolveHTMLHexCharacterReference(0x003A5))
- .Case("Phi", helperResolveHTMLHexCharacterReference(0x003A6))
- .Case("Psi", helperResolveHTMLHexCharacterReference(0x003A8))
- .Case("Omega", helperResolveHTMLHexCharacterReference(0x003A9))
- .Case("alpha", helperResolveHTMLHexCharacterReference(0x003B1))
- .Case("beta", helperResolveHTMLHexCharacterReference(0x003B2))
- .Case("gamma", helperResolveHTMLHexCharacterReference(0x003B3))
- .Case("delta", helperResolveHTMLHexCharacterReference(0x003B4))
- .Case("epsilon", helperResolveHTMLHexCharacterReference(0x003B5))
- .Case("zeta", helperResolveHTMLHexCharacterReference(0x003B6))
- .Case("eta", helperResolveHTMLHexCharacterReference(0x003B7))
- .Case("theta", helperResolveHTMLHexCharacterReference(0x003B8))
- .Case("iota", helperResolveHTMLHexCharacterReference(0x003B9))
- .Case("kappa", helperResolveHTMLHexCharacterReference(0x003BA))
- .Case("lambda", helperResolveHTMLHexCharacterReference(0x003BB))
- .Case("mu", helperResolveHTMLHexCharacterReference(0x003BC))
- .Case("nu", helperResolveHTMLHexCharacterReference(0x003BD))
- .Case("xi", helperResolveHTMLHexCharacterReference(0x003BE))
- .Case("pi", helperResolveHTMLHexCharacterReference(0x003C0))
- .Case("rho", helperResolveHTMLHexCharacterReference(0x003C1))
- .Case("sigma", helperResolveHTMLHexCharacterReference(0x003C3))
- .Case("tau", helperResolveHTMLHexCharacterReference(0x003C4))
- .Case("upsilon", helperResolveHTMLHexCharacterReference(0x003C5))
- .Case("phi", helperResolveHTMLHexCharacterReference(0x003C6))
- .Case("chi", helperResolveHTMLHexCharacterReference(0x003C7))
- .Case("psi", helperResolveHTMLHexCharacterReference(0x003C8))
- .Case("omega", helperResolveHTMLHexCharacterReference(0x003C9))
- .Case("sigmaf", helperResolveHTMLHexCharacterReference(0x003C2))
- .Case("sect", helperResolveHTMLHexCharacterReference(0x000A7))
- .Case("deg", helperResolveHTMLHexCharacterReference(0x000B0))
- .Case("prime", helperResolveHTMLHexCharacterReference(0x02032))
- .Case("Prime", helperResolveHTMLHexCharacterReference(0x02033))
- .Case("infin", helperResolveHTMLHexCharacterReference(0x0221E))
- .Case("empty", helperResolveHTMLHexCharacterReference(0x02205))
- .Case("plusmn", helperResolveHTMLHexCharacterReference(0x000B1))
- .Case("times", helperResolveHTMLHexCharacterReference(0x000D7))
- .Case("minus", helperResolveHTMLHexCharacterReference(0x02212))
- .Case("sdot", helperResolveHTMLHexCharacterReference(0x022C5))
- .Case("part", helperResolveHTMLHexCharacterReference(0x02202))
- .Case("nabla", helperResolveHTMLHexCharacterReference(0x02207))
- .Case("radic", helperResolveHTMLHexCharacterReference(0x0221A))
- .Case("perp", helperResolveHTMLHexCharacterReference(0x022A5))
- .Case("sum", helperResolveHTMLHexCharacterReference(0x02211))
- .Case("int", helperResolveHTMLHexCharacterReference(0x0222B))
- .Case("prod", helperResolveHTMLHexCharacterReference(0x0220F))
- .Case("sim", helperResolveHTMLHexCharacterReference(0x0223C))
- .Case("asymp", helperResolveHTMLHexCharacterReference(0x02248))
- .Case("ne", helperResolveHTMLHexCharacterReference(0x02260))
- .Case("equiv", helperResolveHTMLHexCharacterReference(0x02261))
- .Case("prop", helperResolveHTMLHexCharacterReference(0x0221D))
- .Case("le", helperResolveHTMLHexCharacterReference(0x02264))
- .Case("ge", helperResolveHTMLHexCharacterReference(0x02265))
- .Case("larr", helperResolveHTMLHexCharacterReference(0x02190))
- .Case("rarr", helperResolveHTMLHexCharacterReference(0x02192))
- .Case("isin", helperResolveHTMLHexCharacterReference(0x02208))
- .Case("notin", helperResolveHTMLHexCharacterReference(0x02209))
- .Case("lceil", helperResolveHTMLHexCharacterReference(0x02308))
- .Case("rceil", helperResolveHTMLHexCharacterReference(0x02309))
- .Case("lfloor", helperResolveHTMLHexCharacterReference(0x0230A))
- .Case("rfloor", helperResolveHTMLHexCharacterReference(0x0230B))
- .Default("");
+ // Slow path.
+ .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
}
StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
@@ -220,13 +64,18 @@ StringRef Lexer::resolveHTMLDecimalChara
CodePoint *= 10;
CodePoint += Name[i] - '0';
}
+ return convertCodePointToUTF8(Allocator, CodePoint);
+}
- char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
- char *ResolvedPtr = Resolved;
- if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
- return StringRef(Resolved, ResolvedPtr - Resolved);
- else
- return StringRef();
+StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
+ unsigned CodePoint = 0;
+ for (unsigned i = 0, e = Name.size(); i != e; ++i) {
+ CodePoint *= 16;
+ const char C = Name[i];
+ assert(isHTMLHexCharacterReferenceCharacter(C));
+ CodePoint += llvm::hexDigitValue(C);
+ }
+ return convertCodePointToUTF8(Allocator, CodePoint);
}
void Lexer::skipLineStartingDecorations() {
@@ -725,17 +574,8 @@ void Lexer::lexHTMLCharacterReference(To
StringRef Name(NamePtr, TokenPtr - NamePtr);
TokenPtr++; // Skip semicolon.
StringRef Resolved;
- if (isNamed) {
+ if (isNamed)
Resolved = resolveHTMLNamedCharacterReference(Name);
- if (Resolved.empty()) {
- Resolved = HTMLDoxygenCharacterReference(Name);
- if (!Resolved.empty()) {
- formTokenWithChars(T, TokenPtr, tok::text);
- T.setText(Resolved);
- return;
- }
- }
- }
else if (isDecimal)
Resolved = resolveHTMLDecimalCharacterReference(Name);
else
Modified: cfe/trunk/utils/TableGen/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/utils/TableGen/CMakeLists.txt?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/utils/TableGen/CMakeLists.txt (original)
+++ cfe/trunk/utils/TableGen/CMakeLists.txt Wed Jan 30 08:29:28 2013
@@ -4,6 +4,7 @@ add_tablegen(clang-tblgen CLANG
ClangASTNodesEmitter.cpp
ClangAttrEmitter.cpp
ClangCommentCommandInfoEmitter.cpp
+ ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
ClangCommentHTMLTagsEmitter.cpp
ClangDiagnosticsEmitter.cpp
ClangSACheckersEmitter.cpp
Added: cfe/trunk/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp?rev=173931&view=auto
==============================================================================
--- cfe/trunk/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp (added)
+++ cfe/trunk/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp Wed Jan 30 08:29:28 2013
@@ -0,0 +1,83 @@
+//===--- ClangCommentHTMLNamedCharacterReferenceEmitter.cpp -----------------=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This tablegen backend emits an fficient function to translate HTML named
+// character references to UTF-8 sequences.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/StringMatcher.h"
+#include <vector>
+
+using namespace llvm;
+
+/// \brief Convert a code point to the corresponding UTF-8 sequence represented
+/// as a C string literal.
+///
+/// \returns true on success.
+static bool translateCodePointToUTF8(unsigned CodePoint,
+ SmallVectorImpl<char> &CLiteral) {
+ char Translated[UNI_MAX_UTF8_BYTES_PER_CODE_POINT];
+ char *TranslatedPtr = Translated;
+ if (!ConvertCodePointToUTF8(CodePoint, TranslatedPtr))
+ return false;
+
+ StringRef UTF8(Translated, TranslatedPtr - Translated);
+
+ raw_svector_ostream OS(CLiteral);
+ OS << "\"";
+ for (size_t i = 0, e = UTF8.size(); i != e; ++i) {
+ OS << "\\x";
+ OS.write_hex(static_cast<unsigned char>(UTF8[i]));
+ }
+ OS << "\"";
+
+ return true;
+}
+
+namespace clang {
+void EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records,
+ raw_ostream &OS) {
+ std::vector<Record *> Tags = Records.getAllDerivedDefinitions("NCR");
+ std::vector<StringMatcher::StringPair> NameToUTF8;
+ SmallString<32> CLiteral;
+ for (std::vector<Record *>::iterator I = Tags.begin(), E = Tags.end();
+ I != E; ++I) {
+ Record &Tag = **I;
+ std::string Spelling = Tag.getValueAsString("Spelling");
+ uint64_t CodePoint = Tag.getValueAsInt("CodePoint");
+ CLiteral.clear();
+ CLiteral.append("return ");
+ if (!translateCodePointToUTF8(CodePoint, CLiteral)) {
+ SrcMgr.PrintMessage(Tag.getLoc().front(),
+ SourceMgr::DK_Error,
+ Twine("invalid code point"));
+ continue;
+ }
+ CLiteral.append(";");
+
+ StringMatcher::StringPair Match(Spelling, CLiteral.str());
+ NameToUTF8.push_back(Match);
+ }
+
+ OS << "// This file is generated by TableGen. Do not edit.\n\n";
+
+ OS << "StringRef translateHTMLNamedCharacterReferenceToUTF8(\n"
+ " StringRef Name) {\n";
+ StringMatcher("Name", NameToUTF8, OS).Emit();
+ OS << " return StringRef();\n"
+ << "}\n\n";
+}
+
+} // end namespace clang
+
Modified: cfe/trunk/utils/TableGen/TableGen.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/utils/TableGen/TableGen.cpp?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/utils/TableGen/TableGen.cpp (original)
+++ cfe/trunk/utils/TableGen/TableGen.cpp Wed Jan 30 08:29:28 2013
@@ -44,6 +44,7 @@ enum ActionType {
GenClangSACheckers,
GenClangCommentHTMLTags,
GenClangCommentHTMLTagsProperties,
+ GenClangCommentHTMLNamedCharacterReferences,
GenClangCommentCommandInfo,
GenOptParserDefs, GenOptParserImpl,
GenArmNeon,
@@ -111,6 +112,10 @@ namespace {
"gen-clang-comment-html-tags-properties",
"Generate efficient matchers for HTML tag "
"properties"),
+ clEnumValN(GenClangCommentHTMLNamedCharacterReferences,
+ "gen-clang-comment-html-named-character-references",
+ "Generate function to translate named character "
+ "references to UTF-8 sequences"),
clEnumValN(GenClangCommentCommandInfo,
"gen-clang-comment-command-info",
"Generate list of commands that are used in "
@@ -194,6 +199,9 @@ bool ClangTableGenMain(raw_ostream &OS,
case GenClangCommentHTMLTagsProperties:
EmitClangCommentHTMLTagsProperties(Records, OS);
break;
+ case GenClangCommentHTMLNamedCharacterReferences:
+ EmitClangCommentHTMLNamedCharacterReferences(Records, OS);
+ break;
case GenClangCommentCommandInfo:
EmitClangCommentCommandInfo(Records, OS);
break;
Modified: cfe/trunk/utils/TableGen/TableGenBackends.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/utils/TableGen/TableGenBackends.h?rev=173931&r1=173930&r2=173931&view=diff
==============================================================================
--- cfe/trunk/utils/TableGen/TableGenBackends.h (original)
+++ cfe/trunk/utils/TableGen/TableGenBackends.h Wed Jan 30 08:29:28 2013
@@ -51,6 +51,7 @@ void EmitClangSACheckers(RecordKeeper &R
void EmitClangCommentHTMLTags(RecordKeeper &Records, raw_ostream &OS);
void EmitClangCommentHTMLTagsProperties(RecordKeeper &Records, raw_ostream &OS);
+void EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records, raw_ostream &OS);
void EmitClangCommentCommandInfo(RecordKeeper &Records, raw_ostream &OS);
More information about the cfe-commits
mailing list