r173850 - [Doc parsing] Patch to parse Doxygen-supported HTML character
Fariborz Jahanian
fjahanian at apple.com
Tue Jan 29 15:42:27 PST 2013
Author: fjahanian
Date: Tue Jan 29 17:42:26 2013
New Revision: 173850
URL: http://llvm.org/viewvc/llvm-project?rev=173850&view=rev
Log:
[Doc parsing] Patch to parse Doxygen-supported HTML character
references to their UTIF-8 encoding. Reviewed offline by Doug.
// rdar://12392215
Added:
cfe/trunk/test/Index/special-html-characters.m
Modified:
cfe/trunk/include/clang/AST/CommentLexer.h
cfe/trunk/lib/AST/CommentLexer.cpp
Modified: cfe/trunk/include/clang/AST/CommentLexer.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/AST/CommentLexer.h?rev=173850&r1=173849&r2=173850&view=diff
==============================================================================
--- cfe/trunk/include/clang/AST/CommentLexer.h (original)
+++ cfe/trunk/include/clang/AST/CommentLexer.h Tue Jan 29 17:42:26 2013
@@ -282,11 +282,18 @@ private:
/// it stands for (e.g., "<").
StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
+ /// Given a Doxygen-supported named character reference (e.g., "™"),
+ /// it returns its UTF8 encoding.
+ StringRef HTMLDoxygenCharacterReference(StringRef Name) const;
+
/// Given a Unicode codepoint as base-10 integer, return the character.
StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
/// Given a Unicode codepoint as base-16 integer, return the character.
StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
+
+ /// Helper routine to do part of the work for resolveHTMLHexCharacterReference.
+ StringRef helperResolveHTMLHexCharacterReference(unsigned CodePoint) const;
void formTokenWithChars(Token &Result, const char *TokEnd,
tok::TokenKind Kind) {
Modified: cfe/trunk/lib/AST/CommentLexer.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/CommentLexer.cpp?rev=173850&r1=173849&r2=173850&view=diff
==============================================================================
--- cfe/trunk/lib/AST/CommentLexer.cpp (original)
+++ cfe/trunk/lib/AST/CommentLexer.cpp Tue Jan 29 17:42:26 2013
@@ -34,6 +34,31 @@ bool isHTMLHexCharacterReferenceCharacte
} // unnamed namespace
+static unsigned getCodePoint(StringRef Name) {
+ unsigned CodePoint = 0;
+ for (unsigned i = 0, e = Name.size(); i != e; ++i) {
+ CodePoint *= 16;
+ const char C = Name[i];
+ assert(isHTMLHexCharacterReferenceCharacter(C));
+ CodePoint += llvm::hexDigitValue(C);
+ }
+ return CodePoint;
+}
+
+StringRef Lexer::helperResolveHTMLHexCharacterReference(unsigned CodePoint) const {
+ char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
+ char *ResolvedPtr = Resolved;
+ if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
+ return StringRef(Resolved, ResolvedPtr - Resolved);
+ else
+ return StringRef();
+}
+
+StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
+ unsigned CodePoint = getCodePoint(Name);
+ return helperResolveHTMLHexCharacterReference(CodePoint);
+}
+
StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
return llvm::StringSwitch<StringRef>(Name)
.Case("amp", "&")
@@ -41,8 +66,154 @@ StringRef Lexer::resolveHTMLNamedCharact
.Case("gt", ">")
.Case("quot", "\"")
.Case("apos", "\'")
+ .Case("minus", "-")
+ .Case("sim", "~")
.Default("");
}
+
+StringRef Lexer::HTMLDoxygenCharacterReference(StringRef Name) const {
+ return llvm::StringSwitch<StringRef>(Name)
+ .Case("copy", helperResolveHTMLHexCharacterReference(0x000A9))
+ .Case("trade", helperResolveHTMLHexCharacterReference(0x02122))
+ .Case("reg", helperResolveHTMLHexCharacterReference(0x000AE))
+ .Case("lt", helperResolveHTMLHexCharacterReference(0x0003C))
+ .Case("gt", helperResolveHTMLHexCharacterReference(0x0003C))
+ .Case("amp", helperResolveHTMLHexCharacterReference(0x00026))
+ .Case("apos", helperResolveHTMLHexCharacterReference(0x00027))
+ .Case("quot", helperResolveHTMLHexCharacterReference(0x00022))
+ .Case("lsquo", helperResolveHTMLHexCharacterReference(0x02018))
+ .Case("rsquo", helperResolveHTMLHexCharacterReference(0x02019))
+ .Case("ldquo", helperResolveHTMLHexCharacterReference(0x0201C))
+ .Case("rdquo", helperResolveHTMLHexCharacterReference(0x0201D))
+ .Case("ndash", helperResolveHTMLHexCharacterReference(0x02013))
+ .Case("mdash", helperResolveHTMLHexCharacterReference(0x02014))
+ .Case("Auml", helperResolveHTMLHexCharacterReference(0x000C4))
+ .Case("Euml", helperResolveHTMLHexCharacterReference(0x000CB))
+ .Case("Iuml", helperResolveHTMLHexCharacterReference(0x000CF))
+ .Case("Ouml", helperResolveHTMLHexCharacterReference(0x000D6))
+ .Case("Uuml", helperResolveHTMLHexCharacterReference(0x000DC))
+ .Case("Yuml", helperResolveHTMLHexCharacterReference(0x00178))
+ .Case("auml", helperResolveHTMLHexCharacterReference(0x000E4))
+ .Case("euml", helperResolveHTMLHexCharacterReference(0x000EB))
+ .Case("iuml", helperResolveHTMLHexCharacterReference(0x000EF))
+ .Case("ouml", helperResolveHTMLHexCharacterReference(0x000F6))
+ .Case("uuml", helperResolveHTMLHexCharacterReference(0x000FC))
+ .Case("yuml", helperResolveHTMLHexCharacterReference(0x000FF))
+ .Case("Aacute", helperResolveHTMLHexCharacterReference(0x000C1))
+ .Case("Eacute", helperResolveHTMLHexCharacterReference(0x000C9))
+ .Case("Iacute", helperResolveHTMLHexCharacterReference(0x000CD))
+ .Case("Oacute", helperResolveHTMLHexCharacterReference(0x000D3))
+ .Case("Uacute", helperResolveHTMLHexCharacterReference(0x000DA))
+ .Case("Yacute", helperResolveHTMLHexCharacterReference(0x000DD))
+ .Case("aacute", helperResolveHTMLHexCharacterReference(0x000E1))
+ .Case("eacute", helperResolveHTMLHexCharacterReference(0x000E9))
+ .Case("iacute", helperResolveHTMLHexCharacterReference(0x000ED))
+ .Case("oacute", helperResolveHTMLHexCharacterReference(0x000F3))
+ .Case("uacute", helperResolveHTMLHexCharacterReference(0x000FA))
+ .Case("yacute", helperResolveHTMLHexCharacterReference(0x000FD))
+ .Case("Agrave", helperResolveHTMLHexCharacterReference(0x000C0))
+ .Case("Egrave", helperResolveHTMLHexCharacterReference(0x000C8))
+ .Case("Igrave", helperResolveHTMLHexCharacterReference(0x000CC))
+ .Case("Ograve", helperResolveHTMLHexCharacterReference(0x000D2))
+ .Case("Ugrave", helperResolveHTMLHexCharacterReference(0x000D9))
+ .Case("agrave", helperResolveHTMLHexCharacterReference(0x000E0))
+ .Case("egrave", helperResolveHTMLHexCharacterReference(0x000E8))
+ .Case("igrave", helperResolveHTMLHexCharacterReference(0x000EC))
+ .Case("ograve", helperResolveHTMLHexCharacterReference(0x000F2))
+ .Case("ugrave", helperResolveHTMLHexCharacterReference(0x000F9))
+ .Case("ygrave", helperResolveHTMLHexCharacterReference(0x01EF3))
+ .Case("Acirc", helperResolveHTMLHexCharacterReference(0x000C2))
+ .Case("Ecirc", helperResolveHTMLHexCharacterReference(0x000CA))
+ .Case("Icirc", helperResolveHTMLHexCharacterReference(0x000CE))
+ .Case("Ocirc", helperResolveHTMLHexCharacterReference(0x000D4))
+ .Case("Ucirc", helperResolveHTMLHexCharacterReference(0x000DB))
+ .Case("acirc", helperResolveHTMLHexCharacterReference(0x000E2))
+ .Case("ecirc", helperResolveHTMLHexCharacterReference(0x000EA))
+ .Case("icirc", helperResolveHTMLHexCharacterReference(0x000EE))
+ .Case("ocirc", helperResolveHTMLHexCharacterReference(0x000F4))
+ .Case("ucirc", helperResolveHTMLHexCharacterReference(0x000FB))
+ .Case("ycirc", helperResolveHTMLHexCharacterReference(0x00177))
+ .Case("Atilde", helperResolveHTMLHexCharacterReference(0x000C3))
+ .Case("Ntilde", helperResolveHTMLHexCharacterReference(0x000D1))
+ .Case("Otilde", helperResolveHTMLHexCharacterReference(0x000D5))
+ .Case("atilde", helperResolveHTMLHexCharacterReference(0x000E3))
+ .Case("ntilde", helperResolveHTMLHexCharacterReference(0x000F1))
+ .Case("otilde", helperResolveHTMLHexCharacterReference(0x000F5))
+ .Case("szlig", helperResolveHTMLHexCharacterReference(0x000DF))
+ .Case("ccedil", helperResolveHTMLHexCharacterReference(0x000E7))
+ .Case("Ccedil", helperResolveHTMLHexCharacterReference(0x000C7))
+ .Case("aring", helperResolveHTMLHexCharacterReference(0x000E5))
+ .Case("Aring", helperResolveHTMLHexCharacterReference(0x000C5))
+ .Case("nbsp", helperResolveHTMLHexCharacterReference(0x000A0))
+ .Case("Gamma", helperResolveHTMLHexCharacterReference(0x00393))
+ .Case("Delta", helperResolveHTMLHexCharacterReference(0x00394))
+ .Case("Theta", helperResolveHTMLHexCharacterReference(0x00398))
+ .Case("Lambda", helperResolveHTMLHexCharacterReference(0x0039B))
+ .Case("Xi", helperResolveHTMLHexCharacterReference(0x0039E))
+ .Case("Pi", helperResolveHTMLHexCharacterReference(0x003A0))
+ .Case("Sigma", helperResolveHTMLHexCharacterReference(0x003A3))
+ .Case("Upsilon", helperResolveHTMLHexCharacterReference(0x003A5))
+ .Case("Phi", helperResolveHTMLHexCharacterReference(0x003A6))
+ .Case("Psi", helperResolveHTMLHexCharacterReference(0x003A8))
+ .Case("Omega", helperResolveHTMLHexCharacterReference(0x003A9))
+ .Case("alpha", helperResolveHTMLHexCharacterReference(0x003B1))
+ .Case("beta", helperResolveHTMLHexCharacterReference(0x003B2))
+ .Case("gamma", helperResolveHTMLHexCharacterReference(0x003B3))
+ .Case("delta", helperResolveHTMLHexCharacterReference(0x003B4))
+ .Case("epsilon", helperResolveHTMLHexCharacterReference(0x003B5))
+ .Case("zeta", helperResolveHTMLHexCharacterReference(0x003B6))
+ .Case("eta", helperResolveHTMLHexCharacterReference(0x003B7))
+ .Case("theta", helperResolveHTMLHexCharacterReference(0x003B8))
+ .Case("iota", helperResolveHTMLHexCharacterReference(0x003B9))
+ .Case("kappa", helperResolveHTMLHexCharacterReference(0x003BA))
+ .Case("lambda", helperResolveHTMLHexCharacterReference(0x003BB))
+ .Case("mu", helperResolveHTMLHexCharacterReference(0x003BC))
+ .Case("nu", helperResolveHTMLHexCharacterReference(0x003BD))
+ .Case("xi", helperResolveHTMLHexCharacterReference(0x003BE))
+ .Case("pi", helperResolveHTMLHexCharacterReference(0x003C0))
+ .Case("rho", helperResolveHTMLHexCharacterReference(0x003C1))
+ .Case("sigma", helperResolveHTMLHexCharacterReference(0x003C3))
+ .Case("tau", helperResolveHTMLHexCharacterReference(0x003C4))
+ .Case("upsilon", helperResolveHTMLHexCharacterReference(0x003C5))
+ .Case("phi", helperResolveHTMLHexCharacterReference(0x003C6))
+ .Case("chi", helperResolveHTMLHexCharacterReference(0x003C7))
+ .Case("psi", helperResolveHTMLHexCharacterReference(0x003C8))
+ .Case("omega", helperResolveHTMLHexCharacterReference(0x003C9))
+ .Case("sigmaf", helperResolveHTMLHexCharacterReference(0x003C2))
+ .Case("sect", helperResolveHTMLHexCharacterReference(0x000A7))
+ .Case("deg", helperResolveHTMLHexCharacterReference(0x000B0))
+ .Case("prime", helperResolveHTMLHexCharacterReference(0x02032))
+ .Case("Prime", helperResolveHTMLHexCharacterReference(0x02033))
+ .Case("infin", helperResolveHTMLHexCharacterReference(0x0221E))
+ .Case("empty", helperResolveHTMLHexCharacterReference(0x02205))
+ .Case("plusmn", helperResolveHTMLHexCharacterReference(0x000B1))
+ .Case("times", helperResolveHTMLHexCharacterReference(0x000D7))
+ .Case("minus", helperResolveHTMLHexCharacterReference(0x02212))
+ .Case("sdot", helperResolveHTMLHexCharacterReference(0x022C5))
+ .Case("part", helperResolveHTMLHexCharacterReference(0x02202))
+ .Case("nabla", helperResolveHTMLHexCharacterReference(0x02207))
+ .Case("radic", helperResolveHTMLHexCharacterReference(0x0221A))
+ .Case("perp", helperResolveHTMLHexCharacterReference(0x022A5))
+ .Case("sum", helperResolveHTMLHexCharacterReference(0x02211))
+ .Case("int", helperResolveHTMLHexCharacterReference(0x0222B))
+ .Case("prod", helperResolveHTMLHexCharacterReference(0x0220F))
+ .Case("sim", helperResolveHTMLHexCharacterReference(0x0223C))
+ .Case("asymp", helperResolveHTMLHexCharacterReference(0x02248))
+ .Case("ne", helperResolveHTMLHexCharacterReference(0x02260))
+ .Case("equiv", helperResolveHTMLHexCharacterReference(0x02261))
+ .Case("prop", helperResolveHTMLHexCharacterReference(0x0221D))
+ .Case("le", helperResolveHTMLHexCharacterReference(0x02264))
+ .Case("ge", helperResolveHTMLHexCharacterReference(0x02265))
+ .Case("larr", helperResolveHTMLHexCharacterReference(0x02190))
+ .Case("rarr", helperResolveHTMLHexCharacterReference(0x02192))
+ .Case("isin", helperResolveHTMLHexCharacterReference(0x02208))
+ .Case("notin", helperResolveHTMLHexCharacterReference(0x02209))
+ .Case("lceil", helperResolveHTMLHexCharacterReference(0x02308))
+ .Case("rceil", helperResolveHTMLHexCharacterReference(0x02309))
+ .Case("lfloor", helperResolveHTMLHexCharacterReference(0x0230A))
+ .Case("rfloor", helperResolveHTMLHexCharacterReference(0x0230B))
+ .Default("");
+}
StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
unsigned CodePoint = 0;
@@ -60,23 +231,6 @@ StringRef Lexer::resolveHTMLDecimalChara
return StringRef();
}
-StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
- unsigned CodePoint = 0;
- for (unsigned i = 0, e = Name.size(); i != e; ++i) {
- CodePoint *= 16;
- const char C = Name[i];
- assert(isHTMLHexCharacterReferenceCharacter(C));
- CodePoint += llvm::hexDigitValue(C);
- }
-
- char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
- char *ResolvedPtr = Resolved;
- if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
- return StringRef(Resolved, ResolvedPtr - Resolved);
- else
- return StringRef();
-}
-
void Lexer::skipLineStartingDecorations() {
// This function should be called only for C comments
assert(CommentState == LCS_InsideCComment);
@@ -573,8 +727,17 @@ void Lexer::lexHTMLCharacterReference(To
StringRef Name(NamePtr, TokenPtr - NamePtr);
TokenPtr++; // Skip semicolon.
StringRef Resolved;
- if (isNamed)
+ if (isNamed) {
Resolved = resolveHTMLNamedCharacterReference(Name);
+ if (Resolved.empty()) {
+ Resolved = HTMLDoxygenCharacterReference(Name);
+ if (!Resolved.empty()) {
+ formTokenWithChars(T, TokenPtr, tok::text);
+ T.setText(Resolved);
+ return;
+ }
+ }
+ }
else if (isDecimal)
Resolved = resolveHTMLDecimalCharacterReference(Name);
else
Added: cfe/trunk/test/Index/special-html-characters.m
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Index/special-html-characters.m?rev=173850&view=auto
==============================================================================
--- cfe/trunk/test/Index/special-html-characters.m (added)
+++ cfe/trunk/test/Index/special-html-characters.m Tue Jan 29 17:42:26 2013
@@ -0,0 +1,28 @@
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: c-index-test -test-load-source all -comments-xml-schema=%S/../../bindings/xml/comment-xml-schema.rng -triple x86_64-apple-darwin10 %s > %t/out
+// RUN: FileCheck %s < %t/out
+// rdar://13067629
+
+// Ensure that XML we generate is not invalid.
+// RUN: FileCheck %s -check-prefix=WRONG < %t/out
+// WRONG-NOT: CommentXMLInvalid
+
+// rdar://12392215
+ at interface I
+ at end
+
+ at implementation I
+/*!
+ © the copyright symbol
+ ™ the trade mark symbol
+ ® the registered trade mark symbol
+ a non breakable space.
+ Δ Greek letter Delta Î.
+ Γ Greek letter Gamma Î.
+ */
+- (void)phoneHome:(id)sender {
+
+}
+ at end
+// CHECK: FullCommentAsHTML=[<p class="para-brief">\t© the copyright symbol\t⢠the trade mark symbol ® the registered trade mark symbol\t a non breakable space. Î Greek letter Delta Î. Î Greek letter Gamma Î. </p>] FullCommentAsXML=[<Function isInstanceMethod="1" file="{{[^"]+}}special-html-characters.m" line="[[@LINE-4]]" column="1"><Name>phoneHome:</Name><USR>c:objc(cs)I(im)phoneHome:</USR><Declaration>- (void)phoneHome:(id)sender;</Declaration><Abstract><Para>\t© the copyright symbol\t⢠the trade mark symbol ® the registered trade mark symbol\t a non breakable space. Î Greek letter Delta Î. Î Greek letter Gamma Î. </Para></Abstract></Function>]
More information about the cfe-commits
mailing list