[clang] Comment parsing: add argument parsing for @throw @throws @exception (PR #84726)

Mon Mar 11 01:20:54 PDT 2024

https://github.com/hdoc created https://github.com/llvm/llvm-project/pull/84726

Doxygen allows for the @throw, @throws, and @exception commands to have an attached argument indicating the type being thrown. Currently, Clang's AST parsing doesn't support parsing out this argument from doc comments. The result is missing compatibility with Doxygen.

We would find it helpful if the AST exposed these thrown types as BlockCommandComment arguments so that we could generate better documentation.

This PR implements parsing of arguments for the @throw, @throws, and @exception commands. Each command can only have one argument, matching the semantics of Doxygen. We have also added unit tests to validate the functionality.

>From ec3f444913d9162de4494cdb09b336b1b00380fa Mon Sep 17 00:00:00 2001
From: hdoc <github at hdoc.io>
Date: Mon, 11 Mar 2024 01:13:25 -0700
Subject: [PATCH] Comment parsing: add argument parsing for @throw @throws
 @exception

Doxygen allows for the @throw, @throws, and @exception commands to
have an attached argument indicating the type being thrown. Currently,
Clang's AST parsing doesn't support parsing out this argument from doc
comments. The result is missing compatibility with Doxygen.

We would find it helpful if the AST exposed these thrown types as
BlockCommandComment arguments so that we could generate better
documentation.

This PR implements parsing of arguments for the @throw, @throws, and
@exception commands. Each command can only have one argument, matching
the semantics of Doxygen. We have also added unit tests to validate
the functionality.
---
 clang/include/clang/AST/CommentCommands.td |   6 +-
 clang/include/clang/AST/CommentParser.h    |   3 +
 clang/lib/AST/CommentParser.cpp            | 133 ++++++++++++
 clang/unittests/AST/CommentParser.cpp      | 235 ++++++++++++++++++++-
 4 files changed, 373 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/AST/CommentCommands.td b/clang/include/clang/AST/CommentCommands.td
index e839031752cdd8..06b2fa9b5531c6 100644
--- a/clang/include/clang/AST/CommentCommands.td
+++ b/clang/include/clang/AST/CommentCommands.td
@@ -132,9 +132,9 @@ def Tparam : BlockCommand<"tparam"> { let IsTParamCommand = 1; }
 // HeaderDoc command for template parameter documentation.
 def Templatefield : BlockCommand<"templatefield"> { let IsTParamCommand = 1; }
 
-def Throws    : BlockCommand<"throws"> { let IsThrowsCommand = 1; }
-def Throw     : BlockCommand<"throw"> { let IsThrowsCommand = 1; }
-def Exception : BlockCommand<"exception"> { let IsThrowsCommand = 1; }
+def Throws    : BlockCommand<"throws"> { let IsThrowsCommand = 1; let NumArgs = 1; }
+def Throw     : BlockCommand<"throw"> { let IsThrowsCommand = 1; let NumArgs = 1; }
+def Exception : BlockCommand<"exception"> { let IsThrowsCommand = 1; let NumArgs = 1;}
 
 def Deprecated : BlockCommand<"deprecated"> {
   let IsEmptyParagraphAllowed = 1;
diff --git a/clang/include/clang/AST/CommentParser.h b/clang/include/clang/AST/CommentParser.h
index e11e818b1af0a1..5884a25d007851 100644
--- a/clang/include/clang/AST/CommentParser.h
+++ b/clang/include/clang/AST/CommentParser.h
@@ -100,6 +100,9 @@ class Parser {
   ArrayRef<Comment::Argument>
   parseCommandArgs(TextTokenRetokenizer &Retokenizer, unsigned NumArgs);
 
+  ArrayRef<Comment::Argument>
+  parseThrowCommandArgs(TextTokenRetokenizer &Retokenizer, unsigned NumArgs);
+
   BlockCommandComment *parseBlockCommand();
   InlineCommandComment *parseInlineCommand();
 
diff --git a/clang/lib/AST/CommentParser.cpp b/clang/lib/AST/CommentParser.cpp
index 8adfd85d0160c3..c70fa1b05cb241 100644
--- a/clang/lib/AST/CommentParser.cpp
+++ b/clang/lib/AST/CommentParser.cpp
@@ -75,6 +75,25 @@ class TextTokenRetokenizer {
     return *Pos.BufferPtr;
   }
 
+  char peekNext(unsigned offset) const {
+    assert(!isEnd());
+    assert(Pos.BufferPtr != Pos.BufferEnd);
+    if (Pos.BufferPtr + offset <= Pos.BufferEnd) {
+      return *(Pos.BufferPtr + offset);
+    } else {
+      return '\0';
+    }
+  }
+
+  void peekNextToken(SmallString<32> &WordText) const {
+    unsigned offset = 1;
+    char C = peekNext(offset++);
+    while (!isWhitespace(C) && C != '\0') {
+      WordText.push_back(C);
+      C = peekNext(offset++);
+    }
+  }
+
   void consumeChar() {
     assert(!isEnd());
     assert(Pos.BufferPtr != Pos.BufferEnd);
@@ -89,6 +108,29 @@ class TextTokenRetokenizer {
     }
   }
 
+  /// Extract a template type
+  bool lexTemplateType(SmallString<32> &WordText) {
+    unsigned IncrementCounter = 0;
+    while (!isEnd()) {
+      const char C = peek();
+      WordText.push_back(C);
+      consumeChar();
+      switch (C) {
+      default:
+        break;
+      case '<': {
+        IncrementCounter++;
+      } break;
+      case '>': {
+        IncrementCounter--;
+        if (!IncrementCounter)
+          return true;
+      } break;
+      }
+    }
+    return false;
+  }
+
   /// Add a token.
   /// Returns true on success, false if there are no interesting tokens to
   /// fetch from lexer.
@@ -149,6 +191,76 @@ class TextTokenRetokenizer {
     addToken();
   }
 
+  /// Extract a type argument
+  bool lexDataType(Token &Tok) {
+    if (isEnd())
+      return false;
+    Position SavedPos = Pos;
+    consumeWhitespace();
+    SmallString<32> NextToken;
+    SmallString<32> WordText;
+    const char *WordBegin = Pos.BufferPtr;
+    SourceLocation Loc = getSourceLocation();
+    StringRef ConstVal = StringRef("const");
+    bool ConstPointer = false;
+
+    while (!isEnd()) {
+      const char C = peek();
+      if (!isWhitespace(C)) {
+        if (C == '<') {
+          if (!lexTemplateType(WordText))
+            return false;
+        } else {
+          WordText.push_back(C);
+          consumeChar();
+        }
+      } else {
+        if (WordText.equals(ConstVal)) {
+          WordText.push_back(C);
+          consumeChar();
+        } else if (WordText.ends_with(StringRef("*")) ||
+                   WordText.ends_with(StringRef("&"))) {
+          NextToken.clear();
+          peekNextToken(NextToken);
+          if (NextToken.equals(ConstVal)) {
+            ConstPointer = true;
+            WordText.push_back(C);
+            consumeChar();
+          } else {
+            consumeChar();
+            break;
+          }
+        } else {
+          NextToken.clear();
+          peekNextToken(NextToken);
+          if ((NextToken.ends_with(StringRef("*")) ||
+               NextToken.ends_with(StringRef("&"))) &&
+              !ConstPointer) {
+            WordText.push_back(C);
+            consumeChar();
+          } else {
+            consumeChar();
+            break;
+          }
+        }
+      }
+    }
+
+    const unsigned Length = WordText.size();
+    if (Length == 0) {
+      Pos = SavedPos;
+      return false;
+    }
+
+    char *TextPtr = Allocator.Allocate<char>(Length + 1);
+
+    memcpy(TextPtr, WordText.c_str(), Length + 1);
+    StringRef Text = StringRef(TextPtr, Length);
+
+    formTokenWithChars(Tok, Loc, WordBegin, Length, Text);
+    return true;
+  }
+
   /// Extract a word -- sequence of non-whitespace characters.
   bool lexWord(Token &Tok) {
     if (isEnd())
@@ -295,6 +407,7 @@ Parser::parseCommandArgs(TextTokenRetokenizer &Retokenizer, unsigned NumArgs) {
       Comment::Argument[NumArgs];
   unsigned ParsedArgs = 0;
   Token Arg;
+
   while (ParsedArgs < NumArgs && Retokenizer.lexWord(Arg)) {
     Args[ParsedArgs] = Comment::Argument{
         SourceRange(Arg.getLocation(), Arg.getEndLocation()), Arg.getText()};
@@ -304,6 +417,23 @@ Parser::parseCommandArgs(TextTokenRetokenizer &Retokenizer, unsigned NumArgs) {
   return llvm::ArrayRef(Args, ParsedArgs);
 }
 
+ArrayRef<Comment::Argument>
+Parser::parseThrowCommandArgs(TextTokenRetokenizer &Retokenizer,
+                              unsigned NumArgs) {
+  auto *Args = new (Allocator.Allocate<Comment::Argument>(NumArgs))
+      Comment::Argument[NumArgs];
+  unsigned ParsedArgs = 0;
+  Token Arg;
+
+  while (ParsedArgs < NumArgs && Retokenizer.lexDataType(Arg)) {
+    Args[ParsedArgs] = Comment::Argument{
+        SourceRange(Arg.getLocation(), Arg.getEndLocation()), Arg.getText()};
+    ParsedArgs++;
+  }
+
+  return llvm::ArrayRef(Args, ParsedArgs);
+}
+
 BlockCommandComment *Parser::parseBlockCommand() {
   assert(Tok.is(tok::backslash_command) || Tok.is(tok::at_command));
 
@@ -356,6 +486,9 @@ BlockCommandComment *Parser::parseBlockCommand() {
       parseParamCommandArgs(PC, Retokenizer);
     else if (TPC)
       parseTParamCommandArgs(TPC, Retokenizer);
+    else if (Info->IsThrowsCommand)
+      S.actOnBlockCommandArgs(
+          BC, parseThrowCommandArgs(Retokenizer, Info->NumArgs));
     else
       S.actOnBlockCommandArgs(BC, parseCommandArgs(Retokenizer, Info->NumArgs));
 
diff --git a/clang/unittests/AST/CommentParser.cpp b/clang/unittests/AST/CommentParser.cpp
index c3479672ae2a3c..e01d654aa1cea2 100644
--- a/clang/unittests/AST/CommentParser.cpp
+++ b/clang/unittests/AST/CommentParser.cpp
@@ -1427,8 +1427,241 @@ TEST_F(CommentParserTest, Deprecated) {
   }
 }
 
+TEST_F(CommentParserTest, ThrowsCommandHasArg1) {
+  const char *Sources[] = {
+      "/// @throws int This function throws an integer",
+      ("/// @throws\n"
+       "/// int This function throws an integer"),
+      ("/// @throws \n"
+       "/// int This function throws an integer"),
+      ("/// @throws\n"
+       "/// int\n"
+       "/// This function throws an integer"),
+      ("/// @throws \n"
+       "/// int \n"
+       "/// This function throws an integer"),
+  };
+
+  for (size_t i = 0, e = std::size(Sources); i != e; i++) {
+    FullComment *FC = parseString(Sources[i]);
+    ASSERT_TRUE(HasChildCount(FC, 2));
+
+    ASSERT_TRUE(HasParagraphCommentAt(FC, 0, " "));
+    {
+      BlockCommandComment *BCC;
+      ParagraphComment *PC;
+      ASSERT_TRUE(HasBlockCommandAt(FC, Traits, 1, BCC, "throws", PC));
+      ASSERT_TRUE(HasChildCount(PC, 1));
+      ASSERT_TRUE(BCC->getNumArgs() == 1);
+      ASSERT_TRUE(BCC->getArgText(0) == "int");
+    }
+  }
+}
+
+TEST_F(CommentParserTest, ThrowsCommandHasArg2) {
+  const char *Sources[] = {
+      "/// @throws const int This function throws a const integer",
+      ("/// @throws\n"
+       "/// const int This function throws a const integer"),
+      ("/// @throws \n"
+       "/// const int This function throws a const integer"),
+      ("/// @throws\n"
+       "/// const int\n"
+       "/// This function throws a const integer"),
+      ("/// @throws \n"
+       "/// const int \n"
+       "/// This function throws a const integer"),
+  };
+
+  for (size_t i = 0, e = std::size(Sources); i != e; i++) {
+    FullComment *FC = parseString(Sources[i]);
+    ASSERT_TRUE(HasChildCount(FC, 2));
+
+    ASSERT_TRUE(HasParagraphCommentAt(FC, 0, " "));
+    {
+      BlockCommandComment *BCC;
+      ParagraphComment *PC;
+      ASSERT_TRUE(HasBlockCommandAt(FC, Traits, 1, BCC, "throws", PC));
+      ASSERT_TRUE(HasChildCount(PC, 1));
+      ASSERT_TRUE(BCC->getNumArgs() == 1);
+      ASSERT_TRUE(BCC->getArgText(0) == "const int");
+    }
+  }
+}
+
+TEST_F(CommentParserTest, ThrowsCommandHasArg3) {
+  const char *Sources[] = {
+      "/// @throws const int * This function throws a pointer to a const "
+      "integer\n",
+      ("/// @throws\n"
+       "/// const int * This function throws a pointer to a const integer"),
+      ("/// @throws \n"
+       "/// const int * This function throws a pointer to a const integer"),
+      ("/// @throws\n"
+       "/// const int *\n"
+       "/// This function throws a pointer to a const integer"),
+      ("/// @throws \n"
+       "/// const int *\n"
+       "/// This function throws a pointer to a const integer"),
+  };
+
+  for (size_t i = 0, e = std::size(Sources); i != e; i++) {
+    FullComment *FC = parseString(Sources[i]);
+    ASSERT_TRUE(HasChildCount(FC, 2));
+
+    ASSERT_TRUE(HasParagraphCommentAt(FC, 0, " "));
+    {
+      BlockCommandComment *BCC;
+      ParagraphComment *PC;
+      ASSERT_TRUE(HasBlockCommandAt(FC, Traits, 1, BCC, "throws", PC));
+      ASSERT_TRUE(HasChildCount(PC, 1));
+      ASSERT_TRUE(BCC->getNumArgs() == 1);
+      ASSERT_TRUE(BCC->getArgText(0) == "const int *");
+    }
+  }
+}
+
+TEST_F(CommentParserTest, ThrowsCommandHasArg4) {
+  const char *Sources[] = {
+      "/// @throws const int * const This function throws a const pointer to a "
+      "const integer",
+      ("/// @throws\n"
+       "/// const int * const This function throws a const pointer to a const "
+       "integer"),
+      ("/// @throws \n"
+       "/// const int * const This function throws a const pointer to a const "
+       "integer"),
+      ("/// @throws\n"
+       "/// const int * const\n"
+       "/// This function throws a const pointer to a const integer"),
+      ("/// @throws \n"
+       "/// const int * const\n"
+       "/// This function throws a const pointer to a const integer"),
+  };
+
+  for (size_t i = 0, e = std::size(Sources); i != e; i++) {
+    FullComment *FC = parseString(Sources[i]);
+    ASSERT_TRUE(HasChildCount(FC, 2));
+
+    ASSERT_TRUE(HasParagraphCommentAt(FC, 0, " "));
+    {
+      BlockCommandComment *BCC;
+      ParagraphComment *PC;
+      ASSERT_TRUE(HasBlockCommandAt(FC, Traits, 1, BCC, "throws", PC));
+      ASSERT_TRUE(HasChildCount(PC, 1));
+      ASSERT_TRUE(BCC->getNumArgs() == 1);
+      ASSERT_TRUE(BCC->getArgText(0) == "const int * const");
+    }
+  }
+}
+
+TEST_F(CommentParserTest, ThrowsCommandHasArg5) {
+  const char *Sources[] = {
+      "/// @throws int** This function throws a double pointer to an integer",
+  };
+
+  for (size_t i = 0, e = std::size(Sources); i != e; i++) {
+    FullComment *FC = parseString(Sources[i]);
+    ASSERT_TRUE(HasChildCount(FC, 2));
+
+    ASSERT_TRUE(HasParagraphCommentAt(FC, 0, " "));
+    {
+      BlockCommandComment *BCC;
+      ParagraphComment *PC;
+      ASSERT_TRUE(HasBlockCommandAt(FC, Traits, 1, BCC, "throws", PC));
+      ASSERT_TRUE(HasChildCount(PC, 1));
+      ASSERT_TRUE(BCC->getNumArgs() == 1);
+      ASSERT_TRUE(BCC->getArgText(0) == "int**");
+    }
+  }
+}
+
+TEST_F(CommentParserTest, ThrowsCommandHasArg6) {
+  const char *Sources[] = {
+      "/// @throws const char ** double pointer to a constant char pointer",
+  };
+
+  for (size_t i = 0, e = std::size(Sources); i != e; i++) {
+    FullComment *FC = parseString(Sources[i]);
+    ASSERT_TRUE(HasChildCount(FC, 2));
+
+    ASSERT_TRUE(HasParagraphCommentAt(FC, 0, " "));
+    {
+      BlockCommandComment *BCC;
+      ParagraphComment *PC;
+      ASSERT_TRUE(HasBlockCommandAt(FC, Traits, 1, BCC, "throws", PC));
+      ASSERT_TRUE(HasChildCount(PC, 1));
+      ASSERT_TRUE(BCC->getNumArgs() == 1);
+      ASSERT_TRUE(BCC->getArgText(0) == "const char **");
+    }
+  }
+}
+
+TEST_F(CommentParserTest, ThrowsCommandHasArg7) {
+  const char *Sources[] = {
+      "/// @throws Error<T> error of type Error<T>",
+  };
+
+  for (size_t i = 0, e = std::size(Sources); i != e; i++) {
+    FullComment *FC = parseString(Sources[i]);
+    ASSERT_TRUE(HasChildCount(FC, 2));
+
+    ASSERT_TRUE(HasParagraphCommentAt(FC, 0, " "));
+    {
+      BlockCommandComment *BCC;
+      ParagraphComment *PC;
+      ASSERT_TRUE(HasBlockCommandAt(FC, Traits, 1, BCC, "throws", PC));
+      ASSERT_TRUE(HasChildCount(PC, 3)); // Extra children because <T> is parsed
+                                         // as a series of TextComments
+      ASSERT_TRUE(BCC->getNumArgs() == 1);
+      ASSERT_TRUE(BCC->getArgText(0) == "Error<T>");
+    }
+  }
+}
+
+TEST_F(CommentParserTest, ThrowsCommandHasArg8) {
+  const char *Sources[] = {
+      "/// @throws Error<Container<T>> nested templates",
+  };
+
+  for (size_t i = 0, e = std::size(Sources); i != e; i++) {
+    FullComment *FC = parseString(Sources[i]);
+    ASSERT_TRUE(HasChildCount(FC, 2));
+
+    ASSERT_TRUE(HasParagraphCommentAt(FC, 0, " "));
+    {
+      BlockCommandComment *BCC;
+      ParagraphComment *PC;
+      ASSERT_TRUE(HasBlockCommandAt(FC, Traits, 1, BCC, "throws", PC));
+      ASSERT_TRUE(HasChildCount(PC, 1));
+      ASSERT_TRUE(BCC->getNumArgs() == 1);
+      ASSERT_TRUE(BCC->getArgText(0) == "Error<Container<T>>");
+    }
+  }
+}
+
+TEST_F(CommentParserTest, ThrowsCommandHasArg9) {
+  const char *Sources[] = {
+      "/// @throws Error<Ts...> variadic templates",
+  };
+
+  for (size_t i = 0, e = std::size(Sources); i != e; i++) {
+    FullComment *FC = parseString(Sources[i]);
+    ASSERT_TRUE(HasChildCount(FC, 2));
+
+    ASSERT_TRUE(HasParagraphCommentAt(FC, 0, " "));
+    {
+      BlockCommandComment *BCC;
+      ParagraphComment *PC;
+      ASSERT_TRUE(HasBlockCommandAt(FC, Traits, 1, BCC, "throws", PC));
+      ASSERT_TRUE(HasChildCount(PC, 1));
+      ASSERT_TRUE(BCC->getNumArgs() == 1);
+      ASSERT_TRUE(BCC->getArgText(0) == "Error<Ts...>");
+    }
+  }
+}
+
 } // unnamed namespace
 
 } // end namespace comments
 } // end namespace clang
-