[llvm] 7b921a6 - [AsmParser][SystemZ][z/OS] Add in support to accept "#" as part of an Identifier token

Thu Apr 1 08:24:51 PDT 2021

Author: Anirudh Prasad
Date: 2021-04-01T11:24:43-04:00
New Revision: 7b921a674756dacebbe0431211ce43edb3493230

URL: https://github.com/llvm/llvm-project/commit/7b921a674756dacebbe0431211ce43edb3493230
DIFF: https://github.com/llvm/llvm-project/commit/7b921a674756dacebbe0431211ce43edb3493230.diff

LOG: [AsmParser][SystemZ][z/OS] Add in support to accept "#" as part of an Identifier token

- This patch adds in support to accept the "#" character as part of an Identifier.
- This support is needed especially for the HLASM dialect since "#" is treated as part of the valid "Alphabet" range
- The way this is done is by making use of the previous precedent set by the `AllowAtInIdentifier` field in `MCAsmLexer.h`. A new field called `AllowHashInIdentifier` is introduced.
- The static function `IsIdentifierChar` is also updated to accept the `#` character if the `AllowHashInIdentifier` field is set to true.
Note: The field introduced in `MCAsmLexer.h` could very well be moved to `MCAsmInfo.h`. I'm not opposed to it. I decided to put it in `MCAsmLexer` since there seems to be some sort of precedent already with `AllowAtInIdentifier`.

Reviewed By: abhina.sreeskantharajan, nickdesaulniers, MaskRay

Differential Revision: https://reviews.llvm.org/D99277

Added: 
    

Modified: 
    llvm/include/llvm/MC/MCParser/MCAsmLexer.h
    llvm/lib/MC/MCParser/AsmLexer.cpp
    llvm/unittests/MC/SystemZ/SystemZAsmLexerTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/MC/MCParser/MCAsmLexer.h b/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
index 21966d1c742dd..d6ef92ca0b852 100644

--- a/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -48,6 +48,7 @@ class MCAsmLexer {
   const char *TokStart = nullptr;
   bool SkipSpace = true;
   bool AllowAtInIdentifier;
+  bool AllowHashInIdentifier = false;
   bool IsAtStartOfStatement = true;
   bool LexMasmHexFloats = false;
   bool LexMasmIntegers = false;
@@ -147,6 +148,8 @@ class MCAsmLexer {
   bool getAllowAtInIdentifier() { return AllowAtInIdentifier; }
   void setAllowAtInIdentifier(bool v) { AllowAtInIdentifier = v; }
 
+  void setAllowHashInIdentifier(bool V) { AllowHashInIdentifier = V; }
+
   void setCommentConsumer(AsmCommentConsumer *CommentConsumer) {
     this->CommentConsumer = CommentConsumer;
   }

diff  --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp
index dd481d46f7883..c4006f92b80eb 100644
--- a/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -143,10 +143,10 @@ AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
   return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
 }
 
-/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
-static bool IsIdentifierChar(char c, bool AllowAt) {
-  return isAlnum(c) || c == '_' || c == '$' || c == '.' ||
-         (c == '@' && AllowAt) || c == '?';
+/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@#?]*
+static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
+  return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
+         (AllowAt && C == '@') || (AllowHash && C == '#');
 }
 
 AsmToken AsmLexer::LexIdentifier() {
@@ -156,12 +156,13 @@ AsmToken AsmLexer::LexIdentifier() {
     while (isDigit(*CurPtr))
       ++CurPtr;
 
-    if (!IsIdentifierChar(*CurPtr, AllowAtInIdentifier) ||
+    if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
+                          AllowHashInIdentifier) ||
         *CurPtr == 'e' || *CurPtr == 'E')
       return LexFloatLiteral();
   }
 
-  while (IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
+  while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
     ++CurPtr;
 
   // Handle . as a special case.
@@ -726,9 +727,10 @@ AsmToken AsmLexer::LexToken() {
   switch (CurChar) {
   default:
     if (MAI.doesAllowSymbolAtNameStart()) {
-      // Handle Microsoft-style identifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@?]*
+      // Handle Microsoft-style identifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
       if (!isDigit(CurChar) &&
-          IsIdentifierChar(CurChar, MAI.doesAllowAtInName()))
+          isIdentifierChar(CurChar, MAI.doesAllowAtInName(),
+                           AllowHashInIdentifier))
         return LexIdentifier();
     } else {
       // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*

diff  --git a/llvm/unittests/MC/SystemZ/SystemZAsmLexerTest.cpp b/llvm/unittests/MC/SystemZ/SystemZAsmLexerTest.cpp
index 183b3ffc9352a..8eea737886d63 100644
--- a/llvm/unittests/MC/SystemZ/SystemZAsmLexerTest.cpp
+++ b/llvm/unittests/MC/SystemZ/SystemZAsmLexerTest.cpp
@@ -94,8 +94,6 @@ class SystemZAsmLexerTest : public ::testing::Test {
     Str.reset(TheTarget->createNullStreamer(*Ctx));
 
     Parser.reset(createMCAsmParser(SrcMgr, *Ctx, *Str, *MUPMAI));
-    // Lex initially to get the string.
-    Parser->getLexer().Lex();
   }
 
   void lexAndCheckTokens(StringRef AsmStr,
@@ -116,6 +114,9 @@ TEST_F(SystemZAsmLexerTest, CheckDontRestrictCommentStringToStartOfStatement) {
   // Setup.
   setupCallToAsmParser(AsmStr);
 
+  // Lex initially to get the string.
+  Parser->getLexer().Lex();
+
   SmallVector<AsmToken::TokenKind> ExpectedTokens(
       {AsmToken::Identifier, AsmToken::EndOfStatement});
   lexAndCheckTokens(AsmStr /* "jne #-4" */, ExpectedTokens);
@@ -129,6 +130,9 @@ TEST_F(SystemZAsmLexerTest, CheckRestrictCommentStringToStartOfStatement) {
   MUPMAI->setRestrictCommentStringToStartOfStatement(true);
   setupCallToAsmParser(AsmStr);
 
+  // Lex initially to get the string.
+  Parser->getLexer().Lex();
+
   // When we are restricting the comment string to only the start of the
   // statement, The sequence of tokens we are expecting are: Identifier - "jne"
   // Hash - '#'
@@ -148,8 +152,65 @@ TEST_F(SystemZAsmLexerTest, CheckHLASMComment) {
   MUPMAI->setCommentString("*");
   setupCallToAsmParser(AsmStr);
 
+  // Lex initially to get the string.
+  Parser->getLexer().Lex();
+
   SmallVector<AsmToken::TokenKind> ExpectedTokens(
       {AsmToken::EndOfStatement, AsmToken::Eof});
   lexAndCheckTokens(AsmStr /* "* lhi 1,10" */, ExpectedTokens);
 }
+
+TEST_F(SystemZAsmLexerTest, CheckHashDefault) {
+  StringRef AsmStr = "lh#123";
+
+  // Setup.
+  setupCallToAsmParser(AsmStr);
+
+  // Lex initially to get the string.
+  Parser->getLexer().Lex();
+
+  // "lh" -> Identifier
+  // "#123" -> EndOfStatement (Lexed as a comment since CommentString is "#")
+  SmallVector<AsmToken::TokenKind> ExpectedTokens(
+      {AsmToken::Identifier, AsmToken::EndOfStatement, AsmToken::Eof});
+  lexAndCheckTokens(AsmStr, ExpectedTokens);
+}
+
+// Test if "#" is accepted as an Identifier
+TEST_F(SystemZAsmLexerTest, CheckAllowHashInIdentifier) {
+  StringRef AsmStr = "lh#123";
+
+  // Setup.
+  setupCallToAsmParser(AsmStr);
+  Parser->getLexer().setAllowHashInIdentifier(true);
+
+  // Lex initially to get the string.
+  Parser->getLexer().Lex();
+
+  // "lh123" -> Identifier
+  SmallVector<AsmToken::TokenKind> ExpectedTokens(
+      {AsmToken::Identifier, AsmToken::EndOfStatement, AsmToken::Eof});
+  lexAndCheckTokens(AsmStr, ExpectedTokens);
+}
+
+TEST_F(SystemZAsmLexerTest, CheckAllowHashInIdentifier2) {
+  StringRef AsmStr = "lh#12*3";
+
+  // Setup.
+  MUPMAI->setCommentString("*");
+  MUPMAI->setRestrictCommentStringToStartOfStatement(true);
+  setupCallToAsmParser(AsmStr);
+  Parser->getLexer().setAllowHashInIdentifier(true);
+
+  // Lex initially to get the string.
+  Parser->getLexer().Lex();
+
+  // "lh#12" -> Identifier
+  // "*" -> Star
+  // "3" -> Integer
+  SmallVector<AsmToken::TokenKind> ExpectedTokens(
+      {AsmToken::Identifier, AsmToken::Star, AsmToken::Integer,
+       AsmToken::EndOfStatement, AsmToken::Eof});
+  lexAndCheckTokens(AsmStr, ExpectedTokens);
+}
 } // end anonymous namespace