[llvm] 07c4f1d - [ms] [llvm-ml] Lex MASM strings, including escaping

Wed Nov 4 12:29:45 PST 2020

Author: Eric Astor
Date: 2020-11-04T15:28:43-05:00
New Revision: 07c4f1d10b305635cc74c7c853c18197faea5d19

URL: https://github.com/llvm/llvm-project/commit/07c4f1d10b305635cc74c7c853c18197faea5d19
DIFF: https://github.com/llvm/llvm-project/commit/07c4f1d10b305635cc74c7c853c18197faea5d19.diff

LOG: [ms] [llvm-ml] Lex MASM strings, including escaping

Allow single-quoted strings and double-quoted character values, as well as doubled-quote escaping.

Reviewed By: thakis

Differential Revision: https://reviews.llvm.org/D89731

Added: 
    llvm/test/tools/llvm-ml/strings.test

Modified: 
    llvm/include/llvm/MC/MCParser/AsmLexer.h
    llvm/include/llvm/MC/MCParser/MCAsmLexer.h
    llvm/lib/MC/MCParser/AsmLexer.cpp
    llvm/lib/MC/MCParser/MasmParser.cpp
    llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
    llvm/test/tools/llvm-ml/struct.test
    llvm/tools/llvm-ml/llvm-ml.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/MC/MCParser/AsmLexer.h b/llvm/include/llvm/MC/MCParser/AsmLexer.h
index 05b3695bc7a0..e187a28f267d 100644

--- a/llvm/include/llvm/MC/MCParser/AsmLexer.h
+++ b/llvm/include/llvm/MC/MCParser/AsmLexer.h
@@ -56,6 +56,7 @@ class AsmLexer : public MCAsmLexer {
   bool isAtStartOfComment(const char *Ptr);
   bool isAtStatementSeparator(const char *Ptr);
   int getNextChar();
+  int peekNextChar();
   AsmToken ReturnError(const char *Loc, const std::string &Msg);
 
   AsmToken LexIdentifier();

diff  --git a/llvm/include/llvm/MC/MCParser/MCAsmLexer.h b/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
index e2f3301d2f2b..21966d1c742d 100644
--- a/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -51,6 +51,7 @@ class MCAsmLexer {
   bool IsAtStartOfStatement = true;
   bool LexMasmHexFloats = false;
   bool LexMasmIntegers = false;
+  bool LexMasmStrings = false;
   bool UseMasmDefaultRadix = false;
   unsigned DefaultRadix = 10;
   AsmCommentConsumer *CommentConsumer = nullptr;
@@ -163,6 +164,10 @@ class MCAsmLexer {
 
   /// Set whether to lex masm-style hex float literals, such as 3f800000r.
   void setLexMasmHexFloats(bool V) { LexMasmHexFloats = V; }
+
+  /// Set whether to lex masm-style string literals, such as 'Can''t find file'
+  /// and "This ""value"" not found".
+  void setLexMasmStrings(bool V) { LexMasmStrings = V; }
 };
 
 } // end namespace llvm

diff  --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp
index d8a20341bfb9..5c9d1264aaa0 100644
--- a/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -64,6 +64,12 @@ int AsmLexer::getNextChar() {
   return (unsigned char)*CurPtr++;
 }
 
+int AsmLexer::peekNextChar() {
+  if (CurPtr == CurBuf.end())
+    return EOF;
+  return (unsigned char)*CurPtr;
+}
+
 /// The leading integral digit sequence and dot should have already been
 /// consumed, some or all of the fractional digit sequence *can* have been
 /// consumed.
@@ -521,6 +527,24 @@ AsmToken AsmLexer::LexDigit() {
 AsmToken AsmLexer::LexSingleQuote() {
   int CurChar = getNextChar();
 
+  if (LexMasmStrings) {
+    while (CurChar != EOF) {
+      if (CurChar != '\'') {
+        CurChar = getNextChar();
+      } else if (peekNextChar() == '\'') {
+        // In MASM single-quote strings, doubled single-quotes mean an escaped
+        // single quote, so should be lexed in.
+        getNextChar();
+        CurChar = getNextChar();
+      } else {
+        break;
+      }
+    }
+    if (CurChar == EOF)
+      return ReturnError(TokStart, "unterminated string constant");
+    return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
+  }
+
   if (CurChar == '\\')
     CurChar = getNextChar();
 
@@ -555,6 +579,24 @@ AsmToken AsmLexer::LexSingleQuote() {
 /// LexQuote: String: "..."
 AsmToken AsmLexer::LexQuote() {
   int CurChar = getNextChar();
+  if (LexMasmStrings) {
+    while (CurChar != EOF) {
+      if (CurChar != '"') {
+        CurChar = getNextChar();
+      } else if (peekNextChar() == '"') {
+        // In MASM double-quoted strings, doubled double-quotes mean an escaped
+        // double quote, so should be lexed in.
+        getNextChar();
+        CurChar = getNextChar();
+      } else {
+        break;
+      }
+    }
+    if (CurChar == EOF)
+      return ReturnError(TokStart, "unterminated string constant");
+    return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
+  }
+
   // TODO: does gas allow multiline string constants?
   while (CurChar != '"') {
     if (CurChar == '\\') {

diff  --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index 6bddbe6ef50a..d07bbd105ae3 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -3086,70 +3086,19 @@ bool MasmParser::parseEscapedString(std::string &Data) {
     return true;
 
   Data = "";
+  char Quote = getTok().getString().front();
   StringRef Str = getTok().getStringContents();
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    if (Str[i] != '\\') {
-      Data += Str[i];
-      continue;
-    }
-
-    // Recognize escaped characters. Note that this escape semantics currently
-    // loosely follows Darwin 'as'.
-    ++i;
-    if (i == e)
-      return TokError("unexpected backslash at end of string");
-
-    // Recognize hex sequences similarly to GNU 'as'.
-    if (Str[i] == 'x' || Str[i] == 'X') {
-      size_t length = Str.size();
-      if (i + 1 >= length || !isHexDigit(Str[i + 1]))
-        return TokError("invalid hexadecimal escape sequence");
-
-      // Consume hex characters. GNU 'as' reads all hexadecimal characters and
-      // then truncates to the lower 16 bits. Seems reasonable.
-      unsigned Value = 0;
-      while (i + 1 < length && isHexDigit(Str[i + 1]))
-        Value = Value * 16 + hexDigitValue(Str[++i]);
-
-      Data += (unsigned char)(Value & 0xFF);
-      continue;
-    }
-
-    // Recognize octal sequences.
-    if ((unsigned)(Str[i] - '0') <= 7) {
-      // Consume up to three octal characters.
-      unsigned Value = Str[i] - '0';
-
-      if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
+  Data.reserve(Str.size());
+  for (int i = 0, e = Str.size(); i != e; ++i) {
+    Data.push_back(Str[i]);
+    if (Str[i] == Quote) {
+      // MASM treats doubled delimiting quotes as an escaped delimiting quote.
+      // If we're escaping the string's trailing delimiter, we're definitely
+      // missing a quotation mark.
+      if (i + 1 == Str.size())
+        return Error(getTok().getLoc(), "missing quotation mark in string");
+      if (Str[i + 1] == Quote)
         ++i;
-        Value = Value * 8 + (Str[i] - '0');
-
-        if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
-          ++i;
-          Value = Value * 8 + (Str[i] - '0');
-        }
-      }
-
-      if (Value > 255)
-        return TokError("invalid octal escape sequence (out of range)");
-
-      Data += (unsigned char)Value;
-      continue;
-    }
-
-    // Otherwise recognize individual escapes.
-    switch (Str[i]) {
-    default:
-      // Just reject invalid escape sequences for now.
-      return TokError("invalid escape sequence (unrecognized character)");
-
-    case 'b': Data += '\b'; break;
-    case 'f': Data += '\f'; break;
-    case 'n': Data += '\n'; break;
-    case 'r': Data += '\r'; break;
-    case 't': Data += '\t'; break;
-    case '"': Data += '"'; break;
-    case '\\': Data += '\\'; break;
     }
   }
 
@@ -3220,7 +3169,9 @@ bool MasmParser::parseScalarInitializer(unsigned Size,
                                         SmallVectorImpl<const MCExpr *> &Values,
                                         unsigned StringPadLength) {
   if (getTok().is(AsmToken::String)) {
-    StringRef Value = getTok().getStringContents();
+    std::string Value;
+    if (parseEscapedString(Value))
+      return true;
     if (Size == 1) {
       // Treat each character as an initializer.
       for (const char CharVal : Value)
@@ -3235,11 +3186,10 @@ bool MasmParser::parseScalarInitializer(unsigned Size,
         return Error(getTok().getLoc(), "out of range literal value");
 
       uint64_t IntValue = 0;
-      for (const unsigned char CharVal : Value.bytes())
+      for (const unsigned char CharVal : Value)
         IntValue = (IntValue << 8) | CharVal;
       Values.push_back(MCConstantExpr::create(IntValue, getContext()));
     }
-    Lex();
   } else {
     const MCExpr *Value;
     if (parseExpression(Value))

diff  --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 4952c78e1fc3..6d037ca14523 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1696,6 +1696,17 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
     case AsmToken::At:
     case AsmToken::String:
     case AsmToken::Identifier: {
+      if (Parser.isParsingMasm() && Tok.is(AsmToken::String)) {
+        // Single-character strings should be treated as integer constants. This
+        // includes MASM escapes for quotes.
+        char Quote = Tok.getString().front();
+        StringRef Contents = Tok.getStringContents();
+        if (Contents.size() == 1 || Contents == std::string(2, Quote)) {
+          if (SM.onInteger(Contents.front(), ErrMsg))
+            return Error(Tok.getLoc(), ErrMsg);
+          break;
+        }
+      }
       SMLoc IdentLoc = Tok.getLoc();
       StringRef Identifier = Tok.getString();
       UpdateLocLex = false;

diff  --git a/llvm/test/tools/llvm-ml/strings.test b/llvm/test/tools/llvm-ml/strings.test
new file mode 100644
index 000000000000..5064a458b7ec
--- /dev/null
+++ b/llvm/test/tools/llvm-ml/strings.test
@@ -0,0 +1,122 @@
+# RUN: llvm-ml -filetype=asm %s | FileCheck %s
+
+.data
+
+dq_single_character BYTE "a"
+; CHECK-LABEL: dq_single_character:
+; CHECK-NEXT: .byte 97
+; CHECK-NOT: .byte
+
+dq_join BYTE "ab", "cd"
+; CHECK-LABEL: dq_join:
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 100
+; CHECK-NOT: .byte
+
+dq_quote_escape BYTE "ab""""cd"
+; Intended result: ab""cd
+; CHECK-LABEL: dq_quote_escape:
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 34
+; CHECK-NEXT: .byte 34
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 100
+; CHECK-NOT: .byte
+
+dq_single_quote BYTE "ab''''cd"
+; Intended result: ab''''cd
+; CHECK-LABEL: dq_single_quote:
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 39
+; CHECK-NEXT: .byte 39
+; CHECK-NEXT: .byte 39
+; CHECK-NEXT: .byte 39
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 100
+; CHECK-NOT: .byte
+
+sq_single_character BYTE 'a'
+; CHECK-LABEL: sq_single_character:
+; CHECK-NEXT: .byte 97
+; CHECK-NOT: .byte
+
+sq_join BYTE 'ab', 'cd'
+; CHECK-LABEL: sq_join:
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 100
+; CHECK-NOT: .byte
+
+sq_quote_escape BYTE 'ab''''cd'
+; Intended result: ab''cd
+; CHECK-LABEL: sq_quote_escape:
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 39
+; CHECK-NEXT: .byte 39
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 100
+; CHECK-NOT: .byte
+
+sq_double_quote BYTE 'ab""""cd'
+; Intended result: ab""""cd
+; CHECK-LABEL: sq_double_quote:
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 34
+; CHECK-NEXT: .byte 34
+; CHECK-NEXT: .byte 34
+; CHECK-NEXT: .byte 34
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 100
+; CHECK-NOT: .byte
+
+mixed_quotes_join BYTE "a'b", 'c"d'
+; Intended result: a'bc"d
+; CHECK-LABEL: mixed_quotes_join:
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 39
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 34
+; CHECK-NEXT: .byte 100
+; CHECK-NOT: .byte
+
+.code
+
+sq_char_test PROC
+; CHECK-LABEL: sq_char_test:
+
+  mov eax, 'a'
+; CHECK: mov eax, 97
+
+  mov eax, ''''
+; CHECK: mov eax, 39
+
+  mov eax, '"'
+; CHECK: mov eax, 34
+
+  ret
+sq_char_test ENDP
+
+dq_char_test PROC
+; CHECK-LABEL: dq_char_test:
+
+  mov eax, "b"
+; CHECK: mov eax, 98
+
+  mov eax, """"
+; CHECK: mov eax, 34
+
+  mov eax, "'"
+; CHECK: mov eax, 39
+
+  ret
+dq_char_test ENDP
+
+end

diff  --git a/llvm/test/tools/llvm-ml/struct.test b/llvm/test/tools/llvm-ml/struct.test
index 479d31c8121f..b29a069dc84c 100644
--- a/llvm/test/tools/llvm-ml/struct.test
+++ b/llvm/test/tools/llvm-ml/struct.test
@@ -46,7 +46,7 @@ t1 foobar <>
 ; CHECK-NEXT: .byte 101
 ; CHECK-NEXT: .zero 1
 
-t2 FOOBAR <"gh",,<10,11>,<12>,"ijk">
+t2 FOOBAR <"gh",,<10,11>,<12>,'ijk'>
 
 ; CHECK: t2:
 ;

diff  --git a/llvm/tools/llvm-ml/llvm-ml.cpp b/llvm/tools/llvm-ml/llvm-ml.cpp
index 1586870e0855..d01c66e13267 100644
--- a/llvm/tools/llvm-ml/llvm-ml.cpp
+++ b/llvm/tools/llvm-ml/llvm-ml.cpp
@@ -184,6 +184,7 @@ static int AsLexInput(SourceMgr &SrcMgr, MCAsmInfo &MAI, raw_ostream &OS) {
   Lexer.setLexMasmIntegers(true);
   Lexer.useMasmDefaultRadix(true);
   Lexer.setLexMasmHexFloats(true);
+  Lexer.setLexMasmStrings(true);
 
   bool Error = false;
   while (Lexer.Lex().isNot(AsmToken::Eof)) {
@@ -216,6 +217,7 @@ static int AssembleInput(const char *ProgName, const Target *TheTarget,
   Parser->getLexer().setLexMasmIntegers(true);
   Parser->getLexer().useMasmDefaultRadix(true);
   Parser->getLexer().setLexMasmHexFloats(true);
+  Parser->getLexer().setLexMasmStrings(true);
 
   int Res = Parser->Run(/*NoInitialTextSection=*/true);