[PATCH] MC asm parser: allow @'s in symbol names in MS assembly

Hans Wennborg hans at chromium.org
Fri Oct 18 11:32:44 PDT 2013


Hi rnk, rafael, majnemer,

This is another (final?) stab at making us able to parse our own asm output.

On Windows, it's not uncommon to have @'s and ?'s show up in symbol names. Our parser didn't like this. The ?'s were rejected, and the @'s were interpreted as trying to reference something like PLT or GOT.

My previous solution of quoting these names didn't work, as for some targets we use the gas assembler, which doesn't like quotes in certain places (most notably in .def directives).

Instead, I think we should try to parse these symbol names without quotes, as gas does.

This is probably not perfect, but should allow us to start being able to parse our asm output on windows.

http://llvm-reviews.chandlerc.com/D1978

Files:
  include/llvm/MC/MCAsmInfo.h
  lib/MC/MCAsmInfo.cpp
  lib/MC/MCParser/AsmLexer.cpp
  lib/MC/MCParser/AsmParser.cpp
  lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
  test/MC/COFF/tricky-names.ll

Index: include/llvm/MC/MCAsmInfo.h
===================================================================
--- include/llvm/MC/MCAsmInfo.h
+++ include/llvm/MC/MCAsmInfo.h
@@ -156,6 +156,10 @@
     /// symbol names.  This defaults to true.
     bool AllowPeriodsInName;
 
+    /// \brief This is true if the assembler allows @ characters in symbol
+    /// names. Defaults to false.
+    bool AllowAtInName;
+
     /// AllowUTF8 - This is true if the assembler accepts UTF-8 input.
     // FIXME: Make this a more general encoding setting?
     bool AllowUTF8;
@@ -485,6 +489,9 @@
     bool doesAllowPeriodsInName() const {
       return AllowPeriodsInName;
     }
+    bool doesAllowAtInName() const {
+      return AllowAtInName;
+    }
     bool doesAllowUTF8() const {
       return AllowUTF8;
     }
Index: lib/MC/MCAsmInfo.cpp
===================================================================
--- lib/MC/MCAsmInfo.cpp
+++ lib/MC/MCAsmInfo.cpp
@@ -53,6 +53,7 @@
   AllowQuotesInName = false;
   AllowNameToStartWithDigit = false;
   AllowPeriodsInName = true;
+  AllowAtInName = false;
   AllowUTF8 = true;
   UseDataRegionDirectives = false;
   ZeroDirective = "\t.zero\t";
Index: lib/MC/MCParser/AsmLexer.cpp
===================================================================
--- lib/MC/MCParser/AsmLexer.cpp
+++ lib/MC/MCParser/AsmLexer.cpp
@@ -138,9 +138,9 @@
   return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
 }
 
-/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
+/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
 static bool IsIdentifierChar(char c) {
-  return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@';
+  return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@' || c == '?';
 }
 AsmToken AsmLexer::LexIdentifier() {
   // Check for floating point literals.
Index: lib/MC/MCParser/AsmParser.cpp
===================================================================
--- lib/MC/MCParser/AsmParser.cpp
+++ lib/MC/MCParser/AsmParser.cpp
@@ -769,6 +769,7 @@
     Res = MCUnaryExpr::CreateLNot(Res, getContext());
     return false;
   case AsmToken::Dollar:
+  case AsmToken::At:
   case AsmToken::String:
   case AsmToken::Identifier: {
     StringRef Identifier;
@@ -792,19 +793,25 @@
     EndLoc = SMLoc::getFromPointer(Identifier.end());
 
     // This is a symbol reference.
+    StringRef SymbolName = Identifier;
+    MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
     std::pair<StringRef, StringRef> Split = Identifier.split('@');
-    MCSymbol *Sym = getContext().GetOrCreateSymbol(Split.first);
 
     // Lookup the symbol variant if used.
-    MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
     if (Split.first.size() != Identifier.size()) {
       Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
-      if (Variant == MCSymbolRefExpr::VK_Invalid) {
+      if (Variant != MCSymbolRefExpr::VK_Invalid) {
+        SymbolName = Split.first;
+      } else if (MAI.doesAllowAtInName()) {
+        Variant = MCSymbolRefExpr::VK_None;
+      } else {
         Variant = MCSymbolRefExpr::VK_None;
         return TokError("invalid variant '" + Split.second + "'");
       }
     }
 
+    MCSymbol *Sym = getContext().GetOrCreateSymbol(SymbolName);
+
     // If this is an absolute variable reference, substitute it now to preserve
     // semantics in the face of reassignment.
     if (Sym->isVariable() && isa<MCConstantExpr>(Sym->getVariableValue())) {
@@ -2105,25 +2112,25 @@
 ///   ::= string
 bool AsmParser::parseIdentifier(StringRef &Res) {
   // The assembler has relaxed rules for accepting identifiers, in particular we
-  // allow things like '.globl $foo', which would normally be separate
-  // tokens. At this level, we have already lexed so we cannot (currently)
+  // allow things like '.globl $foo' and '.def @feat.00', which would normally be
+  // separate tokens. At this level, we have already lexed so we cannot (currently)
   // handle this as a context dependent token, instead we detect adjacent tokens
   // and return the combined identifier.
-  if (Lexer.is(AsmToken::Dollar)) {
-    SMLoc DollarLoc = getLexer().getLoc();
+  if (Lexer.is(AsmToken::Dollar) || Lexer.is(AsmToken::At)) {
+    SMLoc PrefixLoc = getLexer().getLoc();
 
-    // Consume the dollar sign, and check for a following identifier.
+    // Consume the prefix character, and check for a following identifier.
     Lex();
     if (Lexer.isNot(AsmToken::Identifier))
       return true;
 
-    // We have a '$' followed by an identifier, make sure they are adjacent.
-    if (DollarLoc.getPointer() + 1 != getTok().getLoc().getPointer())
+    // We have a '$' or '@' followed by an identifier, make sure they are adjacent.
+    if (PrefixLoc.getPointer() + 1 != getTok().getLoc().getPointer())
       return true;
 
     // Construct the joined identifier and consume the token.
     Res =
-        StringRef(DollarLoc.getPointer(), getTok().getIdentifier().size() + 1);
+        StringRef(PrefixLoc.getPointer(), getTok().getIdentifier().size() + 1);
     Lex();
     return false;
   }
Index: lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
===================================================================
--- lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -135,6 +135,8 @@
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
+
+  AllowAtInName = true;
 }
 
 void X86MCAsmInfoGNUCOFF::anchor() { }
Index: test/MC/COFF/tricky-names.ll
===================================================================
--- /dev/null
+++ test/MC/COFF/tricky-names.ll
@@ -0,0 +1,38 @@
+; Check how tricky symbols are printed in the asm output.
+; RUN: llc -mtriple=i686-pc-win32 %s -o - | FileCheck %s --check-prefix=ASM
+
+; Check that we can roundtrip these names through our assembler.
+; RUN: llc -mtriple=i686-pc-win32 %s -o - | llvm-mc -triple i686-pc-win32 -filetype=obj | llvm-readobj -t | FileCheck %s --check-prefix=READOBJ
+
+
+@"\01??__E_Generic_object@?$_Error_objects at H@std@@YAXXZ" = global i32 0
+@"\01__ZL16ExceptionHandlerP19_EXCEPTION_POINTERS at 4" = global i32 0
+@"\01 at foo.bar" = global i32 0
+
+define weak i32 @"\01??_B?$num_put at _WV?$back_insert_iterator at V?$basic_string at _WU?$char_traits at _W@std@@V?$allocator at _W@2@@std@@@std@@@std@@51"() section ".text" {
+  %a = load i32* @"\01??__E_Generic_object@?$_Error_objects at H@std@@YAXXZ"
+  %b = load i32* @"\01__ZL16ExceptionHandlerP19_EXCEPTION_POINTERS at 4"
+  %c = load i32* @"\01 at foo.bar"
+  %x = add i32 %a, %b
+  %y = add i32 %x, %c
+  ret i32 %y
+}
+
+; Check that these symbols are not quoted. They occur in output that gets passed to GAS.
+; ASM: .globl __ZL16ExceptionHandlerP19_EXCEPTION_POINTERS at 4
+; ASM-NOT: .globl "__ZL16ExceptionHandlerP19_EXCEPTION_POINTERS at 4"
+; ASM: .globl @foo.bar
+; ASM-NOT: .globl "@foo.bar"
+
+; READOBJ: Symbol
+; READOBJ: Name: .text$??_B?$num_put at _WV?$back_insert_iterator at V?$basic_string at _WU?$char_traits at _W@std@@V?$allocator at _W@2@@std@@@std@@@std@@51
+; READOBJ: Section: .text$??_B?$num_put at _WV?$back_insert_iterator at V?$basic_string at _WU?$char_traits at _W@std@@V?$allocator at _W@2@@std@@@std@@@std@@51
+; READOBJ: Symbol
+; READOBJ: Name: ??_B?$num_put at _WV?$back_insert_iterator at V?$basic_string at _WU?$char_traits at _W@std@@V?$allocator at _W@2@@std@@@std@@@std@@51
+; READOBJ: Section: .text$??_B?$num_put at _WV?$back_insert_iterator at V?$basic_string at _WU?$char_traits at _W@std@@V?$allocator at _W@2@@std@@@std@@@std@@51
+; READOBJ: Symbol
+; READOBJ: Name: ??__E_Generic_object@?$_Error_objects at H@std@@YAXXZ
+; READOBJ: Symbol
+; READOBJ: Name: __ZL16ExceptionHandlerP19_EXCEPTION_POINTERS at 4
+; READOBJ: Symbol
+; READOBJ: Name: @foo.bar
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D1978.1.patch
Type: text/x-patch
Size: 7724 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20131018/4b7cd6f7/attachment.bin>


More information about the llvm-commits mailing list