[llvm] r314883 - [MC] - Don't assert when non-english characters are used.

Wed Oct 4 01:50:08 PDT 2017

Author: grimar
Date: Wed Oct  4 01:50:08 2017
New Revision: 314883

URL: http://llvm.org/viewvc/llvm-project?rev=314883&view=rev
Log:
[MC] - Don't assert when non-english characters are used.

I found that llvm-mc does not like non-english characters even in comments,
which it tries to tokenize.

Problem happens because of functions like isdigit(), isalnum() which takes
int argument and expects it is not negative.
But at the same time MCParser uses char* to store input buffer poiner, char has signed value,
so it is possible to pass negative value to one of functions from above and
that triggers an assert. 
Testcase for demonstration is provided.

To fix the issue helper functions were introduced in StringExtras.h

Differential revision: https://reviews.llvm.org/D38461

Added:
    llvm/trunk/test/MC/AsmParser/non-english-characters.s
Modified:
    llvm/trunk/include/llvm/ADT/StringExtras.h
    llvm/trunk/lib/MC/MCParser/AsmLexer.cpp

Modified: llvm/trunk/include/llvm/ADT/StringExtras.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/ADT/StringExtras.h?rev=314883&r1=314882&r2=314883&view=diff
==============================================================================

--- llvm/trunk/include/llvm/ADT/StringExtras.h (original)
+++ llvm/trunk/include/llvm/ADT/StringExtras.h Wed Oct  4 01:50:08 2017
@@ -59,6 +59,21 @@ static inline unsigned hexDigitValue(cha
   return -1U;
 }
 
+/// Checks if character \p C is one of the 10 decimal digits.
+static inline bool isDigit(char C) { return C >= '0' && C <= '9'; }
+
+/// Checks if character \p C is a hexadecimal numeric character.
+static inline bool isHexDigit(char C) { return hexDigitValue(C) != -1U; }
+
+/// Checks if character \p C is a valid letter as classified by "C" locale.
+static inline bool isAlpha(char C) {
+  return ('a' <= C && C <= 'z') || ('A' <= C && C <= 'Z');
+}
+
+/// Checks whether character \p C is either a decimal digit or an uppercase or
+/// lowercase letter as classified by "C" locale.
+static inline bool isAlnum(char C) { return isAlpha(C) || isDigit(C); }
+
 static inline std::string utohexstr(uint64_t X, bool LowerCase = false) {
   char Buffer[17];
   char *BufPtr = std::end(Buffer);

Modified: llvm/trunk/lib/MC/MCParser/AsmLexer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/MC/MCParser/AsmLexer.cpp?rev=314883&r1=314882&r2=314883&view=diff
==============================================================================
--- llvm/trunk/lib/MC/MCParser/AsmLexer.cpp (original)
+++ llvm/trunk/lib/MC/MCParser/AsmLexer.cpp Wed Oct  4 01:50:08 2017
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -68,7 +69,7 @@ int AsmLexer::getNextChar() {
 /// consumed.
 AsmToken AsmLexer::LexFloatLiteral() {
   // Skip the fractional digit sequence.
-  while (isdigit(*CurPtr))
+  while (isDigit(*CurPtr))
     ++CurPtr;
 
   // Check for exponent; we intentionally accept a slighlty wider set of
@@ -78,7 +79,7 @@ AsmToken AsmLexer::LexFloatLiteral() {
     ++CurPtr;
     if (*CurPtr == '-' || *CurPtr == '+')
       ++CurPtr;
-    while (isdigit(*CurPtr))
+    while (isDigit(*CurPtr))
       ++CurPtr;
   }
 
@@ -102,7 +103,7 @@ AsmToken AsmLexer::LexHexFloatLiteral(bo
     ++CurPtr;
 
     const char *FracStart = CurPtr;
-    while (isxdigit(*CurPtr))
+    while (isHexDigit(*CurPtr))
       ++CurPtr;
 
     NoFracDigits = CurPtr == FracStart;
@@ -123,7 +124,7 @@ AsmToken AsmLexer::LexHexFloatLiteral(bo
 
   // N.b. exponent digits are *not* hex
   const char *ExpStart = CurPtr;
-  while (isdigit(*CurPtr))
+  while (isDigit(*CurPtr))
     ++CurPtr;
 
   if (CurPtr == ExpStart)
@@ -135,15 +136,15 @@ AsmToken AsmLexer::LexHexFloatLiteral(bo
 
 /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
 static bool IsIdentifierChar(char c, bool AllowAt) {
-  return isalnum(c) || c == '_' || c == '$' || c == '.' ||
+  return isAlnum(c) || c == '_' || c == '$' || c == '.' ||
          (c == '@' && AllowAt) || c == '?';
 }
 
 AsmToken AsmLexer::LexIdentifier() {
   // Check for floating point literals.
-  if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
+  if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
     // Disambiguate a .1243foo identifier from a floating literal.
-    while (isdigit(*CurPtr))
+    while (isDigit(*CurPtr))
       ++CurPtr;
     if (*CurPtr == 'e' || *CurPtr == 'E' ||
         !IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
@@ -244,9 +245,9 @@ static unsigned doLookAhead(const char *
   const char *FirstHex = nullptr;
   const char *LookAhead = CurPtr;
   while (true) {
-    if (isdigit(*LookAhead)) {
+    if (isDigit(*LookAhead)) {
       ++LookAhead;
-    } else if (isxdigit(*LookAhead)) {
+    } else if (isHexDigit(*LookAhead)) {
       if (!FirstHex)
         FirstHex = LookAhead;
       ++LookAhead;
@@ -282,7 +283,7 @@ AsmToken AsmLexer::LexDigit() {
     const char *FirstNonBinary = (CurPtr[-1] != '0' && CurPtr[-1] != '1') ?
                                    CurPtr - 1 : nullptr;
     const char *OldCurPtr = CurPtr;
-    while (isxdigit(*CurPtr)) {
+    while (isHexDigit(*CurPtr)) {
       if (*CurPtr != '0' && *CurPtr != '1' && !FirstNonBinary)
         FirstNonBinary = CurPtr;
       ++CurPtr;
@@ -346,7 +347,7 @@ AsmToken AsmLexer::LexDigit() {
   if (!IsParsingMSInlineAsm && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
     ++CurPtr;
     // See if we actually have "0b" as part of something like "jmp 0b\n"
-    if (!isdigit(CurPtr[0])) {
+    if (!isDigit(CurPtr[0])) {
       --CurPtr;
       StringRef Result(TokStart, CurPtr - TokStart);
       return AsmToken(AsmToken::Integer, Result, 0);
@@ -375,7 +376,7 @@ AsmToken AsmLexer::LexDigit() {
   if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
     ++CurPtr;
     const char *NumStart = CurPtr;
-    while (isxdigit(CurPtr[0]))
+    while (isHexDigit(CurPtr[0]))
       ++CurPtr;
 
     // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be

Added: llvm/trunk/test/MC/AsmParser/non-english-characters.s
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/MC/AsmParser/non-english-characters.s?rev=314883&view=auto
==============================================================================
--- llvm/trunk/test/MC/AsmParser/non-english-characters.s (added)
+++ llvm/trunk/test/MC/AsmParser/non-english-characters.s Wed Oct  4 01:50:08 2017
@@ -0,0 +1,14 @@
+# RUN: llvm-mc -triple i386-linux-gnu -filetype=obj -o %t %s
+# RUN: llvm-readobj %t | FileCheck %s
+# CHECK: Format: ELF32-i386
+
+# 0bÑ
+# 0xÑ
+# .Ñ4
+# .XÑ
+# .1Ñ
+# .1eÑ
+# 0x.Ñ
+# 0x0pÑ
+.intel_syntax
+# 1Ñ