[PATCH] D38461: [MC] - Don't assert when non-english characters are used.

Tue Oct 3 09:17:04 PDT 2017

LGTM.

Thaks,
Rafael

George Rimar via Phabricator <reviews at reviews.llvm.org> writes:

> grimar updated this revision to Diff 117503.
> grimar added a comment.
>
> - Added helpers to StringExtras as was suggested.
>
>
> https://reviews.llvm.org/D38461
>
> Files:
>   include/llvm/ADT/StringExtras.h
>   lib/MC/MCParser/AsmLexer.cpp
>   test/MC/AsmParser/non-english-characters.s
>
> Index: test/MC/AsmParser/non-english-characters.s
> ===================================================================
> --- test/MC/AsmParser/non-english-characters.s
> +++ test/MC/AsmParser/non-english-characters.s
> @@ -0,0 +1,14 @@
> +# RUN: llvm-mc -triple i386-linux-gnu -filetype=obj -o %t %s
> +# RUN: llvm-readobj %t | FileCheck %s
> +# CHECK: Format: ELF32-i386
> +
> +# 0bﾑ
> +# 0xﾑ
> +# .ﾑ4
> +# .Xﾑ
> +# .1ﾑ
> +# .1eﾑ
> +# 0x.ﾑ
> +# 0x0pﾑ
> +.intel_syntax
> +# 1ﾑ
> Index: lib/MC/MCParser/AsmLexer.cpp
> ===================================================================
> --- lib/MC/MCParser/AsmLexer.cpp
> +++ lib/MC/MCParser/AsmLexer.cpp
> @@ -14,6 +14,7 @@
>  #include "llvm/MC/MCParser/AsmLexer.h"
>  #include "llvm/ADT/APInt.h"
>  #include "llvm/ADT/ArrayRef.h"
> +#include "llvm/ADT/StringExtras.h"
>  #include "llvm/ADT/StringRef.h"
>  #include "llvm/ADT/StringSwitch.h"
>  #include "llvm/MC/MCAsmInfo.h"
> @@ -68,7 +69,7 @@
>  /// consumed.
>  AsmToken AsmLexer::LexFloatLiteral() {
>    // Skip the fractional digit sequence.
> -  while (isdigit(*CurPtr))
> +  while (isDigit(*CurPtr))
>      ++CurPtr;
>  
>    // Check for exponent; we intentionally accept a slighlty wider set of
> @@ -78,7 +79,7 @@
>      ++CurPtr;
>      if (*CurPtr == '-' || *CurPtr == '+')
>        ++CurPtr;
> -    while (isdigit(*CurPtr))
> +    while (isDigit(*CurPtr))
>        ++CurPtr;
>    }
>  
> @@ -102,7 +103,7 @@
>      ++CurPtr;
>  
>      const char *FracStart = CurPtr;
> -    while (isxdigit(*CurPtr))
> +    while (isHexDigit(*CurPtr))
>        ++CurPtr;
>  
>      NoFracDigits = CurPtr == FracStart;
> @@ -123,7 +124,7 @@
>  
>    // N.b. exponent digits are *not* hex
>    const char *ExpStart = CurPtr;
> -  while (isdigit(*CurPtr))
> +  while (isDigit(*CurPtr))
>      ++CurPtr;
>  
>    if (CurPtr == ExpStart)
> @@ -135,15 +136,15 @@
>  
>  /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
>  static bool IsIdentifierChar(char c, bool AllowAt) {
> -  return isalnum(c) || c == '_' || c == '$' || c == '.' ||
> +  return isAlnum(c) || c == '_' || c == '$' || c == '.' ||
>           (c == '@' && AllowAt) || c == '?';
>  }
>  
>  AsmToken AsmLexer::LexIdentifier() {
>    // Check for floating point literals.
> -  if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
> +  if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
>      // Disambiguate a .1243foo identifier from a floating literal.
> -    while (isdigit(*CurPtr))
> +    while (isDigit(*CurPtr))
>        ++CurPtr;
>      if (*CurPtr == 'e' || *CurPtr == 'E' ||
>          !IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
> @@ -244,9 +245,9 @@
>    const char *FirstHex = nullptr;
>    const char *LookAhead = CurPtr;
>    while (true) {
> -    if (isdigit(*LookAhead)) {
> +    if (isDigit(*LookAhead)) {
>        ++LookAhead;
> -    } else if (isxdigit(*LookAhead)) {
> +    } else if (isHexDigit(*LookAhead)) {
>        if (!FirstHex)
>          FirstHex = LookAhead;
>        ++LookAhead;
> @@ -282,7 +283,7 @@
>      const char *FirstNonBinary = (CurPtr[-1] != '0' && CurPtr[-1] != '1') ?
>                                     CurPtr - 1 : nullptr;
>      const char *OldCurPtr = CurPtr;
> -    while (isxdigit(*CurPtr)) {
> +    while (isHexDigit(*CurPtr)) {
>        if (*CurPtr != '0' && *CurPtr != '1' && !FirstNonBinary)
>          FirstNonBinary = CurPtr;
>        ++CurPtr;
> @@ -346,7 +347,7 @@
>    if (!IsParsingMSInlineAsm && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
>      ++CurPtr;
>      // See if we actually have "0b" as part of something like "jmp 0b\n"
> -    if (!isdigit(CurPtr[0])) {
> +    if (!isDigit(CurPtr[0])) {
>        --CurPtr;
>        StringRef Result(TokStart, CurPtr - TokStart);
>        return AsmToken(AsmToken::Integer, Result, 0);
> @@ -375,7 +376,7 @@
>    if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
>      ++CurPtr;
>      const char *NumStart = CurPtr;
> -    while (isxdigit(CurPtr[0]))
> +    while (isHexDigit(CurPtr[0]))
>        ++CurPtr;
>  
>      // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
> Index: include/llvm/ADT/StringExtras.h
> ===================================================================
> --- include/llvm/ADT/StringExtras.h
> +++ include/llvm/ADT/StringExtras.h
> @@ -59,6 +59,21 @@
>    return -1U;
>  }
>  
> +/// Checks if character \p C is one of the 10 decimal digits.
> +static inline bool isDigit(char C) { return C >= '0' && C <= '9'; }
> +
> +/// Checks if character \p C is a hexadecimal numeric character.
> +static inline bool isHexDigit(char C) { return hexDigitValue(C) != -1U; }
> +
> +/// Checks if character \p C is a valid letter as classified by "C" locale.
> +static inline bool isAlpha(char C) {
> +  return ('a' <= C && C <= 'z') || ('A' <= C && C <= 'Z');
> +}
> +
> +/// Checks whether character \p C is either a decimal digit or an uppercase or
> +/// lowercase letter as classified by "C" locale.
> +static inline bool isAlnum(char C) { return isAlpha(C) || isDigit(C); }
> +
>  static inline std::string utohexstr(uint64_t X, bool LowerCase = false) {
>    char Buffer[17];
>    char *BufPtr = std::end(Buffer);