[llvm] [LLVM][AsmParser] Make error reporting of lexer errors more precise (PR #111077)

Mon Oct 7 05:52:17 PDT 2024

https://github.com/jurahul updated https://github.com/llvm/llvm-project/pull/111077

>From 6afdbea6b7a1bb7de0c94e7f614a87793fd1ab58 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Thu, 3 Oct 2024 16:28:16 -0700
Subject: [PATCH] [LLVM][AsmParser] Make error reporting of lexer errors more
 precise

When the lexer hits an error during assembly parsing, it just logs
the error in the `ErrorMsg` object, and its possible that error gets
overwritten later in by the parser with a more generic error message.
This makes some errors reported by the LLVM's asm parser less precise.

Address this by not having the parser overwrite the message logged
by the lexer.

Update several Assembler unit test to now check the more precise
error messages reported by the LLVM's AsmParser.
---
 llvm/include/llvm/AsmParser/LLLexer.h     |  34 +++++++++++++++--
 llvm/include/llvm/AsmParser/LLParser.h    |   6 +--
 llvm/lib/AsmParser/LLLexer.cpp            |  44 ++++++++++++++--------
 llvm/lib/AsmParser/LLParser.cpp           |  16 ++++----
 llvm/test/Assembler/allockind-missing.ll  |   4 +-
 llvm/test/Assembler/invalid-inttype.ll    |   4 +-
 llvm/test/Assembler/invalid-landingpad.ll |   4 +-
 llvm/test/Assembler/invalid-name.ll       | Bin 142 -> 207 bytes
 llvm/test/Assembler/invalid-name2.ll      | Bin 120 -> 185 bytes
 9 files changed, 75 insertions(+), 37 deletions(-)

diff --git a/llvm/include/llvm/AsmParser/LLLexer.h b/llvm/include/llvm/AsmParser/LLLexer.h
index bd929db33c4a2b..a9f51fb925f5d5 100644
--- a/llvm/include/llvm/AsmParser/LLLexer.h
+++ b/llvm/include/llvm/AsmParser/LLLexer.h
@@ -28,7 +28,20 @@ namespace llvm {
   class LLLexer {
     const char *CurPtr;
     StringRef CurBuf;
-    SMDiagnostic &ErrorInfo;
+
+    enum class ErrorPriority {
+      None,   // No error message present.
+      Parser, // Errors issued by parser.
+      Lexer,  // Errors issued by lexer.
+    };
+
+    struct ErrorInfo {
+      ErrorPriority Priority = ErrorPriority::None;
+      SMDiagnostic &Error;
+
+      explicit ErrorInfo(SMDiagnostic &Error) : Error(Error) {}
+    } ErrorInfo;
+
     SourceMgr &SM;
     LLVMContext &Context;
 
@@ -66,8 +79,13 @@ namespace llvm {
       IgnoreColonInIdentifiers = val;
     }
 
-    bool Error(LocTy ErrorLoc, const Twine &Msg) const;
-    bool Error(const Twine &Msg) const { return Error(getLoc(), Msg); }
+    // This returns true as a convenience for the parser functions that return
+    // true on error.
+    bool ParseError(LocTy ErrorLoc, const Twine &Msg) {
+      Error(ErrorLoc, Msg, ErrorPriority::Parser);
+      return true;
+    }
+    bool ParseError(const Twine &Msg) { return ParseError(getLoc(), Msg); }
 
     void Warning(LocTy WarningLoc, const Twine &Msg) const;
     void Warning(const Twine &Msg) const { return Warning(getLoc(), Msg); }
@@ -97,7 +115,15 @@ namespace llvm {
     uint64_t atoull(const char *Buffer, const char *End);
     uint64_t HexIntToVal(const char *Buffer, const char *End);
     void HexToIntPair(const char *Buffer, const char *End, uint64_t Pair[2]);
-    void FP80HexToIntPair(const char *Buffer, const char *End, uint64_t Pair[2]);
+    void FP80HexToIntPair(const char *Buffer, const char *End,
+                          uint64_t Pair[2]);
+
+    void Error(LocTy ErrorLoc, const Twine &Msg, ErrorPriority Origin);
+
+    void LexError(LocTy ErrorLoc, const Twine &Msg) {
+      Error(ErrorLoc, Msg, ErrorPriority::Lexer);
+    }
+    void LexError(const Twine &Msg) { LexError(getLoc(), Msg); }
   };
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index 9576b935198dd4..1ef8b8ffc39660 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -207,11 +207,11 @@ namespace llvm {
     LLVMContext &getContext() { return Context; }
 
   private:
-    bool error(LocTy L, const Twine &Msg) const { return Lex.Error(L, Msg); }
-    bool tokError(const Twine &Msg) const { return error(Lex.getLoc(), Msg); }
+    bool error(LocTy L, const Twine &Msg) { return Lex.ParseError(L, Msg); }
+    bool tokError(const Twine &Msg) { return error(Lex.getLoc(), Msg); }
 
     bool checkValueID(LocTy L, StringRef Kind, StringRef Prefix,
-                      unsigned NextID, unsigned ID) const;
+                      unsigned NextID, unsigned ID);
 
     /// Restore the internal name and slot mappings using the mappings that
     /// were created at an earlier parsing stage.
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index a3e47da77fe776..f71cbe1b4b1e88 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -25,9 +25,21 @@
 
 using namespace llvm;
 
-bool LLLexer::Error(LocTy ErrorLoc, const Twine &Msg) const {
-  ErrorInfo = SM.GetMessage(ErrorLoc, SourceMgr::DK_Error, Msg);
-  return true;
+// Both the lexer and parser can issue error messages. If the lexer issues a
+// lexer error, since we do not terminate execution immediately, usually that
+// is followed by the parser issuing a parser error. However, the error issued
+// by the lexer is more relevant in that case as opposed to potentially more
+// generic parser error. So instead of always recording the last error message
+// use the `Priority` to establish a priority, with Lexer > Parser > None. We
+// record the issued message only if the message has same or higher priority
+// than the existing one. This prevents lexer errors from being overwritten by
+// parser errors.
+void LLLexer::Error(LocTy ErrorLoc, const Twine &Msg,
+                    LLLexer::ErrorPriority Priority) {
+  if (Priority < ErrorInfo.Priority)
+    return;
+  ErrorInfo.Error = SM.GetMessage(ErrorLoc, SourceMgr::DK_Error, Msg);
+  ErrorInfo.Priority = Priority;
 }
 
 void LLLexer::Warning(LocTy WarningLoc, const Twine &Msg) const {
@@ -49,7 +61,7 @@ uint64_t LLLexer::atoull(const char *Buffer, const char *End) {
     Result *= 10;
     Result += *Buffer-'0';
     if (Result < OldRes) {  // Uh, oh, overflow detected!!!
-      Error("constant bigger than 64 bits detected!");
+      LexError("constant bigger than 64 bits detected!");
       return 0;
     }
   }
@@ -64,7 +76,7 @@ uint64_t LLLexer::HexIntToVal(const char *Buffer, const char *End) {
     Result += hexDigitValue(*Buffer);
 
     if (Result < OldRes) {   // Uh, oh, overflow detected!!!
-      Error("constant bigger than 64 bits detected!");
+      LexError("constant bigger than 64 bits detected!");
       return 0;
     }
   }
@@ -87,7 +99,7 @@ void LLLexer::HexToIntPair(const char *Buffer, const char *End,
     Pair[1] += hexDigitValue(*Buffer);
   }
   if (Buffer != End)
-    Error("constant bigger than 128 bits detected!");
+    LexError("constant bigger than 128 bits detected!");
 }
 
 /// FP80HexToIntPair - translate an 80 bit FP80 number (20 hexits) into
@@ -106,7 +118,7 @@ void LLLexer::FP80HexToIntPair(const char *Buffer, const char *End,
     Pair[0] += hexDigitValue(*Buffer);
   }
   if (Buffer != End)
-    Error("constant bigger than 128 bits detected!");
+    LexError("constant bigger than 128 bits detected!");
 }
 
 // UnEscapeLexed - Run through the specified buffer and change \xx codes to the
@@ -273,14 +285,14 @@ lltok::Kind LLLexer::LexDollar() {
       int CurChar = getNextChar();
 
       if (CurChar == EOF) {
-        Error("end of file in COMDAT variable name");
+        LexError("end of file in COMDAT variable name");
         return lltok::Error;
       }
       if (CurChar == '"') {
         StrVal.assign(TokStart + 2, CurPtr - 1);
         UnEscapeLexed(StrVal);
         if (StringRef(StrVal).contains(0)) {
-          Error("Null bytes are not allowed in names");
+          LexError("Null bytes are not allowed in names");
           return lltok::Error;
         }
         return lltok::ComdatVar;
@@ -302,7 +314,7 @@ lltok::Kind LLLexer::ReadString(lltok::Kind kind) {
     int CurChar = getNextChar();
 
     if (CurChar == EOF) {
-      Error("end of file in string constant");
+      LexError("end of file in string constant");
       return lltok::Error;
     }
     if (CurChar == '"') {
@@ -342,7 +354,7 @@ lltok::Kind LLLexer::LexUIntID(lltok::Kind Token) {
 
   uint64_t Val = atoull(TokStart + 1, CurPtr);
   if ((unsigned)Val != Val)
-    Error("invalid value number (too large)!");
+    LexError("invalid value number (too large)!");
   UIntVal = unsigned(Val);
   return Token;
 }
@@ -356,14 +368,14 @@ lltok::Kind LLLexer::LexVar(lltok::Kind Var, lltok::Kind VarID) {
       int CurChar = getNextChar();
 
       if (CurChar == EOF) {
-        Error("end of file in global variable name");
+        LexError("end of file in global variable name");
         return lltok::Error;
       }
       if (CurChar == '"') {
         StrVal.assign(TokStart+2, CurPtr-1);
         UnEscapeLexed(StrVal);
         if (StringRef(StrVal).contains(0)) {
-          Error("Null bytes are not allowed in names");
+          LexError("Null bytes are not allowed in names");
           return lltok::Error;
         }
         return Var;
@@ -398,7 +410,7 @@ lltok::Kind LLLexer::LexQuote() {
   if (CurPtr[0] == ':') {
     ++CurPtr;
     if (StringRef(StrVal).contains(0)) {
-      Error("Null bytes are not allowed in names");
+      LexError("Null bytes are not allowed in names");
       kind = lltok::Error;
     } else {
       kind = lltok::LabelStr;
@@ -480,7 +492,7 @@ lltok::Kind LLLexer::LexIdentifier() {
     uint64_t NumBits = atoull(StartChar, CurPtr);
     if (NumBits < IntegerType::MIN_INT_BITS ||
         NumBits > IntegerType::MAX_INT_BITS) {
-      Error("bitwidth for integer type out of range!");
+      LexError("bitwidth for integer type out of range!");
       return lltok::Error;
     }
     TyVal = IntegerType::get(Context, NumBits);
@@ -1109,7 +1121,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
     uint64_t Val = atoull(TokStart, CurPtr);
     ++CurPtr; // Skip the colon.
     if ((unsigned)Val != Val)
-      Error("invalid value number (too large)!");
+      LexError("invalid value number (too large)!");
     UIntVal = unsigned(Val);
     return lltok::LabelID;
   }
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index d84521d2e6e10d..6450d8b109063a 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -3220,7 +3220,7 @@ bool LLParser::parseOptionalOperandBundles(
 }
 
 bool LLParser::checkValueID(LocTy Loc, StringRef Kind, StringRef Prefix,
-                            unsigned NextID, unsigned ID) const {
+                            unsigned NextID, unsigned ID) {
   if (ID < NextID)
     return error(Loc, Kind + " expected to be numbered '" + Prefix +
                           Twine(NextID) + "' or greater");
@@ -5192,7 +5192,7 @@ bool LLParser::parseDILocation(MDNode *&Result, bool IsDistinct) {
 ///   ::= distinct !DIAssignID()
 bool LLParser::parseDIAssignID(MDNode *&Result, bool IsDistinct) {
   if (!IsDistinct)
-    return Lex.Error("missing 'distinct', required for !DIAssignID()");
+    return tokError("missing 'distinct', required for !DIAssignID()");
 
   Lex.Lex();
 
@@ -5500,7 +5500,7 @@ bool LLParser::parseDIFile(MDNode *&Result, bool IsDistinct) {
   if (checksumkind.Seen && checksum.Seen)
     OptChecksum.emplace(checksumkind.Val, checksum.Val);
   else if (checksumkind.Seen || checksum.Seen)
-    return Lex.Error("'checksumkind' and 'checksum' must be provided together");
+    return tokError("'checksumkind' and 'checksum' must be provided together");
 
   MDString *Source = nullptr;
   if (source.Seen)
@@ -5519,7 +5519,7 @@ bool LLParser::parseDIFile(MDNode *&Result, bool IsDistinct) {
 ///                      sysroot: "/", sdk: "MacOSX.sdk")
 bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   if (!IsDistinct)
-    return Lex.Error("missing 'distinct', required for !DICompileUnit");
+    return tokError("missing 'distinct', required for !DICompileUnit");
 
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(language, DwarfLangField, );                                        \
@@ -5599,7 +5599,7 @@ bool LLParser::parseDISubprogram(MDNode *&Result, bool IsDistinct) {
                    : DISubprogram::toSPFlags(isLocal.Val, isDefinition.Val,
                                              isOptimized.Val, virtuality.Val);
   if ((SPFlags & DISubprogram::SPFlagDefinition) && !IsDistinct)
-    return Lex.Error(
+    return error(
         Loc,
         "missing 'distinct', required for !DISubprogram that is a Definition");
   Result = GET_OR_DISTINCT(
@@ -7952,10 +7952,10 @@ bool LLParser::parseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
     // array constant.
     if (CT == LandingPadInst::Catch) {
       if (isa<ArrayType>(V->getType()))
-        error(VLoc, "'catch' clause has an invalid type");
+        return error(VLoc, "'catch' clause has an invalid type");
     } else {
       if (!isa<ArrayType>(V->getType()))
-        error(VLoc, "'filter' clause has an invalid type");
+        return error(VLoc, "'filter' clause has an invalid type");
     }
 
     Constant *CV = dyn_cast<Constant>(V);
@@ -8639,7 +8639,7 @@ bool LLParser::parseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes) {
   if (parseToken(lltok::lbrace, "expected '{' here"))
     return true;
   if (Lex.getKind() == lltok::rbrace)
-    return Lex.Error("expected non-empty list of uselistorder indexes");
+    return tokError("expected non-empty list of uselistorder indexes");
 
   // Use Offset, Max, and IsOrdered to check consistency of indexes.  The
   // indexes should be distinct numbers in the range [0, size-1], and should
diff --git a/llvm/test/Assembler/allockind-missing.ll b/llvm/test/Assembler/allockind-missing.ll
index e8672fe9db032a..4fb3f658a91190 100644
--- a/llvm/test/Assembler/allockind-missing.ll
+++ b/llvm/test/Assembler/allockind-missing.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llvm-as --disable-output %s  2>&1 | FileCheck -DFILE=%s %s
 
+; CHECK: [[FILE]]:[[@LINE+1]]:30: error: expected allockind value
 declare void @f0() allockind()
-; CHECK: :[[#@LINE-1]]:30: error: expected allockind value
diff --git a/llvm/test/Assembler/invalid-inttype.ll b/llvm/test/Assembler/invalid-inttype.ll
index df3175540fe829..c8aa7c66b79e4d 100644
--- a/llvm/test/Assembler/invalid-inttype.ll
+++ b/llvm/test/Assembler/invalid-inttype.ll
@@ -1,5 +1,5 @@
-; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+; RUN: not llvm-as --disable-output %s 2>&1 | FileCheck -DFILE=%s %s
 
 ; i8388609 is the smallest integer type that can't be represented in LLVM IR
+; CHECK: [[FILE]]:[[@LINE+1]]:21: error: bitwidth for integer type out of range!
 @i2 = common global i8388609 0, align 4
-; CHECK: expected type
diff --git a/llvm/test/Assembler/invalid-landingpad.ll b/llvm/test/Assembler/invalid-landingpad.ll
index 306e94312dd552..805d3dbaa4d206 100644
--- a/llvm/test/Assembler/invalid-landingpad.ll
+++ b/llvm/test/Assembler/invalid-landingpad.ll
@@ -1,7 +1,7 @@
-; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+; RUN: not llvm-as --disable-output %s 2>&1 | FileCheck -DFILE=%s %s
 
-; CHECK: clause argument must be a constant
 
 define void @test(i32 %in) personality ptr null {
+; CHECK: [[FILE]]:[[@LINE+1]]:24: error: 'filter' clause has an invalid type
   landingpad {} filter i32 %in
 }
diff --git a/llvm/test/Assembler/invalid-name.ll b/llvm/test/Assembler/invalid-name.ll
index 0681ea528bf43017d803f26ccd9f8c0ad3b0523b..74133e60df54d595c68c50aea282cd90899f334f 100644
GIT binary patch
literal 207
zcmW-b%L>9U6hvL;D~5uiNWdz%NKp#4q9|Pmu1XiFy<#vnlB7OR|J^FgV&EK at iLl#e
z0=4nb`gY;cf%8f^S!&{@@z=(q>oA`8LcFn5nvxS8&ftsXdYjH#)P}K&j;bcyjH(bH
z(I4Q&A|$Ic*$CwM&n*g(=ka!(rNgl869KZ;Sb?n38s*cIgS7O&BDFSms-SAr@<NUQ
b-Jla-21gGM6;3gNH&Byti!Ay7GW)PEXMR1O

literal 142
zcmW;E!3u&v6h`6J_bGk^qZUCLw9txRD1x9x&;!KG6))M0I5!chcTd_L_)dxI?NY!8
zMbq at p#XVw!G$P+kGkkEAhE2`(F*YK{m3T%U#9?1yHpiQQG?^sWBJ!Y+Y>g!}Kvp at G
gFsFk#7vP~~tLz{?YWK1#!6URbI39h+`d`wQeoNdb&;S4c

diff --git a/llvm/test/Assembler/invalid-name2.ll b/llvm/test/Assembler/invalid-name2.ll
index 384dee6777d805466166edf9db309eb75c3de521..8a848798a54cafef9c3c151b08f70955e59fa62f 100644
GIT binary patch
literal 185
zcmW-bu?oU47=)ek6o;TFlA(?fMW`(nO9#OzIHdL$gE5gLt%&;WR^f(&@8G)y_J>Nq
zx(H*s8&?6GH#(@=5O;07w-KX&`D&ctja(Z_FLb(sZ}OtdmObjjSVL!R37yjo)@Mwn
zc(Dc2ZI<o?r2K7<M<M09EUIjl^gR(E at 0}N at T4PW@BLyf=|0~KE_n-#a!m1kuHXIBE
L13bmQgMHW+VOTf-

literal 120
zcmW;DK?=e!5Cu at zbBa$v(S-|LrXoUU5d;^42asfngE3Q at u?Xtj6}Jywjnnz)kkydV
zcwb%$HkcNyx3d-AXeF_~a=87BC~_y6F{(uDI?NY%Q8lR?)$seq9~}7Eb1;sOTx at s*
R6V1>*Xk-EgwtuD%eE~+FBr^a2