[llvm] [LLVM][IR] Fix a couple of StringRef OOB error in AsmParser (PR #151556)

Rahul Joshi via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 31 09:46:00 PDT 2025


https://github.com/jurahul created https://github.com/llvm/llvm-project/pull/151556

- Fix LLLexer::getNextChar() to not index OOB into the CurBuf.
- Add unit tests that exercise these cases using a heap allocated StringRef. They show asan failures.

>From b7f69b3f7d669f449c5fea65172ffd6aad3e8d85 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Thu, 31 Jul 2025 09:43:11 -0700
Subject: [PATCH] [LLVM][IR] Fix a couple of StringRef OOB error in AsmParser

- Fix LLLexer::getNextChar() to not index OOB into the CurBuf.
- Add unit tests that exercise these cases using a heap allocated
  StringRef. They show asan failures.
---
 llvm/include/llvm/AsmParser/LLLexer.h      |  1 +
 llvm/lib/AsmParser/LLLexer.cpp             | 44 ++++++++-------
 llvm/lib/AsmParser/LLParser.cpp            |  3 +-
 llvm/lib/AsmParser/Parser.cpp              |  3 +-
 llvm/unittests/AsmParser/AsmParserTest.cpp | 64 ++++++++++++++++++----
 5 files changed, 82 insertions(+), 33 deletions(-)

diff --git a/llvm/include/llvm/AsmParser/LLLexer.h b/llvm/include/llvm/AsmParser/LLLexer.h
index 501a7aefccd7f..fcefdb449f73e 100644
--- a/llvm/include/llvm/AsmParser/LLLexer.h
+++ b/llvm/include/llvm/AsmParser/LLLexer.h
@@ -94,6 +94,7 @@ namespace llvm {
     lltok::Kind LexToken();
 
     int getNextChar();
+    bool isLabelTail();
     void SkipLineComment();
     bool SkipCComment();
     lltok::Kind ReadString(lltok::Kind kind);
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 520c6a00a9c07..62d701a9f501a 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -150,9 +150,8 @@ static void UnEscapeLexed(std::string &Str) {
 }
 
 /// isLabelChar - Return true for [-a-zA-Z$._0-9].
-static bool isLabelChar(char C) {
-  return isalnum(static_cast<unsigned char>(C)) || C == '-' || C == '$' ||
-         C == '.' || C == '_';
+static bool isLabelChar(int C) {
+  return isalnum(C) || C == '-' || C == '$' || C == '.' || C == '_';
 }
 
 /// isLabelTail - Return true if this pointer points to a valid end of a label.
@@ -175,19 +174,24 @@ LLLexer::LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &Err,
 }
 
 int LLLexer::getNextChar() {
-  char CurChar = *CurPtr++;
-  switch (CurChar) {
-  default: return (unsigned char)CurChar;
-  case 0:
-    // A nul character in the stream is either the end of the current buffer or
-    // a random nul in the file.  Disambiguate that here.
-    if (CurPtr-1 != CurBuf.end())
-      return 0;  // Just whitespace.
-
-    // Otherwise, return end of file.
-    --CurPtr;  // Another call to lex will return EOF again.
+  if (CurPtr == CurBuf.end())
     return EOF;
+  return *CurPtr++;
+}
+
+/// isLabelTail - Return true if the buffer is valid end of a label. If not,
+/// it returns false and restores the current pointer to where it was.
+bool LLLexer::isLabelTail() {
+  const char *SavedPtr = CurPtr;
+  while (true) {
+    int CurChar = getNextChar();
+    if (CurChar == ':')
+      return true;
+    if (!isLabelChar(CurChar))
+      break;
   }
+  CurPtr = SavedPtr;
+  return false;
 }
 
 lltok::Kind LLLexer::LexToken() {
@@ -215,8 +219,7 @@ lltok::Kind LLLexer::LexToken() {
     case '%': return LexPercent();
     case '"': return LexQuote();
     case '.':
-      if (const char *Ptr = isLabelTail(CurPtr)) {
-        CurPtr = Ptr;
+      if (isLabelTail()) {
         StrVal.assign(TokStart, CurPtr-1);
         return lltok::LabelStr;
       }
@@ -262,7 +265,8 @@ lltok::Kind LLLexer::LexToken() {
 
 void LLLexer::SkipLineComment() {
   while (true) {
-    if (CurPtr[0] == '\n' || CurPtr[0] == '\r' || getNextChar() == EOF)
+    int CurChar = getNextChar();
+    if (CurChar == '\n' || CurChar == '\r' || CurChar == EOF)
       return;
   }
 }
@@ -298,7 +302,7 @@ lltok::Kind LLLexer::LexAt() {
 }
 
 lltok::Kind LLLexer::LexDollar() {
-  if (const char *Ptr = isLabelTail(TokStart)) {
+  if (const char *Ptr = ::isLabelTail(TokStart)) {
     CurPtr = Ptr;
     StrVal.assign(TokStart, CurPtr - 1);
     return lltok::LabelStr;
@@ -1145,7 +1149,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
   if (!isdigit(static_cast<unsigned char>(TokStart[0])) &&
       !isdigit(static_cast<unsigned char>(CurPtr[0]))) {
     // Okay, this is not a number after the -, it's probably a label.
-    if (const char *End = isLabelTail(CurPtr)) {
+    if (const char *End = ::isLabelTail(CurPtr)) {
       StrVal.assign(TokStart, End-1);
       CurPtr = End;
       return lltok::LabelStr;
@@ -1172,7 +1176,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
 
   // Check to see if this really is a string label, e.g. "-1:".
   if (isLabelChar(CurPtr[0]) || CurPtr[0] == ':') {
-    if (const char *End = isLabelTail(CurPtr)) {
+    if (const char *End = ::isLabelTail(CurPtr)) {
       StrVal.assign(TokStart, End-1);
       CurPtr = End;
       return lltok::LabelStr;
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 13bef1f62f1a9..1ad2cd61daaf6 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -6917,7 +6917,8 @@ bool LLParser::parseFunctionBody(Function &Fn, unsigned FunctionNumber,
                                  ArrayRef<unsigned> UnnamedArgNums) {
   if (Lex.getKind() != lltok::lbrace)
     return tokError("expected '{' in function body");
-  Lex.Lex();  // eat the {.
+  if (Lex.Lex() == lltok::Error) // eat the {.
+    return tokError("Invalid token");
 
   PerFunctionState PFS(*this, Fn, FunctionNumber, UnnamedArgNums);
 
diff --git a/llvm/lib/AsmParser/Parser.cpp b/llvm/lib/AsmParser/Parser.cpp
index 07fdce981b084..a7e1e61a35037 100644
--- a/llvm/lib/AsmParser/Parser.cpp
+++ b/llvm/lib/AsmParser/Parser.cpp
@@ -26,7 +26,8 @@ static bool parseAssemblyInto(MemoryBufferRef F, Module *M,
                               SlotMapping *Slots, bool UpgradeDebugInfo,
                               DataLayoutCallbackTy DataLayoutCallback) {
   SourceMgr SM;
-  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(F);
+  std::unique_ptr<MemoryBuffer> Buf =
+      MemoryBuffer::getMemBuffer(F, /*RequiresNullTerminator*/ false);
   SM.AddNewSourceBuffer(std::move(Buf), SMLoc());
 
   std::optional<LLVMContext> OptContext;
diff --git a/llvm/unittests/AsmParser/AsmParserTest.cpp b/llvm/unittests/AsmParser/AsmParserTest.cpp
index ce226705068af..27a4f78e49be4 100644
--- a/llvm/unittests/AsmParser/AsmParserTest.cpp
+++ b/llvm/unittests/AsmParser/AsmParserTest.cpp
@@ -22,31 +22,67 @@ using namespace llvm;
 
 namespace {
 
+class HeapAllocatedStringRef {
+  std::unique_ptr<char[]> Data;
+  size_t Size;
+
+public:
+  HeapAllocatedStringRef(StringRef S) : Size(S.size()) {
+    Data = std::make_unique<char[]>(Size);
+    std::strncpy(Data.get(), S.data(), Size);
+  }
+  operator StringRef() const { return StringRef(Data.get(), Size); }
+};
+
 TEST(AsmParserTest, NullTerminatedInput) {
   LLVMContext Ctx;
-  StringRef Source = "; Empty module \n";
+  StringRef Source = "; Empty module\n";
   SMDiagnostic Error;
   auto Mod = parseAssemblyString(Source, Error, Ctx);
 
   EXPECT_TRUE(Mod != nullptr);
   EXPECT_TRUE(Error.getMessage().empty());
-}
 
-#ifdef GTEST_HAS_DEATH_TEST
-#ifndef NDEBUG
+  Mod = parseAssemblyString(HeapAllocatedStringRef(Source), Error, Ctx);
+
+  EXPECT_TRUE(Mod != nullptr);
+  EXPECT_TRUE(Error.getMessage().empty());
+}
 
 TEST(AsmParserTest, NonNullTerminatedInput) {
   LLVMContext Ctx;
-  StringRef Source = "; Empty module \n\1\2";
+  StringRef Source = "; Empty module\n\1\2";
+  SMDiagnostic Error;
+
+  auto Mod = parseAssemblyString(Source.drop_back(2), Error, Ctx);
+
+  EXPECT_TRUE(Mod != nullptr);
+  EXPECT_TRUE(Error.getMessage().empty());
+}
+
+TEST(AsmParserTest, CommentTerminatedWithEOF) {
+  LLVMContext Ctx;
+  StringRef Source = "; Empty module";
   SMDiagnostic Error;
-  std::unique_ptr<Module> Mod;
-  EXPECT_DEATH(Mod = parseAssemblyString(Source.substr(0, Source.size() - 2),
-                                         Error, Ctx),
-               "Buffer is not null terminated!");
+
+  auto Mod = parseAssemblyString(HeapAllocatedStringRef(Source), Error, Ctx);
+
+  EXPECT_TRUE(Mod != nullptr);
+  EXPECT_TRUE(Error.getMessage().empty());
 }
 
-#endif
-#endif
+TEST(AsmParserTest, LabelTailTerminatedWithEOF) {
+  LLVMContext Ctx;
+  StringRef Source = R"(
+  define void @main() {
+    .l12)";
+  SMDiagnostic Error;
+
+  auto Mod = parseAssemblyString(HeapAllocatedStringRef(Source), Error, Ctx);
+
+  EXPECT_EQ(Mod, nullptr);
+  EXPECT_EQ(Error.getMessage(), "Invalid token");
+}
 
 TEST(AsmParserTest, SlotMappingTest) {
   LLVMContext Ctx;
@@ -65,6 +101,12 @@ TEST(AsmParserTest, SlotMappingTest) {
   EXPECT_EQ(Mapping.MetadataNodes.count(0), 1u);
   EXPECT_EQ(Mapping.MetadataNodes.count(42), 1u);
   EXPECT_EQ(Mapping.MetadataNodes.count(1), 0u);
+
+  Mod =
+      parseAssemblyString(HeapAllocatedStringRef(Source), Error, Ctx, &Mapping);
+
+  EXPECT_TRUE(Mod != nullptr);
+  EXPECT_TRUE(Error.getMessage().empty());
 }
 
 TEST(AsmParserTest, TypeAndConstantValueParsing) {



More information about the llvm-commits mailing list