[llvm] [LLVM][IR] Switch from manual pointer incrementation to function in Lexer (PR #152103)

Tue Aug 5 01:29:06 PDT 2025

Albert =?utf-8?q?Havliček?= <ahavlicek at azul.com>,
Albert =?utf-8?q?Havliček?= <ahavlicek at azul.com>,
Albert =?utf-8?q?Havliček?= <ahavlicek at azul.com>
Message-ID: <llvm.org/llvm/llvm-project/pull/152103 at github.com>
In-Reply-To:


https://github.com/Bertik23 created https://github.com/llvm/llvm-project/pull/152103

The IR Lexer has a `getNextChar()` function implemented. But it is used only once, on other places the pointer pointing to the current position in the lexed string (`CurPtr`) is incremented manually, this can lead to some OOB errors (see #151556 and [Discourse thread](https://discourse.llvm.org/t/llvm-ir-lexer-not-using-getnextchar/87625/4)).

To mitigate the issue, this PR changes the `getNextChar()` to be more straight forward, changes all manual pointer increments to `getNextChar()` or adds checks, so that no OOB occurs.

Two helper functions were used `skipNChars` and `advancePositionTo`. Where `skipNChars(N)` calls `getNextChar()` N-times, while `advancePositionTo(Ptr)` is used insead of `CurPtr = Ptr`.

One helper function `isLabelTail` was moved into the Lexer class, to be able to check the bounds, and renamed to `getLabelTail` to better reflect it's function.

>From ecf591c76631957bea7f8d62191d4b99fe8fc4c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20Havli=C4=8Dek?= <ahavlicek at azul.com>
Date: Tue, 29 Jul 2025 14:41:18 +0000
Subject: [PATCH 1/4] Update CurLineNum anc CurColNum in sync with movement in
 text

---
 llvm/include/llvm/AsmParser/LLLexer.h |  2 +
 llvm/lib/AsmParser/LLLexer.cpp        | 97 +++++++++++++++++----------
 2 files changed, 63 insertions(+), 36 deletions(-)

diff --git a/llvm/include/llvm/AsmParser/LLLexer.h b/llvm/include/llvm/AsmParser/LLLexer.h
index 501a7aefccd7f..5f6e32a4bf5e1 100644
--- a/llvm/include/llvm/AsmParser/LLLexer.h
+++ b/llvm/include/llvm/AsmParser/LLLexer.h
@@ -94,6 +94,8 @@ namespace llvm {
     lltok::Kind LexToken();
 
     int getNextChar();
+    const char *skipNChars(unsigned N);
+    void advancePositionTo(const char *Ptr);
     void SkipLineComment();
     bool SkipCComment();
     lltok::Kind ReadString(lltok::Kind kind);
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 520c6a00a9c07..7cefd4f6b4935 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -190,6 +190,23 @@ int LLLexer::getNextChar() {
   }
 }
 
+const char *LLLexer::skipNChars(unsigned N) {
+  while (N--)
+    getNextChar();
+  return CurPtr;
+}
+
+void LLLexer::advancePositionTo(const char *Ptr) {
+  while (CurPtr != Ptr) {
+    // FIXME: Assumes that if moving back, we stay in that line
+    if (CurPtr > Ptr) {
+      --CurPtr;
+      --CurColNum;
+    } else
+      getNextChar();
+  }
+}
+
 lltok::Kind LLLexer::LexToken() {
   while (true) {
     TokStart = CurPtr;
@@ -216,12 +233,12 @@ lltok::Kind LLLexer::LexToken() {
     case '"': return LexQuote();
     case '.':
       if (const char *Ptr = isLabelTail(CurPtr)) {
-        CurPtr = Ptr;
+        advancePositionTo(Ptr);
         StrVal.assign(TokStart, CurPtr-1);
         return lltok::LabelStr;
       }
       if (CurPtr[0] == '.' && CurPtr[1] == '.') {
-        CurPtr += 2;
+        skipNChars(2);
         return lltok::dotdotdot;
       }
       return lltok::Error;
@@ -299,14 +316,14 @@ lltok::Kind LLLexer::LexAt() {
 
 lltok::Kind LLLexer::LexDollar() {
   if (const char *Ptr = isLabelTail(TokStart)) {
-    CurPtr = Ptr;
+    advancePositionTo(Ptr);
     StrVal.assign(TokStart, CurPtr - 1);
     return lltok::LabelStr;
   }
 
   // Handle DollarStringConstant: $\"[^\"]*\"
   if (CurPtr[0] == '"') {
-    ++CurPtr;
+    getNextChar();
 
     while (true) {
       int CurChar = getNextChar();
@@ -358,11 +375,11 @@ bool LLLexer::ReadVarName() {
   if (isalpha(static_cast<unsigned char>(CurPtr[0])) ||
       CurPtr[0] == '-' || CurPtr[0] == '$' ||
       CurPtr[0] == '.' || CurPtr[0] == '_') {
-    ++CurPtr;
+    getNextChar();
     while (isalnum(static_cast<unsigned char>(CurPtr[0])) ||
            CurPtr[0] == '-' || CurPtr[0] == '$' ||
            CurPtr[0] == '.' || CurPtr[0] == '_')
-      ++CurPtr;
+      getNextChar();
 
     StrVal.assign(NameStart, CurPtr);
     return true;
@@ -376,7 +393,8 @@ lltok::Kind LLLexer::LexUIntID(lltok::Kind Token) {
   if (!isdigit(static_cast<unsigned char>(CurPtr[0])))
     return lltok::Error;
 
-  for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
+  for (getNextChar(); isdigit(static_cast<unsigned char>(CurPtr[0]));
+       getNextChar())
     /*empty*/;
 
   uint64_t Val = atoull(TokStart + 1, CurPtr);
@@ -389,7 +407,7 @@ lltok::Kind LLLexer::LexUIntID(lltok::Kind Token) {
 lltok::Kind LLLexer::LexVar(lltok::Kind Var, lltok::Kind VarID) {
   // Handle StringConstant: \"[^\"]*\"
   if (CurPtr[0] == '"') {
-    ++CurPtr;
+    getNextChar();
 
     while (true) {
       int CurChar = getNextChar();
@@ -435,7 +453,7 @@ lltok::Kind LLLexer::LexQuote() {
     return kind;
 
   if (CurPtr[0] == ':') {
-    ++CurPtr;
+    getNextChar();
     if (StringRef(StrVal).contains(0)) {
       LexError("NUL character is not allowed in names");
       kind = lltok::Error;
@@ -455,11 +473,11 @@ lltok::Kind LLLexer::LexExclaim() {
   if (isalpha(static_cast<unsigned char>(CurPtr[0])) ||
       CurPtr[0] == '-' || CurPtr[0] == '$' ||
       CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\') {
-    ++CurPtr;
+    getNextChar();
     while (isalnum(static_cast<unsigned char>(CurPtr[0])) ||
            CurPtr[0] == '-' || CurPtr[0] == '$' ||
            CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\')
-      ++CurPtr;
+      getNextChar();
 
     StrVal.assign(TokStart+1, CurPtr);   // Skip !
     UnEscapeLexed(StrVal);
@@ -495,7 +513,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   const char *IntEnd = CurPtr[-1] == 'i' ? nullptr : StartChar;
   const char *KeywordEnd = nullptr;
 
-  for (; isLabelChar(*CurPtr); ++CurPtr) {
+  for (; isLabelChar(*CurPtr); getNextChar()) {
     // If we decide this is an integer, remember the end of the sequence.
     if (!IntEnd && !isdigit(static_cast<unsigned char>(*CurPtr)))
       IntEnd = CurPtr;
@@ -507,7 +525,8 @@ lltok::Kind LLLexer::LexIdentifier() {
   // If we stopped due to a colon, unless we were directed to ignore it,
   // this really is a label.
   if (!IgnoreColonInIdentifiers && *CurPtr == ':') {
-    StrVal.assign(StartChar-1, CurPtr++);
+    StrVal.assign(StartChar - 1, CurPtr);
+    getNextChar();
     return lltok::LabelStr;
   }
 
@@ -515,7 +534,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   // return it.
   if (!IntEnd) IntEnd = CurPtr;
   if (IntEnd != StartChar) {
-    CurPtr = IntEnd;
+    advancePositionTo(IntEnd);
     uint64_t NumBits = atoull(StartChar, CurPtr);
     if (NumBits < IntegerType::MIN_INT_BITS ||
         NumBits > IntegerType::MAX_INT_BITS) {
@@ -528,7 +547,7 @@ lltok::Kind LLLexer::LexIdentifier() {
 
   // Otherwise, this was a letter sequence.  See which keyword this is.
   if (!KeywordEnd) KeywordEnd = CurPtr;
-  CurPtr = KeywordEnd;
+  advancePositionTo(KeywordEnd);
   --StartChar;
   StringRef Keyword(StartChar, CurPtr - StartChar);
 
@@ -1042,7 +1061,7 @@ lltok::Kind LLLexer::LexIdentifier() {
     StringRef HexStr(TokStart + 3, len);
     if (!all_of(HexStr, isxdigit)) {
       // Bad token, return it as an error.
-      CurPtr = TokStart+3;
+      advancePositionTo(TokStart + 3);
       return lltok::Error;
     }
     APInt Tmp(bits, HexStr, 16);
@@ -1055,12 +1074,12 @@ lltok::Kind LLLexer::LexIdentifier() {
 
   // If this is "cc1234", return this as just "cc".
   if (TokStart[0] == 'c' && TokStart[1] == 'c') {
-    CurPtr = TokStart+2;
+    advancePositionTo(TokStart + 2);
     return lltok::kw_cc;
   }
 
   // Finally, if this isn't known, return an error.
-  CurPtr = TokStart+1;
+  advancePositionTo(TokStart + 1);
   return lltok::Error;
 }
 
@@ -1073,24 +1092,25 @@ lltok::Kind LLLexer::LexIdentifier() {
 ///    HexHalfConstant   0xH[0-9A-Fa-f]+
 ///    HexBFloatConstant 0xR[0-9A-Fa-f]+
 lltok::Kind LLLexer::Lex0x() {
-  CurPtr = TokStart + 2;
+  advancePositionTo(TokStart + 2);
 
   char Kind;
   if ((CurPtr[0] >= 'K' && CurPtr[0] <= 'M') || CurPtr[0] == 'H' ||
       CurPtr[0] == 'R') {
-    Kind = *CurPtr++;
+    Kind = *CurPtr;
+    getNextChar();
   } else {
     Kind = 'J';
   }
 
   if (!isxdigit(static_cast<unsigned char>(CurPtr[0]))) {
     // Bad token, return it as an error.
-    CurPtr = TokStart+1;
+    advancePositionTo(TokStart + 1);
     return lltok::Error;
   }
 
   while (isxdigit(static_cast<unsigned char>(CurPtr[0])))
-    ++CurPtr;
+    getNextChar();
 
   if (Kind == 'J') {
     // HexFPConstant - Floating point constant represented in IEEE format as a
@@ -1147,7 +1167,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
     // Okay, this is not a number after the -, it's probably a label.
     if (const char *End = isLabelTail(CurPtr)) {
       StrVal.assign(TokStart, End-1);
-      CurPtr = End;
+      advancePositionTo(End);
       return lltok::LabelStr;
     }
 
@@ -1157,13 +1177,13 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
   // At this point, it is either a label, int or fp constant.
 
   // Skip digits, we have at least one.
-  for (; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
+  for (; isdigit(static_cast<unsigned char>(CurPtr[0])); getNextChar())
     /*empty*/;
 
   // Check if this is a fully-numeric label:
   if (isdigit(TokStart[0]) && CurPtr[0] == ':') {
     uint64_t Val = atoull(TokStart, CurPtr);
-    ++CurPtr; // Skip the colon.
+    getNextChar(); // Skip the colon.
     if ((unsigned)Val != Val)
       LexError("invalid value number (too large)");
     UIntVal = unsigned(Val);
@@ -1174,7 +1194,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
   if (isLabelChar(CurPtr[0]) || CurPtr[0] == ':') {
     if (const char *End = isLabelTail(CurPtr)) {
       StrVal.assign(TokStart, End-1);
-      CurPtr = End;
+      advancePositionTo(End);
       return lltok::LabelStr;
     }
   }
@@ -1188,17 +1208,19 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
     return lltok::APSInt;
   }
 
-  ++CurPtr;
+  getNextChar();
 
   // Skip over [0-9]*([eE][-+]?[0-9]+)?
-  while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+  while (isdigit(static_cast<unsigned char>(CurPtr[0])))
+    getNextChar();
 
   if (CurPtr[0] == 'e' || CurPtr[0] == 'E') {
     if (isdigit(static_cast<unsigned char>(CurPtr[1])) ||
         ((CurPtr[1] == '-' || CurPtr[1] == '+') &&
           isdigit(static_cast<unsigned char>(CurPtr[2])))) {
-      CurPtr += 2;
-      while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+      skipNChars(2);
+      while (isdigit(static_cast<unsigned char>(CurPtr[0])))
+        getNextChar();
     }
   }
 
@@ -1216,26 +1238,29 @@ lltok::Kind LLLexer::LexPositive() {
     return lltok::Error;
 
   // Skip digits.
-  for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
+  for (getNextChar(); isdigit(static_cast<unsigned char>(CurPtr[0]));
+       getNextChar())
     /*empty*/;
 
   // At this point, we need a '.'.
   if (CurPtr[0] != '.') {
-    CurPtr = TokStart+1;
+    advancePositionTo(TokStart + 1);
     return lltok::Error;
   }
 
-  ++CurPtr;
+  getNextChar();
 
   // Skip over [0-9]*([eE][-+]?[0-9]+)?
-  while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+  while (isdigit(static_cast<unsigned char>(CurPtr[0])))
+    getNextChar();
 
   if (CurPtr[0] == 'e' || CurPtr[0] == 'E') {
     if (isdigit(static_cast<unsigned char>(CurPtr[1])) ||
         ((CurPtr[1] == '-' || CurPtr[1] == '+') &&
         isdigit(static_cast<unsigned char>(CurPtr[2])))) {
-      CurPtr += 2;
-      while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+      skipNChars(2);
+      while (isdigit(static_cast<unsigned char>(CurPtr[0])))
+        getNextChar();
     }
   }
 

>From 06926e9fa28e10abbec7803c0cd8c196ea09daf5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20Havli=C4=8Dek?= <ahavlicek at azul.com>
Date: Mon, 4 Aug 2025 09:36:51 +0000
Subject: [PATCH 2/4] Remove remains from cherry pick from LSP branch

---
 llvm/lib/AsmParser/LLLexer.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 7cefd4f6b4935..db4079975ad40 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -198,10 +198,8 @@ const char *LLLexer::skipNChars(unsigned N) {
 
 void LLLexer::advancePositionTo(const char *Ptr) {
   while (CurPtr != Ptr) {
-    // FIXME: Assumes that if moving back, we stay in that line
     if (CurPtr > Ptr) {
       --CurPtr;
-      --CurColNum;
     } else
       getNextChar();
   }

>From 1fdf13c326ba2f7889b27de5789c0339190fa0f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20Havli=C4=8Dek?= <ahavlicek at azul.com>
Date: Mon, 4 Aug 2025 09:49:32 +0000
Subject: [PATCH 3/4] Make isLabelTail more safe and rename it to better show
 what it does

---
 llvm/include/llvm/AsmParser/LLLexer.h |  3 ++
 llvm/lib/AsmParser/LLLexer.cpp        | 43 +++++++++++----------------
 2 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/llvm/include/llvm/AsmParser/LLLexer.h b/llvm/include/llvm/AsmParser/LLLexer.h
index 5f6e32a4bf5e1..d0d6f72c197da 100644
--- a/llvm/include/llvm/AsmParser/LLLexer.h
+++ b/llvm/include/llvm/AsmParser/LLLexer.h
@@ -93,6 +93,9 @@ namespace llvm {
   private:
     lltok::Kind LexToken();
 
+    // Return closes pointer after `Ptr` that is an end of a label.
+    // Returns nullptr if `Ptr` doesn't point into a label.
+    const char *getLabelTail(const char *Ptr);
     int getNextChar();
     const char *skipNChars(unsigned N);
     void advancePositionTo(const char *Ptr);
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index db4079975ad40..bbd6b690a97c0 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -155,15 +155,6 @@ static bool isLabelChar(char C) {
          C == '.' || C == '_';
 }
 
-/// isLabelTail - Return true if this pointer points to a valid end of a label.
-static const char *isLabelTail(const char *CurPtr) {
-  while (true) {
-    if (CurPtr[0] == ':') return CurPtr+1;
-    if (!isLabelChar(CurPtr[0])) return nullptr;
-    ++CurPtr;
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // Lexer definition.
 //===----------------------------------------------------------------------===//
@@ -174,20 +165,22 @@ LLLexer::LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &Err,
   CurPtr = CurBuf.begin();
 }
 
+/// getLabelTail - Return true if this pointer points to a valid end of a label.
+const char *LLLexer::getLabelTail(const char *Ptr) {
+  while (Ptr != CurBuf.end()) {
+    if (Ptr[0] == ':')
+      return Ptr + 1;
+    if (!isLabelChar(Ptr[0]))
+      return nullptr;
+    ++Ptr;
+  }
+  return nullptr;
+}
+
 int LLLexer::getNextChar() {
-  char CurChar = *CurPtr++;
-  switch (CurChar) {
-  default: return (unsigned char)CurChar;
-  case 0:
-    // A nul character in the stream is either the end of the current buffer or
-    // a random nul in the file.  Disambiguate that here.
-    if (CurPtr-1 != CurBuf.end())
-      return 0;  // Just whitespace.
-
-    // Otherwise, return end of file.
-    --CurPtr;  // Another call to lex will return EOF again.
+  if (CurPtr == CurBuf.end())
     return EOF;
-  }
+  return *CurPtr++;
 }
 
 const char *LLLexer::skipNChars(unsigned N) {
@@ -230,7 +223,7 @@ lltok::Kind LLLexer::LexToken() {
     case '%': return LexPercent();
     case '"': return LexQuote();
     case '.':
-      if (const char *Ptr = isLabelTail(CurPtr)) {
+      if (const char *Ptr = getLabelTail(CurPtr)) {
         advancePositionTo(Ptr);
         StrVal.assign(TokStart, CurPtr-1);
         return lltok::LabelStr;
@@ -313,7 +306,7 @@ lltok::Kind LLLexer::LexAt() {
 }
 
 lltok::Kind LLLexer::LexDollar() {
-  if (const char *Ptr = isLabelTail(TokStart)) {
+  if (const char *Ptr = getLabelTail(TokStart)) {
     advancePositionTo(Ptr);
     StrVal.assign(TokStart, CurPtr - 1);
     return lltok::LabelStr;
@@ -1163,7 +1156,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
   if (!isdigit(static_cast<unsigned char>(TokStart[0])) &&
       !isdigit(static_cast<unsigned char>(CurPtr[0]))) {
     // Okay, this is not a number after the -, it's probably a label.
-    if (const char *End = isLabelTail(CurPtr)) {
+    if (const char *End = getLabelTail(CurPtr)) {
       StrVal.assign(TokStart, End-1);
       advancePositionTo(End);
       return lltok::LabelStr;
@@ -1190,7 +1183,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
 
   // Check to see if this really is a string label, e.g. "-1:".
   if (isLabelChar(CurPtr[0]) || CurPtr[0] == ':') {
-    if (const char *End = isLabelTail(CurPtr)) {
+    if (const char *End = getLabelTail(CurPtr)) {
       StrVal.assign(TokStart, End-1);
       advancePositionTo(End);
       return lltok::LabelStr;

>From 2772cd8f679124daed8bae737451dcf54b637694 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20Havli=C4=8Dek?= <ahavlicek at azul.com>
Date: Mon, 4 Aug 2025 09:51:38 +0000
Subject: [PATCH 4/4] Remove dangling comment

---
 llvm/lib/AsmParser/LLLexer.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index bbd6b690a97c0..578ac851e38c5 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -165,7 +165,6 @@ LLLexer::LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &Err,
   CurPtr = CurBuf.begin();
 }
 
-/// getLabelTail - Return true if this pointer points to a valid end of a label.
 const char *LLLexer::getLabelTail(const char *Ptr) {
   while (Ptr != CurBuf.end()) {
     if (Ptr[0] == ':')