[llvm] [LLVM][IR] Switch from manual pointer incrementation to function in Lexer (PR #152103)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 5 01:29:06 PDT 2025
Albert =?utf-8?q?Havliček?= <ahavlicek at azul.com>,
Albert =?utf-8?q?Havliček?= <ahavlicek at azul.com>,
Albert =?utf-8?q?Havliček?= <ahavlicek at azul.com>
Message-ID: <llvm.org/llvm/llvm-project/pull/152103 at github.com>
In-Reply-To:
https://github.com/Bertik23 created https://github.com/llvm/llvm-project/pull/152103
The IR Lexer has a `getNextChar()` function implemented. But it is used only once, on other places the pointer pointing to the current position in the lexed string (`CurPtr`) is incremented manually, this can lead to some OOB errors (see #151556 and [Discourse thread](https://discourse.llvm.org/t/llvm-ir-lexer-not-using-getnextchar/87625/4)).
To mitigate the issue, this PR changes the `getNextChar()` to be more straight forward, changes all manual pointer increments to `getNextChar()` or adds checks, so that no OOB occurs.
Two helper functions were used `skipNChars` and `advancePositionTo`. Where `skipNChars(N)` calls `getNextChar()` N-times, while `advancePositionTo(Ptr)` is used insead of `CurPtr = Ptr`.
One helper function `isLabelTail` was moved into the Lexer class, to be able to check the bounds, and renamed to `getLabelTail` to better reflect it's function.
>From ecf591c76631957bea7f8d62191d4b99fe8fc4c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20Havli=C4=8Dek?= <ahavlicek at azul.com>
Date: Tue, 29 Jul 2025 14:41:18 +0000
Subject: [PATCH 1/4] Update CurLineNum anc CurColNum in sync with movement in
text
---
llvm/include/llvm/AsmParser/LLLexer.h | 2 +
llvm/lib/AsmParser/LLLexer.cpp | 97 +++++++++++++++++----------
2 files changed, 63 insertions(+), 36 deletions(-)
diff --git a/llvm/include/llvm/AsmParser/LLLexer.h b/llvm/include/llvm/AsmParser/LLLexer.h
index 501a7aefccd7f..5f6e32a4bf5e1 100644
--- a/llvm/include/llvm/AsmParser/LLLexer.h
+++ b/llvm/include/llvm/AsmParser/LLLexer.h
@@ -94,6 +94,8 @@ namespace llvm {
lltok::Kind LexToken();
int getNextChar();
+ const char *skipNChars(unsigned N);
+ void advancePositionTo(const char *Ptr);
void SkipLineComment();
bool SkipCComment();
lltok::Kind ReadString(lltok::Kind kind);
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 520c6a00a9c07..7cefd4f6b4935 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -190,6 +190,23 @@ int LLLexer::getNextChar() {
}
}
+const char *LLLexer::skipNChars(unsigned N) {
+ while (N--)
+ getNextChar();
+ return CurPtr;
+}
+
+void LLLexer::advancePositionTo(const char *Ptr) {
+ while (CurPtr != Ptr) {
+ // FIXME: Assumes that if moving back, we stay in that line
+ if (CurPtr > Ptr) {
+ --CurPtr;
+ --CurColNum;
+ } else
+ getNextChar();
+ }
+}
+
lltok::Kind LLLexer::LexToken() {
while (true) {
TokStart = CurPtr;
@@ -216,12 +233,12 @@ lltok::Kind LLLexer::LexToken() {
case '"': return LexQuote();
case '.':
if (const char *Ptr = isLabelTail(CurPtr)) {
- CurPtr = Ptr;
+ advancePositionTo(Ptr);
StrVal.assign(TokStart, CurPtr-1);
return lltok::LabelStr;
}
if (CurPtr[0] == '.' && CurPtr[1] == '.') {
- CurPtr += 2;
+ skipNChars(2);
return lltok::dotdotdot;
}
return lltok::Error;
@@ -299,14 +316,14 @@ lltok::Kind LLLexer::LexAt() {
lltok::Kind LLLexer::LexDollar() {
if (const char *Ptr = isLabelTail(TokStart)) {
- CurPtr = Ptr;
+ advancePositionTo(Ptr);
StrVal.assign(TokStart, CurPtr - 1);
return lltok::LabelStr;
}
// Handle DollarStringConstant: $\"[^\"]*\"
if (CurPtr[0] == '"') {
- ++CurPtr;
+ getNextChar();
while (true) {
int CurChar = getNextChar();
@@ -358,11 +375,11 @@ bool LLLexer::ReadVarName() {
if (isalpha(static_cast<unsigned char>(CurPtr[0])) ||
CurPtr[0] == '-' || CurPtr[0] == '$' ||
CurPtr[0] == '.' || CurPtr[0] == '_') {
- ++CurPtr;
+ getNextChar();
while (isalnum(static_cast<unsigned char>(CurPtr[0])) ||
CurPtr[0] == '-' || CurPtr[0] == '$' ||
CurPtr[0] == '.' || CurPtr[0] == '_')
- ++CurPtr;
+ getNextChar();
StrVal.assign(NameStart, CurPtr);
return true;
@@ -376,7 +393,8 @@ lltok::Kind LLLexer::LexUIntID(lltok::Kind Token) {
if (!isdigit(static_cast<unsigned char>(CurPtr[0])))
return lltok::Error;
- for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
+ for (getNextChar(); isdigit(static_cast<unsigned char>(CurPtr[0]));
+ getNextChar())
/*empty*/;
uint64_t Val = atoull(TokStart + 1, CurPtr);
@@ -389,7 +407,7 @@ lltok::Kind LLLexer::LexUIntID(lltok::Kind Token) {
lltok::Kind LLLexer::LexVar(lltok::Kind Var, lltok::Kind VarID) {
// Handle StringConstant: \"[^\"]*\"
if (CurPtr[0] == '"') {
- ++CurPtr;
+ getNextChar();
while (true) {
int CurChar = getNextChar();
@@ -435,7 +453,7 @@ lltok::Kind LLLexer::LexQuote() {
return kind;
if (CurPtr[0] == ':') {
- ++CurPtr;
+ getNextChar();
if (StringRef(StrVal).contains(0)) {
LexError("NUL character is not allowed in names");
kind = lltok::Error;
@@ -455,11 +473,11 @@ lltok::Kind LLLexer::LexExclaim() {
if (isalpha(static_cast<unsigned char>(CurPtr[0])) ||
CurPtr[0] == '-' || CurPtr[0] == '$' ||
CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\') {
- ++CurPtr;
+ getNextChar();
while (isalnum(static_cast<unsigned char>(CurPtr[0])) ||
CurPtr[0] == '-' || CurPtr[0] == '$' ||
CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\')
- ++CurPtr;
+ getNextChar();
StrVal.assign(TokStart+1, CurPtr); // Skip !
UnEscapeLexed(StrVal);
@@ -495,7 +513,7 @@ lltok::Kind LLLexer::LexIdentifier() {
const char *IntEnd = CurPtr[-1] == 'i' ? nullptr : StartChar;
const char *KeywordEnd = nullptr;
- for (; isLabelChar(*CurPtr); ++CurPtr) {
+ for (; isLabelChar(*CurPtr); getNextChar()) {
// If we decide this is an integer, remember the end of the sequence.
if (!IntEnd && !isdigit(static_cast<unsigned char>(*CurPtr)))
IntEnd = CurPtr;
@@ -507,7 +525,8 @@ lltok::Kind LLLexer::LexIdentifier() {
// If we stopped due to a colon, unless we were directed to ignore it,
// this really is a label.
if (!IgnoreColonInIdentifiers && *CurPtr == ':') {
- StrVal.assign(StartChar-1, CurPtr++);
+ StrVal.assign(StartChar - 1, CurPtr);
+ getNextChar();
return lltok::LabelStr;
}
@@ -515,7 +534,7 @@ lltok::Kind LLLexer::LexIdentifier() {
// return it.
if (!IntEnd) IntEnd = CurPtr;
if (IntEnd != StartChar) {
- CurPtr = IntEnd;
+ advancePositionTo(IntEnd);
uint64_t NumBits = atoull(StartChar, CurPtr);
if (NumBits < IntegerType::MIN_INT_BITS ||
NumBits > IntegerType::MAX_INT_BITS) {
@@ -528,7 +547,7 @@ lltok::Kind LLLexer::LexIdentifier() {
// Otherwise, this was a letter sequence. See which keyword this is.
if (!KeywordEnd) KeywordEnd = CurPtr;
- CurPtr = KeywordEnd;
+ advancePositionTo(KeywordEnd);
--StartChar;
StringRef Keyword(StartChar, CurPtr - StartChar);
@@ -1042,7 +1061,7 @@ lltok::Kind LLLexer::LexIdentifier() {
StringRef HexStr(TokStart + 3, len);
if (!all_of(HexStr, isxdigit)) {
// Bad token, return it as an error.
- CurPtr = TokStart+3;
+ advancePositionTo(TokStart + 3);
return lltok::Error;
}
APInt Tmp(bits, HexStr, 16);
@@ -1055,12 +1074,12 @@ lltok::Kind LLLexer::LexIdentifier() {
// If this is "cc1234", return this as just "cc".
if (TokStart[0] == 'c' && TokStart[1] == 'c') {
- CurPtr = TokStart+2;
+ advancePositionTo(TokStart + 2);
return lltok::kw_cc;
}
// Finally, if this isn't known, return an error.
- CurPtr = TokStart+1;
+ advancePositionTo(TokStart + 1);
return lltok::Error;
}
@@ -1073,24 +1092,25 @@ lltok::Kind LLLexer::LexIdentifier() {
/// HexHalfConstant 0xH[0-9A-Fa-f]+
/// HexBFloatConstant 0xR[0-9A-Fa-f]+
lltok::Kind LLLexer::Lex0x() {
- CurPtr = TokStart + 2;
+ advancePositionTo(TokStart + 2);
char Kind;
if ((CurPtr[0] >= 'K' && CurPtr[0] <= 'M') || CurPtr[0] == 'H' ||
CurPtr[0] == 'R') {
- Kind = *CurPtr++;
+ Kind = *CurPtr;
+ getNextChar();
} else {
Kind = 'J';
}
if (!isxdigit(static_cast<unsigned char>(CurPtr[0]))) {
// Bad token, return it as an error.
- CurPtr = TokStart+1;
+ advancePositionTo(TokStart + 1);
return lltok::Error;
}
while (isxdigit(static_cast<unsigned char>(CurPtr[0])))
- ++CurPtr;
+ getNextChar();
if (Kind == 'J') {
// HexFPConstant - Floating point constant represented in IEEE format as a
@@ -1147,7 +1167,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
// Okay, this is not a number after the -, it's probably a label.
if (const char *End = isLabelTail(CurPtr)) {
StrVal.assign(TokStart, End-1);
- CurPtr = End;
+ advancePositionTo(End);
return lltok::LabelStr;
}
@@ -1157,13 +1177,13 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
// At this point, it is either a label, int or fp constant.
// Skip digits, we have at least one.
- for (; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
+ for (; isdigit(static_cast<unsigned char>(CurPtr[0])); getNextChar())
/*empty*/;
// Check if this is a fully-numeric label:
if (isdigit(TokStart[0]) && CurPtr[0] == ':') {
uint64_t Val = atoull(TokStart, CurPtr);
- ++CurPtr; // Skip the colon.
+ getNextChar(); // Skip the colon.
if ((unsigned)Val != Val)
LexError("invalid value number (too large)");
UIntVal = unsigned(Val);
@@ -1174,7 +1194,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
if (isLabelChar(CurPtr[0]) || CurPtr[0] == ':') {
if (const char *End = isLabelTail(CurPtr)) {
StrVal.assign(TokStart, End-1);
- CurPtr = End;
+ advancePositionTo(End);
return lltok::LabelStr;
}
}
@@ -1188,17 +1208,19 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
return lltok::APSInt;
}
- ++CurPtr;
+ getNextChar();
// Skip over [0-9]*([eE][-+]?[0-9]+)?
- while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+ while (isdigit(static_cast<unsigned char>(CurPtr[0])))
+ getNextChar();
if (CurPtr[0] == 'e' || CurPtr[0] == 'E') {
if (isdigit(static_cast<unsigned char>(CurPtr[1])) ||
((CurPtr[1] == '-' || CurPtr[1] == '+') &&
isdigit(static_cast<unsigned char>(CurPtr[2])))) {
- CurPtr += 2;
- while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+ skipNChars(2);
+ while (isdigit(static_cast<unsigned char>(CurPtr[0])))
+ getNextChar();
}
}
@@ -1216,26 +1238,29 @@ lltok::Kind LLLexer::LexPositive() {
return lltok::Error;
// Skip digits.
- for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
+ for (getNextChar(); isdigit(static_cast<unsigned char>(CurPtr[0]));
+ getNextChar())
/*empty*/;
// At this point, we need a '.'.
if (CurPtr[0] != '.') {
- CurPtr = TokStart+1;
+ advancePositionTo(TokStart + 1);
return lltok::Error;
}
- ++CurPtr;
+ getNextChar();
// Skip over [0-9]*([eE][-+]?[0-9]+)?
- while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+ while (isdigit(static_cast<unsigned char>(CurPtr[0])))
+ getNextChar();
if (CurPtr[0] == 'e' || CurPtr[0] == 'E') {
if (isdigit(static_cast<unsigned char>(CurPtr[1])) ||
((CurPtr[1] == '-' || CurPtr[1] == '+') &&
isdigit(static_cast<unsigned char>(CurPtr[2])))) {
- CurPtr += 2;
- while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+ skipNChars(2);
+ while (isdigit(static_cast<unsigned char>(CurPtr[0])))
+ getNextChar();
}
}
>From 06926e9fa28e10abbec7803c0cd8c196ea09daf5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20Havli=C4=8Dek?= <ahavlicek at azul.com>
Date: Mon, 4 Aug 2025 09:36:51 +0000
Subject: [PATCH 2/4] Remove remains from cherry pick from LSP branch
---
llvm/lib/AsmParser/LLLexer.cpp | 2 --
1 file changed, 2 deletions(-)
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 7cefd4f6b4935..db4079975ad40 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -198,10 +198,8 @@ const char *LLLexer::skipNChars(unsigned N) {
void LLLexer::advancePositionTo(const char *Ptr) {
while (CurPtr != Ptr) {
- // FIXME: Assumes that if moving back, we stay in that line
if (CurPtr > Ptr) {
--CurPtr;
- --CurColNum;
} else
getNextChar();
}
>From 1fdf13c326ba2f7889b27de5789c0339190fa0f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20Havli=C4=8Dek?= <ahavlicek at azul.com>
Date: Mon, 4 Aug 2025 09:49:32 +0000
Subject: [PATCH 3/4] Make isLabelTail more safe and rename it to better show
what it does
---
llvm/include/llvm/AsmParser/LLLexer.h | 3 ++
llvm/lib/AsmParser/LLLexer.cpp | 43 +++++++++++----------------
2 files changed, 21 insertions(+), 25 deletions(-)
diff --git a/llvm/include/llvm/AsmParser/LLLexer.h b/llvm/include/llvm/AsmParser/LLLexer.h
index 5f6e32a4bf5e1..d0d6f72c197da 100644
--- a/llvm/include/llvm/AsmParser/LLLexer.h
+++ b/llvm/include/llvm/AsmParser/LLLexer.h
@@ -93,6 +93,9 @@ namespace llvm {
private:
lltok::Kind LexToken();
+ // Return closes pointer after `Ptr` that is an end of a label.
+ // Returns nullptr if `Ptr` doesn't point into a label.
+ const char *getLabelTail(const char *Ptr);
int getNextChar();
const char *skipNChars(unsigned N);
void advancePositionTo(const char *Ptr);
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index db4079975ad40..bbd6b690a97c0 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -155,15 +155,6 @@ static bool isLabelChar(char C) {
C == '.' || C == '_';
}
-/// isLabelTail - Return true if this pointer points to a valid end of a label.
-static const char *isLabelTail(const char *CurPtr) {
- while (true) {
- if (CurPtr[0] == ':') return CurPtr+1;
- if (!isLabelChar(CurPtr[0])) return nullptr;
- ++CurPtr;
- }
-}
-
//===----------------------------------------------------------------------===//
// Lexer definition.
//===----------------------------------------------------------------------===//
@@ -174,20 +165,22 @@ LLLexer::LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &Err,
CurPtr = CurBuf.begin();
}
+/// getLabelTail - Return true if this pointer points to a valid end of a label.
+const char *LLLexer::getLabelTail(const char *Ptr) {
+ while (Ptr != CurBuf.end()) {
+ if (Ptr[0] == ':')
+ return Ptr + 1;
+ if (!isLabelChar(Ptr[0]))
+ return nullptr;
+ ++Ptr;
+ }
+ return nullptr;
+}
+
int LLLexer::getNextChar() {
- char CurChar = *CurPtr++;
- switch (CurChar) {
- default: return (unsigned char)CurChar;
- case 0:
- // A nul character in the stream is either the end of the current buffer or
- // a random nul in the file. Disambiguate that here.
- if (CurPtr-1 != CurBuf.end())
- return 0; // Just whitespace.
-
- // Otherwise, return end of file.
- --CurPtr; // Another call to lex will return EOF again.
+ if (CurPtr == CurBuf.end())
return EOF;
- }
+ return *CurPtr++;
}
const char *LLLexer::skipNChars(unsigned N) {
@@ -230,7 +223,7 @@ lltok::Kind LLLexer::LexToken() {
case '%': return LexPercent();
case '"': return LexQuote();
case '.':
- if (const char *Ptr = isLabelTail(CurPtr)) {
+ if (const char *Ptr = getLabelTail(CurPtr)) {
advancePositionTo(Ptr);
StrVal.assign(TokStart, CurPtr-1);
return lltok::LabelStr;
@@ -313,7 +306,7 @@ lltok::Kind LLLexer::LexAt() {
}
lltok::Kind LLLexer::LexDollar() {
- if (const char *Ptr = isLabelTail(TokStart)) {
+ if (const char *Ptr = getLabelTail(TokStart)) {
advancePositionTo(Ptr);
StrVal.assign(TokStart, CurPtr - 1);
return lltok::LabelStr;
@@ -1163,7 +1156,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
if (!isdigit(static_cast<unsigned char>(TokStart[0])) &&
!isdigit(static_cast<unsigned char>(CurPtr[0]))) {
// Okay, this is not a number after the -, it's probably a label.
- if (const char *End = isLabelTail(CurPtr)) {
+ if (const char *End = getLabelTail(CurPtr)) {
StrVal.assign(TokStart, End-1);
advancePositionTo(End);
return lltok::LabelStr;
@@ -1190,7 +1183,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
// Check to see if this really is a string label, e.g. "-1:".
if (isLabelChar(CurPtr[0]) || CurPtr[0] == ':') {
- if (const char *End = isLabelTail(CurPtr)) {
+ if (const char *End = getLabelTail(CurPtr)) {
StrVal.assign(TokStart, End-1);
advancePositionTo(End);
return lltok::LabelStr;
>From 2772cd8f679124daed8bae737451dcf54b637694 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20Havli=C4=8Dek?= <ahavlicek at azul.com>
Date: Mon, 4 Aug 2025 09:51:38 +0000
Subject: [PATCH 4/4] Remove dangling comment
---
llvm/lib/AsmParser/LLLexer.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index bbd6b690a97c0..578ac851e38c5 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -165,7 +165,6 @@ LLLexer::LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &Err,
CurPtr = CurBuf.begin();
}
-/// getLabelTail - Return true if this pointer points to a valid end of a label.
const char *LLLexer::getLabelTail(const char *Ptr) {
while (Ptr != CurBuf.end()) {
if (Ptr[0] == ':')
More information about the llvm-commits
mailing list