[llvm] [SourceMgr] Clean up handling of line ending characters (PR #120605)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 19 09:05:14 PST 2024
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/120605
Change the offset cache to point to the start of each line, instead of
each terminating LF character, and consistently handle CR, LF and CRLF
(or LFCR) line ending sequences.
Previously some parts of the code were handling both CR and LF but
others were not. Also functions GetMessage, getLineAndColumn and
FindLocForLineAndColumn had to scan a whole line to find the column
number, or the extent of the line. These functions are now faster and
simpler.
>From aeb0249ebc15f1c5e315989433de953f874db939 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 17 Dec 2024 16:40:23 +0000
Subject: [PATCH] [SourceMgr] Clean up handling of line ending characters
Change the offset cache to point to the start of each line, instead of
each terminating LF character, and consistently handle CR, LF and CRLF
(or LFCR) line ending sequences.
Previously some parts of the code were handling both CR and LF but
others were not. Also functions GetMessage, getLineAndColumn and
FindLocForLineAndColumn had to scan a whole line to find the column
number, or the extent of the line. These functions are now faster and
simpler.
---
llvm/lib/Support/SourceMgr.cpp | 94 +++++++++++-------------
llvm/unittests/Support/SourceMgrTest.cpp | 41 +++++++++++
2 files changed, 85 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Support/SourceMgr.cpp b/llvm/lib/Support/SourceMgr.cpp
index 3f97213d86c055..43e18233661519 100644
--- a/llvm/lib/Support/SourceMgr.cpp
+++ b/llvm/lib/Support/SourceMgr.cpp
@@ -81,8 +81,8 @@ unsigned SourceMgr::FindBufferContainingLoc(SMLoc Loc) const {
}
template <typename T>
-static std::vector<T> &GetOrCreateOffsetCache(void *&OffsetCache,
- MemoryBuffer *Buffer) {
+static ArrayRef<T> GetOrCreateOffsetCache(void *&OffsetCache,
+ MemoryBuffer *Buffer) {
if (OffsetCache)
return *static_cast<std::vector<T> *>(OffsetCache);
@@ -91,10 +91,24 @@ static std::vector<T> &GetOrCreateOffsetCache(void *&OffsetCache,
size_t Sz = Buffer->getBufferSize();
assert(Sz <= std::numeric_limits<T>::max());
StringRef S = Buffer->getBuffer();
- for (size_t N = 0; N < Sz; ++N) {
- if (S[N] == '\n')
- Offsets->push_back(static_cast<T>(N));
+
+ // The cache always includes 0 (for the start of the first line) and Sz (so
+ // that you can always index by N+1 to find the end of line N, even if the
+ // last line has no terminating newline).
+ Offsets->push_back(0);
+ for (size_t N = 0; N != Sz;) {
+ while (N != Sz && S[N] != '\n' && S[N] != '\r')
+ ++N;
+ if (N == Sz)
+ break;
+
+ // Skip over CR, LF, CRLF or LFCR.
+ ++N;
+ if (N != Sz && (S[N - 1] ^ S[N]) == ('\r' ^ '\n'))
+ ++N;
+ Offsets->push_back(static_cast<T>(N));
}
+ Offsets->push_back(static_cast<T>(Sz));
OffsetCache = Offsets;
return *Offsets;
@@ -102,8 +116,7 @@ static std::vector<T> &GetOrCreateOffsetCache(void *&OffsetCache,
template <typename T>
unsigned SourceMgr::SrcBuffer::getLineNumberSpecialized(const char *Ptr) const {
- std::vector<T> &Offsets =
- GetOrCreateOffsetCache<T>(OffsetCache, Buffer.get());
+ ArrayRef<T> Offsets = GetOrCreateOffsetCache<T>(OffsetCache, Buffer.get());
const char *BufStart = Buffer->getBufferStart();
assert(Ptr >= BufStart && Ptr <= Buffer->getBufferEnd());
@@ -112,9 +125,9 @@ unsigned SourceMgr::SrcBuffer::getLineNumberSpecialized(const char *Ptr) const {
static_cast<size_t>(PtrDiff) <= std::numeric_limits<T>::max());
T PtrOffset = static_cast<T>(PtrDiff);
- // llvm::lower_bound gives the number of EOL before PtrOffset. Add 1 to get
- // the line number.
- return llvm::lower_bound(Offsets, PtrOffset) - Offsets.begin() + 1;
+ // upper_bound gives the number of start-of-lines before or equal to
+ // PtrOffset.
+ return upper_bound(Offsets.drop_back(), PtrOffset) - Offsets.begin();
}
/// Look up a given \p Ptr in the buffer, determining which line it came
@@ -134,8 +147,7 @@ unsigned SourceMgr::SrcBuffer::getLineNumber(const char *Ptr) const {
template <typename T>
const char *SourceMgr::SrcBuffer::getPointerForLineNumberSpecialized(
unsigned LineNo) const {
- std::vector<T> &Offsets =
- GetOrCreateOffsetCache<T>(OffsetCache, Buffer.get());
+ ArrayRef<T> Offsets = GetOrCreateOffsetCache<T>(OffsetCache, Buffer.get());
// We start counting line and column numbers from 1.
if (LineNo != 0)
@@ -143,13 +155,11 @@ const char *SourceMgr::SrcBuffer::getPointerForLineNumberSpecialized(
const char *BufStart = Buffer->getBufferStart();
- // The offset cache contains the location of the \n for the specified line,
- // we want the start of the line. As such, we look for the previous entry.
- if (LineNo == 0)
- return BufStart;
- if (LineNo > Offsets.size())
+ // This allows looking up one greater than the maximum line number to get the
+ // offset to the end of the buffer.
+ if (LineNo >= Offsets.size())
return nullptr;
- return BufStart + Offsets[LineNo - 1] + 1;
+ return BufStart + Offsets[LineNo];
}
/// Return a pointer to the first character of the specified line number or
@@ -198,11 +208,8 @@ SourceMgr::getLineAndColumn(SMLoc Loc, unsigned BufferID) const {
const char *Ptr = Loc.getPointer();
unsigned LineNo = SB.getLineNumber(Ptr);
- const char *BufStart = SB.Buffer->getBufferStart();
- size_t NewlineOffs = StringRef(BufStart, Ptr - BufStart).find_last_of("\n\r");
- if (NewlineOffs == StringRef::npos)
- NewlineOffs = ~(size_t)0;
- return std::make_pair(LineNo, Ptr - BufStart - NewlineOffs);
+ const char *LineStart = SB.getPointerForLineNumber(LineNo);
+ return std::make_pair(LineNo, Ptr - LineStart + 1);
}
// FIXME: Note that the formatting of source locations is spread between
@@ -241,19 +248,12 @@ SMLoc SourceMgr::FindLocForLineAndColumn(unsigned BufferID, unsigned LineNo,
// We start counting line and column numbers from 1.
if (ColNo != 0)
--ColNo;
+ Ptr += ColNo;
- // If we have a column number, validate it.
- if (ColNo) {
- // Make sure the location is within the current line.
- if (Ptr + ColNo > SB.Buffer->getBufferEnd())
- return SMLoc();
-
- // Make sure there is no newline in the way.
- if (StringRef(Ptr, ColNo).find_first_of("\n\r") != StringRef::npos)
- return SMLoc();
-
- Ptr += ColNo;
- }
+ // Make sure the location is still within the current line.
+ if (Ptr >= SB.getPointerForLineNumber(LineNo + 1) &&
+ Ptr != SB.Buffer->getBufferEnd())
+ return SMLoc();
return SMLoc::getFromPointer(Ptr);
}
@@ -285,21 +285,15 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
unsigned CurBuf = FindBufferContainingLoc(Loc);
assert(CurBuf && "Invalid or unspecified location!");
- const MemoryBuffer *CurMB = getMemoryBuffer(CurBuf);
- BufferID = CurMB->getBufferIdentifier();
-
- // Scan backward to find the start of the line.
- const char *LineStart = Loc.getPointer();
- const char *BufStart = CurMB->getBufferStart();
- while (LineStart != BufStart && LineStart[-1] != '\n' &&
- LineStart[-1] != '\r')
- --LineStart;
-
- // Get the end of the line.
- const char *LineEnd = Loc.getPointer();
- const char *BufEnd = CurMB->getBufferEnd();
- while (LineEnd != BufEnd && LineEnd[0] != '\n' && LineEnd[0] != '\r')
- ++LineEnd;
+ const auto &SB = getBufferInfo(CurBuf);
+ BufferID = SB.Buffer->getBufferIdentifier();
+
+ // Find the start and end of the line.
+ unsigned LineNo = SB.getLineNumber(Loc.getPointer());
+ const char *LineStart = SB.getPointerForLineNumber(LineNo);
+ const char *LineEnd = SB.getPointerForLineNumber(LineNo + 1);
+ while (LineEnd > LineStart && (LineEnd[-1] == '\n' || LineEnd[-1] == '\r'))
+ --LineEnd;
LineStr = StringRef(LineStart, LineEnd - LineStart);
// Convert any ranges to column ranges that only intersect the line of the
diff --git a/llvm/unittests/Support/SourceMgrTest.cpp b/llvm/unittests/Support/SourceMgrTest.cpp
index 301b64f36a49af..d9ae1771db09aa 100644
--- a/llvm/unittests/Support/SourceMgrTest.cpp
+++ b/llvm/unittests/Support/SourceMgrTest.cpp
@@ -39,6 +39,13 @@ class SourceMgrTest : public testing::Test {
void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
const Twine &Msg, ArrayRef<SMRange> Ranges,
ArrayRef<SMFixIt> FixIts) {
+ // Check that we can round-trip from SMLoc to buffer ID, line and column
+ // number and back.
+ unsigned BufferID = SM.FindBufferContainingLoc(Loc);
+ auto [LineNo, ColNo] = SM.getLineAndColumn(Loc, BufferID);
+ EXPECT_EQ(Loc, SM.FindLocForLineAndColumn(BufferID, LineNo, ColNo));
+
+ Output.clear();
raw_string_ostream OS(Output);
SM.PrintMessage(OS, Loc, Kind, Msg, Ranges, FixIts);
}
@@ -196,6 +203,40 @@ TEST_F(SourceMgrTest, LocationOnEOLOfSecondSecondLineOfMultiline) {
Output);
}
+TEST_F(SourceMgrTest, MixedLineEndings) {
+ setMainBuffer("CR\rLF\nCRLF\r\nLFCR\n\rEND", "file.in");
+
+ printMessage(getLoc(0), SourceMgr::DK_Error, "message", {}, {});
+ EXPECT_EQ("file.in:1:1: error: message\n"
+ "CR\n"
+ "^\n",
+ Output);
+
+ printMessage(getLoc(3), SourceMgr::DK_Error, "message", {}, {});
+ EXPECT_EQ("file.in:2:1: error: message\n"
+ "LF\n"
+ "^\n",
+ Output);
+
+ printMessage(getLoc(6), SourceMgr::DK_Error, "message", {}, {});
+ EXPECT_EQ("file.in:3:1: error: message\n"
+ "CRLF\n"
+ "^\n",
+ Output);
+
+ printMessage(getLoc(12), SourceMgr::DK_Error, "message", {}, {});
+ EXPECT_EQ("file.in:4:1: error: message\n"
+ "LFCR\n"
+ "^\n",
+ Output);
+
+ printMessage(getLoc(18), SourceMgr::DK_Error, "message", {}, {});
+ EXPECT_EQ("file.in:5:1: error: message\n"
+ "END\n"
+ "^\n",
+ Output);
+}
+
#define STRING_LITERAL_253_BYTES \
"1234567890\n1234567890\n" \
"1234567890\n1234567890\n" \
More information about the llvm-commits
mailing list