[PATCH] Fix crash in getStringSplit.
Alexander Kornienko
alexfh at google.com
Tue Nov 26 02:15:09 PST 2013
Hi djasper, klimek,
getStringSplit used to crash, when trying to split a long string
literal containing both printable and unprintable multi-byte UTF-8 characters.
http://llvm-reviews.chandlerc.com/D2268
Files:
lib/Format/BreakableToken.cpp
lib/Format/Encoding.h
unittests/Format/FormatTest.cpp
Index: lib/Format/BreakableToken.cpp
===================================================================
--- lib/Format/BreakableToken.cpp
+++ lib/Format/BreakableToken.cpp
@@ -92,9 +92,7 @@
return BreakableToken::Split(StringRef::npos, 0);
if (ColumnLimit <= UsedColumns)
return BreakableToken::Split(StringRef::npos, 0);
- unsigned MaxSplit = std::min<unsigned>(
- ColumnLimit - UsedColumns,
- encoding::columnWidthWithTabs(Text, UsedColumns, TabWidth, Encoding) - 1);
+ unsigned MaxSplit = ColumnLimit - UsedColumns;
StringRef::size_type SpaceOffset = 0;
StringRef::size_type SlashOffset = 0;
StringRef::size_type WordStartOffset = 0;
@@ -110,7 +108,7 @@
Text.substr(0, Advance), UsedColumns + Chars, TabWidth, Encoding);
}
- if (Chars > MaxSplit)
+ if (Chars > MaxSplit || Text.size() == Advance)
break;
if (IsBlank(Text[0]))
Index: lib/Format/Encoding.h
===================================================================
--- lib/Format/Encoding.h
+++ lib/Format/Encoding.h
@@ -64,6 +64,10 @@
inline unsigned columnWidth(StringRef Text, Encoding Encoding) {
if (Encoding == Encoding_UTF8) {
int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
+ // FIXME: Figure out the correct way to handle this in the presence of both
+ // printable and unprintable multi-byte UTF-8 characters. Falling back to
+ // returning the number of bytes may cause problems, as columnWidth suddenly
+ // becomes non-additive.
if (ContentWidth >= 0)
return ContentWidth;
}
@@ -81,9 +85,7 @@
StringRef::size_type TabPos = Tail.find('\t');
if (TabPos == StringRef::npos)
return TotalWidth + columnWidth(Tail, Encoding);
- int Width = columnWidth(Tail.substr(0, TabPos), Encoding);
- assert(Width >= 0);
- TotalWidth += Width;
+ TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding);
TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
Tail = Tail.substr(TabPos + 1);
}
Index: unittests/Format/FormatTest.cpp
===================================================================
--- unittests/Format/FormatTest.cpp
+++ unittests/Format/FormatTest.cpp
@@ -6991,6 +6991,16 @@
}
TEST_F(FormatTest, SplitsUTF8Strings) {
+ // Non-printable characters' width is currently considered to be the length in
+ // bytes in UTF8. The characters can be displayed in very different manner
+ // (zero-width, single width with a substitution glyph, expanded to their code
+ // (e.g. "<8d>"), so there's no single correct way to handle them.
+ EXPECT_EQ("\"aaaaÄ\"\n"
+ "\"\";",
+ format("\"aaaaÄ\";", getLLVMStyleWithColumns(10)));
+ EXPECT_EQ("\"aaaaaaaÄ\"\n"
+ "\"\";",
+ format("\"aaaaaaaÄ\";", getLLVMStyleWithColumns(10)));
EXPECT_EQ(
"\"Однажды, в \"\n"
"\"студёную \"\n"
@@ -7024,6 +7034,8 @@
}
TEST_F(FormatTest, SplitsUTF8LineComments) {
+ EXPECT_EQ("// aaaaÄ",
+ format("// aaaaÄ", getLLVMStyleWithColumns(10)));
EXPECT_EQ("// Я из лесу\n"
"// вышел; был\n"
"// сильный\n"
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D2268.1.patch
Type: text/x-patch
Size: 3192 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20131126/cfc39da5/attachment.bin>
More information about the cfe-commits
mailing list