r195728 - Fix crash in getStringSplit.

Alexander Kornienko alexfh at google.com
Tue Nov 26 02:38:53 PST 2013


Author: alexfh
Date: Tue Nov 26 04:38:53 2013
New Revision: 195728

URL: http://llvm.org/viewvc/llvm-project?rev=195728&view=rev
Log:
Fix crash in getStringSplit.

Summary:
getStringSplit used to crash, when trying to split a long string
literal containing both printable and unprintable multi-byte UTF-8 characters.

Reviewers: djasper, klimek

Reviewed By: djasper

CC: cfe-commits, klimek

Differential Revision: http://llvm-reviews.chandlerc.com/D2268

Modified:
    cfe/trunk/lib/Format/BreakableToken.cpp
    cfe/trunk/lib/Format/Encoding.h
    cfe/trunk/unittests/Format/FormatTest.cpp

Modified: cfe/trunk/lib/Format/BreakableToken.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Format/BreakableToken.cpp?rev=195728&r1=195727&r2=195728&view=diff
==============================================================================
--- cfe/trunk/lib/Format/BreakableToken.cpp (original)
+++ cfe/trunk/lib/Format/BreakableToken.cpp Tue Nov 26 04:38:53 2013
@@ -92,9 +92,7 @@ static BreakableToken::Split getStringSp
     return BreakableToken::Split(StringRef::npos, 0);
   if (ColumnLimit <= UsedColumns)
     return BreakableToken::Split(StringRef::npos, 0);
-  unsigned MaxSplit = std::min<unsigned>(
-      ColumnLimit - UsedColumns,
-      encoding::columnWidthWithTabs(Text, UsedColumns, TabWidth, Encoding) - 1);
+  unsigned MaxSplit = ColumnLimit - UsedColumns;
   StringRef::size_type SpaceOffset = 0;
   StringRef::size_type SlashOffset = 0;
   StringRef::size_type WordStartOffset = 0;
@@ -110,7 +108,7 @@ static BreakableToken::Split getStringSp
           Text.substr(0, Advance), UsedColumns + Chars, TabWidth, Encoding);
     }
 
-    if (Chars > MaxSplit)
+    if (Chars > MaxSplit || Text.size() == Advance)
       break;
 
     if (IsBlank(Text[0]))

Modified: cfe/trunk/lib/Format/Encoding.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Format/Encoding.h?rev=195728&r1=195727&r2=195728&view=diff
==============================================================================
--- cfe/trunk/lib/Format/Encoding.h (original)
+++ cfe/trunk/lib/Format/Encoding.h Tue Nov 26 04:38:53 2013
@@ -64,6 +64,10 @@ inline unsigned getCodePointCount(String
 inline unsigned columnWidth(StringRef Text, Encoding Encoding) {
   if (Encoding == Encoding_UTF8) {
     int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
+    // FIXME: Figure out the correct way to handle this in the presence of both
+    // printable and unprintable multi-byte UTF-8 characters. Falling back to
+    // returning the number of bytes may cause problems, as columnWidth suddenly
+    // becomes non-additive.
     if (ContentWidth >= 0)
       return ContentWidth;
   }
@@ -81,9 +85,7 @@ inline unsigned columnWidthWithTabs(Stri
     StringRef::size_type TabPos = Tail.find('\t');
     if (TabPos == StringRef::npos)
       return TotalWidth + columnWidth(Tail, Encoding);
-    int Width = columnWidth(Tail.substr(0, TabPos), Encoding);
-    assert(Width >= 0);
-    TotalWidth += Width;
+    TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding);
     TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
     Tail = Tail.substr(TabPos + 1);
   }

Modified: cfe/trunk/unittests/Format/FormatTest.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/unittests/Format/FormatTest.cpp?rev=195728&r1=195727&r2=195728&view=diff
==============================================================================
--- cfe/trunk/unittests/Format/FormatTest.cpp (original)
+++ cfe/trunk/unittests/Format/FormatTest.cpp Tue Nov 26 04:38:53 2013
@@ -6991,6 +6991,16 @@ TEST_F(FormatTest, CountsUTF8CharactersP
 }
 
 TEST_F(FormatTest, SplitsUTF8Strings) {
+  // Non-printable characters' width is currently considered to be the length in
+  // bytes in UTF8. The characters can be displayed in very different manner
+  // (zero-width, single width with a substitution glyph, expanded to their code
+  // (e.g. "<8d>"), so there's no single correct way to handle them.
+  EXPECT_EQ("\"aaaaÄ\"\n"
+            "\"\";",
+            format("\"aaaač\";", getLLVMStyleWithColumns(10)));
+  EXPECT_EQ("\"aaaaaaaÄ\"\n"
+            "\"\";",
+            format("\"aaaaaaač\";", getLLVMStyleWithColumns(10)));
   EXPECT_EQ(
       "\"Однажды, в \"\n"
       "\"студёную \"\n"
@@ -7024,6 +7034,8 @@ TEST_F(FormatTest, HandlesDoubleWidthCha
 }
 
 TEST_F(FormatTest, SplitsUTF8LineComments) {
+  EXPECT_EQ("// aaaač",
+            format("// aaaač", getLLVMStyleWithColumns(10)));
   EXPECT_EQ("// Я из лесу\n"
             "// вышел; был\n"
             "// сильный\n"





More information about the cfe-commits mailing list