[clang] e80b81d - [Support] Fix formatted_raw_ostream for UTF-8

Mon Jul 6 08:29:01 PDT 2020

Author: Oliver Stannard
Date: 2020-07-06T16:18:15+01:00
New Revision: e80b81d1cbf85dcd427759369978afdb48f0998f

URL: https://github.com/llvm/llvm-project/commit/e80b81d1cbf85dcd427759369978afdb48f0998f
DIFF: https://github.com/llvm/llvm-project/commit/e80b81d1cbf85dcd427759369978afdb48f0998f.diff

LOG: [Support] Fix formatted_raw_ostream for UTF-8

* The getLine and getColumn functions need to update the position, or
  they will return stale data for buffered streams. This fixes a bug in
  the clang -analyzer-checker-option-help option, which was not wrapping
  the help text correctly when stdout is not a TTY.
* If the stream contains multi-byte UTF-8 sequences, then the whole
  sequence needs to be considered to be a single character. This has the
  edge case that the buffer might fill up and be flushed part way
  through a character.
* If the stream contains East Asian wide characters, these will be
  rendered twice as wide as other characters, so we need to increase the
  column count to match.

This doesn't attempt to handle everything unicode can do (combining
characters, right-to-left markers, ...), but hopefully covers most
things likely to be common in messages and source code we might want to
print.

Differential revision: https://reviews.llvm.org/D76291

Added: 
    

Modified: 
    clang/test/Analysis/checker-plugins.c
    llvm/include/llvm/Support/FormattedStream.h
    llvm/lib/Support/FormattedStream.cpp
    llvm/test/MC/ARM/lsl-zero.s
    llvm/unittests/Support/formatted_raw_ostream_test.cpp

Removed: 
    


################################################################################
diff  --git a/clang/test/Analysis/checker-plugins.c b/clang/test/Analysis/checker-plugins.c
index fbc9c9bd1c22..69fab8fa6eed 100644

--- a/clang/test/Analysis/checker-plugins.c
+++ b/clang/test/Analysis/checker-plugins.c
@@ -116,4 +116,5 @@ void caller() {
 // RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-CHECKER-OPTION-HELP
 
 // CHECK-CHECKER-OPTION-HELP: example.MyChecker:ExampleOption  (bool) This is an
-// CHECK-CHECKER-OPTION-HELP-SAME: example checker opt. (default: false)
+// CHECK-CHECKER-OPTION-HELP-SAME: example checker opt. (default:
+// CHECK-CHECKER-OPTION-HELP-NEXT: false)

diff  --git a/llvm/include/llvm/Support/FormattedStream.h b/llvm/include/llvm/Support/FormattedStream.h
index b49c8d86531d..5f937cfa7984 100644
--- a/llvm/include/llvm/Support/FormattedStream.h
+++ b/llvm/include/llvm/Support/FormattedStream.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SUPPORT_FORMATTEDSTREAM_H
 #define LLVM_SUPPORT_FORMATTEDSTREAM_H
 
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/raw_ostream.h"
 #include <utility>
 
@@ -21,8 +22,11 @@ namespace llvm {
 
 /// formatted_raw_ostream - A raw_ostream that wraps another one and keeps track
 /// of line and column position, allowing padding out to specific column
-/// boundaries and querying the number of lines written to the stream.
-///
+/// boundaries and querying the number of lines written to the stream. This
+/// assumes that the contents of the stream is valid UTF-8 encoded text. This
+/// doesn't attempt to handle everything Unicode can do (combining characters,
+/// right-to-left markers, etc), but should cover the cases likely to appear in
+/// source code or diagnostic messages.
 class formatted_raw_ostream : public raw_ostream {
   /// TheStream - The real stream we output to. We set it to be
   /// unbuffered, since we're already doing our own buffering.
@@ -40,6 +44,14 @@ class formatted_raw_ostream : public raw_ostream {
   ///
   const char *Scanned;
 
+  /// PartialUTF8Char - Either empty or a prefix of a UTF-8 code unit sequence
+  /// for a Unicode scalar value which should be prepended to the buffer for the
+  /// next call to ComputePosition. This is needed when the buffer is flushed
+  /// when it ends part-way through the UTF-8 encoding of a Unicode scalar
+  /// value, so that we can compute the display width of the character once we
+  /// have the rest of it.
+  SmallString<4> PartialUTF8Char;
+
   void write_impl(const char *Ptr, size_t Size) override;
 
   /// current_pos - Return the current position within the stream,
@@ -52,10 +64,16 @@ class formatted_raw_ostream : public raw_ostream {
   }
 
   /// ComputePosition - Examine the given output buffer and figure out the new
-  /// position after output.
-  ///
+  /// position after output. This is safe to call multiple times on the same
+  /// buffer, as it records the most recently scanned character and resumes from
+  /// there when the buffer has not been flushed.
   void ComputePosition(const char *Ptr, size_t size);
 
+  /// UpdatePosition - scan the characters in [Ptr, Ptr+Size), and update the
+  /// line and column numbers. Unlike ComputePosition, this must be called
+  /// exactly once on each region of the buffer.
+  void UpdatePosition(const char *Ptr, size_t Size);
+
   void setStream(raw_ostream &Stream) {
     releaseStream();
 
@@ -105,11 +123,17 @@ class formatted_raw_ostream : public raw_ostream {
   /// \param NewCol - The column to move to.
   formatted_raw_ostream &PadToColumn(unsigned NewCol);
 
-  /// getColumn - Return the column number
-  unsigned getColumn() { return Position.first; }
+  unsigned getColumn() {
+    // Calculate current position, taking buffer contents into account.
+    ComputePosition(getBufferStart(), GetNumBytesInBuffer());
+    return Position.first;
+  }
 
-  /// getLine - Return the line number
-  unsigned getLine() { return Position.second; }
+  unsigned getLine() {
+    // Calculate current position, taking buffer contents into account.
+    ComputePosition(getBufferStart(), GetNumBytesInBuffer());
+    return Position.second;
+  }
 
   raw_ostream &resetColor() override {
     TheStream->resetColor();

diff  --git a/llvm/lib/Support/FormattedStream.cpp b/llvm/lib/Support/FormattedStream.cpp
index 4eb747038bb9..081b8bf2cc19 100644
--- a/llvm/lib/Support/FormattedStream.cpp
+++ b/llvm/lib/Support/FormattedStream.cpp
@@ -11,7 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Unicode.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
@@ -19,16 +21,22 @@ using namespace llvm;
 
 /// UpdatePosition - Examine the given char sequence and figure out which
 /// column we end up in after output, and how many line breaks are contained.
-///
-static void UpdatePosition(std::pair<unsigned, unsigned> &Position, const char *Ptr, size_t Size) {
+/// This assumes that the input string is well-formed UTF-8, and takes into
+/// account Unicode characters which render as multiple columns wide.
+void formatted_raw_ostream::UpdatePosition(const char *Ptr, size_t Size) {
   unsigned &Column = Position.first;
   unsigned &Line = Position.second;
 
-  // Keep track of the current column and line by scanning the string for
-  // special characters
-  for (const char *End = Ptr + Size; Ptr != End; ++Ptr) {
-    ++Column;
-    switch (*Ptr) {
+  auto ProcessUTF8CodePoint = [&Line, &Column](StringRef CP) {
+    int Width = sys::unicode::columnWidthUTF8(CP);
+    if (Width != sys::unicode::ErrorNonPrintableCharacter)
+      Column += Width;
+
+    // The only special whitespace characters we care about are single-byte.
+    if (CP.size() > 1)
+      return;
+
+    switch (CP[0]) {
     case '\n':
       Line += 1;
       LLVM_FALLTHROUGH;
@@ -40,6 +48,46 @@ static void UpdatePosition(std::pair<unsigned, unsigned> &Position, const char *
       Column += (8 - (Column & 0x7)) & 0x7;
       break;
     }
+  };
+
+  // If we have a partial UTF-8 sequence from the previous buffer, check that
+  // first.
+  if (PartialUTF8Char.size()) {
+    size_t BytesFromBuffer =
+        getNumBytesForUTF8(PartialUTF8Char[0]) - PartialUTF8Char.size();
+    if (Size < BytesFromBuffer) {
+      // If we still don't have enough bytes for a complete code point, just
+      // append what we have.
+      PartialUTF8Char.append(StringRef(Ptr, Size));
+      return;
+    } else {
+      // The first few bytes from the buffer will complete the code point.
+      // Concatenate them and process their effect on the line and column
+      // numbers.
+      PartialUTF8Char.append(StringRef(Ptr, BytesFromBuffer));
+      ProcessUTF8CodePoint(PartialUTF8Char);
+      PartialUTF8Char.clear();
+      Ptr += BytesFromBuffer;
+      Size -= BytesFromBuffer;
+    }
+  }
+
+  // Now scan the rest of the buffer.
+  unsigned NumBytes;
+  for (const char *End = Ptr + Size; Ptr < End; Ptr += NumBytes) {
+    NumBytes = getNumBytesForUTF8(*Ptr);
+
+    // The buffer might end part way through a UTF-8 code unit sequence for a
+    // Unicode scalar value if it got flushed. If this happens, we can't know
+    // the display width until we see the rest of the code point. Stash the
+    // bytes we do have, so that we can reconstruct the whole code point later,
+    // even if the buffer is being flushed.
+    if ((End - Ptr) < NumBytes) {
+      PartialUTF8Char = StringRef(Ptr, End - Ptr);
+      return;
+    }
+
+    ProcessUTF8CodePoint(StringRef(Ptr, NumBytes));
   }
 }
 
@@ -52,9 +100,9 @@ void formatted_raw_ostream::ComputePosition(const char *Ptr, size_t Size) {
   if (Ptr <= Scanned && Scanned <= Ptr + Size)
     // Scan all characters added since our last scan to determine the new
     // column.
-    UpdatePosition(Position, Scanned, Size - (Scanned - Ptr));
+    UpdatePosition(Scanned, Size - (Scanned - Ptr));
   else
-    UpdatePosition(Position, Ptr, Size);
+    UpdatePosition(Ptr, Size);
 
   // Update the scanning pointer.
   Scanned = Ptr + Size;

diff  --git a/llvm/test/MC/ARM/lsl-zero.s b/llvm/test/MC/ARM/lsl-zero.s
index 5d097115448f..6e64e0012362 100644
--- a/llvm/test/MC/ARM/lsl-zero.s
+++ b/llvm/test/MC/ARM/lsl-zero.s
@@ -1,6 +1,6 @@
-// RUN: llvm-mc -triple=thumbv7 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV7 %s
-// RUN: llvm-mc -triple=thumbv8 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV8 %s
-// RUN: llvm-mc -triple=armv7 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM %s
+// RUN: llvm-mc -triple=thumbv7 -show-encoding < %s 2>/dev/null | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV7 %s
+// RUN: llvm-mc -triple=thumbv8 -show-encoding < %s 2>/dev/null | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV8 %s
+// RUN: llvm-mc -triple=armv7 -show-encoding < %s 2>/dev/null | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM %s
 
         // lsl #0 is actually mov, so here we check that it behaves the same as
         // mov with regards to the permitted registers and how it behaves in an

diff  --git a/llvm/unittests/Support/formatted_raw_ostream_test.cpp b/llvm/unittests/Support/formatted_raw_ostream_test.cpp
index 0fe0869922a1..5c57f1f18700 100644
--- a/llvm/unittests/Support/formatted_raw_ostream_test.cpp
+++ b/llvm/unittests/Support/formatted_raw_ostream_test.cpp
@@ -29,4 +29,143 @@ TEST(formatted_raw_ostreamTest, Test_Tell) {
   }
 }
 
+TEST(formatted_raw_ostreamTest, Test_LineColumn) {
+  // Test tracking of line and column numbers in a stream.
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  formatted_raw_ostream C(B);
+
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(0U, C.getColumn());
+
+  C << "a";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(1U, C.getColumn());
+
+  C << "bcdef";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(6U, C.getColumn());
+
+  // '\n' increments line number, sets column to zero.
+  C << "\n";
+  EXPECT_EQ(1U, C.getLine());
+  EXPECT_EQ(0U, C.getColumn());
+
+  // '\r sets column to zero without changing line number
+  C << "foo\r";
+  EXPECT_EQ(1U, C.getLine());
+  EXPECT_EQ(0U, C.getColumn());
+
+  // '\t' advances column to the next multiple of 8.
+  // FIXME: If the column number is already a multiple of 8 this will do
+  // nothing, is this behaviour correct?
+  C << "1\t";
+  EXPECT_EQ(8U, C.getColumn());
+  C << "\t";
+  EXPECT_EQ(8U, C.getColumn());
+  C << "1234567\t";
+  EXPECT_EQ(16U, C.getColumn());
+  EXPECT_EQ(1U, C.getLine());
+}
+
+TEST(formatted_raw_ostreamTest, Test_Flush) {
+  // Flushing the buffer causes the characters in the buffer to be scanned
+  // before the buffer is emptied, so line and column numbers will still be
+  // tracked properly.
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(32);
+  formatted_raw_ostream C(B);
+
+  C << "\nabc";
+  EXPECT_EQ(4U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(1U, C.getLine());
+  EXPECT_EQ(3U, C.getColumn());
+  EXPECT_EQ(0U, C.GetNumBytesInBuffer());
+}
+
+TEST(formatted_raw_ostreamTest, Test_UTF8) {
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(32);
+  formatted_raw_ostream C(B);
+
+  // U+00A0 Non-breaking space: encoded as two bytes, but only one column wide.
+  C << u8"\u00a0";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(1U, C.getColumn());
+  EXPECT_EQ(2U, C.GetNumBytesInBuffer());
+
+  // U+2468 CIRCLED DIGIT NINE: encoded as three bytes, but only one column
+  // wide.
+  C << u8"\u2468";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(2U, C.getColumn());
+  EXPECT_EQ(5U, C.GetNumBytesInBuffer());
+
+  // U+00010000 LINEAR B SYLLABLE B008 A: encoded as four bytes, but only one
+  // column wide.
+  C << u8"\U00010000";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(3U, C.getColumn());
+  EXPECT_EQ(9U, C.GetNumBytesInBuffer());
+
+  // U+55B5, CJK character, encodes as three bytes, takes up two columns.
+  C << u8"\u55b5";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(5U, C.getColumn());
+  EXPECT_EQ(12U, C.GetNumBytesInBuffer());
+
+  // U+200B, zero-width space, encoded as three bytes but has no effect on the
+  // column or line number.
+  C << u8"\u200b";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(5U, C.getColumn());
+  EXPECT_EQ(15U, C.GetNumBytesInBuffer());
+}
+
+TEST(formatted_raw_ostreamTest, Test_UTF8Buffered) {
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(4);
+  formatted_raw_ostream C(B);
+
+  // U+2468 encodes as three bytes, so will cause the buffer to be flushed after
+  // the first byte (4 byte buffer, 3 bytes already written). We need to save
+  // the first part of the UTF-8 encoding until after the buffer is cleared and
+  // the remaining two bytes are written, at which point we can check the
+  // display width. In this case the display width is 1, so we end at column 4,
+  // with 6 bytes written into total, 2 of which are in the buffer.
+  C << u8"123\u2468";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(4U, C.getColumn());
+  EXPECT_EQ(2U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(6U, A.size());
+
+  // Same as above, but with a CJK character which displays as two columns.
+  C << u8"123\u55b5";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(9U, C.getColumn());
+  EXPECT_EQ(2U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(12U, A.size());
+}
+
+TEST(formatted_raw_ostreamTest, Test_UTF8TinyBuffer) {
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(1);
+  formatted_raw_ostream C(B);
+
+  // The stream has a one-byte buffer, so it gets flushed multiple times while
+  // printing a single Unicode character.
+  C << u8"\u2468";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(1U, C.getColumn());
+  EXPECT_EQ(0U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(3U, A.size());
+}
 }