[clang-tools-extra] r357173 - [clangd] Support UTF-32 (i.e. codepoint) offsets.

Thu Mar 28 07:37:51 PDT 2019

Author: sammccall
Date: Thu Mar 28 07:37:51 2019
New Revision: 357173

URL: http://llvm.org/viewvc/llvm-project?rev=357173&view=rev
Log:
[clangd] Support UTF-32 (i.e. codepoint) offsets.

Summary:
(Changes to UTF-8/UTF-16 here are NFC, moving things around to make the
cases more symmetrical)

Reviewers: ilya-biryukov

Subscribers: ioeric, MaskRay, jkorous, arphaman, kadircet, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D59927

Modified:
    clang-tools-extra/trunk/clangd/Protocol.cpp
    clang-tools-extra/trunk/clangd/Protocol.h
    clang-tools-extra/trunk/clangd/SourceCode.cpp
    clang-tools-extra/trunk/unittests/clangd/SourceCodeTests.cpp

Modified: clang-tools-extra/trunk/clangd/Protocol.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/Protocol.cpp?rev=357173&r1=357172&r2=357173&view=diff
==============================================================================

--- clang-tools-extra/trunk/clangd/Protocol.cpp (original)
+++ clang-tools-extra/trunk/clangd/Protocol.cpp Thu Mar 28 07:37:51 2019
@@ -938,16 +938,19 @@ bool fromJSON(const llvm::json::Value &P
   return fromJSON(Params, Base);
 }
 
-llvm::json::Value toJSON(const OffsetEncoding &OE) {
+static const char *toString(OffsetEncoding OE) {
   switch (OE) {
-    case OffsetEncoding::UTF8:
-      return "utf-8";
-    case OffsetEncoding::UTF16:
-      return "utf-16";
-    case OffsetEncoding::UnsupportedEncoding:
-      return "unknown";
+  case OffsetEncoding::UTF8:
+    return "utf-8";
+  case OffsetEncoding::UTF16:
+    return "utf-16";
+  case OffsetEncoding::UTF32:
+    return "utf-32";
+  case OffsetEncoding::UnsupportedEncoding:
+    return "unknown";
   }
 }
+llvm::json::Value toJSON(const OffsetEncoding &OE) { return toString(OE); }
 bool fromJSON(const llvm::json::Value &V, OffsetEncoding &OE) {
   auto Str = V.getAsString();
   if (!Str)
@@ -955,9 +958,13 @@ bool fromJSON(const llvm::json::Value &V
   OE = llvm::StringSwitch<OffsetEncoding>(*Str)
            .Case("utf-8", OffsetEncoding::UTF8)
            .Case("utf-16", OffsetEncoding::UTF16)
+           .Case("utf-32", OffsetEncoding::UTF32)
            .Default(OffsetEncoding::UnsupportedEncoding);
   return true;
 }
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, OffsetEncoding Enc) {
+  return OS << toString(Enc);
+}
 
 } // namespace clangd
 } // namespace clang

Modified: clang-tools-extra/trunk/clangd/Protocol.h
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/Protocol.h?rev=357173&r1=357172&r2=357173&view=diff
==============================================================================
--- clang-tools-extra/trunk/clangd/Protocol.h (original)
+++ clang-tools-extra/trunk/clangd/Protocol.h Thu Mar 28 07:37:51 2019
@@ -28,6 +28,7 @@
 #include "clang/Index/IndexSymbol.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/Support/JSON.h"
+#include "llvm/Support/raw_ostream.h"
 #include <bitset>
 #include <string>
 #include <vector>
@@ -346,9 +347,12 @@ enum class OffsetEncoding {
   UTF16,
   // Length counts bytes of UTF-8 encoded text. (Clangd extension).
   UTF8,
+  // Length counts codepoints in unicode text. (Clangd extension).
+  UTF32,
 };
 llvm::json::Value toJSON(const OffsetEncoding &);
 bool fromJSON(const llvm::json::Value &, OffsetEncoding &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, OffsetEncoding OS);
 
 // This struct doesn't mirror LSP!
 // The protocol defines deeply nested structures for client capabilities.

Modified: clang-tools-extra/trunk/clangd/SourceCode.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/SourceCode.cpp?rev=357173&r1=357172&r2=357173&view=diff
==============================================================================
--- clang-tools-extra/trunk/clangd/SourceCode.cpp (original)
+++ clang-tools-extra/trunk/clangd/SourceCode.cpp Thu Mar 28 07:37:51 2019
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Path.h"
 
 namespace clang {
@@ -30,6 +31,8 @@ namespace clangd {
 // Returns true if CB returned true, false if we hit the end of string.
 template <typename Callback>
 static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) {
+  // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
+  // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
   for (size_t I = 0; I < U8.size();) {
     unsigned char C = static_cast<unsigned char>(U8[I]);
     if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character.
@@ -53,46 +56,75 @@ static bool iterateCodepoints(llvm::Stri
   return false;
 }
 
-// Returns the offset into the string that matches \p Units UTF-16 code units.
-// Conceptually, this converts to UTF-16, truncates to CodeUnits, converts back
-// to UTF-8, and returns the length in bytes.
-static size_t measureUTF16(llvm::StringRef U8, int U16Units, bool &Valid) {
+// Returns the byte offset into the string that is an offset of \p Units in
+// the specified encoding.
+// Conceptually, this converts to the encoding, truncates to CodeUnits,
+// converts back to UTF-8, and returns the length in bytes.
+static size_t measureUnits(llvm::StringRef U8, int Units, OffsetEncoding Enc,
+                           bool &Valid) {
+  Valid = Units >= 0;
+  if (Units <= 0)
+    return 0;
   size_t Result = 0;
-  Valid = U16Units == 0 || iterateCodepoints(U8, [&](int U8Len, int U16Len) {
-            Result += U8Len;
-            U16Units -= U16Len;
-            return U16Units <= 0;
-          });
-  if (U16Units < 0) // Offset was into the middle of a surrogate pair.
-    Valid = false;
+  switch (Enc) {
+  case OffsetEncoding::UTF8:
+    Result = Units;
+    break;
+  case OffsetEncoding::UTF16:
+    Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) {
+      Result += U8Len;
+      Units -= U16Len;
+      return Units <= 0;
+    });
+    if (Units < 0) // Offset in the middle of a surrogate pair.
+      Valid = false;
+    break;
+  case OffsetEncoding::UTF32:
+    Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) {
+      Result += U8Len;
+      Units--;
+      return Units <= 0;
+    });
+    break;
+  case OffsetEncoding::UnsupportedEncoding:
+    llvm_unreachable("unsupported encoding");
+  }
   // Don't return an out-of-range index if we overran.
-  return std::min(Result, U8.size());
+  if (Result > U8.size()) {
+    Valid = false;
+    return U8.size();
+  }
+  return Result;
 }
 
 Key<OffsetEncoding> kCurrentOffsetEncoding;
-static bool useUTF16ForLSP() {
+static OffsetEncoding lspEncoding() {
   auto *Enc = Context::current().get(kCurrentOffsetEncoding);
-  switch (Enc ? *Enc : OffsetEncoding::UTF16) {
-    case OffsetEncoding::UTF16:
-      return true;
-    case OffsetEncoding::UTF8:
-      return false;
-    case OffsetEncoding::UnsupportedEncoding:
-      llvm_unreachable("cannot use an unsupported encoding");
-  }
+  return Enc ? *Enc : OffsetEncoding::UTF16;
 }
 
 // Like most strings in clangd, the input is UTF-8 encoded.
 size_t lspLength(llvm::StringRef Code) {
-  if (!useUTF16ForLSP())
-    return Code.size();
-  // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
-  // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
   size_t Count = 0;
-  iterateCodepoints(Code, [&](int U8Len, int U16Len) {
-    Count += U16Len;
-    return false;
-  });
+  switch (lspEncoding()) {
+  case OffsetEncoding::UTF8:
+    Count = Code.size();
+    break;
+  case OffsetEncoding::UTF16:
+    iterateCodepoints(Code, [&](int U8Len, int U16Len) {
+      Count += U16Len;
+      return false;
+    });
+    break;
+  case OffsetEncoding::UTF32:
+    iterateCodepoints(Code, [&](int U8Len, int U16Len) {
+      ++Count;
+      return false;
+    });
+    break;
+  case OffsetEncoding::UnsupportedEncoding:
+    llvm_unreachable("unsupported encoding");
+  }
   return Count;
 }
 
@@ -118,28 +150,15 @@ llvm::Expected<size_t> positionToOffset(
   StringRef Line =
       Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; });
 
-  if (!useUTF16ForLSP()) {
-    // Bounds-checking only.
-    if (P.character > int(Line.size())) {
-      if (AllowColumnsBeyondLineLength)
-        return StartOfLine + Line.size();
-      else
-        return llvm::make_error<llvm::StringError>(
-            llvm::formatv("UTF-8 offset {0} overruns line {1}", P.character,
-                          P.line),
-            llvm::errc::invalid_argument);
-    }
-    return StartOfLine + P.character;
-  }
-  // P.character is in UTF-16 code units, so we have to transcode.
+  // P.character may be in UTF-16, transcode if necessary.
   bool Valid;
-  size_t ByteOffsetInLine = measureUTF16(Line, P.character, Valid);
+  size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid);
   if (!Valid && !AllowColumnsBeyondLineLength)
     return llvm::make_error<llvm::StringError>(
-        llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character,
-                      P.line),
+        llvm::formatv("{0} offset {1} is invalid for line {2}", lspEncoding(),
+                      P.character, P.line),
         llvm::errc::invalid_argument);
-  return StartOfLine + ByteOffsetInLine;
+  return StartOfLine + ByteInLine;
 }
 
 Position offsetToPosition(llvm::StringRef Code, size_t Offset) {

Modified: clang-tools-extra/trunk/unittests/clangd/SourceCodeTests.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/unittests/clangd/SourceCodeTests.cpp?rev=357173&r1=357172&r2=357173&view=diff
==============================================================================
--- clang-tools-extra/trunk/unittests/clangd/SourceCodeTests.cpp (original)
+++ clang-tools-extra/trunk/unittests/clangd/SourceCodeTests.cpp Thu Mar 28 07:37:51 2019
@@ -58,6 +58,15 @@ TEST(SourceCodeTests, lspLength) {
   EXPECT_EQ(lspLength("Â¥"), 2UL);
   // astral
   EXPECT_EQ(lspLength("ð"), 4UL);
+
+  WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32);
+  EXPECT_EQ(lspLength(""), 0UL);
+  EXPECT_EQ(lspLength("ascii"), 5UL);
+  // BMP
+  EXPECT_EQ(lspLength("â"), 1UL);
+  EXPECT_EQ(lspLength("Â¥"), 1UL);
+  // astral
+  EXPECT_EQ(lspLength("ð"), 1UL);
 }
 
 // The = â ð¡ below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes).
@@ -131,6 +140,63 @@ TEST(SourceCodeTests, PositionToOffset)
   EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 0)), llvm::Failed());
   EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 1)), llvm::Failed());
 
+  // Codepoints are similar, except near astral characters.
+  WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32);
+  // line out of bounds
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(-1, 2)), llvm::Failed());
+  // first line
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, -1)),
+                       llvm::Failed()); // out of range
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 0)),
+                       llvm::HasValue(0)); // first character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 3)),
+                       llvm::HasValue(3)); // middle character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 6)),
+                       llvm::HasValue(6)); // last character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 7)),
+                       llvm::HasValue(7)); // the newline itself
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 7), false),
+                       llvm::HasValue(7));
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 8)),
+                       llvm::HasValue(7)); // out of range
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 8), false),
+                       llvm::Failed()); // out of range
+  // middle line
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, -1)),
+                       llvm::Failed()); // out of range
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 0)),
+                       llvm::HasValue(8)); // first character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 3)),
+                       llvm::HasValue(11)); // middle character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 3), false),
+                       llvm::HasValue(11));
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 6)),
+                       llvm::HasValue(16)); // last character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 7)),
+                       llvm::HasValue(17)); // the newline itself
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 8)),
+                       llvm::HasValue(17)); // out of range
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 8), false),
+                       llvm::Failed()); // out of range
+  // last line
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, -1)),
+                       llvm::Failed()); // out of range
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 0)),
+                       llvm::HasValue(18)); // first character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 4)),
+                       llvm::HasValue(22)); // Before astral character.
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 5), false),
+                       llvm::HasValue(26)); // after astral character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 7)),
+                       llvm::HasValue(28)); // last character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 8)),
+                       llvm::HasValue(29)); // EOF
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 9), false),
+                       llvm::Failed()); // out of range
+  // line out of bounds
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 0)), llvm::Failed());
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 1)), llvm::Failed());
+
   // Test UTF-8, where transformations are trivial.
   WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8);
   EXPECT_THAT_EXPECTED(positionToOffset(File, position(-1, 2)), llvm::Failed());
@@ -169,6 +235,27 @@ TEST(SourceCodeTests, OffsetToPosition)
   EXPECT_THAT(offsetToPosition(File, 29), Pos(2, 9)) << "EOF";
   EXPECT_THAT(offsetToPosition(File, 30), Pos(2, 9)) << "out of bounds";
 
+  // Codepoints are similar, except near astral characters.
+  WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32);
+  EXPECT_THAT(offsetToPosition(File, 0), Pos(0, 0)) << "start of file";
+  EXPECT_THAT(offsetToPosition(File, 3), Pos(0, 3)) << "in first line";
+  EXPECT_THAT(offsetToPosition(File, 6), Pos(0, 6)) << "end of first line";
+  EXPECT_THAT(offsetToPosition(File, 7), Pos(0, 7)) << "first newline";
+  EXPECT_THAT(offsetToPosition(File, 8), Pos(1, 0)) << "start of second line";
+  EXPECT_THAT(offsetToPosition(File, 12), Pos(1, 4)) << "before BMP char";
+  EXPECT_THAT(offsetToPosition(File, 13), Pos(1, 5)) << "in BMP char";
+  EXPECT_THAT(offsetToPosition(File, 15), Pos(1, 5)) << "after BMP char";
+  EXPECT_THAT(offsetToPosition(File, 16), Pos(1, 6)) << "end of second line";
+  EXPECT_THAT(offsetToPosition(File, 17), Pos(1, 7)) << "second newline";
+  EXPECT_THAT(offsetToPosition(File, 18), Pos(2, 0)) << "start of last line";
+  EXPECT_THAT(offsetToPosition(File, 21), Pos(2, 3)) << "in last line";
+  EXPECT_THAT(offsetToPosition(File, 22), Pos(2, 4)) << "before astral char";
+  EXPECT_THAT(offsetToPosition(File, 24), Pos(2, 5)) << "in astral char";
+  EXPECT_THAT(offsetToPosition(File, 26), Pos(2, 5)) << "after astral char";
+  EXPECT_THAT(offsetToPosition(File, 28), Pos(2, 7)) << "end of last line";
+  EXPECT_THAT(offsetToPosition(File, 29), Pos(2, 8)) << "EOF";
+  EXPECT_THAT(offsetToPosition(File, 30), Pos(2, 8)) << "out of bounds";
+
   WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8);
   for (Line L : FileLines) {
     for (unsigned I = 0; I <= L.Length; ++I)