[clang-tools-extra] r357173 - [clangd] Support UTF-32 (i.e. codepoint) offsets.
Sam McCall via cfe-commits
cfe-commits at lists.llvm.org
Thu Mar 28 07:37:51 PDT 2019
Author: sammccall
Date: Thu Mar 28 07:37:51 2019
New Revision: 357173
URL: http://llvm.org/viewvc/llvm-project?rev=357173&view=rev
Log:
[clangd] Support UTF-32 (i.e. codepoint) offsets.
Summary:
(Changes to UTF-8/UTF-16 here are NFC, moving things around to make the
cases more symmetrical)
Reviewers: ilya-biryukov
Subscribers: ioeric, MaskRay, jkorous, arphaman, kadircet, cfe-commits
Tags: #clang
Differential Revision: https://reviews.llvm.org/D59927
Modified:
clang-tools-extra/trunk/clangd/Protocol.cpp
clang-tools-extra/trunk/clangd/Protocol.h
clang-tools-extra/trunk/clangd/SourceCode.cpp
clang-tools-extra/trunk/unittests/clangd/SourceCodeTests.cpp
Modified: clang-tools-extra/trunk/clangd/Protocol.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/Protocol.cpp?rev=357173&r1=357172&r2=357173&view=diff
==============================================================================
--- clang-tools-extra/trunk/clangd/Protocol.cpp (original)
+++ clang-tools-extra/trunk/clangd/Protocol.cpp Thu Mar 28 07:37:51 2019
@@ -938,16 +938,19 @@ bool fromJSON(const llvm::json::Value &P
return fromJSON(Params, Base);
}
-llvm::json::Value toJSON(const OffsetEncoding &OE) {
+static const char *toString(OffsetEncoding OE) {
switch (OE) {
- case OffsetEncoding::UTF8:
- return "utf-8";
- case OffsetEncoding::UTF16:
- return "utf-16";
- case OffsetEncoding::UnsupportedEncoding:
- return "unknown";
+ case OffsetEncoding::UTF8:
+ return "utf-8";
+ case OffsetEncoding::UTF16:
+ return "utf-16";
+ case OffsetEncoding::UTF32:
+ return "utf-32";
+ case OffsetEncoding::UnsupportedEncoding:
+ return "unknown";
}
}
+llvm::json::Value toJSON(const OffsetEncoding &OE) { return toString(OE); }
bool fromJSON(const llvm::json::Value &V, OffsetEncoding &OE) {
auto Str = V.getAsString();
if (!Str)
@@ -955,9 +958,13 @@ bool fromJSON(const llvm::json::Value &V
OE = llvm::StringSwitch<OffsetEncoding>(*Str)
.Case("utf-8", OffsetEncoding::UTF8)
.Case("utf-16", OffsetEncoding::UTF16)
+ .Case("utf-32", OffsetEncoding::UTF32)
.Default(OffsetEncoding::UnsupportedEncoding);
return true;
}
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, OffsetEncoding Enc) {
+ return OS << toString(Enc);
+}
} // namespace clangd
} // namespace clang
Modified: clang-tools-extra/trunk/clangd/Protocol.h
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/Protocol.h?rev=357173&r1=357172&r2=357173&view=diff
==============================================================================
--- clang-tools-extra/trunk/clangd/Protocol.h (original)
+++ clang-tools-extra/trunk/clangd/Protocol.h Thu Mar 28 07:37:51 2019
@@ -28,6 +28,7 @@
#include "clang/Index/IndexSymbol.h"
#include "llvm/ADT/Optional.h"
#include "llvm/Support/JSON.h"
+#include "llvm/Support/raw_ostream.h"
#include <bitset>
#include <string>
#include <vector>
@@ -346,9 +347,12 @@ enum class OffsetEncoding {
UTF16,
// Length counts bytes of UTF-8 encoded text. (Clangd extension).
UTF8,
+ // Length counts codepoints in unicode text. (Clangd extension).
+ UTF32,
};
llvm::json::Value toJSON(const OffsetEncoding &);
bool fromJSON(const llvm::json::Value &, OffsetEncoding &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, OffsetEncoding OS);
// This struct doesn't mirror LSP!
// The protocol defines deeply nested structures for client capabilities.
Modified: clang-tools-extra/trunk/clangd/SourceCode.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/SourceCode.cpp?rev=357173&r1=357172&r2=357173&view=diff
==============================================================================
--- clang-tools-extra/trunk/clangd/SourceCode.cpp (original)
+++ clang-tools-extra/trunk/clangd/SourceCode.cpp Thu Mar 28 07:37:51 2019
@@ -17,6 +17,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Errc.h"
#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Path.h"
namespace clang {
@@ -30,6 +31,8 @@ namespace clangd {
// Returns true if CB returned true, false if we hit the end of string.
template <typename Callback>
static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) {
+ // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
+ // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
for (size_t I = 0; I < U8.size();) {
unsigned char C = static_cast<unsigned char>(U8[I]);
if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character.
@@ -53,46 +56,75 @@ static bool iterateCodepoints(llvm::Stri
return false;
}
-// Returns the offset into the string that matches \p Units UTF-16 code units.
-// Conceptually, this converts to UTF-16, truncates to CodeUnits, converts back
-// to UTF-8, and returns the length in bytes.
-static size_t measureUTF16(llvm::StringRef U8, int U16Units, bool &Valid) {
+// Returns the byte offset into the string that is an offset of \p Units in
+// the specified encoding.
+// Conceptually, this converts to the encoding, truncates to CodeUnits,
+// converts back to UTF-8, and returns the length in bytes.
+static size_t measureUnits(llvm::StringRef U8, int Units, OffsetEncoding Enc,
+ bool &Valid) {
+ Valid = Units >= 0;
+ if (Units <= 0)
+ return 0;
size_t Result = 0;
- Valid = U16Units == 0 || iterateCodepoints(U8, [&](int U8Len, int U16Len) {
- Result += U8Len;
- U16Units -= U16Len;
- return U16Units <= 0;
- });
- if (U16Units < 0) // Offset was into the middle of a surrogate pair.
- Valid = false;
+ switch (Enc) {
+ case OffsetEncoding::UTF8:
+ Result = Units;
+ break;
+ case OffsetEncoding::UTF16:
+ Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) {
+ Result += U8Len;
+ Units -= U16Len;
+ return Units <= 0;
+ });
+ if (Units < 0) // Offset in the middle of a surrogate pair.
+ Valid = false;
+ break;
+ case OffsetEncoding::UTF32:
+ Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) {
+ Result += U8Len;
+ Units--;
+ return Units <= 0;
+ });
+ break;
+ case OffsetEncoding::UnsupportedEncoding:
+ llvm_unreachable("unsupported encoding");
+ }
// Don't return an out-of-range index if we overran.
- return std::min(Result, U8.size());
+ if (Result > U8.size()) {
+ Valid = false;
+ return U8.size();
+ }
+ return Result;
}
Key<OffsetEncoding> kCurrentOffsetEncoding;
-static bool useUTF16ForLSP() {
+static OffsetEncoding lspEncoding() {
auto *Enc = Context::current().get(kCurrentOffsetEncoding);
- switch (Enc ? *Enc : OffsetEncoding::UTF16) {
- case OffsetEncoding::UTF16:
- return true;
- case OffsetEncoding::UTF8:
- return false;
- case OffsetEncoding::UnsupportedEncoding:
- llvm_unreachable("cannot use an unsupported encoding");
- }
+ return Enc ? *Enc : OffsetEncoding::UTF16;
}
// Like most strings in clangd, the input is UTF-8 encoded.
size_t lspLength(llvm::StringRef Code) {
- if (!useUTF16ForLSP())
- return Code.size();
- // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
- // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
size_t Count = 0;
- iterateCodepoints(Code, [&](int U8Len, int U16Len) {
- Count += U16Len;
- return false;
- });
+ switch (lspEncoding()) {
+ case OffsetEncoding::UTF8:
+ Count = Code.size();
+ break;
+ case OffsetEncoding::UTF16:
+ iterateCodepoints(Code, [&](int U8Len, int U16Len) {
+ Count += U16Len;
+ return false;
+ });
+ break;
+ case OffsetEncoding::UTF32:
+ iterateCodepoints(Code, [&](int U8Len, int U16Len) {
+ ++Count;
+ return false;
+ });
+ break;
+ case OffsetEncoding::UnsupportedEncoding:
+ llvm_unreachable("unsupported encoding");
+ }
return Count;
}
@@ -118,28 +150,15 @@ llvm::Expected<size_t> positionToOffset(
StringRef Line =
Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; });
- if (!useUTF16ForLSP()) {
- // Bounds-checking only.
- if (P.character > int(Line.size())) {
- if (AllowColumnsBeyondLineLength)
- return StartOfLine + Line.size();
- else
- return llvm::make_error<llvm::StringError>(
- llvm::formatv("UTF-8 offset {0} overruns line {1}", P.character,
- P.line),
- llvm::errc::invalid_argument);
- }
- return StartOfLine + P.character;
- }
- // P.character is in UTF-16 code units, so we have to transcode.
+ // P.character may be in UTF-16, transcode if necessary.
bool Valid;
- size_t ByteOffsetInLine = measureUTF16(Line, P.character, Valid);
+ size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid);
if (!Valid && !AllowColumnsBeyondLineLength)
return llvm::make_error<llvm::StringError>(
- llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character,
- P.line),
+ llvm::formatv("{0} offset {1} is invalid for line {2}", lspEncoding(),
+ P.character, P.line),
llvm::errc::invalid_argument);
- return StartOfLine + ByteOffsetInLine;
+ return StartOfLine + ByteInLine;
}
Position offsetToPosition(llvm::StringRef Code, size_t Offset) {
Modified: clang-tools-extra/trunk/unittests/clangd/SourceCodeTests.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/unittests/clangd/SourceCodeTests.cpp?rev=357173&r1=357172&r2=357173&view=diff
==============================================================================
--- clang-tools-extra/trunk/unittests/clangd/SourceCodeTests.cpp (original)
+++ clang-tools-extra/trunk/unittests/clangd/SourceCodeTests.cpp Thu Mar 28 07:37:51 2019
@@ -58,6 +58,15 @@ TEST(SourceCodeTests, lspLength) {
EXPECT_EQ(lspLength("Â¥"), 2UL);
// astral
EXPECT_EQ(lspLength("ð"), 4UL);
+
+ WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32);
+ EXPECT_EQ(lspLength(""), 0UL);
+ EXPECT_EQ(lspLength("ascii"), 5UL);
+ // BMP
+ EXPECT_EQ(lspLength("â"), 1UL);
+ EXPECT_EQ(lspLength("Â¥"), 1UL);
+ // astral
+ EXPECT_EQ(lspLength("ð"), 1UL);
}
// The = â ð¡ below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes).
@@ -131,6 +140,63 @@ TEST(SourceCodeTests, PositionToOffset)
EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 0)), llvm::Failed());
EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 1)), llvm::Failed());
+ // Codepoints are similar, except near astral characters.
+ WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32);
+ // line out of bounds
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(-1, 2)), llvm::Failed());
+ // first line
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, -1)),
+ llvm::Failed()); // out of range
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 0)),
+ llvm::HasValue(0)); // first character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 3)),
+ llvm::HasValue(3)); // middle character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 6)),
+ llvm::HasValue(6)); // last character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 7)),
+ llvm::HasValue(7)); // the newline itself
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 7), false),
+ llvm::HasValue(7));
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 8)),
+ llvm::HasValue(7)); // out of range
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 8), false),
+ llvm::Failed()); // out of range
+ // middle line
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, -1)),
+ llvm::Failed()); // out of range
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 0)),
+ llvm::HasValue(8)); // first character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 3)),
+ llvm::HasValue(11)); // middle character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 3), false),
+ llvm::HasValue(11));
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 6)),
+ llvm::HasValue(16)); // last character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 7)),
+ llvm::HasValue(17)); // the newline itself
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 8)),
+ llvm::HasValue(17)); // out of range
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 8), false),
+ llvm::Failed()); // out of range
+ // last line
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, -1)),
+ llvm::Failed()); // out of range
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 0)),
+ llvm::HasValue(18)); // first character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 4)),
+ llvm::HasValue(22)); // Before astral character.
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 5), false),
+ llvm::HasValue(26)); // after astral character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 7)),
+ llvm::HasValue(28)); // last character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 8)),
+ llvm::HasValue(29)); // EOF
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 9), false),
+ llvm::Failed()); // out of range
+ // line out of bounds
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 0)), llvm::Failed());
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 1)), llvm::Failed());
+
// Test UTF-8, where transformations are trivial.
WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8);
EXPECT_THAT_EXPECTED(positionToOffset(File, position(-1, 2)), llvm::Failed());
@@ -169,6 +235,27 @@ TEST(SourceCodeTests, OffsetToPosition)
EXPECT_THAT(offsetToPosition(File, 29), Pos(2, 9)) << "EOF";
EXPECT_THAT(offsetToPosition(File, 30), Pos(2, 9)) << "out of bounds";
+ // Codepoints are similar, except near astral characters.
+ WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32);
+ EXPECT_THAT(offsetToPosition(File, 0), Pos(0, 0)) << "start of file";
+ EXPECT_THAT(offsetToPosition(File, 3), Pos(0, 3)) << "in first line";
+ EXPECT_THAT(offsetToPosition(File, 6), Pos(0, 6)) << "end of first line";
+ EXPECT_THAT(offsetToPosition(File, 7), Pos(0, 7)) << "first newline";
+ EXPECT_THAT(offsetToPosition(File, 8), Pos(1, 0)) << "start of second line";
+ EXPECT_THAT(offsetToPosition(File, 12), Pos(1, 4)) << "before BMP char";
+ EXPECT_THAT(offsetToPosition(File, 13), Pos(1, 5)) << "in BMP char";
+ EXPECT_THAT(offsetToPosition(File, 15), Pos(1, 5)) << "after BMP char";
+ EXPECT_THAT(offsetToPosition(File, 16), Pos(1, 6)) << "end of second line";
+ EXPECT_THAT(offsetToPosition(File, 17), Pos(1, 7)) << "second newline";
+ EXPECT_THAT(offsetToPosition(File, 18), Pos(2, 0)) << "start of last line";
+ EXPECT_THAT(offsetToPosition(File, 21), Pos(2, 3)) << "in last line";
+ EXPECT_THAT(offsetToPosition(File, 22), Pos(2, 4)) << "before astral char";
+ EXPECT_THAT(offsetToPosition(File, 24), Pos(2, 5)) << "in astral char";
+ EXPECT_THAT(offsetToPosition(File, 26), Pos(2, 5)) << "after astral char";
+ EXPECT_THAT(offsetToPosition(File, 28), Pos(2, 7)) << "end of last line";
+ EXPECT_THAT(offsetToPosition(File, 29), Pos(2, 8)) << "EOF";
+ EXPECT_THAT(offsetToPosition(File, 30), Pos(2, 8)) << "out of bounds";
+
WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8);
for (Line L : FileLines) {
for (unsigned I = 0; I <= L.Length; ++I)
More information about the cfe-commits
mailing list