[llvm] r336657 - [Support] Harded JSON against invalid UTF-8.
Sam McCall via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 10 04:51:26 PDT 2018
Author: sammccall
Date: Tue Jul 10 04:51:26 2018
New Revision: 336657
URL: http://llvm.org/viewvc/llvm-project?rev=336657&view=rev
Log:
[Support] Harded JSON against invalid UTF-8.
Parsing invalid UTF-8 input is now a parse error.
Creating JSON values from invalid UTF-8 now triggers an assertion, and
(in no-assert builds) substitutes the unicode replacement character.
Strings retrieved from json::Value are always valid UTF-8.
Modified:
llvm/trunk/include/llvm/ADT/StringExtras.h
llvm/trunk/include/llvm/Support/JSON.h
llvm/trunk/lib/Support/JSON.cpp
llvm/trunk/unittests/Support/JSONTest.cpp
Modified: llvm/trunk/include/llvm/ADT/StringExtras.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/ADT/StringExtras.h?rev=336657&r1=336656&r2=336657&view=diff
==============================================================================
--- llvm/trunk/include/llvm/ADT/StringExtras.h (original)
+++ llvm/trunk/include/llvm/ADT/StringExtras.h Tue Jul 10 04:51:26 2018
@@ -88,6 +88,17 @@ inline bool isAlpha(char C) {
/// lowercase letter as classified by "C" locale.
inline bool isAlnum(char C) { return isAlpha(C) || isDigit(C); }
+/// Checks whether character \p C is valid ASCII (high bit is zero).
+inline bool isASCII(char C) { return static_cast<unsigned char>(C) <= 127; }
+
+/// Checks whether all characters in S are ASCII.
+inline bool isASCII(llvm::StringRef S) {
+ for (char C : S)
+ if (LLVM_UNLIKELY(!isASCII(C)))
+ return false;
+ return true;
+}
+
/// Returns the corresponding lowercase character if \p x is uppercase.
inline char toLower(char x) {
if (x >= 'A' && x <= 'Z')
Modified: llvm/trunk/include/llvm/Support/JSON.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Support/JSON.h?rev=336657&r1=336656&r2=336657&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Support/JSON.h (original)
+++ llvm/trunk/include/llvm/Support/JSON.h Tue Jul 10 04:51:26 2018
@@ -54,6 +54,30 @@
namespace llvm {
namespace json {
+
+// === String encodings ===
+//
+// JSON strings are character sequences (not byte sequences like std::string).
+// We need to know the encoding, and for simplicity only support UTF-8.
+//
+// - When parsing, invalid UTF-8 is a syntax error like any other
+//
+// - When creating Values from strings, callers must ensure they are UTF-8.
+// with asserts on, invalid UTF-8 will crash the program
+// with asserts off, we'll substitute the replacement character (U+FFFD)
+// Callers can use json::isUTF8() and json::fixUTF8() for validation.
+//
+// - When retrieving strings from Values (e.g. asString()), the result will
+// always be valid UTF-8.
+
+/// Returns true if \p S is valid UTF-8, which is required for use as JSON.
+/// If it returns false, \p Offset is set to a byte offset near the first error.
+bool isUTF8(llvm::StringRef S, size_t *ErrOffset = nullptr);
+/// Replaces invalid UTF-8 sequences in \p S with the replacement character
+/// (U+FFFD). The returned string is valid UTF-8.
+/// This is much slower than isUTF8, so test that first.
+std::string fixUTF8(llvm::StringRef S);
+
class Array;
class ObjectKey;
class Value;
@@ -273,16 +297,26 @@ public:
Value(json::Object &&Properties) : Type(T_Object) {
create<json::Object>(std::move(Properties));
}
- // Strings: types with value semantics.
- Value(std::string &&V) : Type(T_String) { create<std::string>(std::move(V)); }
- Value(const std::string &V) : Type(T_String) { create<std::string>(V); }
- Value(const llvm::SmallVectorImpl<char> &V) : Type(T_String) {
- create<std::string>(V.begin(), V.end());
+ // Strings: types with value semantics. Must be valid UTF-8.
+ Value(std::string V) : Type(T_String) {
+ if (LLVM_UNLIKELY(!isUTF8(V))) {
+ assert(false && "Invalid UTF-8 in value used as JSON");
+ V = fixUTF8(std::move(V));
+ }
+ create<std::string>(std::move(V));
}
+ Value(const llvm::SmallVectorImpl<char> &V)
+ : Value(std::string(V.begin(), V.end())){};
Value(const llvm::formatv_object_base &V) : Value(V.str()){};
- // Strings: types with reference semantics.
- Value(llvm::StringRef V) : Type(T_StringRef) { create<llvm::StringRef>(V); }
- Value(const char *V) : Type(T_StringRef) { create<llvm::StringRef>(V); }
+ // Strings: types with reference semantics. Must be valid UTF-8.
+ Value(StringRef V) : Type(T_StringRef) {
+ create<llvm::StringRef>(V);
+ if (LLVM_UNLIKELY(!isUTF8(V))) {
+ assert(false && "Invalid UTF-8 in value used as JSON");
+ *this = Value(fixUTF8(V));
+ }
+ }
+ Value(const char *V) : Value(StringRef(V)) {}
Value(std::nullptr_t) : Type(T_Null) {}
// Boolean (disallow implicit conversions).
// (The last template parameter is a dummy to keep templates distinct.)
@@ -449,13 +483,23 @@ llvm::raw_ostream &operator<<(llvm::raw_
/// ObjectKey is a used to capture keys in Object. Like Value but:
/// - only strings are allowed
/// - it's optimized for the string literal case (Owned == nullptr)
+/// Like Value, strings must be UTF-8. See isUTF8 documentation for details.
class ObjectKey {
public:
- ObjectKey(const char *S) : Data(S) {}
- ObjectKey(llvm::StringRef S) : Data(S) {}
- ObjectKey(std::string &&V)
- : Owned(new std::string(std::move(V))), Data(*Owned) {}
- ObjectKey(const std::string &V) : Owned(new std::string(V)), Data(*Owned) {}
+ ObjectKey(const char *S) : ObjectKey(StringRef(S)) {}
+ ObjectKey(std::string S) : Owned(new std::string(std::move(S))) {
+ if (LLVM_UNLIKELY(!isUTF8(*Owned))) {
+ assert(false && "Invalid UTF-8 in value used as JSON");
+ *Owned = fixUTF8(std::move(*Owned));
+ }
+ Data = *Owned;
+ }
+ ObjectKey(llvm::StringRef S) : Data(S) {
+ if (LLVM_UNLIKELY(!isUTF8(Data))) {
+ assert(false && "Invalid UTF-8 in value used as JSON");
+ *this = ObjectKey(fixUTF8(S));
+ }
+ }
ObjectKey(const llvm::SmallVectorImpl<char> &V)
: ObjectKey(std::string(V.begin(), V.end())) {}
ObjectKey(const llvm::formatv_object_base &V) : ObjectKey(V.str()) {}
Modified: llvm/trunk/lib/Support/JSON.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/JSON.cpp?rev=336657&r1=336656&r2=336657&view=diff
==============================================================================
--- llvm/trunk/lib/Support/JSON.cpp (original)
+++ llvm/trunk/lib/Support/JSON.cpp Tue Jul 10 04:51:26 2018
@@ -8,6 +8,7 @@
//===---------------------------------------------------------------------===//
#include "llvm/Support/JSON.h"
+#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/Format.h"
#include <cctype>
@@ -199,6 +200,14 @@ public:
Parser(StringRef JSON)
: Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
+ bool checkUTF8() {
+ size_t ErrOffset;
+ if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
+ return true;
+ P = Start + ErrOffset; // For line/column calculation.
+ return parseError("Invalid UTF-8 sequence");
+ }
+
bool parseValue(Value &Out);
bool assertEnd() {
@@ -458,7 +467,7 @@ bool Parser::parseUnicode(std::string &O
// Case 3: it's a leading surrogate. We expect a trailing one next.
// Case 3a: there's no trailing \u escape. Don't advance in the stream.
- if (!LLVM_LIKELY(P + 2 <= End && *P == '\\' && *(P + 1) == 'u')) {
+ if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
Invalid(); // Leading surrogate was unpaired.
return true;
}
@@ -496,9 +505,10 @@ bool Parser::parseError(const char *Msg)
Expected<Value> parse(StringRef JSON) {
Parser P(JSON);
Value E = nullptr;
- if (P.parseValue(E))
- if (P.assertEnd())
- return std::move(E);
+ if (P.checkUTF8())
+ if (P.parseValue(E))
+ if (P.assertEnd())
+ return std::move(E);
return P.takeError();
}
char ParseError::ID = 0;
@@ -514,6 +524,37 @@ static std::vector<const Object::value_t
return Elements;
}
+bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
+ // Fast-path for ASCII, which is valid UTF-8.
+ if (LLVM_LIKELY(isASCII(S)))
+ return true;
+
+ const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
+ if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
+ return true;
+
+ if (ErrOffset)
+ *ErrOffset = Rest - Data;
+ return false;
+}
+
+std::string fixUTF8(llvm::StringRef S) {
+ // This isn't particularly efficient, but is only for error-recovery.
+ std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
+ const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
+ UTF32 *Out32 = Codepoints.data();
+ ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
+ lenientConversion);
+ Codepoints.resize(Out32 - Codepoints.data());
+ std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
+ const UTF32 *In32 = Codepoints.data();
+ UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
+ ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
+ strictConversion);
+ Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
+ return Res;
+}
+
} // namespace json
} // namespace llvm
Modified: llvm/trunk/unittests/Support/JSONTest.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/unittests/Support/JSONTest.cpp?rev=336657&r1=336656&r2=336657&view=diff
==============================================================================
--- llvm/trunk/unittests/Support/JSONTest.cpp (original)
+++ llvm/trunk/unittests/Support/JSONTest.cpp Tue Jul 10 04:51:26 2018
@@ -27,6 +27,14 @@ TEST(JSONTest, Types) {
EXPECT_EQ(R"("foo")", s("foo"));
EXPECT_EQ("[1,2,3]", s({1, 2, 3}));
EXPECT_EQ(R"({"x":10,"y":20})", s(Object{{"x", 10}, {"y", 20}}));
+
+#ifdef NDEBUG
+ EXPECT_EQ(R"("��")", s("\xC0\x80"));
+ EXPECT_EQ(R"({"��":0})", s(Object{{"\xC0\x80", 0}}));
+#else
+ EXPECT_DEATH(s("\xC0\x80"), "Invalid UTF-8");
+ EXPECT_DEATH(s(Object{{"\xC0\x80", 0}}), "Invalid UTF-8");
+#endif
}
TEST(JSONTest, Constructors) {
@@ -181,6 +189,31 @@ TEST(JSONTest, ParseErrors) {
"valid": 1,
invalid: 2
})");
+ ExpectErr("Invalid UTF-8 sequence", "\"\xC0\x80\""); // WTF-8 null
+}
+
+// Direct tests of isUTF8 and fixUTF8. Internal uses are also tested elsewhere.
+TEST(JSONTest, UTF8) {
+ for (const char *Valid : {
+ "this is ASCII text",
+ "thïs tëxt häs BMP chäräctërs",
+ "ð¶ð°Lð¾ð Cðð¼ð´ð",
+ }) {
+ EXPECT_TRUE(isUTF8(Valid)) << Valid;
+ EXPECT_EQ(fixUTF8(Valid), Valid);
+ }
+ for (auto Invalid : std::vector<std::pair<const char *, const char *>>{
+ {"lone trailing \x81\x82 bytes", "lone trailing �� bytes"},
+ {"missing trailing \xD0 bytes", "missing trailing � bytes"},
+ {"truncated character \xD0", "truncated character �"},
+ {"not \xC1\x80 the \xE0\x9f\xBF shortest \xF0\x83\x83\x83 encoding",
+ "not �� the ��� shortest ���� encoding"},
+ {"too \xF9\x80\x80\x80\x80 long", "too ����� long"},
+ {"surrogate \xED\xA0\x80 invalid \xF4\x90\x80\x80",
+ "surrogate ��� invalid ����"}}) {
+ EXPECT_FALSE(isUTF8(Invalid.first)) << Invalid.first;
+ EXPECT_EQ(fixUTF8(Invalid.first), Invalid.second);
+ }
}
TEST(JSONTest, Inspection) {
More information about the llvm-commits
mailing list