[llvm] r336657 - [Support] Harded JSON against invalid UTF-8.

Tue Jul 10 04:51:26 PDT 2018

Author: sammccall
Date: Tue Jul 10 04:51:26 2018
New Revision: 336657

URL: http://llvm.org/viewvc/llvm-project?rev=336657&view=rev
Log:
[Support] Harded JSON against invalid UTF-8.

Parsing invalid UTF-8 input is now a parse error.
Creating JSON values from invalid UTF-8 now triggers an assertion, and
(in no-assert builds) substitutes the unicode replacement character.
Strings retrieved from json::Value are always valid UTF-8.

Modified:
    llvm/trunk/include/llvm/ADT/StringExtras.h
    llvm/trunk/include/llvm/Support/JSON.h
    llvm/trunk/lib/Support/JSON.cpp
    llvm/trunk/unittests/Support/JSONTest.cpp

Modified: llvm/trunk/include/llvm/ADT/StringExtras.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/ADT/StringExtras.h?rev=336657&r1=336656&r2=336657&view=diff
==============================================================================

--- llvm/trunk/include/llvm/ADT/StringExtras.h (original)
+++ llvm/trunk/include/llvm/ADT/StringExtras.h Tue Jul 10 04:51:26 2018
@@ -88,6 +88,17 @@ inline bool isAlpha(char C) {
 /// lowercase letter as classified by "C" locale.
 inline bool isAlnum(char C) { return isAlpha(C) || isDigit(C); }
 
+/// Checks whether character \p C is valid ASCII (high bit is zero).
+inline bool isASCII(char C) { return static_cast<unsigned char>(C) <= 127; }
+
+/// Checks whether all characters in S are ASCII.
+inline bool isASCII(llvm::StringRef S) {
+  for (char C : S)
+    if (LLVM_UNLIKELY(!isASCII(C)))
+      return false;
+  return true;
+}
+
 /// Returns the corresponding lowercase character if \p x is uppercase.
 inline char toLower(char x) {
   if (x >= 'A' && x <= 'Z')

Modified: llvm/trunk/include/llvm/Support/JSON.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Support/JSON.h?rev=336657&r1=336656&r2=336657&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Support/JSON.h (original)
+++ llvm/trunk/include/llvm/Support/JSON.h Tue Jul 10 04:51:26 2018
@@ -54,6 +54,30 @@
 
 namespace llvm {
 namespace json {
+
+// === String encodings ===
+//
+// JSON strings are character sequences (not byte sequences like std::string).
+// We need to know the encoding, and for simplicity only support UTF-8.
+//
+//   - When parsing, invalid UTF-8 is a syntax error like any other
+//
+//   - When creating Values from strings, callers must ensure they are UTF-8.
+//        with asserts on, invalid UTF-8 will crash the program
+//        with asserts off, we'll substitute the replacement character (U+FFFD)
+//     Callers can use json::isUTF8() and json::fixUTF8() for validation.
+//
+//   - When retrieving strings from Values (e.g. asString()), the result will
+//     always be valid UTF-8.
+
+/// Returns true if \p S is valid UTF-8, which is required for use as JSON.
+/// If it returns false, \p Offset is set to a byte offset near the first error.
+bool isUTF8(llvm::StringRef S, size_t *ErrOffset = nullptr);
+/// Replaces invalid UTF-8 sequences in \p S with the replacement character
+/// (U+FFFD). The returned string is valid UTF-8.
+/// This is much slower than isUTF8, so test that first.
+std::string fixUTF8(llvm::StringRef S);
+
 class Array;
 class ObjectKey;
 class Value;
@@ -273,16 +297,26 @@ public:
   Value(json::Object &&Properties) : Type(T_Object) {
     create<json::Object>(std::move(Properties));
   }
-  // Strings: types with value semantics.
-  Value(std::string &&V) : Type(T_String) { create<std::string>(std::move(V)); }
-  Value(const std::string &V) : Type(T_String) { create<std::string>(V); }
-  Value(const llvm::SmallVectorImpl<char> &V) : Type(T_String) {
-    create<std::string>(V.begin(), V.end());
+  // Strings: types with value semantics. Must be valid UTF-8.
+  Value(std::string V) : Type(T_String) {
+    if (LLVM_UNLIKELY(!isUTF8(V))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      V = fixUTF8(std::move(V));
+    }
+    create<std::string>(std::move(V));
   }
+  Value(const llvm::SmallVectorImpl<char> &V)
+      : Value(std::string(V.begin(), V.end())){};
   Value(const llvm::formatv_object_base &V) : Value(V.str()){};
-  // Strings: types with reference semantics.
-  Value(llvm::StringRef V) : Type(T_StringRef) { create<llvm::StringRef>(V); }
-  Value(const char *V) : Type(T_StringRef) { create<llvm::StringRef>(V); }
+  // Strings: types with reference semantics. Must be valid UTF-8.
+  Value(StringRef V) : Type(T_StringRef) {
+    create<llvm::StringRef>(V);
+    if (LLVM_UNLIKELY(!isUTF8(V))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      *this = Value(fixUTF8(V));
+    }
+  }
+  Value(const char *V) : Value(StringRef(V)) {}
   Value(std::nullptr_t) : Type(T_Null) {}
   // Boolean (disallow implicit conversions).
   // (The last template parameter is a dummy to keep templates distinct.)
@@ -449,13 +483,23 @@ llvm::raw_ostream &operator<<(llvm::raw_
 /// ObjectKey is a used to capture keys in Object. Like Value but:
 ///   - only strings are allowed
 ///   - it's optimized for the string literal case (Owned == nullptr)
+/// Like Value, strings must be UTF-8. See isUTF8 documentation for details.
 class ObjectKey {
 public:
-  ObjectKey(const char *S) : Data(S) {}
-  ObjectKey(llvm::StringRef S) : Data(S) {}
-  ObjectKey(std::string &&V)
-      : Owned(new std::string(std::move(V))), Data(*Owned) {}
-  ObjectKey(const std::string &V) : Owned(new std::string(V)), Data(*Owned) {}
+  ObjectKey(const char *S) : ObjectKey(StringRef(S)) {}
+  ObjectKey(std::string S) : Owned(new std::string(std::move(S))) {
+    if (LLVM_UNLIKELY(!isUTF8(*Owned))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      *Owned = fixUTF8(std::move(*Owned));
+    }
+    Data = *Owned;
+  }
+  ObjectKey(llvm::StringRef S) : Data(S) {
+    if (LLVM_UNLIKELY(!isUTF8(Data))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      *this = ObjectKey(fixUTF8(S));
+    }
+  }
   ObjectKey(const llvm::SmallVectorImpl<char> &V)
       : ObjectKey(std::string(V.begin(), V.end())) {}
   ObjectKey(const llvm::formatv_object_base &V) : ObjectKey(V.str()) {}

Modified: llvm/trunk/lib/Support/JSON.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/JSON.cpp?rev=336657&r1=336656&r2=336657&view=diff
==============================================================================
--- llvm/trunk/lib/Support/JSON.cpp (original)
+++ llvm/trunk/lib/Support/JSON.cpp Tue Jul 10 04:51:26 2018
@@ -8,6 +8,7 @@
 //===---------------------------------------------------------------------===//
 
 #include "llvm/Support/JSON.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Format.h"
 #include <cctype>
 
@@ -199,6 +200,14 @@ public:
   Parser(StringRef JSON)
       : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
 
+  bool checkUTF8() {
+    size_t ErrOffset;
+    if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
+      return true;
+    P = Start + ErrOffset; // For line/column calculation.
+    return parseError("Invalid UTF-8 sequence");
+  }
+
   bool parseValue(Value &Out);
 
   bool assertEnd() {
@@ -458,7 +467,7 @@ bool Parser::parseUnicode(std::string &O
 
     // Case 3: it's a leading surrogate. We expect a trailing one next.
     // Case 3a: there's no trailing \u escape. Don't advance in the stream.
-    if (!LLVM_LIKELY(P + 2 <= End && *P == '\\' && *(P + 1) == 'u')) {
+    if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
       Invalid(); // Leading surrogate was unpaired.
       return true;
     }
@@ -496,9 +505,10 @@ bool Parser::parseError(const char *Msg)
 Expected<Value> parse(StringRef JSON) {
   Parser P(JSON);
   Value E = nullptr;
-  if (P.parseValue(E))
-    if (P.assertEnd())
-      return std::move(E);
+  if (P.checkUTF8())
+    if (P.parseValue(E))
+      if (P.assertEnd())
+        return std::move(E);
   return P.takeError();
 }
 char ParseError::ID = 0;
@@ -514,6 +524,37 @@ static std::vector<const Object::value_t
   return Elements;
 }
 
+bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
+  // Fast-path for ASCII, which is valid UTF-8.
+  if (LLVM_LIKELY(isASCII(S)))
+    return true;
+
+  const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
+  if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
+    return true;
+
+  if (ErrOffset)
+    *ErrOffset = Rest - Data;
+  return false;
+}
+
+std::string fixUTF8(llvm::StringRef S) {
+  // This isn't particularly efficient, but is only for error-recovery.
+  std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
+  const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
+  UTF32 *Out32 = Codepoints.data();
+  ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
+                     lenientConversion);
+  Codepoints.resize(Out32 - Codepoints.data());
+  std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
+  const UTF32 *In32 = Codepoints.data();
+  UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
+  ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
+                     strictConversion);
+  Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
+  return Res;
+}
+
 } // namespace json
 } // namespace llvm
 

Modified: llvm/trunk/unittests/Support/JSONTest.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/unittests/Support/JSONTest.cpp?rev=336657&r1=336656&r2=336657&view=diff
==============================================================================
--- llvm/trunk/unittests/Support/JSONTest.cpp (original)
+++ llvm/trunk/unittests/Support/JSONTest.cpp Tue Jul 10 04:51:26 2018
@@ -27,6 +27,14 @@ TEST(JSONTest, Types) {
   EXPECT_EQ(R"("foo")", s("foo"));
   EXPECT_EQ("[1,2,3]", s({1, 2, 3}));
   EXPECT_EQ(R"({"x":10,"y":20})", s(Object{{"x", 10}, {"y", 20}}));
+
+#ifdef NDEBUG
+  EXPECT_EQ(R"("ï¿½ï¿½")", s("\xC0\x80"));
+  EXPECT_EQ(R"({"ï¿½ï¿½":0})", s(Object{{"\xC0\x80", 0}}));
+#else
+  EXPECT_DEATH(s("\xC0\x80"), "Invalid UTF-8");
+  EXPECT_DEATH(s(Object{{"\xC0\x80", 0}}), "Invalid UTF-8");
+#endif
 }
 
 TEST(JSONTest, Constructors) {
@@ -181,6 +189,31 @@ TEST(JSONTest, ParseErrors) {
   "valid": 1,
   invalid: 2
 })");
+  ExpectErr("Invalid UTF-8 sequence", "\"\xC0\x80\""); // WTF-8 null
+}
+
+// Direct tests of isUTF8 and fixUTF8. Internal uses are also tested elsewhere.
+TEST(JSONTest, UTF8) {
+  for (const char *Valid : {
+           "this is ASCII text",
+           "thÃ¯s tÃ«xt hÃ¤s BMP chÃ¤rÃ¤ctÃ«rs",
+           "ð¶ð°Lð¾ð Cðð¼ð´ð",
+       }) {
+    EXPECT_TRUE(isUTF8(Valid)) << Valid;
+    EXPECT_EQ(fixUTF8(Valid), Valid);
+  }
+  for (auto Invalid : std::vector<std::pair<const char *, const char *>>{
+           {"lone trailing \x81\x82 bytes", "lone trailing ï¿½ï¿½ bytes"},
+           {"missing trailing \xD0 bytes", "missing trailing ï¿½ bytes"},
+           {"truncated character \xD0", "truncated character ï¿½"},
+           {"not \xC1\x80 the \xE0\x9f\xBF shortest \xF0\x83\x83\x83 encoding",
+            "not ï¿½ï¿½ the ï¿½ï¿½ï¿½ shortest ï¿½ï¿½ï¿½ï¿½ encoding"},
+           {"too \xF9\x80\x80\x80\x80 long", "too ï¿½ï¿½ï¿½ï¿½ï¿½ long"},
+           {"surrogate \xED\xA0\x80 invalid \xF4\x90\x80\x80",
+            "surrogate ï¿½ï¿½ï¿½ invalid ï¿½ï¿½ï¿½ï¿½"}}) {
+    EXPECT_FALSE(isUTF8(Invalid.first)) << Invalid.first;
+    EXPECT_EQ(fixUTF8(Invalid.first), Invalid.second);
+  }
 }
 
 TEST(JSONTest, Inspection) {