[clang-tools-extra] [llvm] [lld] [compiler-rt] [libcxx] [flang] [lldb] [clang] [YAMLParser] Unfold multi-line scalar values (PR #70898)

Mon Nov 6 14:15:36 PST 2023

================
@@ -2030,187 +2030,219 @@ bool Node::failed() const {
 }
 
 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
-  // TODO: Handle newlines properly. We need to remove leading whitespace.
-  if (Value[0] == '"') { // Double quoted.
-    // Pull off the leading and trailing "s.
-    StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
-    // Search for characters that would require unescaping the value.
-    StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
-    if (i != StringRef::npos)
-      return unescapeDoubleQuoted(UnquotedValue, i, Storage);
+  if (Value[0] == '"')
+    return getDoubleQuotedValue(Value, Storage);
+  if (Value[0] == '\'')
+    return getSingleQuotedValue(Value, Storage);
+  return getPlainValue(Value, Storage);
+}
+
+static StringRef
+parseScalarValue(StringRef UnquotedValue, SmallVectorImpl<char> &Storage,
+                 StringRef LookupChars,
+                 std::function<StringRef(StringRef, SmallVectorImpl<char> &)>
+                     UnescapeCallback) {
+  size_t I = UnquotedValue.find_first_of(LookupChars);
+  if (I == StringRef::npos)
     return UnquotedValue;
-  } else if (Value[0] == '\'') { // Single quoted.
-    // Pull off the leading and trailing 's.
-    StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
-    StringRef::size_type i = UnquotedValue.find('\'');
-    if (i != StringRef::npos) {
-      // We're going to need Storage.
-      Storage.clear();
-      Storage.reserve(UnquotedValue.size());
-      for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
-        StringRef Valid(UnquotedValue.begin(), i);
-        llvm::append_range(Storage, Valid);
-        Storage.push_back('\'');
-        UnquotedValue = UnquotedValue.substr(i + 2);
-      }
-      llvm::append_range(Storage, UnquotedValue);
-      return StringRef(Storage.begin(), Storage.size());
-    }
-    return UnquotedValue;
-  }
-  // Plain.
-  // Trim whitespace ('b-char' and 's-white').
-  // NOTE: Alternatively we could change the scanner to not include whitespace
-  //       here in the first place.
-  return Value.rtrim("\x0A\x0D\x20\x09");
-}
 
-StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
-                                          , StringRef::size_type i
-                                          , SmallVectorImpl<char> &Storage)
-                                          const {
-  // Use Storage to build proper value.
   Storage.clear();
   Storage.reserve(UnquotedValue.size());
-  for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
-    // Insert all previous chars into Storage.
-    StringRef Valid(UnquotedValue.begin(), i);
-    llvm::append_range(Storage, Valid);
-    // Chop off inserted chars.
-    UnquotedValue = UnquotedValue.substr(i);
-
-    assert(!UnquotedValue.empty() && "Can't be empty!");
-
-    // Parse escape or line break.
-    switch (UnquotedValue[0]) {
-    case '\r':
-    case '\n':
-      Storage.push_back('\n');
-      if (   UnquotedValue.size() > 1
-          && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
-        UnquotedValue = UnquotedValue.substr(1);
-      UnquotedValue = UnquotedValue.substr(1);
-      break;
-    default:
-      if (UnquotedValue.size() == 1) {
-        Token T;
-        T.Range = StringRef(UnquotedValue.begin(), 1);
-        setError("Unrecognized escape code", T);
-        return "";
-      }
-      UnquotedValue = UnquotedValue.substr(1);
-      switch (UnquotedValue[0]) {
-      default: {
-          Token T;
-          T.Range = StringRef(UnquotedValue.begin(), 1);
-          setError("Unrecognized escape code", T);
-          return "";
-        }
-      case '\r':
-      case '\n':
-        // Remove the new line.
-        if (   UnquotedValue.size() > 1
-            && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
-          UnquotedValue = UnquotedValue.substr(1);
-        // If this was just a single byte newline, it will get skipped
-        // below.
-        break;
-      case '0':
-        Storage.push_back(0x00);
-        break;
-      case 'a':
-        Storage.push_back(0x07);
-        break;
-      case 'b':
-        Storage.push_back(0x08);
-        break;
-      case 't':
-      case 0x09:
-        Storage.push_back(0x09);
-        break;
-      case 'n':
-        Storage.push_back(0x0A);
-        break;
-      case 'v':
-        Storage.push_back(0x0B);
-        break;
-      case 'f':
-        Storage.push_back(0x0C);
-        break;
-      case 'r':
-        Storage.push_back(0x0D);
-        break;
-      case 'e':
-        Storage.push_back(0x1B);
-        break;
+  char LastNewLineAddedAs = '\0';
+  for (; I != StringRef::npos; I = UnquotedValue.find_first_of(LookupChars)) {
+    if (UnquotedValue[I] != '\x0D' && UnquotedValue[I] != '\x0A') {
+      llvm::append_range(Storage, UnquotedValue.take_front(I));
+      UnquotedValue = UnescapeCallback(UnquotedValue.drop_front(I), Storage);
+      LastNewLineAddedAs = '\0';
+      continue;
+    }
+    if (size_t LastNonSWhite = UnquotedValue.find_last_not_of("\x20\x09", I);
+        LastNonSWhite != StringRef::npos) {
+      llvm::append_range(Storage, UnquotedValue.take_front(LastNonSWhite + 1));
+      Storage.push_back(' ');
+      LastNewLineAddedAs = ' ';
+    } else {
+      // Note: we can't just check if the last character in Storage is ' ',
+      // '\n', or something else; that would give a wrong result for double
+      // quoted values containing an escaped space character before a new-line
+      // character.
+      switch (LastNewLineAddedAs) {
       case ' ':
-        Storage.push_back(0x20);
-        break;
-      case '"':
-        Storage.push_back(0x22);
-        break;
-      case '/':
-        Storage.push_back(0x2F);
-        break;
-      case '\\':
-        Storage.push_back(0x5C);
-        break;
-      case 'N':
-        encodeUTF8(0x85, Storage);
+        assert(!Storage.empty() && Storage.back() == ' ');
+        Storage.back() = '\n';
+        LastNewLineAddedAs = '\n';
         break;
-      case '_':
-        encodeUTF8(0xA0, Storage);
-        break;
-      case 'L':
-        encodeUTF8(0x2028, Storage);
+      case '\n':
+        assert(!Storage.empty() && Storage.back() == '\n');
+        Storage.push_back('\n');
         break;
-      case 'P':
-        encodeUTF8(0x2029, Storage);
+      default:
+        Storage.push_back(' ');
+        LastNewLineAddedAs = ' ';
         break;
-      case 'x': {
-          if (UnquotedValue.size() < 3)
-            // TODO: Report error.
-            break;
-          unsigned int UnicodeScalarValue;
-          if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
-            // TODO: Report error.
-            UnicodeScalarValue = 0xFFFD;
-          encodeUTF8(UnicodeScalarValue, Storage);
-          UnquotedValue = UnquotedValue.substr(2);
-          break;
-        }
-      case 'u': {
-          if (UnquotedValue.size() < 5)
-            // TODO: Report error.
-            break;
-          unsigned int UnicodeScalarValue;
-          if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
-            // TODO: Report error.
-            UnicodeScalarValue = 0xFFFD;
-          encodeUTF8(UnicodeScalarValue, Storage);
-          UnquotedValue = UnquotedValue.substr(4);
-          break;
-        }
-      case 'U': {
-          if (UnquotedValue.size() < 9)
-            // TODO: Report error.
-            break;
-          unsigned int UnicodeScalarValue;
-          if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
-            // TODO: Report error.
-            UnicodeScalarValue = 0xFFFD;
-          encodeUTF8(UnicodeScalarValue, Storage);
-          UnquotedValue = UnquotedValue.substr(8);
-          break;
-        }
       }
-      UnquotedValue = UnquotedValue.substr(1);
     }
+    // Handle Windows-style EOL
+    if (UnquotedValue.substr(I, 2) == "\x0D\x0A")
+      I++;
+    UnquotedValue = UnquotedValue.drop_front(I + 1).ltrim("\x20\x09");
   }
   llvm::append_range(Storage, UnquotedValue);
   return StringRef(Storage.begin(), Storage.size());
 }
 
+StringRef
+ScalarNode::getDoubleQuotedValue(StringRef RawValue,
+                                 SmallVectorImpl<char> &Storage) const {
+  assert(RawValue.size() >= 2 && RawValue.front() == '"' &&
+         RawValue.back() == '"');
+  StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2);
+
+  auto UnescapeFunc = [this](StringRef UnquotedValue,
+                             SmallVectorImpl<char> &Storage) {
+    assert(UnquotedValue.take_front(1) == "\\");
+    if (UnquotedValue.size() == 1) {
+      Token T;
+      T.Range = UnquotedValue;
+      this->setError("Unrecognized escape code", T);
+      Storage.clear();
+      return StringRef();
+    }
+    UnquotedValue = UnquotedValue.drop_front(1);
+    switch (UnquotedValue[0]) {
+    default: {
+      Token T;
+      T.Range = UnquotedValue.take_front(1);
+      setError("Unrecognized escape code", T);
----------------
slinder1 wrote:

I'd just standardize on the implicit form you used here

https://github.com/llvm/llvm-project/pull/70898