[libcxx-commits] [flang] [lldb] [clang] [llvm] [lld] [clang-tools-extra] [libcxx] [compiler-rt] [YAMLParser] Unfold multi-line scalar values (PR #70898)

Mon Nov 6 14:15:38 PST 2023

================
@@ -2030,187 +2030,219 @@ bool Node::failed() const {
 }
 
 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
-  // TODO: Handle newlines properly. We need to remove leading whitespace.
-  if (Value[0] == '"') { // Double quoted.
-    // Pull off the leading and trailing "s.
-    StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
-    // Search for characters that would require unescaping the value.
-    StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
-    if (i != StringRef::npos)
-      return unescapeDoubleQuoted(UnquotedValue, i, Storage);
+  if (Value[0] == '"')
+    return getDoubleQuotedValue(Value, Storage);
+  if (Value[0] == '\'')
+    return getSingleQuotedValue(Value, Storage);
+  return getPlainValue(Value, Storage);
+}
+
+static StringRef
+parseScalarValue(StringRef UnquotedValue, SmallVectorImpl<char> &Storage,
+                 StringRef LookupChars,
+                 std::function<StringRef(StringRef, SmallVectorImpl<char> &)>
+                     UnescapeCallback) {
+  size_t I = UnquotedValue.find_first_of(LookupChars);
+  if (I == StringRef::npos)
     return UnquotedValue;
-  } else if (Value[0] == '\'') { // Single quoted.
-    // Pull off the leading and trailing 's.
-    StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
-    StringRef::size_type i = UnquotedValue.find('\'');
-    if (i != StringRef::npos) {
-      // We're going to need Storage.
-      Storage.clear();
-      Storage.reserve(UnquotedValue.size());
-      for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
-        StringRef Valid(UnquotedValue.begin(), i);
-        llvm::append_range(Storage, Valid);
-        Storage.push_back('\'');
-        UnquotedValue = UnquotedValue.substr(i + 2);
-      }
-      llvm::append_range(Storage, UnquotedValue);
-      return StringRef(Storage.begin(), Storage.size());
-    }
-    return UnquotedValue;
-  }
-  // Plain.
-  // Trim whitespace ('b-char' and 's-white').
-  // NOTE: Alternatively we could change the scanner to not include whitespace
-  //       here in the first place.
-  return Value.rtrim("\x0A\x0D\x20\x09");
-}
 
-StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
-                                          , StringRef::size_type i
-                                          , SmallVectorImpl<char> &Storage)
-                                          const {
-  // Use Storage to build proper value.
   Storage.clear();
   Storage.reserve(UnquotedValue.size());
-  for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
-    // Insert all previous chars into Storage.
-    StringRef Valid(UnquotedValue.begin(), i);
-    llvm::append_range(Storage, Valid);
-    // Chop off inserted chars.
-    UnquotedValue = UnquotedValue.substr(i);
-
-    assert(!UnquotedValue.empty() && "Can't be empty!");
-
-    // Parse escape or line break.
-    switch (UnquotedValue[0]) {
-    case '\r':
-    case '\n':
-      Storage.push_back('\n');
-      if (   UnquotedValue.size() > 1
-          && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
-        UnquotedValue = UnquotedValue.substr(1);
-      UnquotedValue = UnquotedValue.substr(1);
-      break;
-    default:
-      if (UnquotedValue.size() == 1) {
-        Token T;
-        T.Range = StringRef(UnquotedValue.begin(), 1);
-        setError("Unrecognized escape code", T);
-        return "";
-      }
-      UnquotedValue = UnquotedValue.substr(1);
-      switch (UnquotedValue[0]) {
-      default: {
-          Token T;
-          T.Range = StringRef(UnquotedValue.begin(), 1);
-          setError("Unrecognized escape code", T);
-          return "";
-        }
-      case '\r':
-      case '\n':
-        // Remove the new line.
-        if (   UnquotedValue.size() > 1
-            && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
-          UnquotedValue = UnquotedValue.substr(1);
-        // If this was just a single byte newline, it will get skipped
-        // below.
-        break;
-      case '0':
-        Storage.push_back(0x00);
-        break;
-      case 'a':
-        Storage.push_back(0x07);
-        break;
-      case 'b':
-        Storage.push_back(0x08);
-        break;
-      case 't':
-      case 0x09:
-        Storage.push_back(0x09);
-        break;
-      case 'n':
-        Storage.push_back(0x0A);
-        break;
-      case 'v':
-        Storage.push_back(0x0B);
-        break;
-      case 'f':
-        Storage.push_back(0x0C);
-        break;
-      case 'r':
-        Storage.push_back(0x0D);
-        break;
-      case 'e':
-        Storage.push_back(0x1B);
-        break;
+  char LastNewLineAddedAs = '\0';
+  for (; I != StringRef::npos; I = UnquotedValue.find_first_of(LookupChars)) {
+    if (UnquotedValue[I] != '\x0D' && UnquotedValue[I] != '\x0A') {
----------------
slinder1 wrote:

Is there a reason to change from the "mnemonic" escape to a codepoint value one? E.g. is there a case where `'\r' != '\x0D'`, or is this just a stylistic change?

If there is a technical rationale, can we at least make the value symbolic with a `constexpr char CarriageReturn = '\x0D';` somewhere? And then ditto for all other magic numbers here

https://github.com/llvm/llvm-project/pull/70898