[llvm-branch-commits] [llvm] [YAMLParser] Unfold multi-line scalar values (PR #70898)

Igor Kudrin via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Nov 9 15:56:56 PST 2023


https://github.com/igorkudrin updated https://github.com/llvm/llvm-project/pull/70898

>From 37ab3fff62b1a3aa373fd513745b1c2b91b1b865 Mon Sep 17 00:00:00 2001
From: Igor Kudrin <ikudrin at accesssoftek.com>
Date: Tue, 7 Nov 2023 18:42:02 -0800
Subject: [PATCH] [YAMLParser] Unfold multi-line scalar values

Long scalar values can be split into multiple lines to improve
readability. The rules are described in Section 6.5. "Line Folding",
https://yaml.org/spec/1.2.2/#65-line-folding. In addition, for flow
scalar styles, the Spec states that "All leading and trailing white
space characters on each line are excluded from the content",
https://yaml.org/spec/1.2.2/#73-flow-scalar-styles.

The patch implements these unfolding rules for double-quoted,
single-quoted, and plain scalars.
---
 llvm/include/llvm/Support/YAMLParser.h    |   9 +-
 llvm/lib/Support/YAMLParser.cpp           | 373 ++++++++++++----------
 llvm/test/YAMLParser/spec-05-13.test      |   2 +-
 llvm/test/YAMLParser/spec-05-14.test      |   2 +-
 llvm/test/YAMLParser/spec-09-01.test      |   4 +-
 llvm/test/YAMLParser/spec-09-02.test      |  18 +-
 llvm/test/YAMLParser/spec-09-03.test      |   6 +-
 llvm/test/YAMLParser/spec-09-04.test      |   2 +-
 llvm/test/YAMLParser/spec-09-05.test      |   6 +-
 llvm/test/YAMLParser/spec-09-07.test      |   4 +-
 llvm/test/YAMLParser/spec-09-08.test      |   8 +-
 llvm/test/YAMLParser/spec-09-09.test      |   6 +-
 llvm/test/YAMLParser/spec-09-10.test      |   2 +-
 llvm/test/YAMLParser/spec-09-11.test      |   4 +-
 llvm/test/YAMLParser/spec-09-13.test      |   4 +-
 llvm/test/YAMLParser/spec-09-16.test      |   8 +-
 llvm/test/YAMLParser/spec-09-17.test      |   2 +-
 llvm/test/YAMLParser/spec-10-02.test      |   6 +-
 llvm/test/YAMLParser/spec1.2-07-05.test   |   2 +-
 llvm/test/YAMLParser/spec1.2-07-06.test   |   2 +-
 llvm/test/YAMLParser/spec1.2-07-09.test   |   2 +-
 llvm/test/YAMLParser/spec1.2-07-12.test   |   2 +-
 llvm/unittests/Support/YAMLParserTest.cpp | 102 ++++++
 23 files changed, 376 insertions(+), 200 deletions(-)

diff --git a/llvm/include/llvm/Support/YAMLParser.h b/llvm/include/llvm/Support/YAMLParser.h
index f4767641647c217..9d95a1e13a0dff4 100644
--- a/llvm/include/llvm/Support/YAMLParser.h
+++ b/llvm/include/llvm/Support/YAMLParser.h
@@ -240,9 +240,14 @@ class ScalarNode final : public Node {
 private:
   StringRef Value;
 
-  StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
-                                 StringRef::size_type Start,
+  StringRef getDoubleQuotedValue(StringRef UnquotedValue,
                                  SmallVectorImpl<char> &Storage) const;
+
+  static StringRef getSingleQuotedValue(StringRef RawValue,
+                                        SmallVectorImpl<char> &Storage);
+
+  static StringRef getPlainValue(StringRef RawValue,
+                                 SmallVectorImpl<char> &Storage);
 };
 
 /// A block scalar node is an opaque datum that can be presented as a
diff --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp
index b47cb3ae3b44a75..fdd0ed6e682eb5e 100644
--- a/llvm/lib/Support/YAMLParser.cpp
+++ b/llvm/lib/Support/YAMLParser.cpp
@@ -2030,184 +2030,229 @@ bool Node::failed() const {
 }
 
 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
-  // TODO: Handle newlines properly. We need to remove leading whitespace.
-  if (Value[0] == '"') { // Double quoted.
-    // Pull off the leading and trailing "s.
-    StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
-    // Search for characters that would require unescaping the value.
-    StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
-    if (i != StringRef::npos)
-      return unescapeDoubleQuoted(UnquotedValue, i, Storage);
+  if (Value[0] == '"')
+    return getDoubleQuotedValue(Value, Storage);
+  if (Value[0] == '\'')
+    return getSingleQuotedValue(Value, Storage);
+  return getPlainValue(Value, Storage);
+}
+
+/// parseScalarValue - A common parsing routine for all flow scalar styles.
+/// It handles line break characters by itself, adds regular content characters
+/// to the result, and forwards escaped sequences to the provided routine for
+/// the style-specific processing.
+///
+/// \param UnquotedValue - An input value without quotation marks.
+/// \param Storage - A storage for the result if the input value is multiline or
+/// contains escaped characters.
+/// \param LookupChars - A set of special characters to search in the input
+/// string. Should include line break characters and the escape character
+/// specific for the processing scalar style, if any.
+/// \param UnescapeCallback - This is called when the escape character is found
+/// in the input.
+/// \returns - The unfolded and unescaped value.
+static StringRef
+parseScalarValue(StringRef UnquotedValue, SmallVectorImpl<char> &Storage,
+                 StringRef LookupChars,
+                 std::function<StringRef(StringRef, SmallVectorImpl<char> &)>
+                     UnescapeCallback) {
+  size_t I = UnquotedValue.find_first_of(LookupChars);
+  if (I == StringRef::npos)
     return UnquotedValue;
-  } else if (Value[0] == '\'') { // Single quoted.
-    // Pull off the leading and trailing 's.
-    StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
-    StringRef::size_type i = UnquotedValue.find('\'');
-    if (i != StringRef::npos) {
-      // We're going to need Storage.
-      Storage.clear();
-      Storage.reserve(UnquotedValue.size());
-      for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
-        StringRef Valid(UnquotedValue.begin(), i);
-        llvm::append_range(Storage, Valid);
-        Storage.push_back('\'');
-        UnquotedValue = UnquotedValue.substr(i + 2);
+
+  Storage.clear();
+  Storage.reserve(UnquotedValue.size());
+  char LastNewLineAddedAs = '\0';
+  for (; I != StringRef::npos; I = UnquotedValue.find_first_of(LookupChars)) {
+    if (UnquotedValue[I] != '\r' && UnquotedValue[I] != '\n') {
+      llvm::append_range(Storage, UnquotedValue.take_front(I));
+      UnquotedValue = UnescapeCallback(UnquotedValue.drop_front(I), Storage);
+      LastNewLineAddedAs = '\0';
+      continue;
+    }
+    if (size_t LastNonSWhite = UnquotedValue.find_last_not_of(" \t", I);
+        LastNonSWhite != StringRef::npos) {
+      llvm::append_range(Storage, UnquotedValue.take_front(LastNonSWhite + 1));
+      Storage.push_back(' ');
+      LastNewLineAddedAs = ' ';
+    } else {
+      // Note: we can't just check if the last character in Storage is ' ',
+      // '\n', or something else; that would give a wrong result for double
+      // quoted values containing an escaped space character before a new-line
+      // character.
+      switch (LastNewLineAddedAs) {
+      case ' ':
+        assert(!Storage.empty() && Storage.back() == ' ');
+        Storage.back() = '\n';
+        LastNewLineAddedAs = '\n';
+        break;
+      case '\n':
+        assert(!Storage.empty() && Storage.back() == '\n');
+        Storage.push_back('\n');
+        break;
+      default:
+        Storage.push_back(' ');
+        LastNewLineAddedAs = ' ';
+        break;
       }
-      llvm::append_range(Storage, UnquotedValue);
-      return StringRef(Storage.begin(), Storage.size());
     }
-    return UnquotedValue;
+    // Handle Windows-style EOL
+    if (UnquotedValue.substr(I, 2) == "\r\n")
+      I++;
+    UnquotedValue = UnquotedValue.drop_front(I + 1).ltrim(" \t");
   }
-  // Plain.
-  // Trim whitespace ('b-char' and 's-white').
-  // NOTE: Alternatively we could change the scanner to not include whitespace
-  //       here in the first place.
-  return Value.rtrim("\x0A\x0D\x20\x09");
+  llvm::append_range(Storage, UnquotedValue);
+  return StringRef(Storage.begin(), Storage.size());
 }
 
-StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
-                                          , StringRef::size_type i
-                                          , SmallVectorImpl<char> &Storage)
-                                          const {
-  // Use Storage to build proper value.
-  Storage.clear();
-  Storage.reserve(UnquotedValue.size());
-  for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
-    // Insert all previous chars into Storage.
-    StringRef Valid(UnquotedValue.begin(), i);
-    llvm::append_range(Storage, Valid);
-    // Chop off inserted chars.
-    UnquotedValue = UnquotedValue.substr(i);
-
-    assert(!UnquotedValue.empty() && "Can't be empty!");
-
-    // Parse escape or line break.
+StringRef
+ScalarNode::getDoubleQuotedValue(StringRef RawValue,
+                                 SmallVectorImpl<char> &Storage) const {
+  assert(RawValue.size() >= 2 && RawValue.front() == '"' &&
+         RawValue.back() == '"');
+  StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2);
+
+  auto UnescapeFunc = [this](StringRef UnquotedValue,
+                             SmallVectorImpl<char> &Storage) {
+    assert(UnquotedValue.take_front(1) == "\\");
+    if (UnquotedValue.size() == 1) {
+      Token T;
+      T.Range = UnquotedValue;
+      setError("Unrecognized escape code", T);
+      Storage.clear();
+      return StringRef();
+    }
+    UnquotedValue = UnquotedValue.drop_front(1);
     switch (UnquotedValue[0]) {
+    default: {
+      Token T;
+      T.Range = UnquotedValue.take_front(1);
+      setError("Unrecognized escape code", T);
+      Storage.clear();
+      return StringRef();
+    }
     case '\r':
+      // Shrink the Windows-style EOL.
+      if (UnquotedValue.size() >= 2 && UnquotedValue[1] == '\n')
+        UnquotedValue = UnquotedValue.drop_front(1);
+      [[fallthrough]];
     case '\n':
-      Storage.push_back('\n');
-      if (   UnquotedValue.size() > 1
-          && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
-        UnquotedValue = UnquotedValue.substr(1);
-      UnquotedValue = UnquotedValue.substr(1);
+      return UnquotedValue.drop_front(1).ltrim(" \t");
+    case '0':
+      Storage.push_back(0x00);
       break;
-    default:
-      if (UnquotedValue.size() == 1) {
-        Token T;
-        T.Range = StringRef(UnquotedValue.begin(), 1);
-        setError("Unrecognized escape code", T);
-        return "";
-      }
-      UnquotedValue = UnquotedValue.substr(1);
-      switch (UnquotedValue[0]) {
-      default: {
-          Token T;
-          T.Range = StringRef(UnquotedValue.begin(), 1);
-          setError("Unrecognized escape code", T);
-          return "";
-        }
-      case '\r':
-        // Shrink the Windows-style EOL.
-        if (UnquotedValue.size() >= 2 && UnquotedValue[1] == '\n')
-          UnquotedValue = UnquotedValue.drop_front(1);
-        [[fallthrough]];
-      case '\n':
-        UnquotedValue = UnquotedValue.drop_front(1).ltrim(" \t");
-        continue;
-      case '0':
-        Storage.push_back(0x00);
-        break;
-      case 'a':
-        Storage.push_back(0x07);
-        break;
-      case 'b':
-        Storage.push_back(0x08);
-        break;
-      case 't':
-      case 0x09:
-        Storage.push_back(0x09);
-        break;
-      case 'n':
-        Storage.push_back(0x0A);
-        break;
-      case 'v':
-        Storage.push_back(0x0B);
-        break;
-      case 'f':
-        Storage.push_back(0x0C);
-        break;
-      case 'r':
-        Storage.push_back(0x0D);
-        break;
-      case 'e':
-        Storage.push_back(0x1B);
-        break;
-      case ' ':
-        Storage.push_back(0x20);
-        break;
-      case '"':
-        Storage.push_back(0x22);
-        break;
-      case '/':
-        Storage.push_back(0x2F);
-        break;
-      case '\\':
-        Storage.push_back(0x5C);
-        break;
-      case 'N':
-        encodeUTF8(0x85, Storage);
-        break;
-      case '_':
-        encodeUTF8(0xA0, Storage);
+    case 'a':
+      Storage.push_back(0x07);
+      break;
+    case 'b':
+      Storage.push_back(0x08);
+      break;
+    case 't':
+    case 0x09:
+      Storage.push_back(0x09);
+      break;
+    case 'n':
+      Storage.push_back(0x0A);
+      break;
+    case 'v':
+      Storage.push_back(0x0B);
+      break;
+    case 'f':
+      Storage.push_back(0x0C);
+      break;
+    case 'r':
+      Storage.push_back(0x0D);
+      break;
+    case 'e':
+      Storage.push_back(0x1B);
+      break;
+    case ' ':
+      Storage.push_back(0x20);
+      break;
+    case '"':
+      Storage.push_back(0x22);
+      break;
+    case '/':
+      Storage.push_back(0x2F);
+      break;
+    case '\\':
+      Storage.push_back(0x5C);
+      break;
+    case 'N':
+      encodeUTF8(0x85, Storage);
+      break;
+    case '_':
+      encodeUTF8(0xA0, Storage);
+      break;
+    case 'L':
+      encodeUTF8(0x2028, Storage);
+      break;
+    case 'P':
+      encodeUTF8(0x2029, Storage);
+      break;
+    case 'x': {
+      if (UnquotedValue.size() < 3)
+        // TODO: Report error.
         break;
-      case 'L':
-        encodeUTF8(0x2028, Storage);
+      unsigned int UnicodeScalarValue;
+      if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
+        // TODO: Report error.
+        UnicodeScalarValue = 0xFFFD;
+      encodeUTF8(UnicodeScalarValue, Storage);
+      return UnquotedValue.drop_front(3);
+    }
+    case 'u': {
+      if (UnquotedValue.size() < 5)
+        // TODO: Report error.
         break;
-      case 'P':
-        encodeUTF8(0x2029, Storage);
+      unsigned int UnicodeScalarValue;
+      if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
+        // TODO: Report error.
+        UnicodeScalarValue = 0xFFFD;
+      encodeUTF8(UnicodeScalarValue, Storage);
+      return UnquotedValue.drop_front(5);
+    }
+    case 'U': {
+      if (UnquotedValue.size() < 9)
+        // TODO: Report error.
         break;
-      case 'x': {
-          if (UnquotedValue.size() < 3)
-            // TODO: Report error.
-            break;
-          unsigned int UnicodeScalarValue;
-          if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
-            // TODO: Report error.
-            UnicodeScalarValue = 0xFFFD;
-          encodeUTF8(UnicodeScalarValue, Storage);
-          UnquotedValue = UnquotedValue.substr(2);
-          break;
-        }
-      case 'u': {
-          if (UnquotedValue.size() < 5)
-            // TODO: Report error.
-            break;
-          unsigned int UnicodeScalarValue;
-          if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
-            // TODO: Report error.
-            UnicodeScalarValue = 0xFFFD;
-          encodeUTF8(UnicodeScalarValue, Storage);
-          UnquotedValue = UnquotedValue.substr(4);
-          break;
-        }
-      case 'U': {
-          if (UnquotedValue.size() < 9)
-            // TODO: Report error.
-            break;
-          unsigned int UnicodeScalarValue;
-          if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
-            // TODO: Report error.
-            UnicodeScalarValue = 0xFFFD;
-          encodeUTF8(UnicodeScalarValue, Storage);
-          UnquotedValue = UnquotedValue.substr(8);
-          break;
-        }
-      }
-      UnquotedValue = UnquotedValue.substr(1);
+      unsigned int UnicodeScalarValue;
+      if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
+        // TODO: Report error.
+        UnicodeScalarValue = 0xFFFD;
+      encodeUTF8(UnicodeScalarValue, Storage);
+      return UnquotedValue.drop_front(9);
     }
-  }
-  llvm::append_range(Storage, UnquotedValue);
-  return StringRef(Storage.begin(), Storage.size());
+    }
+    return UnquotedValue.drop_front(1);
+  };
+
+  return parseScalarValue(UnquotedValue, Storage, "\\\r\n", UnescapeFunc);
+}
+
+StringRef ScalarNode::getSingleQuotedValue(StringRef RawValue,
+                                           SmallVectorImpl<char> &Storage) {
+  assert(RawValue.size() >= 2 && RawValue.front() == '\'' &&
+         RawValue.back() == '\'');
+  StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2);
+
+  auto UnescapeFunc = [](StringRef UnquotedValue,
+                         SmallVectorImpl<char> &Storage) {
+    assert(UnquotedValue.take_front(2) == "''");
+    Storage.push_back('\'');
+    return UnquotedValue.drop_front(2);
+  };
+
+  return parseScalarValue(UnquotedValue, Storage, "'\r\n", UnescapeFunc);
+}
+
+StringRef ScalarNode::getPlainValue(StringRef RawValue,
+                                    SmallVectorImpl<char> &Storage) {
+  // Trim trailing whitespace ('b-char' and 's-white').
+  // NOTE: Alternatively we could change the scanner to not include whitespace
+  //       here in the first place.
+  RawValue = RawValue.rtrim("\r\n \t");
+  return parseScalarValue(RawValue, Storage, "\r\n", nullptr);
 }
 
 Node *KeyValueNode::getKey() {
diff --git a/llvm/test/YAMLParser/spec-05-13.test b/llvm/test/YAMLParser/spec-05-13.test
index e7ec42a4aaa80d7..b2367a373ee454a 100644
--- a/llvm/test/YAMLParser/spec-05-13.test
+++ b/llvm/test/YAMLParser/spec-05-13.test
@@ -1,5 +1,5 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
-# CHECK: "Text containing   \n  both space and\t\n  \ttab\tcharacters"
+# CHECK: "Text containing both space and tab\tcharacters"
 
   "Text containing   
   both space and	
diff --git a/llvm/test/YAMLParser/spec-05-14.test b/llvm/test/YAMLParser/spec-05-14.test
index 984f3721312ab63..87d699dbc027b8d 100644
--- a/llvm/test/YAMLParser/spec-05-14.test
+++ b/llvm/test/YAMLParser/spec-05-14.test
@@ -6,4 +6,4 @@
 \  \_ \N \L \P \
 \x41 \u0041 \U00000041"
 
-# CHECK: !!str "Fun with \\\n\" \a \b \e \f \n \r \t \v \0   \_ \N \L \P A A A"
+# CHECK: !!str "Fun with \\ \" \a \b \e \f \n \r \t \v \0   \_ \N \L \P A A A"
diff --git a/llvm/test/YAMLParser/spec-09-01.test b/llvm/test/YAMLParser/spec-09-01.test
index 2b5a6f31166ddf1..e552e7ca264404c 100644
--- a/llvm/test/YAMLParser/spec-09-01.test
+++ b/llvm/test/YAMLParser/spec-09-01.test
@@ -4,8 +4,8 @@
 # CHECK-NEXT:   : !!map {
 # CHECK-NEXT:     ? !!str "also simple"
 # CHECK-NEXT:     : !!str "value",
-# CHECK-NEXT:     ? !!str "not a\n  simple key"
-# CHECK-NEXT:     : !!str "any\n  value",
+# CHECK-NEXT:     ? !!str "not a simple key"
+# CHECK-NEXT:     : !!str "any value",
 # CHECK-NEXT:   },
 # CHECK-NEXT: }
 
diff --git a/llvm/test/YAMLParser/spec-09-02.test b/llvm/test/YAMLParser/spec-09-02.test
index 51ea61dd23273d3..99c836bf0047536 100644
--- a/llvm/test/YAMLParser/spec-09-02.test
+++ b/llvm/test/YAMLParser/spec-09-02.test
@@ -1,12 +1,24 @@
 # RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s --strict-whitespace
-# CHECK: "as space\n trimmed \n specific\L\n escaped\t\n none"
+# CHECK: "as space trimmed\nspecific\L escaped\t none"
 
 ## Note: The example was originally taken from Spec 1.1, but the parsing rules
 ## have been changed since then.
-## * The paragraph-separator character '\u2029' is excluded from line-break
+## * The line-separator character '\u2028' is no longer considered a line-break
+##   character, so the line "...specific\u2028\nescaped..." is now parsed as 
+##   "...specific\L escaped...".
+## * The paragraph-separator character '\u2029' is also excluded from line-break
 ##   characters, so the original sequence "escaped\t\\\u2029" is no longer
-##   considered valid. This is replaced by "escaped\t\\\n" in the test source.
+##   considered valid. This is replaced by "escaped\t\\\n" in the test source,
+#    so the output has changed as well.
 ## See https://yaml.org/spec/1.2.2/ext/changes/ for details.
+##
+## Note 2: Different parsers handle this corner case example differently.
+## * https://github.com/yaml/libyaml:
+##   "as space trimmed\nspecific\L\nescaped\t\nnone"
+## * https://github.com/yaml/yaml-reference-parser (parser-1.2):
+##   "as space trimmed\nspecific\L escaped\t none"
+## * https://github.com/yaml/yaml-reference-parser (parser-1.3):
+##   "as space trimmed\nspecific
 escaped\t none"
 
  "as space
  trimmed 
diff --git a/llvm/test/YAMLParser/spec-09-03.test b/llvm/test/YAMLParser/spec-09-03.test
index c656058b7ff8b3e..f067d1366f06918 100644
--- a/llvm/test/YAMLParser/spec-09-03.test
+++ b/llvm/test/YAMLParser/spec-09-03.test
@@ -1,8 +1,8 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
 # CHECK:      !!seq [
-# CHECK-NEXT:   !!str "\n  last",
-# CHECK-NEXT:   !!str " \t\n  last",
-# CHECK-NEXT:   !!str " \tfirst\n  last",
+# CHECK-NEXT:   !!str " last",
+# CHECK-NEXT:   !!str " last",
+# CHECK-NEXT:   !!str " \tfirst last",
 # CHECK-NEXT: ]
 
 - "
diff --git a/llvm/test/YAMLParser/spec-09-04.test b/llvm/test/YAMLParser/spec-09-04.test
index e4f77ea83c7ac5f..79af877b38c8361 100644
--- a/llvm/test/YAMLParser/spec-09-04.test
+++ b/llvm/test/YAMLParser/spec-09-04.test
@@ -1,5 +1,5 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
-# CHECK: "first\n \tinner 1\t\n  inner 2 last"
+# CHECK: "first inner 1  inner 2 last"
 
  "first
  	inner 1	
diff --git a/llvm/test/YAMLParser/spec-09-05.test b/llvm/test/YAMLParser/spec-09-05.test
index 5eb5b22f421d64b..4a748e609f1d692 100644
--- a/llvm/test/YAMLParser/spec-09-05.test
+++ b/llvm/test/YAMLParser/spec-09-05.test
@@ -1,8 +1,8 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
 # CHECK:      !!seq [
-# CHECK-NEXT:   !!str "first\n  \t",
-# CHECK-NEXT:   !!str "first\n  \tlast",
-# CHECK-NEXT:   !!str "first\n inner\n  \tlast",
+# CHECK-NEXT:   !!str "first ",
+# CHECK-NEXT:   !!str "first\nlast",
+# CHECK-NEXT:   !!str "first inner  \tlast",
 # CHECK-NEXT: ]
 
 - "first
diff --git a/llvm/test/YAMLParser/spec-09-07.test b/llvm/test/YAMLParser/spec-09-07.test
index 71007e79b79d208..f397e2ca5f41672 100644
--- a/llvm/test/YAMLParser/spec-09-07.test
+++ b/llvm/test/YAMLParser/spec-09-07.test
@@ -4,8 +4,8 @@
 # CHECK-NEXT:   : !!map {
 # CHECK-NEXT:     ? !!str "also simple"
 # CHECK-NEXT:     : !!str "value",
-# CHECK-NEXT:     ? !!str "not a\n  simple key"
-# CHECK-NEXT:     : !!str "any\n  value",
+# CHECK-NEXT:     ? !!str "not a simple key"
+# CHECK-NEXT:     : !!str "any value",
 # CHECK-NEXT:   },
 # CHECK-NEXT: }
 
diff --git a/llvm/test/YAMLParser/spec-09-08.test b/llvm/test/YAMLParser/spec-09-08.test
index 5d1f13b0e31dfc0..7ed436ecb7cea7d 100644
--- a/llvm/test/YAMLParser/spec-09-08.test
+++ b/llvm/test/YAMLParser/spec-09-08.test
@@ -1,5 +1,11 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
-# CHECK: "as space\t\n trimmed \n \n specific\L\n none"
+# CHECK: "as space trimmed\nspecific\L none"
+
+## Note: The parsing rules were changed in version 1.2 and the line-separator
+## character is no longer considered a line-break character. The example is
+## taken from Spec 1.1 and is now parsed as "..\L .." instead of "..\L\n.." as
+## in the original edition.
+## See https://yaml.org/spec/1.2.2/ext/changes/ for details.
 
  'as space	
  trimmed 
diff --git a/llvm/test/YAMLParser/spec-09-09.test b/llvm/test/YAMLParser/spec-09-09.test
index 181971bd1349530..4910b66c24b1c9b 100644
--- a/llvm/test/YAMLParser/spec-09-09.test
+++ b/llvm/test/YAMLParser/spec-09-09.test
@@ -1,8 +1,8 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
 # CHECK:      !!seq [
-# CHECK-NEXT:   !!str "\n  last",
-# CHECK-NEXT:   !!str " \t\n  last",
-# CHECK-NEXT:   !!str " \tfirst\n  last",
+# CHECK-NEXT:   !!str " last",
+# CHECK-NEXT:   !!str " last",
+# CHECK-NEXT:   !!str " \tfirst last",
 # CHECK-NEXT: ]
 
 - '
diff --git a/llvm/test/YAMLParser/spec-09-10.test b/llvm/test/YAMLParser/spec-09-10.test
index f75834fa4dda544..3e21afe22d349f1 100644
--- a/llvm/test/YAMLParser/spec-09-10.test
+++ b/llvm/test/YAMLParser/spec-09-10.test
@@ -1,5 +1,5 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
-# CHECK: "first\n \tinner\t\n last"
+# CHECK: "first inner last"
 
  'first
  	inner	
diff --git a/llvm/test/YAMLParser/spec-09-11.test b/llvm/test/YAMLParser/spec-09-11.test
index b1f8f45f954af22..62bc1927998b3e0 100644
--- a/llvm/test/YAMLParser/spec-09-11.test
+++ b/llvm/test/YAMLParser/spec-09-11.test
@@ -1,7 +1,7 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
 # CHECK:      !!seq [
-# CHECK-NEXT:   !!str "first\n  \t",
-# CHECK-NEXT:   !!str "first\n\n  \tlast",
+# CHECK-NEXT:   !!str "first ",
+# CHECK-NEXT:   !!str "first\nlast",
 # CHECK-NEXT: ]
 
 - 'first
diff --git a/llvm/test/YAMLParser/spec-09-13.test b/llvm/test/YAMLParser/spec-09-13.test
index 015f38951ebbd64..f2a5f49ea0c6632 100644
--- a/llvm/test/YAMLParser/spec-09-13.test
+++ b/llvm/test/YAMLParser/spec-09-13.test
@@ -4,8 +4,8 @@
 # CHECK-NEXT:   : !!map {
 # CHECK-NEXT:     ? !!str "also simple"
 # CHECK-NEXT:     : !!str "value",
-# CHECK-NEXT:     ? !!str "not a\n  simple key"
-# CHECK-NEXT:     : !!str "any\n  value",
+# CHECK-NEXT:     ? !!str "not a simple key"
+# CHECK-NEXT:     : !!str "any value",
 # CHECK-NEXT:   },
 # CHECK-NEXT: }
 
diff --git a/llvm/test/YAMLParser/spec-09-16.test b/llvm/test/YAMLParser/spec-09-16.test
index b1f52ce194f11af..b6c92e3ec63c17d 100644
--- a/llvm/test/YAMLParser/spec-09-16.test
+++ b/llvm/test/YAMLParser/spec-09-16.test
@@ -1,5 +1,11 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
-# CHECK: "as space\t\n trimmed \n\n specific\L\n none"
+# CHECK: "as space trimmed\nspecific\L none"
+
+## Note: The parsing rules were changed in version 1.2 and the line-separator
+## character is no longer considered a line-break character. The example is
+## taken from Spec 1.1 and is now parsed as "..\L .." instead of "..\L\n.." as
+## in the original edition.
+## See https://yaml.org/spec/1.2.2/ext/changes/ for details.
 
  as space	
  trimmed 
diff --git a/llvm/test/YAMLParser/spec-09-17.test b/llvm/test/YAMLParser/spec-09-17.test
index 425925774d92fd1..06f1db21202753b 100644
--- a/llvm/test/YAMLParser/spec-09-17.test
+++ b/llvm/test/YAMLParser/spec-09-17.test
@@ -1,5 +1,5 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
-# CHECK: "first line \n   \n  more line"
+# CHECK: "first line\nmore line"
 
  first line 
    
diff --git a/llvm/test/YAMLParser/spec-10-02.test b/llvm/test/YAMLParser/spec-10-02.test
index 9adddc9237d51de..2fd91040af26ccd 100644
--- a/llvm/test/YAMLParser/spec-10-02.test
+++ b/llvm/test/YAMLParser/spec-10-02.test
@@ -1,8 +1,8 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
 # CHECK:      !!seq [
-# CHECK-NEXT:   !!str "double\n quoted",
-# CHECK-NEXT:   !!str "single\n           quoted",
-# CHECK-NEXT:   !!str "plain\n text",
+# CHECK-NEXT:   !!str "double quoted",
+# CHECK-NEXT:   !!str "single quoted",
+# CHECK-NEXT:   !!str "plain text",
 # CHECK-NEXT:   !!seq [
 # CHECK-NEXT:     !!str "nested",
 # CHECK-NEXT:   ],
diff --git a/llvm/test/YAMLParser/spec1.2-07-05.test b/llvm/test/YAMLParser/spec1.2-07-05.test
index f923f68d04295f9..a273e79acef6551 100644
--- a/llvm/test/YAMLParser/spec1.2-07-05.test
+++ b/llvm/test/YAMLParser/spec1.2-07-05.test
@@ -1,5 +1,5 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
-# CHECK: "folded \nto a space,\t\n \nto a line feed, or \t \tnon-content"
+# CHECK: "folded to a space,\nto a line feed, or \t \tnon-content"
 
 "folded 
 to a space,	
diff --git a/llvm/test/YAMLParser/spec1.2-07-06.test b/llvm/test/YAMLParser/spec1.2-07-06.test
index 8982c1ed2a7b18d..7008bbcf1516c5a 100644
--- a/llvm/test/YAMLParser/spec1.2-07-06.test
+++ b/llvm/test/YAMLParser/spec1.2-07-06.test
@@ -1,5 +1,5 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
-# CHECK: " 1st non-empty\n 2nd non-empty \n\t3rd non-empty "
+# CHECK: " 1st non-empty\n2nd non-empty 3rd non-empty "
 
 " 1st non-empty
 
diff --git a/llvm/test/YAMLParser/spec1.2-07-09.test b/llvm/test/YAMLParser/spec1.2-07-09.test
index 38d541973bc43fc..6a71f8c8ad890e7 100644
--- a/llvm/test/YAMLParser/spec1.2-07-09.test
+++ b/llvm/test/YAMLParser/spec1.2-07-09.test
@@ -1,5 +1,5 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
-# CHECK: " 1st non-empty\n\n 2nd non-empty \n\t3rd non-empty "
+# CHECK: " 1st non-empty\n2nd non-empty 3rd non-empty "
 
 ' 1st non-empty
 
diff --git a/llvm/test/YAMLParser/spec1.2-07-12.test b/llvm/test/YAMLParser/spec1.2-07-12.test
index 84d986e29d510c5..b5d0cb91f3023d4 100644
--- a/llvm/test/YAMLParser/spec1.2-07-12.test
+++ b/llvm/test/YAMLParser/spec1.2-07-12.test
@@ -1,5 +1,5 @@
 # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
-# CHECK: "1st non-empty\n\n 2nd non-empty \n\t3rd non-empty"
+# CHECK: "1st non-empty\n2nd non-empty 3rd non-empty"
 
 1st non-empty
 
diff --git a/llvm/unittests/Support/YAMLParserTest.cpp b/llvm/unittests/Support/YAMLParserTest.cpp
index 247e70756861df1..7bd11e748155d8b 100644
--- a/llvm/unittests/Support/YAMLParserTest.cpp
+++ b/llvm/unittests/Support/YAMLParserTest.cpp
@@ -441,4 +441,106 @@ TEST(YAMLParser, ParsesBools) {
   expectCannotParseBool("0");
 }
 
+// Checks that the given string can be parsed into an expected scalar value.
+static void expectCanParseScalar(StringRef Input, StringRef Expected) {
+  SourceMgr SM;
+  yaml::Stream Stream(Input, SM);
+  yaml::Node *Root = Stream.begin()->getRoot();
+  ASSERT_NE(Root, nullptr);
+  auto *ScalarNode = dyn_cast<yaml::ScalarNode>(Root);
+  ASSERT_NE(ScalarNode, nullptr);
+  SmallVector<char> Storage;
+  StringRef Result = ScalarNode->getValue(Storage);
+  EXPECT_EQ(Result, Expected);
+}
+
+TEST(YAMLParser, UnfoldsScalarValue) {
+  // Double-quoted values
+  expectCanParseScalar("\"\"", "");
+  expectCanParseScalar("\"  \t\t  \t\t  \"", "  \t\t  \t\t  ");
+  expectCanParseScalar("\"\n\"", " ");
+  expectCanParseScalar("\"\r\"", " ");
+  expectCanParseScalar("\"\r\n\"", " ");
+  expectCanParseScalar("\"\n\n\"", "\n");
+  expectCanParseScalar("\"\r\r\"", "\n");
+  expectCanParseScalar("\"\n\r\"", "\n");
+  expectCanParseScalar("\"\r\n\r\n\"", "\n");
+  expectCanParseScalar("\"\n\n\n\"", "\n\n");
+  expectCanParseScalar("\"\r\r\r\"", "\n\n");
+  expectCanParseScalar("\"\r\n\r\n\r\n\"", "\n\n");
+  expectCanParseScalar("\" \t \t \n\t \t \t\r \t \t \"", "\n");
+  expectCanParseScalar("\" \t A \t \n \t B \t \"", " \t A B \t ");
+  expectCanParseScalar("\" \t \\ \r\r\t \\  \t \"", " \t  \n  \t ");
+  expectCanParseScalar("\"A\nB\"", "A B");
+  expectCanParseScalar("\"A\rB\"", "A B");
+  expectCanParseScalar("\"A\r\nB\"", "A B");
+  expectCanParseScalar("\"A\n\nB\"", "A\nB");
+  expectCanParseScalar("\"A\r\rB\"", "A\nB");
+  expectCanParseScalar("\"A\n\rB\"", "A\nB");
+  expectCanParseScalar("\"A\r\n\r\nB\"", "A\nB");
+  expectCanParseScalar("\"A\n\n\nB\"", "A\n\nB");
+  expectCanParseScalar("\"A\r\r\rB\"", "A\n\nB");
+  expectCanParseScalar("\"A\r\n\r\n\r\nB\"", "A\n\nB");
+  expectCanParseScalar("\"A \t \t \n\t \t \t B\"", "A B");
+  expectCanParseScalar("\"A \t \t \n\t \t \t\r \t \t B\"", "A\nB");
+  expectCanParseScalar("\"A \t \t \n\t \t \t\r\n \t \r  \t B\"", "A\n\nB");
+  expectCanParseScalar("\"A\\\rB\"", "AB");
+  expectCanParseScalar("\"A\\\nB\"", "AB");
+  expectCanParseScalar("\"A\\\r\nB\"", "AB");
+  expectCanParseScalar("\"A \t \\\rB\"", "A \t B");
+  expectCanParseScalar("\"A  \t\\\nB\"", "A  \tB");
+  expectCanParseScalar("\"A\t  \\\r\nB\"", "A\t  B");
+  expectCanParseScalar("\"A\\\r\rB\"", "A B");
+  expectCanParseScalar("\"A\\\n\nB\"", "A B");
+  expectCanParseScalar("\"A\\\r\n\r\nB\"", "A B");
+  expectCanParseScalar("\"A\\\r\r\rB\"", "A\nB");
+  expectCanParseScalar("\"A\\\n\n\nB\"", "A\nB");
+  expectCanParseScalar("\"A\\\r\n\r\n\r\nB\"", "A\nB");
+  expectCanParseScalar("\"A\r\\ \rB\"", "A   B");
+  // Single-quoted values
+  expectCanParseScalar("''", "");
+  expectCanParseScalar("'  \t\t  \t\t  '", "  \t\t  \t\t  ");
+  expectCanParseScalar("'\n'", " ");
+  expectCanParseScalar("'\r'", " ");
+  expectCanParseScalar("'\r\n'", " ");
+  expectCanParseScalar("'\n\n'", "\n");
+  expectCanParseScalar("'\r\r'", "\n");
+  expectCanParseScalar("'\n\r'", "\n");
+  expectCanParseScalar("'\r\n\r\n'", "\n");
+  expectCanParseScalar("'\n\n\n'", "\n\n");
+  expectCanParseScalar("'\r\r\r'", "\n\n");
+  expectCanParseScalar("'\r\n\r\n\r\n'", "\n\n");
+  expectCanParseScalar("' \t \t \n\t \t \t\r \t \t '", "\n");
+  expectCanParseScalar("' \t A \t \n \t B \t '", " \t A B \t ");
+  expectCanParseScalar("'A\nB'", "A B");
+  expectCanParseScalar("'A\rB'", "A B");
+  expectCanParseScalar("'A\r\nB'", "A B");
+  expectCanParseScalar("'A\n\nB'", "A\nB");
+  expectCanParseScalar("'A\r\rB'", "A\nB");
+  expectCanParseScalar("'A\n\rB'", "A\nB");
+  expectCanParseScalar("'A\r\n\r\nB'", "A\nB");
+  expectCanParseScalar("'A\n\n\nB'", "A\n\nB");
+  expectCanParseScalar("'A\r\r\rB'", "A\n\nB");
+  expectCanParseScalar("'A\r\n\r\n\r\nB'", "A\n\nB");
+  expectCanParseScalar("'A \t \t \n\t \t \t B'", "A B");
+  expectCanParseScalar("'A \t \t \n\t \t \t\r \t \t B'", "A\nB");
+  expectCanParseScalar("'A \t \t \n\t \t \t\r\n \t \r  \t B'", "A\n\nB");
+  // Plain values
+  expectCanParseScalar("A  \t \r \n \t \r\n \t\r\r\t  ", "A");
+  expectCanParseScalar("A \t \n \t B", "A B");
+  expectCanParseScalar("A\nB", "A B");
+  expectCanParseScalar("A\rB", "A B");
+  expectCanParseScalar("A\r\nB", "A B");
+  expectCanParseScalar("A\n\nB", "A\nB");
+  expectCanParseScalar("A\r\rB", "A\nB");
+  expectCanParseScalar("A\n\rB", "A\nB");
+  expectCanParseScalar("A\r\n\r\nB", "A\nB");
+  expectCanParseScalar("A\n\n\nB", "A\n\nB");
+  expectCanParseScalar("A\r\r\rB", "A\n\nB");
+  expectCanParseScalar("A\r\n\r\n\r\nB", "A\n\nB");
+  expectCanParseScalar("A \t \t \n\t \t \t B", "A B");
+  expectCanParseScalar("A \t \t \n\t \t \t\r \t \t B", "A\nB");
+  expectCanParseScalar("A \t \t \n\t \t \t\r\n \t \r  \t B", "A\n\nB");
+}
+
 } // end namespace llvm



More information about the llvm-branch-commits mailing list