[llvm] 4480e65 - [YAMLParser] Improve plain scalar spec compliance (#68946)

via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 17 10:28:19 PDT 2023


Author: akirchhoff-modular
Date: 2023-10-17T11:28:14-06:00
New Revision: 4480e650b3cf7cc63cfd3767cd6b120f8bfad2ac

URL: https://github.com/llvm/llvm-project/commit/4480e650b3cf7cc63cfd3767cd6b120f8bfad2ac
DIFF: https://github.com/llvm/llvm-project/commit/4480e650b3cf7cc63cfd3767cd6b120f8bfad2ac.diff

LOG: [YAMLParser] Improve plain scalar spec compliance (#68946)

The `YAMLParser.h` header file claims support for YAML 1.2 with a few
deviations, but our plain scalar parsing failed to parse some valid YAML
according to the spec. This change puts us more in compliance with the
YAML spec, now letting us parse plain scalars containing additional
special characters in cases where they are not ambiguous.

Added: 
    llvm/test/YAMLParser/plain-characters.test

Modified: 
    llvm/lib/Support/YAMLParser.cpp
    llvm/test/CodeGen/MIR/Generic/first-character-parse-error.mir
    llvm/unittests/Support/YAMLIOTest.cpp
    llvm/unittests/Support/YAMLParserTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp
index 6ac2c6aeeb46ad5..1422e40f91944ae 100644
--- a/llvm/lib/Support/YAMLParser.cpp
+++ b/llvm/lib/Support/YAMLParser.cpp
@@ -392,6 +392,10 @@ class Scanner {
   ///        Pos is whitespace or a new line
   bool isBlankOrBreak(StringRef::iterator Position);
 
+  /// Return true if the minimal well-formed code unit subsequence at
+  ///        Pos is considered a "safe" character for plain scalars.
+  bool isPlainSafeNonBlank(StringRef::iterator Position);
+
   /// Return true if the line is a line break, false otherwise.
   bool isLineEmpty(StringRef Line);
 
@@ -545,6 +549,10 @@ class Scanner {
   /// Can the next token be the start of a simple key?
   bool IsSimpleKeyAllowed;
 
+  /// Can the next token be a value indicator even if it does not have a
+  /// trailing space?
+  bool IsAdjacentValueAllowedInFlow;
+
   /// True if an error has occurred.
   bool Failed;
 
@@ -868,6 +876,7 @@ void Scanner::init(MemoryBufferRef Buffer) {
   FlowLevel = 0;
   IsStartOfStream = true;
   IsSimpleKeyAllowed = true;
+  IsAdjacentValueAllowedInFlow = false;
   Failed = false;
   std::unique_ptr<MemoryBuffer> InputBufferOwner =
       MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false);
@@ -1049,6 +1058,15 @@ bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
          *Position == '\n';
 }
 
+bool Scanner::isPlainSafeNonBlank(StringRef::iterator Position) {
+  if (Position == End || isBlankOrBreak(Position))
+    return false;
+  if (FlowLevel &&
+      StringRef(Position, 1).find_first_of(",[]{}") != StringRef::npos)
+    return false;
+  return true;
+}
+
 bool Scanner::isLineEmpty(StringRef Line) {
   for (const auto *Position = Line.begin(); Position != Line.end(); ++Position)
     if (!isBlankOrBreak(Position))
@@ -1189,6 +1207,7 @@ bool Scanner::scanStreamEnd() {
   unrollIndent(-1);
   SimpleKeys.clear();
   IsSimpleKeyAllowed = false;
+  IsAdjacentValueAllowedInFlow = false;
 
   Token T;
   T.Kind = Token::TK_StreamEnd;
@@ -1202,6 +1221,7 @@ bool Scanner::scanDirective() {
   unrollIndent(-1);
   SimpleKeys.clear();
   IsSimpleKeyAllowed = false;
+  IsAdjacentValueAllowedInFlow = false;
 
   StringRef::iterator Start = Current;
   consume('%');
@@ -1233,6 +1253,7 @@ bool Scanner::scanDocumentIndicator(bool IsStart) {
   unrollIndent(-1);
   SimpleKeys.clear();
   IsSimpleKeyAllowed = false;
+  IsAdjacentValueAllowedInFlow = false;
 
   Token T;
   T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
@@ -1255,6 +1276,8 @@ bool Scanner::scanFlowCollectionStart(bool IsSequence) {
 
   // And may also be followed by a simple key.
   IsSimpleKeyAllowed = true;
+  // Adjacent values are allowed in flows only after JSON-style keys.
+  IsAdjacentValueAllowedInFlow = false;
   ++FlowLevel;
   return true;
 }
@@ -1262,6 +1285,7 @@ bool Scanner::scanFlowCollectionStart(bool IsSequence) {
 bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
   IsSimpleKeyAllowed = false;
+  IsAdjacentValueAllowedInFlow = true;
   Token T;
   T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
                       : Token::TK_FlowMappingEnd;
@@ -1276,6 +1300,7 @@ bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
 bool Scanner::scanFlowEntry() {
   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
   IsSimpleKeyAllowed = true;
+  IsAdjacentValueAllowedInFlow = false;
   Token T;
   T.Kind = Token::TK_FlowEntry;
   T.Range = StringRef(Current, 1);
@@ -1288,6 +1313,7 @@ bool Scanner::scanBlockEntry() {
   rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
   IsSimpleKeyAllowed = true;
+  IsAdjacentValueAllowedInFlow = false;
   Token T;
   T.Kind = Token::TK_BlockEntry;
   T.Range = StringRef(Current, 1);
@@ -1302,6 +1328,7 @@ bool Scanner::scanKey() {
 
   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
   IsSimpleKeyAllowed = !FlowLevel;
+  IsAdjacentValueAllowedInFlow = false;
 
   Token T;
   T.Kind = Token::TK_Key;
@@ -1339,6 +1366,7 @@ bool Scanner::scanValue() {
       rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
     IsSimpleKeyAllowed = !FlowLevel;
   }
+  IsAdjacentValueAllowedInFlow = false;
 
   Token T;
   T.Kind = Token::TK_Value;
@@ -1420,6 +1448,7 @@ bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
 
   IsSimpleKeyAllowed = false;
+  IsAdjacentValueAllowedInFlow = true;
 
   return true;
 }
@@ -1434,21 +1463,9 @@ bool Scanner::scanPlainScalar() {
     if (*Current == '#')
       break;
 
-    while (Current != End && !isBlankOrBreak(Current)) {
-      if (FlowLevel && *Current == ':' &&
-          (Current + 1 == End ||
-           !(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) {
-        setError("Found unexpected ':' while scanning a plain scalar", Current);
-        return false;
-      }
-
-      // Check for the end of the plain scalar.
-      if (  (*Current == ':' && isBlankOrBreak(Current + 1))
-          || (  FlowLevel
-          && (StringRef(Current, 1).find_first_of(",:?[]{}")
-              != StringRef::npos)))
-        break;
-
+    while (Current != End &&
+           ((*Current != ':' && isPlainSafeNonBlank(Current)) ||
+            (*Current == ':' && isPlainSafeNonBlank(Current + 1)))) {
       StringRef::iterator i = skip_nb_char(Current);
       if (i == Current)
         break;
@@ -1499,6 +1516,7 @@ bool Scanner::scanPlainScalar() {
   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
 
   IsSimpleKeyAllowed = false;
+  IsAdjacentValueAllowedInFlow = false;
 
   return true;
 }
@@ -1534,6 +1552,7 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) {
   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
 
   IsSimpleKeyAllowed = false;
+  IsAdjacentValueAllowedInFlow = false;
 
   return true;
 }
@@ -1766,6 +1785,7 @@ bool Scanner::scanBlockScalar(bool IsLiteral) {
   // New lines may start a simple key.
   if (!FlowLevel)
     IsSimpleKeyAllowed = true;
+  IsAdjacentValueAllowedInFlow = false;
 
   Token T;
   T.Kind = Token::TK_BlockScalar;
@@ -1799,6 +1819,7 @@ bool Scanner::scanTag() {
   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
 
   IsSimpleKeyAllowed = false;
+  IsAdjacentValueAllowedInFlow = false;
 
   return true;
 }
@@ -1848,13 +1869,14 @@ bool Scanner::fetchMoreTokens() {
   if (*Current == ',')
     return scanFlowEntry();
 
-  if (*Current == '-' && isBlankOrBreak(Current + 1))
+  if (*Current == '-' && (isBlankOrBreak(Current + 1) || Current + 1 == End))
     return scanBlockEntry();
 
-  if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
+  if (*Current == '?' && (Current + 1 == End || isBlankOrBreak(Current + 1)))
     return scanKey();
 
-  if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
+  if (*Current == ':' &&
+      (!isPlainSafeNonBlank(Current + 1) || IsAdjacentValueAllowedInFlow))
     return scanValue();
 
   if (*Current == '*')
@@ -1880,15 +1902,10 @@ bool Scanner::fetchMoreTokens() {
 
   // Get a plain scalar.
   StringRef FirstChar(Current, 1);
-  if (!(isBlankOrBreak(Current)
-        || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
-      || (*Current == '-' && !isBlankOrBreak(Current + 1))
-      || (!FlowLevel && (*Current == '?' || *Current == ':')
-          && isBlankOrBreak(Current + 1))
-      || (!FlowLevel && *Current == ':'
-                      && Current + 2 < End
-                      && *(Current + 1) == ':'
-                      && !isBlankOrBreak(Current + 2)))
+  if ((!isBlankOrBreak(Current) &&
+       FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") == StringRef::npos) ||
+      (FirstChar.find_first_of("?:-") != StringRef::npos &&
+       isPlainSafeNonBlank(Current + 1)))
     return scanPlainScalar();
 
   setError("Unrecognized character while tokenizing.", Current);

diff  --git a/llvm/test/CodeGen/MIR/Generic/first-character-parse-error.mir b/llvm/test/CodeGen/MIR/Generic/first-character-parse-error.mir
index 00a01058dc8cb89..869392f3e4bb6fb 100644
--- a/llvm/test/CodeGen/MIR/Generic/first-character-parse-error.mir
+++ b/llvm/test/CodeGen/MIR/Generic/first-character-parse-error.mir
@@ -1,6 +1,6 @@
-:# RUN: not llc -run-pass=none %s -o - 2>&1 | FileCheck %s
+@# RUN: not llc -run-pass=none %s -o - 2>&1 | FileCheck %s
 
-# The : before the run comment is syntactically invalid. This used to
+# The @ before the run comment is syntactically invalid. This used to
 # crash in the SourceMgr diagnostic printer because it was called
 # before the LLVMContext was initialized.
 

diff  --git a/llvm/test/YAMLParser/plain-characters.test b/llvm/test/YAMLParser/plain-characters.test
new file mode 100644
index 000000000000000..f22016bcb9bca4a
--- /dev/null
+++ b/llvm/test/YAMLParser/plain-characters.test
@@ -0,0 +1,30 @@
+# RUN: yaml-bench -canonical %s | FileCheck %s
+# Example from https://yaml.org/spec/1.2.2/#example-plain-characters
+
+# Outside flow collection:
+- ::vector
+- ": - ()"
+- Up, up, and away!
+- -123
+- https://example.com/foo#bar
+# Inside flow collection:
+- [ ::vector,
+  ": - ()",
+  "Up, up and away!",
+  -123,
+  https://example.com/foo#bar ]
+
+# CHECK: !!seq [
+# CHECK-NEXT:   !!str "::vector",
+# CHECK-NEXT:   !!str ": - ()",
+# CHECK-NEXT:   !!str "Up, up, and away!",
+# CHECK-NEXT:   !!str "-123",
+# CHECK-NEXT:   !!str "https://example.com/foo#bar",
+# CHECK-NEXT:   !!seq [
+# CHECK-NEXT:     !!str "::vector",
+# CHECK-NEXT:     !!str ": - ()",
+# CHECK-NEXT:     !!str "Up, up and away!",
+# CHECK-NEXT:     !!str "-123",
+# CHECK-NEXT:     !!str "https://example.com/foo#bar",
+# CHECK-NEXT:   ],
+# CHECK-NEXT: ]

diff  --git a/llvm/unittests/Support/YAMLIOTest.cpp b/llvm/unittests/Support/YAMLIOTest.cpp
index 745d743b2b24498..488746764ae6550 100644
--- a/llvm/unittests/Support/YAMLIOTest.cpp
+++ b/llvm/unittests/Support/YAMLIOTest.cpp
@@ -3156,7 +3156,7 @@ TEST(YAMLIO, TestFlowSequenceTokenErrors) {
 
 TEST(YAMLIO, TestDirectiveMappingNoValue) {
   Input yin("%YAML\n{5:");
-  EXPECT_FALSE(yin.setCurrentDocument());
+  yin.setCurrentDocument();
   EXPECT_TRUE(yin.error());
 
   Input yin2("%TAG\n'\x98!< :\n");

diff  --git a/llvm/unittests/Support/YAMLParserTest.cpp b/llvm/unittests/Support/YAMLParserTest.cpp
index b52a3850c02b7c0..247e70756861df1 100644
--- a/llvm/unittests/Support/YAMLParserTest.cpp
+++ b/llvm/unittests/Support/YAMLParserTest.cpp
@@ -47,6 +47,10 @@ TEST(YAMLParser, ParsesEmptyArray) {
   ExpectParseSuccess("Empty array", "[]");
 }
 
+TEST(YAMLParser, ParsesComplexMap) {
+  ExpectParseSuccess("Complex block map", "? a\n: b");
+}
+
 TEST(YAMLParser, FailsIfNotClosingArray) {
   ExpectParseError("Not closing array", "[");
   ExpectParseError("Not closing array", "  [  ");
@@ -82,7 +86,10 @@ TEST(YAMLParser, FailsIfMissingColon) {
 }
 
 TEST(YAMLParser, FailsOnMissingQuote) {
-  ExpectParseError("Missing open quote", "[{a\":\"b\"}]");
+  // Missing open quote counts as a plain scalar per YAML spec
+  // (Following is equivalent to JSON [{"a\":\"b\"": null}])
+  ExpectParseSuccess("Missing open quote", "[{a\":\"b\"}]");
+  // Closing quote is more strict -- plain scalars cannot start with a quote
   ExpectParseError("Missing closing quote", "[{\"a\":\"b}]");
 }
 
@@ -128,6 +135,48 @@ TEST(YAMLParser, ParsesArrayOfArrays) {
   ExpectParseSuccess("Array of arrays", "[[]]");
 }
 
+TEST(YAMLParser, ParsesPlainScalars) {
+  ExpectParseSuccess("Plain scalar", "hello");
+  ExpectParseSuccess("Plain scalar beginning with a question mark", "?hello");
+  ExpectParseSuccess("Plain scalar beginning with a colon", ":hello");
+  ExpectParseSuccess("Plain scalar beginning with two colons", "::hello");
+  ExpectParseSuccess("Plain scalar beginning with a hyphen", "-hello");
+  ExpectParseSuccess("Multi-line plain scalar", "Hello\nworld");
+  ExpectParseSuccess("Plain scalar with indicator characters",
+                     "He-!l*lo, []world{}");
+  ExpectParseSuccess("Plain scalar with indicator characters used as block key",
+                     "He-!l*lo, []world{}: value");
+  ExpectParseSuccess("Plain scalar in flow sequence", "hello");
+  ExpectParseSuccess(
+      "Plain scalar beginning with a question mark in flow sequence",
+      "[ ?hello ]");
+  ExpectParseSuccess("Plain scalar beginning with a colon in flow sequence",
+                     "[ :hello ]");
+  ExpectParseSuccess("Plain scalar beginning with two colons in flow sequence",
+                     "[ ::hello ]");
+  ExpectParseSuccess("Plain scalar beginning with a hyphen in flow sequence",
+                     "[ -hello ]");
+  ExpectParseSuccess("Multi-line plain scalar in flow sequence",
+                     "[ Hello\nworld ]");
+  ExpectParseSuccess(
+      "Plain scalar with non-flow indicator characters in flow sequence",
+      "[ He-!l*lo, world ]");
+  ExpectParseSuccess(
+      "Plain scalar with non-flow indicator characters used as flow key",
+      "{ He-!l*lo, world: value } ");
+  ExpectParseError(
+      "Plain scalar with flow indicator characters inside flow sequence",
+      "[ Hello[world ]");
+  ExpectParseError(
+      "Plain scalar with flow indicator characters inside flow key",
+      "{ Hello[world: value }");
+  // Multi-line plain scalar in keys is strictly invalid per the spec, but many
+  // implementations accept it in flow keys nonetheless.  Block keys are not
+  // accepted by any other implementation I can find.
+  ExpectParseSuccess("Multi-line plain scalar in block key", "a\nb: c");
+  ExpectParseSuccess("Multi-line plain scalar in flow key", "{\na\nb: c\n}");
+}
+
 TEST(YAMLParser, ParsesBlockLiteralScalars) {
   ExpectParseSuccess("Block literal scalar", "test: |\n  Hello\n  World\n");
   ExpectParseSuccess("Block literal scalar EOF", "test: |\n  Hello\n  World");
@@ -176,6 +225,10 @@ TEST(YAMLParser, HandlesEndOfFileGracefully) {
   ExpectParseError("In array hitting EOF", "[[] ");
   ExpectParseError("In array hitting EOF", "[[]");
   ExpectParseError("In object hitting EOF", "{\"\"");
+  // This one is valid, equivalent to the JSON {"": null}
+  ExpectParseSuccess("In complex block map hitting EOF", "?");
+  // Equivalent to JSON [null]
+  ExpectParseSuccess("In block sequence hitting EOF", "-");
 }
 
 TEST(YAMLParser, HandlesNullValuesInKeyValueNodesGracefully) {
@@ -183,6 +236,12 @@ TEST(YAMLParser, HandlesNullValuesInKeyValueNodesGracefully) {
   ExpectParseError("KeyValueNode with null value", "test: '");
 }
 
+TEST(YAMLParser, BlockSequenceEOF) {
+  SourceMgr SM;
+  yaml::Stream Stream("-", SM);
+  EXPECT_TRUE(isa_and_present<yaml::SequenceNode>(Stream.begin()->getRoot()));
+}
+
 // Checks that the given string can be parsed into an identical string inside
 // of an array.
 static void ExpectCanParseString(StringRef String) {


        


More information about the llvm-commits mailing list