[PATCH] YAML: Implement block scalar parsing

Wed May 6 13:27:27 PDT 2015

> On 2015 May 5, at 15:57, Alex Lorenz <arphaman at gmail.com> wrote:
> 
> Hi bogner, bob.wilson, dexonsmith,
> 
> This patch implements the parsing of YAML block scalars. Some code existed there before, but it couldn't parse block scalars.
> This patch is based on the patch 'http://reviews.llvm.org/D9448' that enables the YAMLParser tests.
> I've added a new yaml node type to represent the block scalar values.
> I've deleted the files 'spec-09-27.test' and 'spec-09-28.test' as they are identical to the file 'spec-09-26.test'.
> 
> REPOSITORY
>  rL LLVM
> 
> http://reviews.llvm.org/D9503
> 
> Files:
>  include/llvm/Support/YAMLParser.h
>  lib/Support/YAMLParser.cpp
>  test/YAMLParser/spec-09-14.test
>  test/YAMLParser/spec-09-18.test
>  test/YAMLParser/spec-09-19.test
>  test/YAMLParser/spec-09-20.test
>  test/YAMLParser/spec-09-21.test
>  test/YAMLParser/spec-09-22.test
>  test/YAMLParser/spec-09-24.test
>  test/YAMLParser/spec-09-25.test
>  test/YAMLParser/spec-09-26.test
>  test/YAMLParser/spec-09-27.test
>  test/YAMLParser/spec-09-28.test
>  test/YAMLParser/spec-09-29.test
>  test/YAMLParser/spec-09-30.test
>  test/YAMLParser/spec-09-31.test
>  test/YAMLParser/spec-09-32.test
>  test/YAMLParser/spec-09-33.test
>  unittests/Support/YAMLParserTest.cpp
>  utils/yaml-bench/YAMLBench.cpp
> 
> EMAIL PREFERENCES
>  http://reviews.llvm.org/settings/panel/emailpreferences/
> <D9503.24963.patch>
> Index: include/llvm/Support/YAMLParser.h
> ===================================================================
> --- include/llvm/Support/YAMLParser.h
> +++ include/llvm/Support/YAMLParser.h
> @@ -107,6 +107,7 @@
>    enum NodeKind {
>      NK_Null,
>      NK_Scalar,
> +    NK_BlockScalar,
>      NK_KeyValue,
>      NK_Mapping,
>      NK_Sequence,
> @@ -222,6 +223,36 @@
>                                   SmallVectorImpl<char> &Storage) const;
>  };
>  
> +/// \brief A block scalar node is an opaque datum that can be presented as a
> +///        series of zero or more Unicode scalar values.
> +///
> +/// Example:
> +///   |
> +///     Hello
> +///     World
> +class BlockScalarNode : public Node {
> +  void anchor() override;
> +
> +public:
> +  BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
> +                  std::string &Value, StringRef RawVal)
> +      : Node(NK_BlockScalar, D, Anchor, Tag), Value(std::move(Value)) {
> +    SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
> +    SMLoc End = SMLoc::getFromPointer(RawVal.end());
> +    SourceRange = SMRange(Start, End);
> +  }
> +
> +  /// \brief Gets the value of this node as a StringRef.
> +  StringRef getValue() const { return Value; }
> +
> +  static inline bool classof(const Node *N) {
> +    return N->getType() == NK_BlockScalar;
> +  }
> +
> +private:
> +  std::string Value;
> +};
> +
>  /// \brief A key and value pair. While not technically a Node under the YAML
>  ///        representation graph, it is easier to treat them this way.
>  ///
> @@ -252,8 +283,10 @@
>    Node *getValue();
>  
>    void skip() override {
> -    getKey()->skip();
> -    getValue()->skip();
> +    if (Node *Key = getKey())
> +      Key->skip();
> +    if (Node *Val = getValue())
> +      Val->skip();

I'm probably missing something obvious, but why do you need to
conditionalize these?  Do they both need to be conditional?  (If not,
can you change the other to an `assert()`?)

If it's possible to commit this separately along with a testcase (or
unit test) that exposes the bug, please do.  Otherwise please indicate
in this commit message which test required the change.

>    }
>  
>    static inline bool classof(const Node *N) {
> Index: lib/Support/YAMLParser.cpp
> ===================================================================
> --- lib/Support/YAMLParser.cpp
> +++ lib/Support/YAMLParser.cpp
> @@ -101,6 +101,7 @@
>  void Node::anchor() {}
>  void NullNode::anchor() {}
>  void ScalarNode::anchor() {}
> +void BlockScalarNode::anchor() {}
>  void KeyValueNode::anchor() {}
>  void MappingNode::anchor() {}
>  void SequenceNode::anchor() {}
> @@ -128,6 +129,7 @@
>      TK_Key,
>      TK_Value,
>      TK_Scalar,
> +    TK_BlockScalar,
>      TK_Alias,
>      TK_Anchor,
>      TK_Tag
> @@ -137,6 +139,9 @@
>    /// of the token in the input.
>    StringRef Range;
>  
> +  /// The value of a block scalar node.
> +  std::string Value;
> +
>    Token() : Kind(TK_Error) {}
>  };
>  }
> @@ -348,6 +353,14 @@
>    ///          b-break.
>    StringRef::iterator skip_b_break(StringRef::iterator Position);
>  
> +  /// @brief Skip a single s-space[31] starting at Position.
> +  ///
> +  /// An s-space is 0x20
> +  ///
> +  /// @returns The code unit after the s-space, or Position if it's not a
> +  ///          s-space.
> +  StringRef::iterator skip_s_space(StringRef::iterator Position);
> +
>    /// @brief Skip a single s-white[33] starting at Position.
>    ///
>    /// A s-white is 0x20 | 0x9
> @@ -417,6 +430,9 @@
>                   , Token::TokenKind Kind
>                   , TokenQueueT::iterator InsertPoint);
>  
> +  /// @brief Skip a single-line comment if it's present on the current line.
> +  void skipComment();
> +
>    /// @brief Skip whitespace and comments until the start of the next token.
>    void scanToNextToken();
>  
> @@ -608,6 +624,9 @@
>      case Token::TK_Scalar:
>        OS << "Scalar: ";
>        break;
> +    case Token::TK_BlockScalar:
> +      OS << "Block Scalar: ";
> +      break;
>      case Token::TK_Alias:
>        OS << "Alias: ";
>        break;
> @@ -812,6 +831,13 @@
>    return Position;
>  }
>  
> +StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) {
> +  if (Position == End)
> +    return Position;
> +  if (*Position == ' ')
> +    return Position + 1;
> +  return Position;
> +}
>  
>  StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
>    if (Position == End)
> @@ -967,24 +993,27 @@
>    return true;
>  }
>  
> +void Scanner::skipComment() {
> +  if (*Current != '#')
> +    return;
> +  while (true) {
> +    // This may skip more than one byte, thus Column is only incremented
> +    // for code points.
> +    StringRef::iterator i = skip_nb_char(Current);

LLVM style would spell this with an uppercase, as in `I`.  I know you
copy/pasted, but you can fix it in transit ;).

> +    if (i == Current)
> +      break;
> +    Current = i;
> +    ++Column;
> +  }
> +}

Looks like you pulled this directly from `scanToNextToken()`.  Please do
that in a prep-commit.

Also see my note below about whether you pulled the right bits.

> +
>  void Scanner::scanToNextToken() {
>    while (true) {
>      while (*Current == ' ' || *Current == '\t') {
>        skip(1);
>      }
>  
> -    // Skip comment.
> -    if (*Current == '#') {
> -      while (true) {
> -        // This may skip more than one byte, thus Column is only incremented
> -        // for code points.
> -        StringRef::iterator i = skip_nb_char(Current);
> -        if (i == Current)
> -          break;
> -        Current = i;
> -        ++Column;
> -      }
> -    }
> +    skipComment();
>  
>      // Skip EOL.
>      StringRef::iterator i = skip_b_break(Current);
> @@ -1368,37 +1397,196 @@
>  }
>  
>  bool Scanner::scanBlockScalar(bool IsLiteral) {
> -  StringRef::iterator Start = Current;
> -  skip(1); // Eat | or >
> -  while(true) {
> -    StringRef::iterator i = skip_nb_char(Current);
> -    if (i == Current) {
> -      if (Column == 0)
> -        break;
> -      i = skip_b_break(Current);
> -      if (i != Current) {
> -        // We got a line break.
> -        Column = 0;
> -        ++Line;
> -        Current = i;
> -        continue;
> -      } else {
> -        // There was an error, which should already have been printed out.
> -        return false;
> -      }
> -    }
> -    Current = i;
> -    ++Column;
> +  std::string Str;
> +  raw_string_ostream OS(Str);
> +  auto BlockStart = Current;
> +  // If this is true, then the scanner has to look for a first non-blank line
> +  // to determine the indenation of the block scalar.
> +  bool LookingForIndent = true;
> +  unsigned BlockIndent = 0;
> +  unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
> +  unsigned LineBreaks = 0;
> +
> +  unsigned MaxAllSpaceLineCharacters = 0;
> +  StringRef::iterator LongestAllSpaceLine;
> +
> +  bool FoldNextLine = false;
> +
> +  skip(1); // Eat '|' or '>'
> +
> +  // Block Header ([162] c-b-block-header)
> +  // Chomping indicator ([164] c-chomping-indicator)
> +  char ChompingIndicator = ' ';
> +  if (Current != End && (*Current == '+' || *Current == '-')) {
> +    ChompingIndicator = *Current;
> +    skip(1);
> +  }
> +  // Indentation indicator ([163] c-indentation-indicator)
> +  if (Current != End && (*Current >= '1' && *Current <= '9')) {
> +    BlockIndent = unsigned(*Current - '0');
> +    LookingForIndent = false;
> +    skip(1);
>    }
> +  // Check for chomping indicator once again.
> +  if (Current != End && (*Current == '+' || *Current == '-')) {
> +    ChompingIndicator = *Current;
> +    skip(1);
> +  }
> +  Current = skip_while(&Scanner::skip_s_white, Current);
> +  skipComment();

These two lines are both part of `scanToNextToken()`.  I wonder if you
picked the right thing to extract?

>  
> -  if (Start == Current) {
> -    setError("Got empty block scalar", Start);
> +  if (Current == End) {
> +    // End of file after block header, we have an empty scalar.
> +    Token T;
> +    T.Kind = Token::TK_BlockScalar;
> +    T.Range = StringRef(BlockStart, Current - BlockStart);
> +    TokenQueue.push_back(T);
> +    return true;
> +  }
> +  auto I = skip_b_break(Current);
> +  if (I == Current) {
> +    setError("Expected a line break after block scalar header", Current);
>      return false;
>    }

...

A few general questions about the rest (which I haven't taken the time
to read carefully yet):

  - Is it possible to separate out the restructuring you've done into a
    series of NFC ("no functionality change") commits?  That would make
    the functional changes easier to review.
  - How many new features/bugfixes are you rolling into this?  Can any
    of them be split apart into separate commits?  If not, can you
    enumerate them on the list (and, of course, in the commit message)?
  - You haven't added any new tests (although I see you've modified the
    old ones).  Do the current tests really cover everything you've
    changed here?  If not, please add some targeted ones.

> +  Current = I;
> +  Column = 0;
> +  ++Line;
> +  BlockStart = Current;
> +
> +  while (true) {
> +    // Handle the block's indentation
> +    if (LookingForIndent) {
> +      while (true) {
> +        auto I = skip_s_space(Current);
> +        if (I == Current)
> +          break;
> +        Current = I;
> +        ++Column;
> +      }
> +      // If there is a non breaking character next, then this is a non empty
> +      // line, thus we can use the discovered indentation as the block's
> +      // indentation.
> +      if (skip_nb_char(Current) != Current) {
> +        if (Column <= BlockExitIndent) {
> +          // This is the end of the block literal, so we exit the loop.
> +          break;
> +        } else {
> +          BlockIndent = Column;
> +          LookingForIndent = false;
> +          // So here we enforce the rule that the leading all space lines can't
> +          // have more characters than the block's indentation level
> +          if (MaxAllSpaceLineCharacters > BlockIndent) {
> +            setError("A leading all-space line must not have too many spaces",
> +                     LongestAllSpaceLine);
> +          }
> +        }
> +      } else if (skip_b_break(Current) != Current) {
> +        // This is an all space line, so we have to record the amount of
> +        // space characters it has so that later when we find the first
> +        // text line that defines the indentation level we can make sure
> +        // that all previous all space lines don't have more space
> +        // characters than the indentation level.
> +        if (Column > MaxAllSpaceLineCharacters) {
> +          MaxAllSpaceLineCharacters = Column;
> +          LongestAllSpaceLine = Current;
> +        }
> +      }
> +    } else {
> +      while (Column < BlockIndent) {
> +        auto I = skip_s_space(Current);
> +        if (I == Current)
> +          break;
> +        Current = I;
> +        ++Column;
> +      }
> +      // If this line isn't empty then we have to check the indentation to
> +      // see if the block scalar ends.
> +      if (skip_nb_char(Current) != Current) {
> +        if (Column <= BlockExitIndent) {
> +          // This is the end of the block literal, exit the loop.
> +          break;
> +        } else if (Column < BlockIndent) {
> +          if (Current != End && *Current == '#') {
> +            // This is a trailing comment, exit the loop.
> +            break;
> +          } else {
> +            setError("A text line is less indented than the block scalar",
> +                     Current);
> +          }
> +        }
> +      }
> +    }
> +
> +    // Fold this line if necessary, by removing an additional line break.
> +    // Folding is applied to text lines which follow other text lines and don't
> +    // have any extra spaces (or tabs) after the indentation spaces.
> +    bool IsLineFolded = false;
> +    if (!IsLiteral && FoldNextLine && Column == BlockIndent &&
> +        skip_s_white(Current) == Current && skip_nb_char(Current) != Current &&
> +        LineBreaks) {
> +      IsLineFolded = true;
> +      --LineBreaks;
> +    }
> +
> +    // Parse the current line.
> +    auto Start = Current;
> +    Current = skip_while(&Scanner::skip_nb_char, Current);
> +    Column += (Current - Start);
> +    if (Start != Current) {
> +      for (unsigned I = 0; I < LineBreaks; ++I)
> +        OS << "\n";
> +      if (IsLineFolded && !LineBreaks)
> +        OS << ' ';
> +      LineBreaks = 0;
> +      auto Line = StringRef(Start, Current - Start);
> +      OS << Line;
> +      // Don't fold the next line when this line ends with a whitespace
> +      // character.
> +      FoldNextLine = Line.back() != ' ' && Line.back() != '\t';
> +    } else {
> +      FoldNextLine = false;
> +    }
> +
> +    // Check for EOF.
> +    if (Current == End) {
> +      // Ensure that there is at least one line break before the end of file.
> +      if (!LineBreaks)
> +        LineBreaks = 1;
> +      break;
> +    }
> +    auto I = skip_b_break(Current);
> +    if (I == Current) {
> +      // There was an error, which should already have been printed out.
> +      return false;
> +    }
> +    // We've got a line break
> +    Column = 0;
> +    ++Line;
> +    Current = I;
> +    ++LineBreaks;
> +  }
> +
> +  // Don't output any trailing new lines if the stripping chomping behaviour is
> +  // specified.
> +  if (ChompingIndicator == '-')
> +    LineBreaks = 0;
> +  // Clip trailing lines (default chomping behaviour). The final line break
> +  // is preserved as long as the string isn't empty, but the other trailing
> +  // lines aren't kept.
> +  else if (ChompingIndicator != '+')
> +    LineBreaks = OS.str().empty() ? 0 : 1;
> +
> +  for (unsigned I = 0; I < LineBreaks; ++I)
> +    OS << "\n";
> +
> +  // New lines may start a simple key.
> +  if (!FlowLevel)
> +    IsSimpleKeyAllowed = true;
>  
>    Token T;
> -  T.Kind = Token::TK_Scalar;
> -  T.Range = StringRef(Start, Current - Start);
> +  T.Kind = Token::TK_BlockScalar;
> +  T.Range = StringRef(BlockStart, Current - BlockStart);
> +  T.Value = std::move(OS.str());
>    TokenQueue.push_back(T);
>    return true;
>  }
> @@ -1600,6 +1788,7 @@
>    case NK_Null:
>      return "tag:yaml.org,2002:null";
>    case NK_Scalar:
> +  case NK_BlockScalar:
>      // TODO: Tag resolution.
>      return "tag:yaml.org,2002:str";
>    case NK_Mapping:
> @@ -2131,6 +2320,11 @@
>                  , AnchorInfo.Range.substr(1)
>                  , TagInfo.Range
>                  , T.Range);
> +  case Token::TK_BlockScalar:
> +    getNext();
> +    return new (NodeAllocator)
> +        BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1),
> +                        TagInfo.Range, T.Value, T.Range);
>    case Token::TK_Key:
>      // Don't eat the TK_Key, KeyValueNode expects it.
>      return new (NodeAllocator)