[libcxx-commits] [flang] [lldb] [clang] [llvm] [lld] [clang-tools-extra] [libcxx] [compiler-rt] [YAMLParser] Unfold multi-line scalar values (PR #70898)
Scott Linder via libcxx-commits
libcxx-commits at lists.llvm.org
Mon Nov 6 14:15:38 PST 2023
================
@@ -2030,187 +2030,219 @@ bool Node::failed() const {
}
StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
- // TODO: Handle newlines properly. We need to remove leading whitespace.
- if (Value[0] == '"') { // Double quoted.
- // Pull off the leading and trailing "s.
- StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
- // Search for characters that would require unescaping the value.
- StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
- if (i != StringRef::npos)
- return unescapeDoubleQuoted(UnquotedValue, i, Storage);
+ if (Value[0] == '"')
+ return getDoubleQuotedValue(Value, Storage);
+ if (Value[0] == '\'')
+ return getSingleQuotedValue(Value, Storage);
+ return getPlainValue(Value, Storage);
+}
+
+static StringRef
+parseScalarValue(StringRef UnquotedValue, SmallVectorImpl<char> &Storage,
+ StringRef LookupChars,
+ std::function<StringRef(StringRef, SmallVectorImpl<char> &)>
+ UnescapeCallback) {
+ size_t I = UnquotedValue.find_first_of(LookupChars);
+ if (I == StringRef::npos)
return UnquotedValue;
- } else if (Value[0] == '\'') { // Single quoted.
- // Pull off the leading and trailing 's.
- StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
- StringRef::size_type i = UnquotedValue.find('\'');
- if (i != StringRef::npos) {
- // We're going to need Storage.
- Storage.clear();
- Storage.reserve(UnquotedValue.size());
- for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
- StringRef Valid(UnquotedValue.begin(), i);
- llvm::append_range(Storage, Valid);
- Storage.push_back('\'');
- UnquotedValue = UnquotedValue.substr(i + 2);
- }
- llvm::append_range(Storage, UnquotedValue);
- return StringRef(Storage.begin(), Storage.size());
- }
- return UnquotedValue;
- }
- // Plain.
- // Trim whitespace ('b-char' and 's-white').
- // NOTE: Alternatively we could change the scanner to not include whitespace
- // here in the first place.
- return Value.rtrim("\x0A\x0D\x20\x09");
-}
-StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
- , StringRef::size_type i
- , SmallVectorImpl<char> &Storage)
- const {
- // Use Storage to build proper value.
Storage.clear();
Storage.reserve(UnquotedValue.size());
- for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
- // Insert all previous chars into Storage.
- StringRef Valid(UnquotedValue.begin(), i);
- llvm::append_range(Storage, Valid);
- // Chop off inserted chars.
- UnquotedValue = UnquotedValue.substr(i);
-
- assert(!UnquotedValue.empty() && "Can't be empty!");
-
- // Parse escape or line break.
- switch (UnquotedValue[0]) {
- case '\r':
- case '\n':
- Storage.push_back('\n');
- if ( UnquotedValue.size() > 1
- && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
- UnquotedValue = UnquotedValue.substr(1);
- UnquotedValue = UnquotedValue.substr(1);
- break;
- default:
- if (UnquotedValue.size() == 1) {
- Token T;
- T.Range = StringRef(UnquotedValue.begin(), 1);
- setError("Unrecognized escape code", T);
- return "";
- }
- UnquotedValue = UnquotedValue.substr(1);
- switch (UnquotedValue[0]) {
- default: {
- Token T;
- T.Range = StringRef(UnquotedValue.begin(), 1);
- setError("Unrecognized escape code", T);
- return "";
- }
- case '\r':
- case '\n':
- // Remove the new line.
- if ( UnquotedValue.size() > 1
- && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
- UnquotedValue = UnquotedValue.substr(1);
- // If this was just a single byte newline, it will get skipped
- // below.
- break;
- case '0':
- Storage.push_back(0x00);
- break;
- case 'a':
- Storage.push_back(0x07);
- break;
- case 'b':
- Storage.push_back(0x08);
- break;
- case 't':
- case 0x09:
- Storage.push_back(0x09);
- break;
- case 'n':
- Storage.push_back(0x0A);
- break;
- case 'v':
- Storage.push_back(0x0B);
- break;
- case 'f':
- Storage.push_back(0x0C);
- break;
- case 'r':
- Storage.push_back(0x0D);
- break;
- case 'e':
- Storage.push_back(0x1B);
- break;
+ char LastNewLineAddedAs = '\0';
+ for (; I != StringRef::npos; I = UnquotedValue.find_first_of(LookupChars)) {
+ if (UnquotedValue[I] != '\x0D' && UnquotedValue[I] != '\x0A') {
----------------
slinder1 wrote:
Is there a reason to change from the "mnemonic" escape to a codepoint value one? E.g. is there a case where `'\r' != '\x0D'`, or is this just a stylistic change?
If there is a technical rationale, can we at least make the value symbolic with a `constexpr char CarriageReturn = '\x0D';` somewhere? And then ditto for all other magic numbers here
https://github.com/llvm/llvm-project/pull/70898
More information about the libcxx-commits
mailing list