[llvm] r328661 - [YAML] Escape non-printable multibyte UTF8 in Output::scalarString.
Graydon Hoare via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 27 12:52:45 PDT 2018
Author: graydon
Date: Tue Mar 27 12:52:45 2018
New Revision: 328661
URL: http://llvm.org/viewvc/llvm-project?rev=328661&view=rev
Log:
[YAML] Escape non-printable multibyte UTF8 in Output::scalarString.
The existing YAML Output::scalarString code path includes a partial and
incorrect implementation of YAML escaping logic. In particular, the logic put
in place in rL321283 escapes non-printable bytes only if they are not part of a
multibyte UTF8 sequence; implicitly this means that all multibyte UTF8
sequences -- printable and non -- are passed through verbatim.
The simplest solution to this is to direct the Output::scalarString method to
use the standalone yaml::escape function, and this _almost_ works, except that
the existing code in that function _over_ escapes: any multibyte UTF8 sequence
is escaped, even printable ones. While this is permitted for YAML, it is also
more aggressive (and hard to read for non-English locales) than necessary,
and the entire point of rL321283 was to back off such aggressive over-escaping.
So in this change, I have both redirected Output::scalarString to use
yaml::escape _and_ modified yaml::escape to optionally restrict its escaping to
non-printables. This preserves behaviour of any existing clients while giving
them a path to more moderate escaping should they desire.
Reviewers: JDevlieghere, thegameg, MatzeB, vladimir.plyashkun
Reviewed By: thegameg
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D44863
Modified:
llvm/trunk/include/llvm/Support/YAMLParser.h
llvm/trunk/lib/Support/YAMLParser.cpp
llvm/trunk/lib/Support/YAMLTraits.cpp
llvm/trunk/unittests/Support/YAMLIOTest.cpp
Modified: llvm/trunk/include/llvm/Support/YAMLParser.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Support/YAMLParser.h?rev=328661&r1=328660&r2=328661&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Support/YAMLParser.h (original)
+++ llvm/trunk/include/llvm/Support/YAMLParser.h Tue Mar 27 12:52:45 2018
@@ -73,8 +73,11 @@ bool dumpTokens(StringRef Input, raw_ost
/// \returns true if there was an error, false otherwise.
bool scanTokens(StringRef Input);
-/// \brief Escape \a Input for a double quoted scalar.
-std::string escape(StringRef Input);
+/// \brief Escape \a Input for a double quoted scalar; if \p EscapePrintable
+/// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is
+/// false, those UTF8 sequences encoding printable unicode scalars will not be
+/// escaped, but emitted verbatim.
+std::string escape(StringRef Input, bool EscapePrintable = true);
/// \brief This class represents a YAML stream potentially containing multiple
/// documents.
Modified: llvm/trunk/lib/Support/YAMLParser.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/YAMLParser.cpp?rev=328661&r1=328660&r2=328661&view=diff
==============================================================================
--- llvm/trunk/lib/Support/YAMLParser.cpp (original)
+++ llvm/trunk/lib/Support/YAMLParser.cpp Tue Mar 27 12:52:45 2018
@@ -26,6 +26,7 @@
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/SMLoc.h"
#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/Unicode.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
@@ -687,7 +688,7 @@ bool yaml::scanTokens(StringRef Input) {
return true;
}
-std::string yaml::escape(StringRef Input) {
+std::string yaml::escape(StringRef Input, bool EscapePrintable) {
std::string EscapedInput;
for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
if (*i == '\\')
@@ -734,6 +735,9 @@ std::string yaml::escape(StringRef Input
EscapedInput += "\\L";
else if (UnicodeScalarValue.first == 0x2029)
EscapedInput += "\\P";
+ else if (!EscapePrintable &&
+ sys::unicode::isPrintable(UnicodeScalarValue.first))
+ EscapedInput += StringRef(i, UnicodeScalarValue.second);
else {
std::string HexStr = utohexstr(UnicodeScalarValue.first);
if (HexStr.size() <= 2)
Modified: llvm/trunk/lib/Support/YAMLTraits.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/YAMLTraits.cpp?rev=328661&r1=328660&r2=328661&view=diff
==============================================================================
--- llvm/trunk/lib/Support/YAMLTraits.cpp (original)
+++ llvm/trunk/lib/Support/YAMLTraits.cpp Tue Mar 27 12:52:45 2018
@@ -638,39 +638,22 @@ void Output::scalarString(StringRef &S,
const char *Base = S.data();
const char *const Quote = MustQuote == QuotingType::Single ? "'" : "\"";
- const char QuoteChar = MustQuote == QuotingType::Single ? '\'' : '"';
-
output(Quote); // Starting quote.
- // When using single-quoted strings, any single quote ' must be doubled to be
- // escaped.
- // When using double-quoted strings, print \x + hex for non-printable ASCII
- // characters, and escape double quotes.
- while (j < End) {
- if (S[j] == QuoteChar) { // Escape quotes.
- output(StringRef(&Base[i], j - i)); // "flush".
- if (MustQuote == QuotingType::Double) { // Print it as \"
- output(StringLiteral("\\"));
- output(StringRef(Quote, 1));
- } else { // Single
- output(StringLiteral("''")); // Print it as ''
- }
- i = j + 1;
- } else if (MustQuote == QuotingType::Double &&
- !sys::unicode::isPrintable(S[j]) && (S[j] & 0x80) == 0) {
- // If we're double quoting non-printable characters, we prefer printing
- // them as "\x" + their hex representation. Note that special casing is
- // needed for UTF-8, where a byte may be part of a UTF-8 sequence and
- // appear as non-printable, in which case we want to print the correct
- // unicode character and not its hex representation.
- output(StringRef(&Base[i], j - i)); // "flush"
- output(StringLiteral("\\x"));
-
- // Output the byte 0x0F as \x0f.
- auto FormattedHex = format_hex_no_prefix(S[j], 2);
- Out << FormattedHex;
- Column += 4; // one for the '\', one for the 'x', and two for the hex
+ // When using double-quoted strings (and only in that case), non-printable characters may be
+ // present, and will be escaped using a variety of unicode-scalar and special short-form
+ // escapes. This is handled in yaml::escape.
+ if (MustQuote == QuotingType::Double) {
+ output(yaml::escape(Base, /* EscapePrintable= */ false));
+ this->outputUpToEndOfLine(Quote);
+ return;
+ }
+ // When using single-quoted strings, any single quote ' must be doubled to be escaped.
+ while (j < End) {
+ if (S[j] == '\'') { // Escape quotes.
+ output(StringRef(&Base[i], j - i)); // "flush".
+ output(StringLiteral("''")); // Print it as ''
i = j + 1;
}
++j;
Modified: llvm/trunk/unittests/Support/YAMLIOTest.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/unittests/Support/YAMLIOTest.cpp?rev=328661&r1=328660&r2=328661&view=diff
==============================================================================
--- llvm/trunk/unittests/Support/YAMLIOTest.cpp (original)
+++ llvm/trunk/unittests/Support/YAMLIOTest.cpp Tue Mar 27 12:52:45 2018
@@ -2464,7 +2464,10 @@ static void TestEscaped(llvm::StringRef
yamlize(xout, Input, true, Ctx);
ostr.flush();
- EXPECT_EQ(Expected, out);
+
+ // Make a separate StringRef so we get nice byte-by-byte output.
+ llvm::StringRef Got(out);
+ EXPECT_EQ(Expected, Got);
}
TEST(YAMLIO, TestEscaped) {
@@ -2485,4 +2488,17 @@ TEST(YAMLIO, TestEscaped) {
// UTF8 with single quote inside double quote
TestEscaped("parameter 'паÑамеÑÑ' is unused",
"\"parameter 'паÑамеÑÑ' is unused\"");
+
+ // String with embedded non-printable multibyte UTF-8 sequence (U+200B
+ // zero-width space). The thing to test here is that we emit a
+ // unicode-scalar level escape like \uNNNN (at the YAML level), and don't
+ // just pass the UTF-8 byte sequence through as with quoted printables.
+ TestEscaped("foo\u200Bbar", "\"foo\\u200Bbar\"");
+ {
+ const unsigned char foobar[10] = {'f', 'o', 'o',
+ 0xE2, 0x80, 0x8B, // UTF-8 of U+200B
+ 'b', 'a', 'r',
+ 0x0};
+ TestEscaped((char const *)foobar, "\"foo\\u200Bbar\"");
+ }
}
More information about the llvm-commits
mailing list