[llvm] 2980933 - [YAMLIO] Support non-null-terminated inputs

Scott Linder via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 18 15:15:58 PST 2020


Author: Scott Linder
Date: 2020-11-18T23:06:03Z
New Revision: 2980933d850b7506a1a96f8d11588b71956f4089

URL: https://github.com/llvm/llvm-project/commit/2980933d850b7506a1a96f8d11588b71956f4089
DIFF: https://github.com/llvm/llvm-project/commit/2980933d850b7506a1a96f8d11588b71956f4089.diff

LOG: [YAMLIO] Support non-null-terminated inputs

In some places the parser guards against dereferencing `End`, while in
others it relies on the presence of a trailing `'\0'` to elide checks.

Add the remaining guards needed to ensure the parser never attempts to
dereference `End`, making it safe to not require a null-terminated input
buffer.

Update the parser fuzzer harness so that it tests with buffers that are
guaranteed to be non-null-terminated, null-terminated, and 1-terminated,
additionally ensuring the result of the parse is the same in each case.

Some of the regression tests were written by inspection, and some are
cases caught by the fuzzer which required additional fixes in the
parser.

Differential Revision: https://reviews.llvm.org/D84050

Added: 
    

Modified: 
    llvm/lib/Support/YAMLParser.cpp
    llvm/tools/llvm-yaml-parser-fuzzer/yaml-parser-fuzzer.cpp
    llvm/unittests/Support/YAMLIOTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp
index 06c3120bca02..755f93cde171 100644
--- a/llvm/lib/Support/YAMLParser.cpp
+++ b/llvm/lib/Support/YAMLParser.cpp
@@ -200,13 +200,12 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
   StringRef::iterator End = Range.end();
   // 1 byte: [0x00, 0x7f]
   // Bit pattern: 0xxxxxxx
-  if ((*Position & 0x80) == 0) {
-     return std::make_pair(*Position, 1);
+  if (Position < End && (*Position & 0x80) == 0) {
+    return std::make_pair(*Position, 1);
   }
   // 2 bytes: [0x80, 0x7ff]
   // Bit pattern: 110xxxxx 10xxxxxx
-  if (Position + 1 != End &&
-      ((*Position & 0xE0) == 0xC0) &&
+  if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) &&
       ((*(Position + 1) & 0xC0) == 0x80)) {
     uint32_t codepoint = ((*Position & 0x1F) << 6) |
                           (*(Position + 1) & 0x3F);
@@ -215,8 +214,7 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
   }
   // 3 bytes: [0x8000, 0xffff]
   // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
-  if (Position + 2 != End &&
-      ((*Position & 0xF0) == 0xE0) &&
+  if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) &&
       ((*(Position + 1) & 0xC0) == 0x80) &&
       ((*(Position + 2) & 0xC0) == 0x80)) {
     uint32_t codepoint = ((*Position & 0x0F) << 12) |
@@ -230,8 +228,7 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
   }
   // 4 bytes: [0x10000, 0x10FFFF]
   // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-  if (Position + 3 != End &&
-      ((*Position & 0xF8) == 0xF0) &&
+  if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) &&
       ((*(Position + 1) & 0xC0) == 0x80) &&
       ((*(Position + 2) & 0xC0) == 0x80) &&
       ((*(Position + 3) & 0xC0) == 0x80)) {
@@ -773,7 +770,7 @@ void Scanner::init(MemoryBufferRef Buffer) {
   IsSimpleKeyAllowed = true;
   Failed = false;
   std::unique_ptr<MemoryBuffer> InputBufferOwner =
-      MemoryBuffer::getMemBuffer(Buffer);
+      MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false);
   SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
 }
 
@@ -1036,7 +1033,7 @@ bool Scanner::rollIndent( int ToColumn
 }
 
 void Scanner::skipComment() {
-  if (*Current != '#')
+  if (Current == End || *Current != '#')
     return;
   while (true) {
     // This may skip more than one byte, thus Column is only incremented
@@ -1051,7 +1048,7 @@ void Scanner::skipComment() {
 
 void Scanner::scanToNextToken() {
   while (true) {
-    while (*Current == ' ' || *Current == '\t') {
+    while (Current != End && (*Current == ' ' || *Current == '\t')) {
       skip(1);
     }
 
@@ -1286,7 +1283,7 @@ bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
              && wasEscaped(Start + 1, Current));
   } else {
     skip(1);
-    while (true) {
+    while (Current != End) {
       // Skip a ' followed by another '.
       if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
         skip(2);
@@ -1334,13 +1331,14 @@ bool Scanner::scanPlainScalar() {
   unsigned LeadingBlanks = 0;
   assert(Indent >= -1 && "Indent must be >= -1 !");
   unsigned indent = static_cast<unsigned>(Indent + 1);
-  while (true) {
+  while (Current != End) {
     if (*Current == '#')
       break;
 
-    while (!isBlankOrBreak(Current)) {
-      if (  FlowLevel && *Current == ':'
-          && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
+    while (Current != End && !isBlankOrBreak(Current)) {
+      if (FlowLevel && *Current == ':' &&
+          (Current + 1 == End ||
+           !(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) {
         setError("Found unexpected ':' while scanning a plain scalar", Current);
         return false;
       }
@@ -1410,7 +1408,7 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) {
   StringRef::iterator Start = Current;
   unsigned ColStart = Column;
   skip(1);
-  while(true) {
+  while (Current != End) {
     if (   *Current == '[' || *Current == ']'
         || *Current == '{' || *Current == '}'
         || *Current == ','

diff  --git a/llvm/tools/llvm-yaml-parser-fuzzer/yaml-parser-fuzzer.cpp b/llvm/tools/llvm-yaml-parser-fuzzer/yaml-parser-fuzzer.cpp
index edce19f7b1b3..3aeaf02f3e75 100644
--- a/llvm/tools/llvm-yaml-parser-fuzzer/yaml-parser-fuzzer.cpp
+++ b/llvm/tools/llvm-yaml-parser-fuzzer/yaml-parser-fuzzer.cpp
@@ -21,12 +21,32 @@ static bool isValidYaml(const uint8_t *Data, size_t Size) {
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   std::vector<uint8_t> Input(Data, Data + Size);
 
+  // Ensure we don't crash on any arbitrary byte string.
+  isValidYaml(Input.data(), Input.size());
+
+  // Ensure we don't crash on byte strings with no null characters.
+  Input.erase(std::remove(Input.begin(), Input.end(), 0), Input.end());
+  Input.shrink_to_fit();
+  bool IsValidWithout0s = isValidYaml(Input.data(), Input.size());
+
   // Ensure we don't crash on byte strings where the only null character is
   // one-past-the-end of the actual input to the parser.
-  Input.erase(std::remove(Input.begin(), Input.end(), 0), Input.end());
   Input.push_back(0);
   Input.shrink_to_fit();
-  isValidYaml(Input.data(), Input.size() - 1);
+  bool IsValidWhen0Terminated = isValidYaml(Input.data(), Input.size() - 1);
+
+  // Ensure we don't crash on byte strings with no null characters, but with
+  // an invalid character one-past-the-end of the actual input to the parser.
+  Input.back() = 1;
+  bool IsValidWhen1Terminated = isValidYaml(Input.data(), Input.size() - 1);
+
+  // The parser should either accept all of these inputs, or reject all of
+  // them, because the parser sees an identical byte string in each case. This
+  // should hopefully catch some cases where the parser is sensitive to what is
+  // present one-past-the-end of the actual input.
+  if (IsValidWithout0s != IsValidWhen0Terminated ||
+      IsValidWhen0Terminated != IsValidWhen1Terminated)
+    LLVM_BUILTIN_TRAP;
 
   return 0;
 }

diff  --git a/llvm/unittests/Support/YAMLIOTest.cpp b/llvm/unittests/Support/YAMLIOTest.cpp
index e4e3fe09d14c..863861fc3227 100644
--- a/llvm/unittests/Support/YAMLIOTest.cpp
+++ b/llvm/unittests/Support/YAMLIOTest.cpp
@@ -3111,5 +3111,95 @@ TEST(YAMLIO, TestEmptyAlias) {
 TEST(YAMLIO, TestEmptyAnchor) {
   Input yin("*");
   EXPECT_FALSE(yin.setCurrentDocument());
+}
+
+TEST(YAMLIO, TestScannerNoNullEmpty) {
+  std::vector<char> str{};
+  Input yin(llvm::StringRef(str.data(), str.size()));
+  yin.setCurrentDocument();
+  EXPECT_FALSE(yin.error());
+}
+
+TEST(YAMLIO, TestScannerNoNullSequenceOfNull) {
+  std::vector<char> str{'-'};
+  Input yin(llvm::StringRef(str.data(), str.size()));
+  yin.setCurrentDocument();
+  EXPECT_FALSE(yin.error());
+}
+
+TEST(YAMLIO, TestScannerNoNullSimpleSequence) {
+  std::vector<char> str{'-', ' ', 'a'};
+  Input yin(llvm::StringRef(str.data(), str.size()));
+  yin.setCurrentDocument();
+  EXPECT_FALSE(yin.error());
+}
+
+TEST(YAMLIO, TestScannerNoNullUnbalancedMap) {
+  std::vector<char> str{'{'};
+  Input yin(llvm::StringRef(str.data(), str.size()));
+  yin.setCurrentDocument();
+  EXPECT_TRUE(yin.error());
+}
+
+TEST(YAMLIO, TestScannerNoNullEmptyMap) {
+  std::vector<char> str{'{', '}'};
+  Input yin(llvm::StringRef(str.data(), str.size()));
+  yin.setCurrentDocument();
+  EXPECT_FALSE(yin.error());
+}
+
+TEST(YAMLIO, TestScannerNoNullUnbalancedSequence) {
+  std::vector<char> str{'['};
+  Input yin(llvm::StringRef(str.data(), str.size()));
+  yin.setCurrentDocument();
+  EXPECT_TRUE(yin.error());
+}
+
+TEST(YAMLIO, TestScannerNoNullEmptySequence) {
+  std::vector<char> str{'[', ']'};
+  Input yin(llvm::StringRef(str.data(), str.size()));
+  yin.setCurrentDocument();
+  EXPECT_FALSE(yin.error());
+}
+
+TEST(YAMLIO, TestScannerNoNullScalarUnbalancedDoubleQuote) {
+  std::vector<char> str{'"'};
+  Input yin(llvm::StringRef(str.data(), str.size()));
+  yin.setCurrentDocument();
+  EXPECT_TRUE(yin.error());
+}
+
+TEST(YAMLIO, TestScannerNoNullScalarUnbalancedSingleQuote) {
+  std::vector<char> str{'\''};
+  Input yin(llvm::StringRef(str.data(), str.size()));
+  yin.setCurrentDocument();
+  EXPECT_TRUE(yin.error());
+}
+
+TEST(YAMLIO, TestScannerNoNullEmptyAlias) {
+  std::vector<char> str{'&'};
+  Input yin(llvm::StringRef(str.data(), str.size()));
+  yin.setCurrentDocument();
+  EXPECT_TRUE(yin.error());
+}
+
+TEST(YAMLIO, TestScannerNoNullEmptyAnchor) {
+  std::vector<char> str{'*'};
+  Input yin(llvm::StringRef(str.data(), str.size()));
+  yin.setCurrentDocument();
+  EXPECT_TRUE(yin.error());
+}
+
+TEST(YAMLIO, TestScannerNoNullDecodeInvalidUTF8) {
+  std::vector<char> str{'\xef'};
+  Input yin(llvm::StringRef(str.data(), str.size()));
+  yin.setCurrentDocument();
+  EXPECT_TRUE(yin.error());
+}
+
+TEST(YAMLIO, TestScannerNoNullScanPlainScalarInFlow) {
+  std::vector<char> str{'{', 'a', ':'};
+  Input yin(llvm::StringRef(str.data(), str.size()));
+  yin.setCurrentDocument();
   EXPECT_TRUE(yin.error());
 }


        


More information about the llvm-commits mailing list