[libc-commits] [libc] 7a129f0 - [libc] add scanf parser and core utilities

Michael Jones via libc-commits libc-commits at lists.llvm.org
Fri Oct 28 10:53:01 PDT 2022


Author: Michael Jones
Date: 2022-10-28T10:52:51-07:00
New Revision: 7a129f07562d4b7561bf0ff055e864f6e55982ed

URL: https://github.com/llvm/llvm-project/commit/7a129f07562d4b7561bf0ff055e864f6e55982ed
DIFF: https://github.com/llvm/llvm-project/commit/7a129f07562d4b7561bf0ff055e864f6e55982ed.diff

LOG: [libc] add scanf parser and core utilities

This is the first piece of scanf. It's very similar in design to printf,
and so much of the code is copied from that. There were potential issues
with conflicting macros so I've also renamed the "ASSERT_FORMAT_EQ"
macro for printf to "ASSERT_PFORMAT_EQ".

Reviewed By: sivachandra

Differential Revision: https://reviews.llvm.org/D136288

Added: 
    libc/src/stdio/scanf_core/CMakeLists.txt
    libc/src/stdio/scanf_core/core_structs.h
    libc/src/stdio/scanf_core/parser.cpp
    libc/src/stdio/scanf_core/parser.h
    libc/src/stdio/scanf_core/scanf_config.h
    libc/test/src/stdio/scanf_core/CMakeLists.txt
    libc/test/src/stdio/scanf_core/parser_test.cpp
    libc/utils/UnitTest/ScanfMatcher.cpp
    libc/utils/UnitTest/ScanfMatcher.h

Modified: 
    libc/src/stdio/CMakeLists.txt
    libc/test/src/stdio/CMakeLists.txt
    libc/test/src/stdio/printf_core/parser_test.cpp
    libc/utils/UnitTest/CMakeLists.txt
    libc/utils/UnitTest/PrintfMatcher.h

Removed: 
    


################################################################################
diff  --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index 6fe3c59311db4..22536a515bd58 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -3,6 +3,7 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
 endif()
 
 add_subdirectory(printf_core)
+add_subdirectory(scanf_core)
 
 add_entrypoint_object(
   fopen

diff  --git a/libc/src/stdio/scanf_core/CMakeLists.txt b/libc/src/stdio/scanf_core/CMakeLists.txt
new file mode 100644
index 0000000000000..3941d40a838c7
--- /dev/null
+++ b/libc/src/stdio/scanf_core/CMakeLists.txt
@@ -0,0 +1,25 @@
+add_header_library(
+  core_structs
+  HDRS
+    core_structs.h
+  DEPENDS
+    libc.src.__support.CPP.string_view
+    libc.src.__support.CPP.bitset
+    libc.src.__support.FPUtil.fp_bits
+)
+
+add_object_library(
+  parser
+  SRCS
+    parser.cpp
+  HDRS
+    parser.h
+  DEPENDS
+    .core_structs
+    libc.src.__support.arg_list
+    libc.src.__support.ctype_utils
+    libc.src.__support.str_to_integer
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.bitset
+    libc.src.__support.CPP.string_view
+)

diff  --git a/libc/src/stdio/scanf_core/core_structs.h b/libc/src/stdio/scanf_core/core_structs.h
new file mode 100644
index 0000000000000..213a5e1a2b59a
--- /dev/null
+++ b/libc/src/stdio/scanf_core/core_structs.h
@@ -0,0 +1,91 @@
+//===-- Core Structures for scanf ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_CORE_STRUCTS_H
+#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_CORE_STRUCTS_H
+
+#include "src/__support/CPP/bitset.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/FPUtil/FPBits.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+
+namespace __llvm_libc {
+namespace scanf_core {
+
+// These length modifiers match the length modifiers in the format string, which
+// is why they are formatted 
diff erently from the rest of the file.
+enum class LengthModifier { hh, h, l, ll, j, z, t, L, NONE };
+
+enum FormatFlags : uint8_t {
+  NONE = 0x00,
+  NO_WRITE = 0x01, // *
+  ALLOCATE = 0x02, // m
+};
+
+struct FormatSection {
+  bool has_conv;
+
+  cpp::string_view raw_string;
+
+  // Format Specifier Values
+  FormatFlags flags = FormatFlags::NONE;
+  LengthModifier length_modifier = LengthModifier::NONE;
+  int max_width = -1;
+
+  // output_ptr is nullptr if and only if the NO_WRITE flag is set.
+  void *output_ptr = nullptr;
+
+  char conv_name;
+
+  cpp::bitset<256> scan_set;
+
+  bool operator==(const FormatSection &other) {
+    if (has_conv != other.has_conv)
+      return false;
+
+    if (raw_string != other.raw_string)
+      return false;
+
+    if (has_conv) {
+      if (!((static_cast<uint8_t>(flags) ==
+             static_cast<uint8_t>(other.flags)) &&
+            (max_width == other.max_width) &&
+            (length_modifier == other.length_modifier) &&
+            (conv_name == other.conv_name)))
+        return false;
+
+      // If the pointers are used, then they should be equal. If the NO_WRITE
+      // flag is set or the conversion is %, then the pointers are not used.
+      // If the pointers are used and they are not equal, return false.
+
+      if (!(((flags & FormatFlags::NO_WRITE) != 0) || (conv_name == '%') ||
+            (output_ptr == other.output_ptr)))
+        return false;
+
+      if (conv_name == '[')
+        return scan_set == other.scan_set;
+    }
+    return true;
+  }
+};
+
+enum ErrorCodes : int {
+  // This is the value to be returned by conversions when no error has occurred.
+  WRITE_OK = 0,
+  // These are the scanf return values for when an error has occurred. They are
+  // all negative, and should be distinct.
+  FILE_READ_ERROR = -1,
+  FILE_STATUS_ERROR = -2,
+  MATCHING_FAILURE = -3,
+};
+} // namespace scanf_core
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_CORE_STRUCTS_H

diff  --git a/libc/src/stdio/scanf_core/parser.cpp b/libc/src/stdio/scanf_core/parser.cpp
new file mode 100644
index 0000000000000..31dd118ad17a7
--- /dev/null
+++ b/libc/src/stdio/scanf_core/parser.cpp
@@ -0,0 +1,220 @@
+//===-- Format string parser implementation for scanf ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// #define LLVM_LIBC_SCANF_DISABLE_INDEX_MODE 1 // This will be a compile flag.
+
+#include "src/stdio/scanf_core/parser.h"
+
+#include "src/__support/arg_list.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/bitset.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/ctype_utils.h"
+#include "src/__support/str_to_integer.h"
+
+namespace __llvm_libc {
+namespace scanf_core {
+
+#ifndef LLVM_LIBC_SCANF_DISABLE_INDEX_MODE
+#define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index)
+#else
+#define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>()
+#endif // LLVM_LIBC_SCANF_DISABLE_INDEX_MODE
+
+FormatSection Parser::get_next_section() {
+  FormatSection section;
+  size_t starting_pos = cur_pos;
+  if (str[cur_pos] == '%') {
+    // format section
+    section.has_conv = true;
+
+    ++cur_pos;
+    [[maybe_unused]] size_t conv_index = 0;
+
+#ifndef LLVM_LIBC_SCANF_DISABLE_INDEX_MODE
+    conv_index = parse_index(&cur_pos);
+#endif // LLVM_LIBC_SCANF_DISABLE_INDEX_MODE
+
+    if (str[cur_pos] == '*') {
+      ++cur_pos;
+      section.flags = FormatFlags::NO_WRITE;
+    }
+
+    // handle width
+    section.max_width = -1;
+    if (internal::isdigit(str[cur_pos])) {
+      char *int_end;
+      section.max_width =
+          internal::strtointeger<int>(str + cur_pos, &int_end, 10);
+      cur_pos = int_end - str;
+    }
+
+    // TODO(michaelrj): add posix allocate flag support.
+    // if (str[cur_pos] == 'm') {
+    //   ++cur_pos;
+    //   section.flags = FormatFlags::ALLOCATE;
+    // }
+
+    LengthModifier lm = parse_length_modifier(&cur_pos);
+    section.length_modifier = lm;
+
+    section.conv_name = str[cur_pos];
+
+    // If NO_WRITE is not set, then read the next arg as the output pointer.
+    if ((section.flags & FormatFlags::NO_WRITE) == 0) {
+      // Since all outputs are pointers, there's no need to distinguish when
+      // reading from va_args. They're all the same size and stored the same.
+      section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index);
+    }
+
+    ++cur_pos;
+
+    // If the format is a bracketed one, then we need to parse out the insides
+    // of the brackets.
+    if (section.conv_name == '[') {
+      constexpr char CLOSING_BRACKET = ']';
+      constexpr char INVERT_FLAG = '^';
+      constexpr char RANGE_OPERATOR = '-';
+
+      cpp::bitset<256> scan_set;
+      bool invert = false;
+
+      // The circumflex in the first position represents the inversion flag, but
+      // it's easier to apply that at the end so we just store it for now.
+      if (str[cur_pos] == INVERT_FLAG) {
+        invert = true;
+        ++cur_pos;
+      }
+
+      // This is used to determine if a hyphen is being used as a literal or as
+      // a range operator.
+      size_t set_start_pos = cur_pos;
+
+      // Normally the right bracket closes the set, but if it's the first
+      // character (possibly after the inversion flag) then it's instead
+      // included as a character in the set and the second right bracket closes
+      // the set.
+      if (str[cur_pos] == CLOSING_BRACKET) {
+        scan_set.set(CLOSING_BRACKET);
+        ++cur_pos;
+      }
+
+      while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) {
+        // If a hyphen is being used as a range operator, since it's neither at
+        // the beginning nor end of the set.
+        if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos &&
+            str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') {
+          // Technically there is no requirement to correct the ordering of the
+          // range, but since the range operator is entirely implementation
+          // defined it seems like a good convenience.
+          char a = str[cur_pos - 1];
+          char b = str[cur_pos + 1];
+          char start = (a < b ? a : b);
+          char end = (a < b ? b : a);
+          scan_set.set_range(start, end);
+          cur_pos += 2;
+        } else {
+          scan_set.set(str[cur_pos]);
+          ++cur_pos;
+        }
+      }
+      if (invert)
+        scan_set.flip();
+
+      if (str[cur_pos] == CLOSING_BRACKET) {
+        ++cur_pos;
+        section.scan_set = scan_set;
+      } else {
+        // if the end of the string was encountered, this is not a valid set.
+        section.has_conv = false;
+      }
+    }
+  } else {
+    // raw section
+    section.has_conv = false;
+    while (str[cur_pos] != '%' && str[cur_pos] != '\0')
+      ++cur_pos;
+  }
+  section.raw_string = {str + starting_pos, cur_pos - starting_pos};
+  return section;
+}
+
+LengthModifier Parser::parse_length_modifier(size_t *local_pos) {
+  switch (str[*local_pos]) {
+  case ('l'):
+    if (str[*local_pos + 1] == 'l') {
+      *local_pos += 2;
+      return LengthModifier::ll;
+    } else {
+      ++*local_pos;
+      return LengthModifier::l;
+    }
+  case ('h'):
+    if (str[*local_pos + 1] == 'h') {
+      *local_pos += 2;
+      return LengthModifier::hh;
+    } else {
+      ++*local_pos;
+      return LengthModifier::h;
+    }
+  case ('L'):
+    ++*local_pos;
+    return LengthModifier::L;
+  case ('j'):
+    ++*local_pos;
+    return LengthModifier::j;
+  case ('z'):
+    ++*local_pos;
+    return LengthModifier::z;
+  case ('t'):
+    ++*local_pos;
+    return LengthModifier::t;
+  default:
+    return LengthModifier::NONE;
+  }
+}
+
+//----------------------------------------------------
+// INDEX MODE ONLY FUNCTIONS AFTER HERE:
+//----------------------------------------------------
+
+#ifndef LLVM_LIBC_SCANF_DISABLE_INDEX_MODE
+
+size_t Parser::parse_index(size_t *local_pos) {
+  if (internal::isdigit(str[*local_pos])) {
+    char *int_end;
+    size_t index =
+        internal::strtointeger<size_t>(str + *local_pos, &int_end, 10);
+    if (int_end[0] != '$')
+      return 0;
+    *local_pos = 1 + int_end - str;
+    return index;
+  }
+  return 0;
+}
+
+void Parser::args_to_index(size_t index) {
+  if (args_index > index) {
+    args_index = 1;
+    args_cur = args_start;
+  }
+
+  while (args_index < index) {
+    // Since all arguments must be pointers, we can just read all of them as
+    // void * and not worry about type issues.
+    args_cur.next_var<void *>();
+    ++args_index;
+  }
+}
+
+#endif // LLVM_LIBC_SCANF_DISABLE_INDEX_MODE
+
+} // namespace scanf_core
+} // namespace __llvm_libc

diff  --git a/libc/src/stdio/scanf_core/parser.h b/libc/src/stdio/scanf_core/parser.h
new file mode 100644
index 0000000000000..ced841f1d6ca1
--- /dev/null
+++ b/libc/src/stdio/scanf_core/parser.h
@@ -0,0 +1,99 @@
+//===-- Format string parser for scanf -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
+#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
+
+#include "src/__support/arg_list.h"
+#include "src/stdio/scanf_core/core_structs.h"
+#include "src/stdio/scanf_core/scanf_config.h"
+
+#include <stddef.h>
+
+namespace __llvm_libc {
+namespace scanf_core {
+
+class Parser {
+  const char *__restrict str;
+
+  size_t cur_pos = 0;
+  internal::ArgList args_cur;
+
+#ifndef LLVM_LIBC_SCANF_DISABLE_INDEX_MODE
+  // args_start stores the start of the va_args, which is used when a previous
+  // argument is needed. In that case, we have to read the arguments from the
+  // beginning since they don't support reading backwards.
+  internal::ArgList args_start;
+  size_t args_index = 1;
+#endif // LLVM_LIBC_SCANF_DISABLE_INDEX_MODE
+
+public:
+#ifndef LLVM_LIBC_SCANF_DISABLE_INDEX_MODE
+  Parser(const char *__restrict new_str, internal::ArgList &args)
+      : str(new_str), args_cur(args), args_start(args) {}
+#else
+  Parser(const char *__restrict new_str, internal::ArgList &args)
+      : str(new_str), args_cur(args) {}
+#endif // LLVM_LIBC_SCANF_DISABLE_INDEX_MODE
+
+  // get_next_section will parse the format string until it has a fully
+  // specified format section. This can either be a raw format section with no
+  // conversion, or a format section with a conversion that has all of its
+  // variables stored in the format section.
+  FormatSection get_next_section();
+
+private:
+  // parse_length_modifier parses the length modifier inside a format string. It
+  // assumes that str[*local_pos] is inside a format specifier. It returns a
+  // LengthModifier with the length modifier it found. It will advance local_pos
+  // after the format specifier if one is found.
+  LengthModifier parse_length_modifier(size_t *local_pos);
+
+  // get_next_arg_value gets the next value from the arg list as type T.
+  template <class T> T inline get_next_arg_value() {
+    return args_cur.next_var<T>();
+  }
+
+  //----------------------------------------------------
+  // INDEX MODE ONLY FUNCTIONS AFTER HERE:
+  //----------------------------------------------------
+
+#ifndef LLVM_LIBC_SCANF_DISABLE_INDEX_MODE
+
+  // parse_index parses the index of a value inside a format string. It
+  // assumes that str[*local_pos] points to character after a '%' or '*', and
+  // returns 0 if there is no closing $, or if it finds no number. If it finds a
+  // number, it will move local_pos past the end of the $, else it will not move
+  // local_pos.
+  size_t parse_index(size_t *local_pos);
+
+  // get_arg_value gets the value from the arg list at index (starting at 1).
+  // This may require parsing the format string. An index of 0 is interpreted as
+  // the next value.
+  template <class T> T inline get_arg_value(size_t index) {
+    if (!(index == 0 || index == args_index))
+      args_to_index(index);
+
+    ++args_index;
+    return get_next_arg_value<T>();
+  }
+
+  // the ArgList can only return the next item in the list. This function is
+  // used in index mode when the item that needs to be read is not the next one.
+  // It moves cur_args to the index requested so the the appropriate value may
+  // be read. This may involve parsing the format string, and is in the worst
+  // case an O(n^2) operation.
+  void args_to_index(size_t index);
+
+#endif // LLVM_LIBC_SCANF_DISABLE_INDEX_MODE
+};
+
+} // namespace scanf_core
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H

diff  --git a/libc/src/stdio/scanf_core/scanf_config.h b/libc/src/stdio/scanf_core/scanf_config.h
new file mode 100644
index 0000000000000..81c9c7a3af52d
--- /dev/null
+++ b/libc/src/stdio/scanf_core/scanf_config.h
@@ -0,0 +1,24 @@
+//===-- Scanf Configuration Handler ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_SCANF_CONFIG_H
+#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_SCANF_CONFIG_H
+
+// These macros can be set or unset to adjust scanf behavior at compile time.
+
+// This flag disables all functionality relating to floating point numbers. This
+// can be useful for embedded systems or other situations where binary size is
+// important.
+// #define LLVM_LIBC_SCANF_DISABLE_FLOAT
+
+// This flag disables index mode, a posix extension often used for
+// internationalization of format strings. Supporting it takes up additional
+// memory and parsing time, so it can be disabled if it's not used.
+// #define LLVM_LIBC_SCANF_DISABLE_INDEX_MODE
+
+#endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_SCANF_CONFIG_H

diff  --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 1ab174290a1b7..515619e2aa822 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -188,4 +188,5 @@ add_libc_unittest(
 )
 
 add_subdirectory(printf_core)
+add_subdirectory(scanf_core)
 add_subdirectory(testdata)

diff  --git a/libc/test/src/stdio/printf_core/parser_test.cpp b/libc/test/src/stdio/printf_core/parser_test.cpp
index 643c32bd69c91..0684ebc8d444d 100644
--- a/libc/test/src/stdio/printf_core/parser_test.cpp
+++ b/libc/test/src/stdio/printf_core/parser_test.cpp
@@ -56,7 +56,7 @@ TEST(LlvmLibcPrintfParserTest, EvalRaw) {
 
   expected.raw_string = {str, 4};
 
-  ASSERT_FORMAT_EQ(expected, format_arr[0]);
+  ASSERT_PFORMAT_EQ(expected, format_arr[0]);
   // TODO: add checks that the format_arr after the last one has length 0
 }
 
@@ -70,20 +70,20 @@ TEST(LlvmLibcPrintfParserTest, EvalSimple) {
 
   expected0.raw_string = {str, 5};
 
-  ASSERT_FORMAT_EQ(expected0, format_arr[0]);
+  ASSERT_PFORMAT_EQ(expected0, format_arr[0]);
 
   expected1.has_conv = true;
 
   expected1.raw_string = {str + 5, 2};
   expected1.conv_name = '%';
 
-  ASSERT_FORMAT_EQ(expected1, format_arr[1]);
+  ASSERT_PFORMAT_EQ(expected1, format_arr[1]);
 
   expected2.has_conv = false;
 
   expected2.raw_string = {str + 7, 5};
 
-  ASSERT_FORMAT_EQ(expected2, format_arr[2]);
+  ASSERT_PFORMAT_EQ(expected2, format_arr[2]);
 }
 
 TEST(LlvmLibcPrintfParserTest, EvalOneArg) {
@@ -99,7 +99,7 @@ TEST(LlvmLibcPrintfParserTest, EvalOneArg) {
   expected.conv_val_raw = arg1;
   expected.conv_name = 'd';
 
-  ASSERT_FORMAT_EQ(expected, format_arr[0]);
+  ASSERT_PFORMAT_EQ(expected, format_arr[0]);
 }
 
 TEST(LlvmLibcPrintfParserTest, EvalOneArgWithFlags) {
@@ -121,7 +121,7 @@ TEST(LlvmLibcPrintfParserTest, EvalOneArgWithFlags) {
   expected.conv_val_raw = arg1;
   expected.conv_name = 'd';
 
-  ASSERT_FORMAT_EQ(expected, format_arr[0]);
+  ASSERT_PFORMAT_EQ(expected, format_arr[0]);
 }
 
 TEST(LlvmLibcPrintfParserTest, EvalOneArgWithWidth) {
@@ -138,7 +138,7 @@ TEST(LlvmLibcPrintfParserTest, EvalOneArgWithWidth) {
   expected.conv_val_raw = arg1;
   expected.conv_name = 'd';
 
-  ASSERT_FORMAT_EQ(expected, format_arr[0]);
+  ASSERT_PFORMAT_EQ(expected, format_arr[0]);
 }
 
 TEST(LlvmLibcPrintfParserTest, EvalOneArgWithPrecision) {
@@ -155,7 +155,7 @@ TEST(LlvmLibcPrintfParserTest, EvalOneArgWithPrecision) {
   expected.conv_val_raw = arg1;
   expected.conv_name = 'd';
 
-  ASSERT_FORMAT_EQ(expected, format_arr[0]);
+  ASSERT_PFORMAT_EQ(expected, format_arr[0]);
 }
 
 TEST(LlvmLibcPrintfParserTest, EvalOneArgWithTrivialPrecision) {
@@ -172,7 +172,7 @@ TEST(LlvmLibcPrintfParserTest, EvalOneArgWithTrivialPrecision) {
   expected.conv_val_raw = arg1;
   expected.conv_name = 'd';
 
-  ASSERT_FORMAT_EQ(expected, format_arr[0]);
+  ASSERT_PFORMAT_EQ(expected, format_arr[0]);
 }
 
 TEST(LlvmLibcPrintfParserTest, EvalOneArgWithShortLengthModifier) {
@@ -189,7 +189,7 @@ TEST(LlvmLibcPrintfParserTest, EvalOneArgWithShortLengthModifier) {
   expected.conv_val_raw = arg1;
   expected.conv_name = 'd';
 
-  ASSERT_FORMAT_EQ(expected, format_arr[0]);
+  ASSERT_PFORMAT_EQ(expected, format_arr[0]);
 }
 
 TEST(LlvmLibcPrintfParserTest, EvalOneArgWithLongLengthModifier) {
@@ -206,7 +206,7 @@ TEST(LlvmLibcPrintfParserTest, EvalOneArgWithLongLengthModifier) {
   expected.conv_val_raw = arg1;
   expected.conv_name = 'd';
 
-  ASSERT_FORMAT_EQ(expected, format_arr[0]);
+  ASSERT_PFORMAT_EQ(expected, format_arr[0]);
 }
 
 TEST(LlvmLibcPrintfParserTest, EvalOneArgWithAllOptions) {
@@ -229,7 +229,7 @@ TEST(LlvmLibcPrintfParserTest, EvalOneArgWithAllOptions) {
   expected.conv_val_raw = arg1;
   expected.conv_name = 'd';
 
-  ASSERT_FORMAT_EQ(expected, format_arr[0]);
+  ASSERT_PFORMAT_EQ(expected, format_arr[0]);
 }
 
 TEST(LlvmLibcPrintfParserTest, EvalThreeArgs) {
@@ -247,7 +247,7 @@ TEST(LlvmLibcPrintfParserTest, EvalThreeArgs) {
   expected0.conv_val_raw = arg1;
   expected0.conv_name = 'd';
 
-  ASSERT_FORMAT_EQ(expected0, format_arr[0]);
+  ASSERT_PFORMAT_EQ(expected0, format_arr[0]);
 
   expected1.has_conv = true;
 
@@ -255,7 +255,7 @@ TEST(LlvmLibcPrintfParserTest, EvalThreeArgs) {
   expected1.conv_val_raw = __llvm_libc::cpp::bit_cast<uint64_t>(arg2);
   expected1.conv_name = 'f';
 
-  ASSERT_FORMAT_EQ(expected1, format_arr[1]);
+  ASSERT_PFORMAT_EQ(expected1, format_arr[1]);
 
   expected2.has_conv = true;
 
@@ -263,7 +263,7 @@ TEST(LlvmLibcPrintfParserTest, EvalThreeArgs) {
   expected2.conv_val_ptr = const_cast<char *>(arg3);
   expected2.conv_name = 's';
 
-  ASSERT_FORMAT_EQ(expected2, format_arr[2]);
+  ASSERT_PFORMAT_EQ(expected2, format_arr[2]);
 }
 
 #ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
@@ -281,7 +281,7 @@ TEST(LlvmLibcPrintfParserTest, IndexModeOneArg) {
   expected.conv_val_raw = arg1;
   expected.conv_name = 'd';
 
-  ASSERT_FORMAT_EQ(expected, format_arr[0]);
+  ASSERT_PFORMAT_EQ(expected, format_arr[0]);
 }
 
 TEST(LlvmLibcPrintfParserTest, IndexModeThreeArgsSequential) {
@@ -299,7 +299,7 @@ TEST(LlvmLibcPrintfParserTest, IndexModeThreeArgsSequential) {
   expected0.conv_val_raw = arg1;
   expected0.conv_name = 'd';
 
-  ASSERT_FORMAT_EQ(expected0, format_arr[0]);
+  ASSERT_PFORMAT_EQ(expected0, format_arr[0]);
 
   expected1.has_conv = true;
 
@@ -307,7 +307,7 @@ TEST(LlvmLibcPrintfParserTest, IndexModeThreeArgsSequential) {
   expected1.conv_val_raw = __llvm_libc::cpp::bit_cast<uint64_t>(arg2);
   expected1.conv_name = 'f';
 
-  ASSERT_FORMAT_EQ(expected1, format_arr[1]);
+  ASSERT_PFORMAT_EQ(expected1, format_arr[1]);
 
   expected2.has_conv = true;
 
@@ -315,7 +315,7 @@ TEST(LlvmLibcPrintfParserTest, IndexModeThreeArgsSequential) {
   expected2.conv_val_ptr = const_cast<char *>(arg3);
   expected2.conv_name = 's';
 
-  ASSERT_FORMAT_EQ(expected2, format_arr[2]);
+  ASSERT_PFORMAT_EQ(expected2, format_arr[2]);
 }
 
 TEST(LlvmLibcPrintfParserTest, IndexModeThreeArgsReverse) {
@@ -333,7 +333,7 @@ TEST(LlvmLibcPrintfParserTest, IndexModeThreeArgsReverse) {
   expected0.conv_val_raw = arg1;
   expected0.conv_name = 'd';
 
-  ASSERT_FORMAT_EQ(expected0, format_arr[0]);
+  ASSERT_PFORMAT_EQ(expected0, format_arr[0]);
 
   expected1.has_conv = true;
 
@@ -341,7 +341,7 @@ TEST(LlvmLibcPrintfParserTest, IndexModeThreeArgsReverse) {
   expected1.conv_val_raw = __llvm_libc::cpp::bit_cast<uint64_t>(arg2);
   expected1.conv_name = 'f';
 
-  ASSERT_FORMAT_EQ(expected1, format_arr[1]);
+  ASSERT_PFORMAT_EQ(expected1, format_arr[1]);
 
   expected2.has_conv = true;
 
@@ -349,7 +349,7 @@ TEST(LlvmLibcPrintfParserTest, IndexModeThreeArgsReverse) {
   expected2.conv_val_ptr = const_cast<char *>(arg3);
   expected2.conv_name = 's';
 
-  ASSERT_FORMAT_EQ(expected2, format_arr[2]);
+  ASSERT_PFORMAT_EQ(expected2, format_arr[2]);
 }
 
 TEST(LlvmLibcPrintfParserTest, IndexModeTenArgsRandom) {
@@ -367,7 +367,7 @@ TEST(LlvmLibcPrintfParserTest, IndexModeTenArgsRandom) {
                            static_cast<size_t>(4 + (i >= 9 ? 1 : 0))};
     expected.conv_val_raw = i + 1;
     expected.conv_name = 'd';
-    EXPECT_FORMAT_EQ(expected, format_arr[i]);
+    EXPECT_PFORMAT_EQ(expected, format_arr[i]);
   }
 }
 
@@ -388,7 +388,7 @@ TEST(LlvmLibcPrintfParserTest, IndexModeComplexParsing) {
 
   expected0.raw_string = {str, 12};
 
-  EXPECT_FORMAT_EQ(expected0, format_arr[0]);
+  EXPECT_PFORMAT_EQ(expected0, format_arr[0]);
 
   expected1.has_conv = true;
 
@@ -397,26 +397,26 @@ TEST(LlvmLibcPrintfParserTest, IndexModeComplexParsing) {
   expected1.conv_val_raw = arg3;
   expected1.conv_name = 'u';
 
-  EXPECT_FORMAT_EQ(expected1, format_arr[1]);
+  EXPECT_PFORMAT_EQ(expected1, format_arr[1]);
 
   expected2.has_conv = false;
 
   expected2.raw_string = {str + 18, 1};
 
-  EXPECT_FORMAT_EQ(expected2, format_arr[2]);
+  EXPECT_PFORMAT_EQ(expected2, format_arr[2]);
 
   expected3.has_conv = true;
 
   expected3.raw_string = {str + 19, 2};
   expected3.conv_name = '%';
 
-  EXPECT_FORMAT_EQ(expected3, format_arr[3]);
+  EXPECT_PFORMAT_EQ(expected3, format_arr[3]);
 
   expected4.has_conv = false;
 
   expected4.raw_string = {str + 21, 1};
 
-  EXPECT_FORMAT_EQ(expected4, format_arr[4]);
+  EXPECT_PFORMAT_EQ(expected4, format_arr[4]);
 
   expected5.has_conv = true;
 
@@ -426,13 +426,13 @@ TEST(LlvmLibcPrintfParserTest, IndexModeComplexParsing) {
   expected5.conv_val_raw = __llvm_libc::cpp::bit_cast<uint64_t>(arg2);
   expected5.conv_name = 'f';
 
-  EXPECT_FORMAT_EQ(expected5, format_arr[5]);
+  EXPECT_PFORMAT_EQ(expected5, format_arr[5]);
 
   expected6.has_conv = false;
 
   expected6.raw_string = {str + 30, 1};
 
-  EXPECT_FORMAT_EQ(expected6, format_arr[6]);
+  EXPECT_PFORMAT_EQ(expected6, format_arr[6]);
 
   expected7.has_conv = true;
 
@@ -442,13 +442,13 @@ TEST(LlvmLibcPrintfParserTest, IndexModeComplexParsing) {
   expected7.conv_val_raw = __llvm_libc::cpp::bit_cast<uint64_t>(arg2);
   expected7.conv_name = 'f';
 
-  EXPECT_FORMAT_EQ(expected7, format_arr[7]);
+  EXPECT_PFORMAT_EQ(expected7, format_arr[7]);
 
   expected8.has_conv = false;
 
   expected8.raw_string = {str + 40, 1};
 
-  EXPECT_FORMAT_EQ(expected8, format_arr[8]);
+  EXPECT_PFORMAT_EQ(expected8, format_arr[8]);
 
   expected9.has_conv = true;
 
@@ -458,7 +458,7 @@ TEST(LlvmLibcPrintfParserTest, IndexModeComplexParsing) {
   expected9.conv_val_raw = arg1;
   expected9.conv_name = 'c';
 
-  EXPECT_FORMAT_EQ(expected9, format_arr[9]);
+  EXPECT_PFORMAT_EQ(expected9, format_arr[9]);
 }
 
 #endif // LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE

diff  --git a/libc/test/src/stdio/scanf_core/CMakeLists.txt b/libc/test/src/stdio/scanf_core/CMakeLists.txt
new file mode 100644
index 0000000000000..3235a0e53e010
--- /dev/null
+++ b/libc/test/src/stdio/scanf_core/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_libc_unittest(
+  parser_test
+  SUITE
+    libc_stdio_unittests
+  SRCS
+    parser_test.cpp
+  LINK_LIBRARIES
+    LibcScanfHelpers
+  DEPENDS
+    libc.src.stdio.scanf_core.parser
+    libc.src.stdio.scanf_core.core_structs
+    libc.src.__support.CPP.string_view
+    libc.src.__support.arg_list
+)

diff  --git a/libc/test/src/stdio/scanf_core/parser_test.cpp b/libc/test/src/stdio/scanf_core/parser_test.cpp
new file mode 100644
index 0000000000000..7321adb51ce1d
--- /dev/null
+++ b/libc/test/src/stdio/scanf_core/parser_test.cpp
@@ -0,0 +1,754 @@
+//===-- Unittests for the scanf Parser -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/bitset.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/arg_list.h"
+#include "src/stdio/scanf_core/parser.h"
+
+#include <stdarg.h>
+
+#include "utils/UnitTest/ScanfMatcher.h"
+#include "utils/UnitTest/Test.h"
+
+using __llvm_libc::cpp::string_view;
+
+void init(const char *__restrict str, ...) {
+  va_list vlist;
+  va_start(vlist, str);
+  __llvm_libc::internal::ArgList v(vlist);
+  va_end(vlist);
+
+  __llvm_libc::scanf_core::Parser parser(str, v);
+}
+
+void evaluate(__llvm_libc::scanf_core::FormatSection *format_arr,
+              const char *__restrict str, ...) {
+  va_list vlist;
+  va_start(vlist, str);
+  __llvm_libc::internal::ArgList v(vlist);
+  va_end(vlist);
+
+  __llvm_libc::scanf_core::Parser parser(str, v);
+
+  for (auto cur_section = parser.get_next_section();
+       !cur_section.raw_string.empty();
+       cur_section = parser.get_next_section()) {
+    *format_arr = cur_section;
+    ++format_arr;
+  }
+}
+
+TEST(LlvmLibcScanfParserTest, Constructor) { init("test", 1, 2); }
+
+TEST(LlvmLibcScanfParserTest, EvalRaw) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "test";
+  evaluate(format_arr, str);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = false;
+
+  expected.raw_string = str;
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+  // TODO: add checks that the format_arr after the last one has length 0
+}
+
+TEST(LlvmLibcScanfParserTest, EvalSimple) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "test %% test";
+  evaluate(format_arr, str);
+
+  __llvm_libc::scanf_core::FormatSection expected0, expected1, expected2;
+  expected0.has_conv = false;
+
+  expected0.raw_string = {str, 5};
+
+  ASSERT_SFORMAT_EQ(expected0, format_arr[0]);
+
+  expected1.has_conv = true;
+
+  expected1.raw_string = {str + 5, 2};
+  expected1.conv_name = '%';
+
+  ASSERT_SFORMAT_EQ(expected1, format_arr[1]);
+
+  expected2.has_conv = false;
+
+  expected2.raw_string = {str + 7, 5};
+
+  ASSERT_SFORMAT_EQ(expected2, format_arr[2]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalOneArg) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%d";
+  int arg1 = 12345;
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.output_ptr = &arg1;
+  expected.conv_name = 'd';
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalOneArgWithFlag) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%*d";
+  // Since NO_WRITE is set, the argument shouldn't be used, but I've included
+  // one anyways because in the case that it doesn't work it's better for it to
+  // have a real argument to check against.
+  int arg1 = 12345;
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.flags = __llvm_libc::scanf_core::FormatFlags::NO_WRITE;
+  expected.conv_name = 'd';
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+
+  // If NO_WRITE is set, then the equality check ignores the pointer since it's
+  // irrelevant, but in this case I want to make sure that it hasn't been set
+  // and check it separately.
+  ASSERT_EQ(expected.output_ptr, format_arr[0].output_ptr);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalOneArgWithWidth) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%12d";
+  int arg1 = 12345;
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.max_width = 12;
+  expected.output_ptr = &arg1;
+  expected.conv_name = 'd';
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalOneArgWithShortLengthModifier) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%hd";
+  int arg1 = 12345;
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.length_modifier = __llvm_libc::scanf_core::LengthModifier::h;
+  expected.output_ptr = &arg1;
+  expected.conv_name = 'd';
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalOneArgWithLongLengthModifier) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%lld";
+  long long arg1 = 12345;
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.length_modifier = __llvm_libc::scanf_core::LengthModifier::ll;
+  expected.output_ptr = &arg1;
+  expected.conv_name = 'd';
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalOneArgWithAllOptions) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%*56jd";
+  intmax_t arg1 = 12345;
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.flags = __llvm_libc::scanf_core::FormatFlags::NO_WRITE;
+  expected.max_width = 56;
+  expected.length_modifier = __llvm_libc::scanf_core::LengthModifier::j;
+  expected.conv_name = 'd';
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalSimpleBracketArg) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%[abc]";
+  char arg1 = 'a';
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.conv_name = '[';
+  expected.output_ptr = &arg1;
+
+  __llvm_libc::cpp::bitset<256> scan_set;
+
+  scan_set.set('a');
+  scan_set.set('b');
+  scan_set.set('c');
+
+  expected.scan_set = scan_set;
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalBracketArgRange) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%[A-D]";
+  char arg1 = 'a';
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.conv_name = '[';
+  expected.output_ptr = &arg1;
+
+  __llvm_libc::cpp::bitset<256> scan_set;
+
+  scan_set.set('A');
+  scan_set.set('B');
+  scan_set.set('C');
+  scan_set.set('D');
+
+  expected.scan_set = scan_set;
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalBracketArgTwoRanges) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%[A-De-g]";
+  char arg1 = 'a';
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.conv_name = '[';
+  expected.output_ptr = &arg1;
+
+  __llvm_libc::cpp::bitset<256> scan_set;
+
+  scan_set.set('A');
+  scan_set.set('B');
+  scan_set.set('C');
+  scan_set.set('D');
+  scan_set.set_range('e', 'g');
+
+  expected.scan_set = scan_set;
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalBracketArgJustHyphen) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%[-]";
+  char arg1 = 'a';
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.conv_name = '[';
+  expected.output_ptr = &arg1;
+
+  __llvm_libc::cpp::bitset<256> scan_set;
+
+  scan_set.set('-');
+
+  expected.scan_set = scan_set;
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalBracketArgLeftHyphen) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%[-A]";
+  char arg1 = 'a';
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.conv_name = '[';
+  expected.output_ptr = &arg1;
+
+  __llvm_libc::cpp::bitset<256> scan_set;
+
+  scan_set.set('-');
+  scan_set.set('A');
+
+  expected.scan_set = scan_set;
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalBracketArgRightHyphen) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%[Z-]";
+  char arg1 = 'a';
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.conv_name = '[';
+  expected.output_ptr = &arg1;
+
+  __llvm_libc::cpp::bitset<256> scan_set;
+
+  scan_set.set('-');
+  scan_set.set('Z');
+
+  expected.scan_set = scan_set;
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalBracketArgInvertSimple) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%[^abc]";
+  char arg1 = 'a';
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.conv_name = '[';
+  expected.output_ptr = &arg1;
+
+  __llvm_libc::cpp::bitset<256> scan_set;
+
+  scan_set.set('a');
+  scan_set.set('b');
+  scan_set.set('c');
+  scan_set.flip();
+
+  expected.scan_set = scan_set;
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalBracketArgInvertRange) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%[^0-9]";
+  char arg1 = 'a';
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.conv_name = '[';
+  expected.output_ptr = &arg1;
+
+  __llvm_libc::cpp::bitset<256> scan_set;
+
+  scan_set.set_range('0', '9');
+  scan_set.flip();
+
+  expected.scan_set = scan_set;
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalBracketArgRightBracket) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%[]]";
+  char arg1 = 'a';
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.conv_name = '[';
+  expected.output_ptr = &arg1;
+
+  __llvm_libc::cpp::bitset<256> scan_set;
+
+  scan_set.set(']');
+
+  expected.scan_set = scan_set;
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalBracketArgRightBracketRange) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%[]-a]";
+  char arg1 = 'a';
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.conv_name = '[';
+  expected.output_ptr = &arg1;
+
+  __llvm_libc::cpp::bitset<256> scan_set;
+
+  scan_set.set_range(']', 'a');
+
+  expected.scan_set = scan_set;
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalBracketArgRightBracketInvert) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%[^]]";
+  char arg1 = 'a';
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.conv_name = '[';
+  expected.output_ptr = &arg1;
+
+  __llvm_libc::cpp::bitset<256> scan_set;
+
+  scan_set.set(']');
+  scan_set.flip();
+
+  expected.scan_set = scan_set;
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalBracketArgRightBracketInvertRange) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%[^]-^]";
+  char arg1 = 'a';
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.conv_name = '[';
+  expected.output_ptr = &arg1;
+
+  __llvm_libc::cpp::bitset<256> scan_set;
+
+  scan_set.set_range(']', '^');
+  scan_set.flip();
+
+  expected.scan_set = scan_set;
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+// This is not part of the standard, but the hyphen's effect is always
+// implementation defined, and I have defined it such that it will capture the
+// correct range regardless of the order of the characters.
+TEST(LlvmLibcScanfParserTest, EvalBracketArgBackwardsRange) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%[9-0]";
+  char arg1 = 'a';
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = str;
+  expected.conv_name = '[';
+  expected.output_ptr = &arg1;
+
+  __llvm_libc::cpp::bitset<256> scan_set;
+
+  scan_set.set_range('0', '9');
+
+  expected.scan_set = scan_set;
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, EvalThreeArgs) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%d%f%s";
+  int arg1 = 12345;
+  double arg2 = 123.45;
+  const char *arg3 = "12345";
+  evaluate(format_arr, str, &arg1, &arg2, &arg3);
+
+  __llvm_libc::scanf_core::FormatSection expected0, expected1, expected2;
+  expected0.has_conv = true;
+
+  expected0.raw_string = {str, 2};
+  expected0.output_ptr = &arg1;
+  expected0.conv_name = 'd';
+
+  ASSERT_SFORMAT_EQ(expected0, format_arr[0]);
+
+  expected1.has_conv = true;
+
+  expected1.raw_string = {str + 2, 2};
+  expected1.output_ptr = &arg2;
+  expected1.conv_name = 'f';
+
+  ASSERT_SFORMAT_EQ(expected1, format_arr[1]);
+
+  expected2.has_conv = true;
+
+  expected2.raw_string = {str + 4, 2};
+  expected2.output_ptr = &arg3;
+  expected2.conv_name = 's';
+
+  ASSERT_SFORMAT_EQ(expected2, format_arr[2]);
+}
+
+#ifndef LLVM_LIBC_SCANF_DISABLE_INDEX_MODE
+
+TEST(LlvmLibcScanfParserTest, IndexModeOneArg) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%1$d";
+  int arg1 = 12345;
+  evaluate(format_arr, str, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected;
+  expected.has_conv = true;
+
+  expected.raw_string = {str, 4};
+  expected.output_ptr = &arg1;
+  expected.conv_name = 'd';
+
+  ASSERT_SFORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcScanfParserTest, IndexModeThreeArgsSequential) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%1$d%2$f%3$s";
+  int arg1 = 12345;
+  double arg2 = 123.45;
+  const char *arg3 = "12345";
+  evaluate(format_arr, str, &arg1, &arg2, &arg3);
+
+  __llvm_libc::scanf_core::FormatSection expected0, expected1, expected2;
+  expected0.has_conv = true;
+
+  expected0.raw_string = {str, 4};
+  expected0.output_ptr = &arg1;
+  expected0.conv_name = 'd';
+
+  ASSERT_SFORMAT_EQ(expected0, format_arr[0]);
+
+  expected1.has_conv = true;
+
+  expected1.raw_string = {str + 4, 4};
+  expected1.output_ptr = &arg2;
+  expected1.conv_name = 'f';
+
+  ASSERT_SFORMAT_EQ(expected1, format_arr[1]);
+
+  expected2.has_conv = true;
+
+  expected2.raw_string = {str + 8, 4};
+  expected2.output_ptr = &arg3;
+  expected2.conv_name = 's';
+
+  ASSERT_SFORMAT_EQ(expected2, format_arr[2]);
+}
+
+TEST(LlvmLibcScanfParserTest, IndexModeThreeArgsReverse) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%3$d%2$f%1$s";
+  int arg1 = 12345;
+  double arg2 = 123.45;
+  const char *arg3 = "12345";
+  evaluate(format_arr, str, &arg3, &arg2, &arg1);
+
+  __llvm_libc::scanf_core::FormatSection expected0, expected1, expected2;
+  expected0.has_conv = true;
+
+  expected0.raw_string = {str, 4};
+  expected0.output_ptr = &arg1;
+  expected0.conv_name = 'd';
+
+  ASSERT_SFORMAT_EQ(expected0, format_arr[0]);
+
+  expected1.has_conv = true;
+
+  expected1.raw_string = {str + 4, 4};
+  expected1.output_ptr = &arg2;
+  expected1.conv_name = 'f';
+
+  ASSERT_SFORMAT_EQ(expected1, format_arr[1]);
+
+  expected2.has_conv = true;
+
+  expected2.raw_string = {str + 8, 4};
+  expected2.output_ptr = &arg3;
+  expected2.conv_name = 's';
+
+  ASSERT_SFORMAT_EQ(expected2, format_arr[2]);
+}
+
+TEST(LlvmLibcScanfParserTest, IndexModeTenArgsRandom) {
+  __llvm_libc::scanf_core::FormatSection format_arr[10];
+  const char *str = "%6$d%3$d%7$d%2$d%8$d%1$d%4$d%9$d%5$d%10$d";
+  uintptr_t args[10] = {6, 4, 2, 7, 9, 1, 3, 5, 8, 10};
+  evaluate(format_arr, str, args[0], args[1], args[2], args[3], args[4],
+           args[5], args[6], args[7], args[8], args[9]);
+
+  for (size_t i = 0; i < 10; ++i) {
+    __llvm_libc::scanf_core::FormatSection expected;
+    expected.has_conv = true;
+
+    expected.raw_string = {str + (4 * i),
+                           static_cast<size_t>(4 + (i >= 9 ? 1 : 0))};
+    expected.output_ptr = reinterpret_cast<void *>(i + 1);
+    expected.conv_name = 'd';
+    EXPECT_SFORMAT_EQ(expected, format_arr[i]);
+  }
+}
+
+TEST(LlvmLibcScanfParserTest, IndexModeComplexParsing) {
+  __llvm_libc::scanf_core::FormatSection format_arr[11];
+  const char *str = "normal text %3$llu %% %2$*f %4$d %1$1c%5$[123]";
+  char arg1 = '1';
+  double arg2 = 123.45;
+  unsigned long long arg3 = 12345;
+  int arg4 = 10;
+  char arg5 = 'A';
+  evaluate(format_arr, str, &arg1, &arg2, &arg3, &arg4, &arg5);
+
+  __llvm_libc::scanf_core::FormatSection expected0, expected1, expected2,
+      expected3, expected4, expected5, expected6, expected7, expected8,
+      expected9, expected10;
+
+  expected0.has_conv = false;
+
+  // "normal text "
+  expected0.raw_string = {str, 12};
+
+  EXPECT_SFORMAT_EQ(expected0, format_arr[0]);
+
+  expected1.has_conv = true;
+
+  // "%3$llu"
+  expected1.raw_string = {str + 12, 6};
+  expected1.length_modifier = __llvm_libc::scanf_core::LengthModifier::ll;
+  expected1.output_ptr = &arg3;
+  expected1.conv_name = 'u';
+
+  EXPECT_SFORMAT_EQ(expected1, format_arr[1]);
+
+  expected2.has_conv = false;
+
+  // " "
+  expected2.raw_string = {str + 18, 1};
+
+  EXPECT_SFORMAT_EQ(expected2, format_arr[2]);
+
+  expected3.has_conv = true;
+
+  expected3.raw_string = {str + 19, 2};
+  expected3.conv_name = '%';
+
+  EXPECT_SFORMAT_EQ(expected3, format_arr[3]);
+
+  expected4.has_conv = false;
+
+  // " "
+  expected4.raw_string = {str + 21, 1};
+
+  EXPECT_SFORMAT_EQ(expected4, format_arr[4]);
+
+  expected5.has_conv = true;
+
+  // "%%"
+  expected5.raw_string = {str + 22, 5};
+  expected5.flags = __llvm_libc::scanf_core::FormatFlags::NO_WRITE;
+  expected5.conv_name = 'f';
+
+  EXPECT_SFORMAT_EQ(expected5, format_arr[5]);
+
+  expected6.has_conv = false;
+
+  // " "
+  expected6.raw_string = {str + 27, 1};
+
+  EXPECT_SFORMAT_EQ(expected6, format_arr[6]);
+
+  expected7.has_conv = true;
+
+  // "%2$*f"
+  expected7.raw_string = {str + 28, 4};
+  expected7.output_ptr = &arg4;
+  expected7.conv_name = 'd';
+
+  EXPECT_SFORMAT_EQ(expected7, format_arr[7]);
+
+  expected8.has_conv = false;
+
+  // " "
+  expected8.raw_string = {str + 32, 1};
+
+  EXPECT_SFORMAT_EQ(expected8, format_arr[8]);
+
+  expected9.has_conv = true;
+
+  // "%1$1c"
+  expected9.raw_string = {str + 33, 5};
+  expected9.max_width = 1;
+  expected9.output_ptr = &arg1;
+  expected9.conv_name = 'c';
+
+  EXPECT_SFORMAT_EQ(expected9, format_arr[9]);
+
+  expected9.has_conv = true;
+
+  // "%5$[123]"
+  expected10.raw_string = {str + 38, 8};
+  expected10.output_ptr = &arg5;
+  expected10.conv_name = '[';
+
+  __llvm_libc::cpp::bitset<256> scan_set;
+
+  scan_set.set_range('1', '3');
+
+  expected10.scan_set = scan_set;
+
+  EXPECT_SFORMAT_EQ(expected10, format_arr[10]);
+}
+
+#endif // LLVM_LIBC_SCANF_DISABLE_INDEX_MODE

diff  --git a/libc/utils/UnitTest/CMakeLists.txt b/libc/utils/UnitTest/CMakeLists.txt
index c4dffeaedd0ab..f2ec4e0ae6937 100644
--- a/libc/utils/UnitTest/CMakeLists.txt
+++ b/libc/utils/UnitTest/CMakeLists.txt
@@ -69,3 +69,18 @@ add_dependencies(
   libc.src.stdio.printf_core.core_structs
   libc.utils.UnitTest.string_utils
 )
+
+add_library(
+  LibcScanfHelpers
+    ScanfMatcher.h
+    ScanfMatcher.cpp
+)
+target_include_directories(LibcScanfHelpers PUBLIC ${LIBC_SOURCE_DIR})
+target_link_libraries(LibcScanfHelpers LibcUnitTest)
+add_dependencies(
+  LibcScanfHelpers
+  LibcUnitTest
+  libc.src.__support.FPUtil.fp_bits
+  libc.src.stdio.scanf_core.core_structs
+  libc.utils.UnitTest.string_utils
+)

diff  --git a/libc/utils/UnitTest/PrintfMatcher.h b/libc/utils/UnitTest/PrintfMatcher.h
index 004b9721f1d36..81523c0751ef6 100644
--- a/libc/utils/UnitTest/PrintfMatcher.h
+++ b/libc/utils/UnitTest/PrintfMatcher.h
@@ -35,11 +35,11 @@ class FormatSectionMatcher
 } // namespace printf_core
 } // namespace __llvm_libc
 
-#define EXPECT_FORMAT_EQ(expected, actual)                                     \
+#define EXPECT_PFORMAT_EQ(expected, actual)                                    \
   EXPECT_THAT(actual, __llvm_libc::printf_core::testing::FormatSectionMatcher( \
                           expected))
 
-#define ASSERT_FORMAT_EQ(expected, actual)                                     \
+#define ASSERT_PFORMAT_EQ(expected, actual)                                    \
   ASSERT_THAT(actual, __llvm_libc::printf_core::testing::FormatSectionMatcher( \
                           expected))
 

diff  --git a/libc/utils/UnitTest/ScanfMatcher.cpp b/libc/utils/UnitTest/ScanfMatcher.cpp
new file mode 100644
index 0000000000000..bd9e6ac7907d4
--- /dev/null
+++ b/libc/utils/UnitTest/ScanfMatcher.cpp
@@ -0,0 +1,99 @@
+//===-- ScanfMatcher.cpp ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScanfMatcher.h"
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/stdio/scanf_core/core_structs.h"
+
+#include "utils/UnitTest/StringUtils.h"
+
+#include <stdint.h>
+
+namespace __llvm_libc {
+namespace scanf_core {
+namespace testing {
+
+bool FormatSectionMatcher::match(FormatSection actualValue) {
+  actual = actualValue;
+  return expected == actual;
+}
+
+namespace {
+
+#define IF_FLAG_SHOW_FLAG(flag_name)                                           \
+  do {                                                                         \
+    if ((form.flags & FormatFlags::flag_name) == FormatFlags::flag_name)       \
+      stream << "\n\t\t" << #flag_name;                                        \
+  } while (false)
+#define CASE_LM(lm)                                                            \
+  case (LengthModifier::lm):                                                   \
+    stream << #lm;                                                             \
+    break
+
+void display(testutils::StreamWrapper &stream, FormatSection form) {
+  stream << "Raw String (len " << form.raw_string.size() << "): \"";
+  for (size_t i = 0; i < form.raw_string.size(); ++i) {
+    stream << form.raw_string[i];
+  }
+  stream << "\"";
+  if (form.has_conv) {
+    stream << "\n\tHas Conv\n\tFlags:";
+    IF_FLAG_SHOW_FLAG(NO_WRITE);
+    IF_FLAG_SHOW_FLAG(ALLOCATE);
+    stream << "\n";
+    stream << "\tmax width: " << form.max_width << "\n";
+    stream << "\tlength modifier: ";
+    switch (form.length_modifier) {
+      CASE_LM(NONE);
+      CASE_LM(l);
+      CASE_LM(ll);
+      CASE_LM(h);
+      CASE_LM(hh);
+      CASE_LM(j);
+      CASE_LM(z);
+      CASE_LM(t);
+      CASE_LM(L);
+    }
+    stream << "\n";
+    // If the pointer is used (NO_WRITE is not set and the conversion isn't %).
+    if (((form.flags & FormatFlags::NO_WRITE) == 0) &&
+        (form.conv_name != '%')) {
+      stream << "\tpointer value: "
+             << int_to_hex<uintptr_t>(
+                    reinterpret_cast<uintptr_t>(form.output_ptr))
+             << "\n";
+    }
+
+    stream << "\tconversion name: " << form.conv_name << "\n";
+
+    if (form.conv_name == '[') {
+      stream << "\t\t";
+      for (size_t i = 0; i < 256 /* char max */; ++i) {
+        if (form.scan_set.test(i)) {
+          stream << static_cast<char>(i);
+        }
+      }
+      stream << "\n\t]\n";
+    }
+  }
+}
+} // anonymous namespace
+
+void FormatSectionMatcher::explainError(testutils::StreamWrapper &stream) {
+  stream << "expected format section: ";
+  display(stream, expected);
+  stream << '\n';
+  stream << "actual format section  : ";
+  display(stream, actual);
+  stream << '\n';
+}
+
+} // namespace testing
+} // namespace scanf_core
+} // namespace __llvm_libc

diff  --git a/libc/utils/UnitTest/ScanfMatcher.h b/libc/utils/UnitTest/ScanfMatcher.h
new file mode 100644
index 0000000000000..b5079fe6b230e
--- /dev/null
+++ b/libc/utils/UnitTest/ScanfMatcher.h
@@ -0,0 +1,46 @@
+//===-- ScanfMatcher.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_UNITTEST_SCANF_MATCHER_H
+#define LLVM_LIBC_UTILS_UNITTEST_SCANF_MATCHER_H
+
+#include "src/stdio/scanf_core/core_structs.h"
+#include "utils/UnitTest/Test.h"
+
+#include <errno.h>
+
+namespace __llvm_libc {
+namespace scanf_core {
+namespace testing {
+
+class FormatSectionMatcher
+    : public __llvm_libc::testing::Matcher<FormatSection> {
+  FormatSection expected;
+  FormatSection actual;
+
+public:
+  FormatSectionMatcher(FormatSection expectedValue) : expected(expectedValue) {}
+
+  bool match(FormatSection actualValue);
+
+  void explainError(testutils::StreamWrapper &stream) override;
+};
+
+} // namespace testing
+} // namespace scanf_core
+} // namespace __llvm_libc
+
+#define EXPECT_SFORMAT_EQ(expected, actual)                                    \
+  EXPECT_THAT(actual, __llvm_libc::scanf_core::testing::FormatSectionMatcher(  \
+                          expected))
+
+#define ASSERT_SFORMAT_EQ(expected, actual)                                    \
+  ASSERT_THAT(actual, __llvm_libc::scanf_core::testing::FormatSectionMatcher(  \
+                          expected))
+
+#endif // LLVM_LIBC_UTILS_UNITTEST_SCANF_MATCHER_H


        


More information about the libc-commits mailing list