[libc-commits] [libc] [libc] Template the printf / scanf parser class (PR #66277)

Joseph Huber via libc-commits libc-commits at lists.llvm.org
Thu Sep 14 11:09:11 PDT 2023


https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/66277:

>From ddcf5d891f3fe0702be01a81d0e7e7c3461b2e97 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6 at vols.utk.edu>
Date: Wed, 13 Sep 2023 14:08:23 -0500
Subject: [PATCH 1/2] [libc] Template the printf / scanf parser class

Summary:
The parser class for stdio currently accepts different argument
providers. In-tree this is only used for a fuzzer test, however, the
proposed implementation of the GPU handling of printf / scanf will
require custom argument handlers. This makes the current approach of
using a preprocessor macro messier. This path porposed folding this
logic into a template instantiation. The downside to this is that
because the implementation of the parser class is placed into an
implementation file we need to manually instantiate the needed templates
which will slightly bloat binary size. Alternatively we could remove the
implementation file, or key off of the `libc` external packaging macro
so it is not present in the installed version.
---
 libc/fuzzing/stdio/CMakeLists.txt             |   4 +-
 libc/fuzzing/stdio/printf_parser_fuzz.cpp     |   7 +-
 libc/src/stdio/printf_core/CMakeLists.txt     |  24 +-
 libc/src/stdio/printf_core/parser.cpp         | 466 ------------------
 libc/src/stdio/printf_core/parser.h           | 453 ++++++++++++++++-
 libc/src/stdio/printf_core/printf_main.cpp    |   2 +-
 libc/src/stdio/scanf_core/CMakeLists.txt      |   4 +-
 libc/src/stdio/scanf_core/parser.cpp          | 225 ---------
 libc/src/stdio/scanf_core/parser.h            | 213 +++++++-
 libc/src/stdio/scanf_core/scanf_main.cpp      |   2 +-
 .../src/stdio/printf_core/parser_test.cpp     |   9 +-
 .../test/src/stdio/scanf_core/parser_test.cpp |   7 +-
 12 files changed, 671 insertions(+), 745 deletions(-)
 delete mode 100644 libc/src/stdio/printf_core/parser.cpp
 delete mode 100644 libc/src/stdio/scanf_core/parser.cpp

diff --git a/libc/fuzzing/stdio/CMakeLists.txt b/libc/fuzzing/stdio/CMakeLists.txt
index bd7e38bc1401e56..22de67d42747fa9 100644
--- a/libc/fuzzing/stdio/CMakeLists.txt
+++ b/libc/fuzzing/stdio/CMakeLists.txt
@@ -3,9 +3,7 @@ add_libc_fuzzer(
   SRCS
     printf_parser_fuzz.cpp
   DEPENDS
-    libc.src.stdio.printf_core.mock_parser
-  COMPILE_OPTIONS
-    -DLIBC_COPT_MOCK_ARG_LIST
+    libc.src.stdio.printf_core.parser
 )
 
 add_libc_fuzzer(
diff --git a/libc/fuzzing/stdio/printf_parser_fuzz.cpp b/libc/fuzzing/stdio/printf_parser_fuzz.cpp
index 05cd616ca48b0e4..86f8c1e0a55f818 100644
--- a/libc/fuzzing/stdio/printf_parser_fuzz.cpp
+++ b/libc/fuzzing/stdio/printf_parser_fuzz.cpp
@@ -10,10 +10,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LIBC_COPT_MOCK_ARG_LIST
-#error The printf Parser Fuzzer must be compiled with LIBC_COPT_MOCK_ARG_LIST, and the parser itself must also be compiled with that option when it's linked against the fuzzer.
-#endif
-
 #include "src/__support/arg_list.h"
 #include "src/stdio/printf_core/parser.h"
 
@@ -37,7 +33,8 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
 
   auto mock_arg_list = internal::MockArgList();
 
-  auto parser = printf_core::Parser(in_str, mock_arg_list);
+  auto parser =
+      printf_core::Parser<internal::MockArgList>(in_str, mock_arg_list);
 
   int str_percent_count = 0;
 
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index 7087d28ede66e8b..c1a4b8cda85c90d 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -8,28 +8,8 @@ add_header_library(
     libc.src.__support.FPUtil.fp_bits
 )
 
-add_object_library(
+add_header_library(
   parser
-  SRCS
-    parser.cpp
-  HDRS
-    parser.h
-  DEPENDS
-    .core_structs
-    libc.src.__support.arg_list
-    libc.src.__support.ctype_utils
-    libc.src.__support.str_to_integer
-    libc.src.__support.CPP.bit
-    libc.src.__support.CPP.optional
-    libc.src.__support.CPP.string_view
-    libc.src.__support.CPP.type_traits
-    libc.src.__support.common
-)
-
-add_object_library(
-  mock_parser
-  SRCS
-    parser.cpp
   HDRS
     parser.h
   DEPENDS
@@ -42,8 +22,6 @@ add_object_library(
     libc.src.__support.CPP.string_view
     libc.src.__support.CPP.type_traits
     libc.src.__support.common
-  COMPILE_OPTIONS
-    -DLIBC_COPT_MOCK_ARG_LIST
 )
 
 add_object_library(
diff --git a/libc/src/stdio/printf_core/parser.cpp b/libc/src/stdio/printf_core/parser.cpp
deleted file mode 100644
index 6b2c174c3f23329..000000000000000
--- a/libc/src/stdio/printf_core/parser.cpp
+++ /dev/null
@@ -1,466 +0,0 @@
-//===-- Format string parser implementation for printf ----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// #define LIBC_COPT_PRINTF_DISABLE_INDEX_MODE 1 // This will be a compile flag.
-
-#include "parser.h"
-
-#include "src/__support/arg_list.h"
-
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/optional.h"
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/CPP/type_traits.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/ctype_utils.h"
-#include "src/__support/str_to_integer.h"
-#include "src/stdio/printf_core/core_structs.h"
-
-namespace __llvm_libc {
-namespace printf_core {
-
-template <typename T> struct int_type_of {
-  using type = T;
-};
-template <> struct int_type_of<double> {
-  using type = fputil::FPBits<double>::UIntType;
-};
-template <> struct int_type_of<long double> {
-  using type = fputil::FPBits<long double>::UIntType;
-};
-template <typename T> using int_type_of_v = typename int_type_of<T>::type;
-
-#ifndef LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
-#define WRITE_ARG_VAL_SIMPLEST(dst, arg_type, index)                           \
-  {                                                                            \
-    auto temp = get_arg_value<arg_type>(index);                                \
-    if (!temp.has_value()) {                                                   \
-      section.has_conv = false;                                                \
-    } else {                                                                   \
-      dst = cpp::bit_cast<int_type_of_v<arg_type>>(temp.value());              \
-    }                                                                          \
-  }
-#else
-#define WRITE_ARG_VAL_SIMPLEST(dst, arg_type, _)                               \
-  dst = cpp::bit_cast<int_type_of_v<arg_type>>(get_next_arg_value<arg_type>())
-#endif // LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
-
-FormatSection Parser::get_next_section() {
-  FormatSection section;
-  size_t starting_pos = cur_pos;
-  if (str[cur_pos] == '%') {
-    // format section
-    section.has_conv = true;
-
-    ++cur_pos;
-    [[maybe_unused]] size_t conv_index = 0;
-
-#ifndef LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
-    conv_index = parse_index(&cur_pos);
-#endif // LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
-
-    section.flags = parse_flags(&cur_pos);
-
-    // handle width
-    section.min_width = 0;
-    if (str[cur_pos] == '*') {
-      ++cur_pos;
-
-      WRITE_ARG_VAL_SIMPLEST(section.min_width, int, parse_index(&cur_pos));
-    } else if (internal::isdigit(str[cur_pos])) {
-      auto result = internal::strtointeger<int>(str + cur_pos, 10);
-      section.min_width = result.value;
-      cur_pos = cur_pos + result.parsed_len;
-    }
-    if (section.min_width < 0) {
-      section.min_width = -section.min_width;
-      section.flags =
-          static_cast<FormatFlags>(section.flags | FormatFlags::LEFT_JUSTIFIED);
-    }
-
-    // handle precision
-    section.precision = -1; // negative precisions are ignored.
-    if (str[cur_pos] == '.') {
-      ++cur_pos;
-      section.precision = 0; // if there's a . but no specified precision, the
-                             // precision is implicitly 0.
-      if (str[cur_pos] == '*') {
-        ++cur_pos;
-
-        WRITE_ARG_VAL_SIMPLEST(section.precision, int, parse_index(&cur_pos));
-
-      } else if (internal::isdigit(str[cur_pos])) {
-        auto result = internal::strtointeger<int>(str + cur_pos, 10);
-        section.precision = result.value;
-        cur_pos = cur_pos + result.parsed_len;
-      }
-    }
-
-    LengthModifier lm = parse_length_modifier(&cur_pos);
-
-    section.length_modifier = lm;
-    section.conv_name = str[cur_pos];
-    switch (str[cur_pos]) {
-    case ('%'):
-      // Regardless of options, a % conversion is always safe. The standard says
-      // that "The complete conversion specification shall be %%" but it also
-      // says that "If a conversion specification is invalid, the behavior is
-      // undefined." Based on that we define that any conversion specification
-      // ending in '%' shall display as '%' regardless of any valid or invalid
-      // options.
-      section.has_conv = true;
-      break;
-    case ('c'):
-      WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, int, conv_index);
-      break;
-    case ('d'):
-    case ('i'):
-    case ('o'):
-    case ('x'):
-    case ('X'):
-    case ('u'):
-      switch (lm) {
-      case (LengthModifier::hh):
-      case (LengthModifier::h):
-      case (LengthModifier::none):
-        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, int, conv_index);
-        break;
-      case (LengthModifier::l):
-        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, long, conv_index);
-        break;
-      case (LengthModifier::ll):
-      case (LengthModifier::L): // This isn't in the standard, but is in other
-                                // libc implementations.
-
-        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, long long, conv_index);
-        break;
-      case (LengthModifier::j):
-
-        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, intmax_t, conv_index);
-        break;
-      case (LengthModifier::z):
-
-        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, size_t, conv_index);
-        break;
-      case (LengthModifier::t):
-
-        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, ptrdiff_t, conv_index);
-        break;
-      }
-      break;
-#ifndef LIBC_COPT_PRINTF_DISABLE_FLOAT
-    case ('f'):
-    case ('F'):
-    case ('e'):
-    case ('E'):
-    case ('a'):
-    case ('A'):
-    case ('g'):
-    case ('G'):
-      if (lm != LengthModifier::L) {
-        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, double, conv_index);
-      } else {
-        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, long double, conv_index);
-      }
-      break;
-#endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
-#ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
-    case ('n'):
-#endif // LIBC_COPT_PRINTF_DISABLE_WRITE_INT
-    case ('p'):
-    case ('s'):
-      WRITE_ARG_VAL_SIMPLEST(section.conv_val_ptr, void *, conv_index);
-      break;
-    default:
-      // if the conversion is undefined, change this to a raw section.
-      section.has_conv = false;
-      break;
-    }
-    // If the end of the format section is on the '\0'. This means we need to
-    // not advance the cur_pos.
-    if (str[cur_pos] != '\0')
-      ++cur_pos;
-
-  } else {
-    // raw section
-    section.has_conv = false;
-    while (str[cur_pos] != '%' && str[cur_pos] != '\0')
-      ++cur_pos;
-  }
-  section.raw_string = {str + starting_pos, cur_pos - starting_pos};
-  return section;
-}
-
-FormatFlags Parser::parse_flags(size_t *local_pos) {
-  bool found_flag = true;
-  FormatFlags flags = FormatFlags(0);
-  while (found_flag) {
-    switch (str[*local_pos]) {
-    case '-':
-      flags = static_cast<FormatFlags>(flags | FormatFlags::LEFT_JUSTIFIED);
-      break;
-    case '+':
-      flags = static_cast<FormatFlags>(flags | FormatFlags::FORCE_SIGN);
-      break;
-    case ' ':
-      flags = static_cast<FormatFlags>(flags | FormatFlags::SPACE_PREFIX);
-      break;
-    case '#':
-      flags = static_cast<FormatFlags>(flags | FormatFlags::ALTERNATE_FORM);
-      break;
-    case '0':
-      flags = static_cast<FormatFlags>(flags | FormatFlags::LEADING_ZEROES);
-      break;
-    default:
-      found_flag = false;
-    }
-    if (found_flag)
-      ++*local_pos;
-  }
-  return flags;
-}
-
-LengthModifier Parser::parse_length_modifier(size_t *local_pos) {
-  switch (str[*local_pos]) {
-  case ('l'):
-    if (str[*local_pos + 1] == 'l') {
-      *local_pos += 2;
-      return LengthModifier::ll;
-    } else {
-      ++*local_pos;
-      return LengthModifier::l;
-    }
-  case ('h'):
-    if (str[*local_pos + 1] == 'h') {
-      *local_pos += 2;
-      return LengthModifier::hh;
-    } else {
-      ++*local_pos;
-      return LengthModifier::h;
-    }
-  case ('L'):
-    ++*local_pos;
-    return LengthModifier::L;
-  case ('j'):
-    ++*local_pos;
-    return LengthModifier::j;
-  case ('z'):
-    ++*local_pos;
-    return LengthModifier::z;
-  case ('t'):
-    ++*local_pos;
-    return LengthModifier::t;
-  default:
-    return LengthModifier::none;
-  }
-}
-
-//----------------------------------------------------
-// INDEX MODE ONLY FUNCTIONS AFTER HERE:
-//----------------------------------------------------
-
-#ifndef LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
-
-size_t Parser::parse_index(size_t *local_pos) {
-  if (internal::isdigit(str[*local_pos])) {
-    auto result = internal::strtointeger<int>(str + *local_pos, 10);
-    size_t index = result.value;
-    if (str[*local_pos + result.parsed_len] != '$')
-      return 0;
-    *local_pos = 1 + result.parsed_len + *local_pos;
-    return index;
-  }
-  return 0;
-}
-
-TypeDesc Parser::get_type_desc(size_t index) {
-  // index mode is assumed, and the indicies start at 1, so an index
-  // of 0 is invalid.
-  size_t local_pos = 0;
-
-  while (str[local_pos]) {
-    if (str[local_pos] == '%') {
-      ++local_pos;
-
-      size_t conv_index = parse_index(&local_pos);
-
-      // the flags aren't relevant for this situation, but I need to skip past
-      // them so they're parsed but the result is discarded.
-      parse_flags(&local_pos);
-
-      // handle width
-      if (str[local_pos] == '*') {
-        ++local_pos;
-
-        size_t width_index = parse_index(&local_pos);
-        set_type_desc(width_index, type_desc_from_type<int>());
-        if (width_index == index)
-          return type_desc_from_type<int>();
-
-      } else if (internal::isdigit(str[local_pos])) {
-        while (internal::isdigit(str[local_pos]))
-          ++local_pos;
-      }
-
-      // handle precision
-      if (str[local_pos] == '.') {
-        ++local_pos;
-        if (str[local_pos] == '*') {
-          ++local_pos;
-
-          size_t precision_index = parse_index(&local_pos);
-          set_type_desc(precision_index, type_desc_from_type<int>());
-          if (precision_index == index)
-            return type_desc_from_type<int>();
-
-        } else if (internal::isdigit(str[local_pos])) {
-          while (internal::isdigit(str[local_pos]))
-            ++local_pos;
-        }
-      }
-
-      LengthModifier lm = parse_length_modifier(&local_pos);
-
-      // if we don't have an index for this conversion, then its position is
-      // unknown and all this information is irrelevant. The rest of this logic
-      // has been for skipping past this conversion properly to avoid
-      // weirdness with %%.
-      if (conv_index == 0) {
-        if (str[local_pos] != '\0')
-          ++local_pos;
-        continue;
-      }
-
-      TypeDesc conv_size = type_desc_from_type<void>();
-      switch (str[local_pos]) {
-      case ('%'):
-        conv_size = type_desc_from_type<void>();
-        break;
-      case ('c'):
-        conv_size = type_desc_from_type<int>();
-        break;
-      case ('d'):
-      case ('i'):
-      case ('o'):
-      case ('x'):
-      case ('X'):
-      case ('u'):
-        switch (lm) {
-        case (LengthModifier::hh):
-        case (LengthModifier::h):
-        case (LengthModifier::none):
-          conv_size = type_desc_from_type<int>();
-          break;
-        case (LengthModifier::l):
-          conv_size = type_desc_from_type<long>();
-          break;
-        case (LengthModifier::ll):
-        case (LengthModifier::L): // This isn't in the standard, but is in other
-                                  // libc implementations.
-          conv_size = type_desc_from_type<long long>();
-          break;
-        case (LengthModifier::j):
-          conv_size = type_desc_from_type<intmax_t>();
-          break;
-        case (LengthModifier::z):
-          conv_size = type_desc_from_type<size_t>();
-          break;
-        case (LengthModifier::t):
-          conv_size = type_desc_from_type<ptrdiff_t>();
-          break;
-        }
-        break;
-#ifndef LIBC_COPT_PRINTF_DISABLE_FLOAT
-      case ('f'):
-      case ('F'):
-      case ('e'):
-      case ('E'):
-      case ('a'):
-      case ('A'):
-      case ('g'):
-      case ('G'):
-        if (lm != LengthModifier::L)
-          conv_size = type_desc_from_type<double>();
-        else
-          conv_size = type_desc_from_type<long double>();
-        break;
-#endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
-#ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
-      case ('n'):
-#endif // LIBC_COPT_PRINTF_DISABLE_WRITE_INT
-      case ('p'):
-      case ('s'):
-        conv_size = type_desc_from_type<void *>();
-        break;
-      default:
-        conv_size = type_desc_from_type<int>();
-        break;
-      }
-
-      set_type_desc(conv_index, conv_size);
-      if (conv_index == index)
-        return conv_size;
-    }
-    // If the end of the format section is on the '\0'. This means we need to
-    // not advance the local_pos.
-    if (str[local_pos] != '\0')
-      ++local_pos;
-  }
-
-  // If there is no size for the requested index, then it's unknown. Return
-  // void.
-  return type_desc_from_type<void>();
-}
-
-bool Parser::args_to_index(size_t index) {
-  if (args_index > index) {
-    args_index = 1;
-    args_cur = args_start;
-  }
-
-  while (args_index < index) {
-    TypeDesc cur_type_desc = type_desc_from_type<void>();
-    if (args_index <= DESC_ARR_LEN)
-      cur_type_desc = desc_arr[args_index - 1];
-
-    if (cur_type_desc == type_desc_from_type<void>())
-      cur_type_desc = get_type_desc(args_index);
-
-    // A type of void represents the type being unknown. If the type for the
-    // requested index isn't in the desc_arr and isn't found by parsing the
-    // string, then then advancing to the requested index is impossible. In that
-    // case the function returns false.
-    if (cur_type_desc == type_desc_from_type<void>())
-      return false;
-
-    if (cur_type_desc == type_desc_from_type<uint32_t>())
-      args_cur.next_var<uint32_t>();
-    else if (cur_type_desc == type_desc_from_type<uint64_t>())
-      args_cur.next_var<uint64_t>();
-#ifndef LIBC_COPT_PRINTF_DISABLE_FLOAT
-    // Floating point numbers are stored separately from the other arguments.
-    else if (cur_type_desc == type_desc_from_type<double>())
-      args_cur.next_var<double>();
-    else if (cur_type_desc == type_desc_from_type<long double>())
-      args_cur.next_var<long double>();
-#endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
-    // pointers may be stored separately from normal values.
-    else if (cur_type_desc == type_desc_from_type<void *>())
-      args_cur.next_var<void *>();
-    else
-      args_cur.next_var<uint32_t>();
-
-    ++args_index;
-  }
-  return true;
-}
-
-#endif // LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
-
-} // namespace printf_core
-} // namespace __llvm_libc
diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index a376af99ad8d7f7..5507f5e75821f11 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -13,6 +13,7 @@
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/common.h"
+#include "src/__support/str_to_integer.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/printf_config.h"
 
@@ -21,13 +22,18 @@
 namespace __llvm_libc {
 namespace printf_core {
 
-#ifndef LIBC_COPT_MOCK_ARG_LIST
-using ArgProvider = internal::ArgList;
-#else  // not defined LIBC_COPT_MOCK_ARG_LIST
-using ArgProvider = internal::MockArgList;
-#endif // LIBC_COPT_MOCK_ARG_LIST
+template <typename T> struct int_type_of {
+  using type = T;
+};
+template <> struct int_type_of<double> {
+  using type = fputil::FPBits<double>::UIntType;
+};
+template <> struct int_type_of<long double> {
+  using type = fputil::FPBits<long double>::UIntType;
+};
+template <typename T> using int_type_of_v = typename int_type_of<T>::type;
 
-class Parser {
+template <typename ArgProvider> class Parser {
   const char *__restrict str;
 
   size_t cur_pos = 0;
@@ -84,7 +90,7 @@ class Parser {
 
   // get_next_arg_value gets the next value from the arg list as type T.
   template <class T> LIBC_INLINE T get_next_arg_value() {
-    return args_cur.next_var<T>();
+    return args_cur.template next_var<T>();
   }
 
   //----------------------------------------------------
@@ -142,6 +148,439 @@ class Parser {
 #endif // LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
 };
 
+#ifndef LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
+#define WRITE_ARG_VAL_SIMPLEST(dst, arg_type, index)                           \
+  {                                                                            \
+    auto temp = get_arg_value<arg_type>(index);                                \
+    if (!temp.has_value()) {                                                   \
+      section.has_conv = false;                                                \
+    } else {                                                                   \
+      dst = cpp::bit_cast<int_type_of_v<arg_type>>(temp.value());              \
+    }                                                                          \
+  }
+#else
+#define WRITE_ARG_VAL_SIMPLEST(dst, arg_type, _)                               \
+  dst = cpp::bit_cast<int_type_of_v<arg_type>>(get_next_arg_value<arg_type>())
+#endif // LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
+
+template <typename ArgProvider>
+FormatSection Parser<ArgProvider>::get_next_section() {
+  FormatSection section;
+  size_t starting_pos = cur_pos;
+  if (str[cur_pos] == '%') {
+    // format section
+    section.has_conv = true;
+
+    ++cur_pos;
+    [[maybe_unused]] size_t conv_index = 0;
+
+#ifndef LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
+    conv_index = parse_index(&cur_pos);
+#endif // LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
+
+    section.flags = parse_flags(&cur_pos);
+
+    // handle width
+    section.min_width = 0;
+    if (str[cur_pos] == '*') {
+      ++cur_pos;
+
+      WRITE_ARG_VAL_SIMPLEST(section.min_width, int, parse_index(&cur_pos));
+    } else if (internal::isdigit(str[cur_pos])) {
+      auto result = internal::strtointeger<int>(str + cur_pos, 10);
+      section.min_width = result.value;
+      cur_pos = cur_pos + result.parsed_len;
+    }
+    if (section.min_width < 0) {
+      section.min_width = -section.min_width;
+      section.flags =
+          static_cast<FormatFlags>(section.flags | FormatFlags::LEFT_JUSTIFIED);
+    }
+
+    // handle precision
+    section.precision = -1; // negative precisions are ignored.
+    if (str[cur_pos] == '.') {
+      ++cur_pos;
+      section.precision = 0; // if there's a . but no specified precision, the
+                             // precision is implicitly 0.
+      if (str[cur_pos] == '*') {
+        ++cur_pos;
+
+        WRITE_ARG_VAL_SIMPLEST(section.precision, int, parse_index(&cur_pos));
+
+      } else if (internal::isdigit(str[cur_pos])) {
+        auto result = internal::strtointeger<int>(str + cur_pos, 10);
+        section.precision = result.value;
+        cur_pos = cur_pos + result.parsed_len;
+      }
+    }
+
+    LengthModifier lm = parse_length_modifier(&cur_pos);
+
+    section.length_modifier = lm;
+    section.conv_name = str[cur_pos];
+    switch (str[cur_pos]) {
+    case ('%'):
+      // Regardless of options, a % conversion is always safe. The standard says
+      // that "The complete conversion specification shall be %%" but it also
+      // says that "If a conversion specification is invalid, the behavior is
+      // undefined." Based on that we define that any conversion specification
+      // ending in '%' shall display as '%' regardless of any valid or invalid
+      // options.
+      section.has_conv = true;
+      break;
+    case ('c'):
+      WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, int, conv_index);
+      break;
+    case ('d'):
+    case ('i'):
+    case ('o'):
+    case ('x'):
+    case ('X'):
+    case ('u'):
+      switch (lm) {
+      case (LengthModifier::hh):
+      case (LengthModifier::h):
+      case (LengthModifier::none):
+        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, int, conv_index);
+        break;
+      case (LengthModifier::l):
+        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, long, conv_index);
+        break;
+      case (LengthModifier::ll):
+      case (LengthModifier::L): // This isn't in the standard, but is in other
+                                // libc implementations.
+
+        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, long long, conv_index);
+        break;
+      case (LengthModifier::j):
+
+        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, intmax_t, conv_index);
+        break;
+      case (LengthModifier::z):
+
+        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, size_t, conv_index);
+        break;
+      case (LengthModifier::t):
+
+        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, ptrdiff_t, conv_index);
+        break;
+      }
+      break;
+#ifndef LIBC_COPT_PRINTF_DISABLE_FLOAT
+    case ('f'):
+    case ('F'):
+    case ('e'):
+    case ('E'):
+    case ('a'):
+    case ('A'):
+    case ('g'):
+    case ('G'):
+      if (lm != LengthModifier::L) {
+        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, double, conv_index);
+      } else {
+        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, long double, conv_index);
+      }
+      break;
+#endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
+#ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
+    case ('n'):
+#endif // LIBC_COPT_PRINTF_DISABLE_WRITE_INT
+    case ('p'):
+    case ('s'):
+      WRITE_ARG_VAL_SIMPLEST(section.conv_val_ptr, void *, conv_index);
+      break;
+    default:
+      // if the conversion is undefined, change this to a raw section.
+      section.has_conv = false;
+      break;
+    }
+    // If the end of the format section is on the '\0'. This means we need to
+    // not advance the cur_pos.
+    if (str[cur_pos] != '\0')
+      ++cur_pos;
+
+  } else {
+    // raw section
+    section.has_conv = false;
+    while (str[cur_pos] != '%' && str[cur_pos] != '\0')
+      ++cur_pos;
+  }
+  section.raw_string = {str + starting_pos, cur_pos - starting_pos};
+  return section;
+}
+
+template <typename ArgProvider>
+FormatFlags Parser<ArgProvider>::parse_flags(size_t *local_pos) {
+  bool found_flag = true;
+  FormatFlags flags = FormatFlags(0);
+  while (found_flag) {
+    switch (str[*local_pos]) {
+    case '-':
+      flags = static_cast<FormatFlags>(flags | FormatFlags::LEFT_JUSTIFIED);
+      break;
+    case '+':
+      flags = static_cast<FormatFlags>(flags | FormatFlags::FORCE_SIGN);
+      break;
+    case ' ':
+      flags = static_cast<FormatFlags>(flags | FormatFlags::SPACE_PREFIX);
+      break;
+    case '#':
+      flags = static_cast<FormatFlags>(flags | FormatFlags::ALTERNATE_FORM);
+      break;
+    case '0':
+      flags = static_cast<FormatFlags>(flags | FormatFlags::LEADING_ZEROES);
+      break;
+    default:
+      found_flag = false;
+    }
+    if (found_flag)
+      ++*local_pos;
+  }
+  return flags;
+}
+
+template <typename ArgProvider>
+LengthModifier Parser<ArgProvider>::parse_length_modifier(size_t *local_pos) {
+  switch (str[*local_pos]) {
+  case ('l'):
+    if (str[*local_pos + 1] == 'l') {
+      *local_pos += 2;
+      return LengthModifier::ll;
+    } else {
+      ++*local_pos;
+      return LengthModifier::l;
+    }
+  case ('h'):
+    if (str[*local_pos + 1] == 'h') {
+      *local_pos += 2;
+      return LengthModifier::hh;
+    } else {
+      ++*local_pos;
+      return LengthModifier::h;
+    }
+  case ('L'):
+    ++*local_pos;
+    return LengthModifier::L;
+  case ('j'):
+    ++*local_pos;
+    return LengthModifier::j;
+  case ('z'):
+    ++*local_pos;
+    return LengthModifier::z;
+  case ('t'):
+    ++*local_pos;
+    return LengthModifier::t;
+  default:
+    return LengthModifier::none;
+  }
+}
+
+//----------------------------------------------------
+// INDEX MODE ONLY FUNCTIONS AFTER HERE:
+//----------------------------------------------------
+
+#ifndef LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
+
+template <typename ArgProvider>
+size_t Parser<ArgProvider>::parse_index(size_t *local_pos) {
+  if (internal::isdigit(str[*local_pos])) {
+    auto result = internal::strtointeger<int>(str + *local_pos, 10);
+    size_t index = result.value;
+    if (str[*local_pos + result.parsed_len] != '$')
+      return 0;
+    *local_pos = 1 + result.parsed_len + *local_pos;
+    return index;
+  }
+  return 0;
+}
+
+template <typename ArgProvider>
+TypeDesc Parser<ArgProvider>::get_type_desc(size_t index) {
+  // index mode is assumed, and the indicies start at 1, so an index
+  // of 0 is invalid.
+  size_t local_pos = 0;
+
+  while (str[local_pos]) {
+    if (str[local_pos] == '%') {
+      ++local_pos;
+
+      size_t conv_index = parse_index(&local_pos);
+
+      // the flags aren't relevant for this situation, but I need to skip past
+      // them so they're parsed but the result is discarded.
+      parse_flags(&local_pos);
+
+      // handle width
+      if (str[local_pos] == '*') {
+        ++local_pos;
+
+        size_t width_index = parse_index(&local_pos);
+        set_type_desc(width_index, type_desc_from_type<int>());
+        if (width_index == index)
+          return type_desc_from_type<int>();
+
+      } else if (internal::isdigit(str[local_pos])) {
+        while (internal::isdigit(str[local_pos]))
+          ++local_pos;
+      }
+
+      // handle precision
+      if (str[local_pos] == '.') {
+        ++local_pos;
+        if (str[local_pos] == '*') {
+          ++local_pos;
+
+          size_t precision_index = parse_index(&local_pos);
+          set_type_desc(precision_index, type_desc_from_type<int>());
+          if (precision_index == index)
+            return type_desc_from_type<int>();
+
+        } else if (internal::isdigit(str[local_pos])) {
+          while (internal::isdigit(str[local_pos]))
+            ++local_pos;
+        }
+      }
+
+      LengthModifier lm = parse_length_modifier(&local_pos);
+
+      // if we don't have an index for this conversion, then its position is
+      // unknown and all this information is irrelevant. The rest of this logic
+      // has been for skipping past this conversion properly to avoid
+      // weirdness with %%.
+      if (conv_index == 0) {
+        if (str[local_pos] != '\0')
+          ++local_pos;
+        continue;
+      }
+
+      TypeDesc conv_size = type_desc_from_type<void>();
+      switch (str[local_pos]) {
+      case ('%'):
+        conv_size = type_desc_from_type<void>();
+        break;
+      case ('c'):
+        conv_size = type_desc_from_type<int>();
+        break;
+      case ('d'):
+      case ('i'):
+      case ('o'):
+      case ('x'):
+      case ('X'):
+      case ('u'):
+        switch (lm) {
+        case (LengthModifier::hh):
+        case (LengthModifier::h):
+        case (LengthModifier::none):
+          conv_size = type_desc_from_type<int>();
+          break;
+        case (LengthModifier::l):
+          conv_size = type_desc_from_type<long>();
+          break;
+        case (LengthModifier::ll):
+        case (LengthModifier::L): // This isn't in the standard, but is in other
+                                  // libc implementations.
+          conv_size = type_desc_from_type<long long>();
+          break;
+        case (LengthModifier::j):
+          conv_size = type_desc_from_type<intmax_t>();
+          break;
+        case (LengthModifier::z):
+          conv_size = type_desc_from_type<size_t>();
+          break;
+        case (LengthModifier::t):
+          conv_size = type_desc_from_type<ptrdiff_t>();
+          break;
+        }
+        break;
+#ifndef LIBC_COPT_PRINTF_DISABLE_FLOAT
+      case ('f'):
+      case ('F'):
+      case ('e'):
+      case ('E'):
+      case ('a'):
+      case ('A'):
+      case ('g'):
+      case ('G'):
+        if (lm != LengthModifier::L)
+          conv_size = type_desc_from_type<double>();
+        else
+          conv_size = type_desc_from_type<long double>();
+        break;
+#endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
+#ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
+      case ('n'):
+#endif // LIBC_COPT_PRINTF_DISABLE_WRITE_INT
+      case ('p'):
+      case ('s'):
+        conv_size = type_desc_from_type<void *>();
+        break;
+      default:
+        conv_size = type_desc_from_type<int>();
+        break;
+      }
+
+      set_type_desc(conv_index, conv_size);
+      if (conv_index == index)
+        return conv_size;
+    }
+    // If the end of the format section is on the '\0'. This means we need to
+    // not advance the local_pos.
+    if (str[local_pos] != '\0')
+      ++local_pos;
+  }
+
+  // If there is no size for the requested index, then it's unknown. Return
+  // void.
+  return type_desc_from_type<void>();
+}
+
+template <typename ArgProvider>
+bool Parser<ArgProvider>::args_to_index(size_t index) {
+  if (args_index > index) {
+    args_index = 1;
+    args_cur = args_start;
+  }
+
+  while (args_index < index) {
+    TypeDesc cur_type_desc = type_desc_from_type<void>();
+    if (args_index <= DESC_ARR_LEN)
+      cur_type_desc = desc_arr[args_index - 1];
+
+    if (cur_type_desc == type_desc_from_type<void>())
+      cur_type_desc = get_type_desc(args_index);
+
+    // A type of void represents the type being unknown. If the type for the
+    // requested index isn't in the desc_arr and isn't found by parsing the
+    // string, then then advancing to the requested index is impossible. In that
+    // case the function returns false.
+    if (cur_type_desc == type_desc_from_type<void>())
+      return false;
+
+    if (cur_type_desc == type_desc_from_type<uint32_t>())
+      args_cur.template next_var<uint32_t>();
+    else if (cur_type_desc == type_desc_from_type<uint64_t>())
+      args_cur.template next_var<uint64_t>();
+#ifndef LIBC_COPT_PRINTF_DISABLE_FLOAT
+    // Floating point numbers are stored separately from the other arguments.
+    else if (cur_type_desc == type_desc_from_type<double>())
+      args_cur.template next_var<double>();
+    else if (cur_type_desc == type_desc_from_type<long double>())
+      args_cur.template next_var<long double>();
+#endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
+    // pointers may be stored separately from normal values.
+    else if (cur_type_desc == type_desc_from_type<void *>())
+      args_cur.template next_var<void *>();
+    else
+      args_cur.template next_var<uint32_t>();
+
+    ++args_index;
+  }
+  return true;
+}
+
+#endif // LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
+
 } // namespace printf_core
 } // namespace __llvm_libc
 
diff --git a/libc/src/stdio/printf_core/printf_main.cpp b/libc/src/stdio/printf_core/printf_main.cpp
index b7684cdf1e74fc0..60d1e210eee4cf8 100644
--- a/libc/src/stdio/printf_core/printf_main.cpp
+++ b/libc/src/stdio/printf_core/printf_main.cpp
@@ -21,7 +21,7 @@ namespace printf_core {
 
 int printf_main(Writer *writer, const char *__restrict str,
                 internal::ArgList &args) {
-  Parser parser(str, args);
+  Parser<internal::ArgList> parser(str, args);
   int result = 0;
   for (FormatSection cur_section = parser.get_next_section();
        !cur_section.raw_string.empty();
diff --git a/libc/src/stdio/scanf_core/CMakeLists.txt b/libc/src/stdio/scanf_core/CMakeLists.txt
index 9f6cb9c386eb226..8cdd33e5c2c0f94 100644
--- a/libc/src/stdio/scanf_core/CMakeLists.txt
+++ b/libc/src/stdio/scanf_core/CMakeLists.txt
@@ -8,10 +8,8 @@ add_header_library(
     libc.src.__support.FPUtil.fp_bits
 )
 
-add_object_library(
+add_header_library(
   parser
-  SRCS
-    parser.cpp
   HDRS
     parser.h
   DEPENDS
diff --git a/libc/src/stdio/scanf_core/parser.cpp b/libc/src/stdio/scanf_core/parser.cpp
deleted file mode 100644
index 44e853c8a8de8fe..000000000000000
--- a/libc/src/stdio/scanf_core/parser.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-//===-- Format string parser implementation for scanf ----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// #define LIBC_COPT_SCANF_DISABLE_INDEX_MODE 1 // This will be a compile flag.
-
-#include "src/stdio/scanf_core/parser.h"
-
-#include "src/__support/arg_list.h"
-
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/bitset.h"
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/ctype_utils.h"
-#include "src/__support/str_to_integer.h"
-
-namespace __llvm_libc {
-namespace scanf_core {
-
-#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
-#define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index)
-#else
-#define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>()
-#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
-
-FormatSection Parser::get_next_section() {
-  FormatSection section;
-  size_t starting_pos = cur_pos;
-  if (str[cur_pos] == '%') {
-    // format section
-    section.has_conv = true;
-
-    ++cur_pos;
-    [[maybe_unused]] size_t conv_index = 0;
-
-#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
-    conv_index = parse_index(&cur_pos);
-#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
-
-    if (str[cur_pos] == '*') {
-      ++cur_pos;
-      section.flags = FormatFlags::NO_WRITE;
-    }
-
-    // handle width
-    section.max_width = -1;
-    if (internal::isdigit(str[cur_pos])) {
-      auto result = internal::strtointeger<int>(str + cur_pos, 10);
-      section.max_width = result.value;
-      cur_pos = cur_pos + result.parsed_len;
-    }
-
-    // TODO(michaelrj): add posix allocate flag support.
-    // if (str[cur_pos] == 'm') {
-    //   ++cur_pos;
-    //   section.flags = FormatFlags::ALLOCATE;
-    // }
-
-    LengthModifier lm = parse_length_modifier(&cur_pos);
-    section.length_modifier = lm;
-
-    section.conv_name = str[cur_pos];
-
-    // If NO_WRITE is not set, then read the next arg as the output pointer.
-    if ((section.flags & FormatFlags::NO_WRITE) == 0) {
-      // Since all outputs are pointers, there's no need to distinguish when
-      // reading from va_args. They're all the same size and stored the same.
-      section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index);
-    }
-
-    // If the end of the format section is on the '\0'. This means we need to
-    // not advance the cur_pos and we should not count this has having a
-    // conversion.
-    if (str[cur_pos] != '\0') {
-      ++cur_pos;
-    } else {
-      section.has_conv = false;
-    }
-
-    // If the format is a bracketed one, then we need to parse out the insides
-    // of the brackets.
-    if (section.conv_name == '[') {
-      constexpr char CLOSING_BRACKET = ']';
-      constexpr char INVERT_FLAG = '^';
-      constexpr char RANGE_OPERATOR = '-';
-
-      cpp::bitset<256> scan_set;
-      bool invert = false;
-
-      // The circumflex in the first position represents the inversion flag, but
-      // it's easier to apply that at the end so we just store it for now.
-      if (str[cur_pos] == INVERT_FLAG) {
-        invert = true;
-        ++cur_pos;
-      }
-
-      // This is used to determine if a hyphen is being used as a literal or as
-      // a range operator.
-      size_t set_start_pos = cur_pos;
-
-      // Normally the right bracket closes the set, but if it's the first
-      // character (possibly after the inversion flag) then it's instead
-      // included as a character in the set and the second right bracket closes
-      // the set.
-      if (str[cur_pos] == CLOSING_BRACKET) {
-        scan_set.set(CLOSING_BRACKET);
-        ++cur_pos;
-      }
-
-      while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) {
-        // If a hyphen is being used as a range operator, since it's neither at
-        // the beginning nor end of the set.
-        if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos &&
-            str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') {
-          // Technically there is no requirement to correct the ordering of the
-          // range, but since the range operator is entirely implementation
-          // defined it seems like a good convenience.
-          char a = str[cur_pos - 1];
-          char b = str[cur_pos + 1];
-          char start = (a < b ? a : b);
-          char end = (a < b ? b : a);
-          scan_set.set_range(start, end);
-          cur_pos += 2;
-        } else {
-          scan_set.set(str[cur_pos]);
-          ++cur_pos;
-        }
-      }
-      if (invert)
-        scan_set.flip();
-
-      if (str[cur_pos] == CLOSING_BRACKET) {
-        ++cur_pos;
-        section.scan_set = scan_set;
-      } else {
-        // if the end of the string was encountered, this is not a valid set.
-        section.has_conv = false;
-      }
-    }
-  } else {
-    // raw section
-    section.has_conv = false;
-    while (str[cur_pos] != '%' && str[cur_pos] != '\0')
-      ++cur_pos;
-  }
-  section.raw_string = {str + starting_pos, cur_pos - starting_pos};
-  return section;
-}
-
-LengthModifier Parser::parse_length_modifier(size_t *local_pos) {
-  switch (str[*local_pos]) {
-  case ('l'):
-    if (str[*local_pos + 1] == 'l') {
-      *local_pos += 2;
-      return LengthModifier::ll;
-    } else {
-      ++*local_pos;
-      return LengthModifier::l;
-    }
-  case ('h'):
-    if (str[*local_pos + 1] == 'h') {
-      *local_pos += 2;
-      return LengthModifier::hh;
-    } else {
-      ++*local_pos;
-      return LengthModifier::h;
-    }
-  case ('L'):
-    ++*local_pos;
-    return LengthModifier::L;
-  case ('j'):
-    ++*local_pos;
-    return LengthModifier::j;
-  case ('z'):
-    ++*local_pos;
-    return LengthModifier::z;
-  case ('t'):
-    ++*local_pos;
-    return LengthModifier::t;
-  default:
-    return LengthModifier::NONE;
-  }
-}
-
-//----------------------------------------------------
-// INDEX MODE ONLY FUNCTIONS AFTER HERE:
-//----------------------------------------------------
-
-#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
-
-size_t Parser::parse_index(size_t *local_pos) {
-  if (internal::isdigit(str[*local_pos])) {
-    auto result = internal::strtointeger<int>(str + *local_pos, 10);
-    size_t index = result.value;
-    if (str[*local_pos + result.parsed_len] != '$')
-      return 0;
-    *local_pos = 1 + result.parsed_len + *local_pos;
-    return index;
-  }
-  return 0;
-}
-
-void Parser::args_to_index(size_t index) {
-  if (args_index > index) {
-    args_index = 1;
-    args_cur = args_start;
-  }
-
-  while (args_index < index) {
-    // Since all arguments must be pointers, we can just read all of them as
-    // void * and not worry about type issues.
-    args_cur.next_var<void *>();
-    ++args_index;
-  }
-}
-
-#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
-
-} // namespace scanf_core
-} // namespace __llvm_libc
diff --git a/libc/src/stdio/scanf_core/parser.h b/libc/src/stdio/scanf_core/parser.h
index 4b9f0b4dd95b94b..5554d34c0c95d46 100644
--- a/libc/src/stdio/scanf_core/parser.h
+++ b/libc/src/stdio/scanf_core/parser.h
@@ -11,6 +11,8 @@
 
 #include "src/__support/arg_list.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
+#include "src/__support/str_to_integer.h"
 #include "src/stdio/scanf_core/core_structs.h"
 #include "src/stdio/scanf_core/scanf_config.h"
 
@@ -19,17 +21,23 @@
 namespace __llvm_libc {
 namespace scanf_core {
 
-class Parser {
+#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
+#define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index)
+#else
+#define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>()
+#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
+
+template <typename ArgProvider> class Parser {
   const char *__restrict str;
 
   size_t cur_pos = 0;
-  internal::ArgList args_cur;
+  ArgProvider args_cur;
 
 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
   // args_start stores the start of the va_args, which is used when a previous
   // argument is needed. In that case, we have to read the arguments from the
   // beginning since they don't support reading backwards.
-  internal::ArgList args_start;
+  ArgProvider args_start;
   size_t args_index = 1;
 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
 
@@ -57,7 +65,7 @@ class Parser {
 
   // get_next_arg_value gets the next value from the arg list as type T.
   template <class T> LIBC_INLINE T get_next_arg_value() {
-    return args_cur.next_var<T>();
+    return args_cur.template next_var<T>();
   }
 
   //----------------------------------------------------
@@ -94,6 +102,203 @@ class Parser {
 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
 };
 
+template <typename ArgProvider>
+FormatSection Parser<ArgProvider>::get_next_section() {
+  FormatSection section;
+  size_t starting_pos = cur_pos;
+  if (str[cur_pos] == '%') {
+    // format section
+    section.has_conv = true;
+
+    ++cur_pos;
+    [[maybe_unused]] size_t conv_index = 0;
+
+#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
+    conv_index = parse_index(&cur_pos);
+#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
+
+    if (str[cur_pos] == '*') {
+      ++cur_pos;
+      section.flags = FormatFlags::NO_WRITE;
+    }
+
+    // handle width
+    section.max_width = -1;
+    if (internal::isdigit(str[cur_pos])) {
+      auto result = internal::strtointeger<int>(str + cur_pos, 10);
+      section.max_width = result.value;
+      cur_pos = cur_pos + result.parsed_len;
+    }
+
+    // TODO(michaelrj): add posix allocate flag support.
+    // if (str[cur_pos] == 'm') {
+    //   ++cur_pos;
+    //   section.flags = FormatFlags::ALLOCATE;
+    // }
+
+    LengthModifier lm = parse_length_modifier(&cur_pos);
+    section.length_modifier = lm;
+
+    section.conv_name = str[cur_pos];
+
+    // If NO_WRITE is not set, then read the next arg as the output pointer.
+    if ((section.flags & FormatFlags::NO_WRITE) == 0) {
+      // Since all outputs are pointers, there's no need to distinguish when
+      // reading from va_args. They're all the same size and stored the same.
+      section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index);
+    }
+
+    // If the end of the format section is on the '\0'. This means we need to
+    // not advance the cur_pos and we should not count this has having a
+    // conversion.
+    if (str[cur_pos] != '\0') {
+      ++cur_pos;
+    } else {
+      section.has_conv = false;
+    }
+
+    // If the format is a bracketed one, then we need to parse out the insides
+    // of the brackets.
+    if (section.conv_name == '[') {
+      constexpr char CLOSING_BRACKET = ']';
+      constexpr char INVERT_FLAG = '^';
+      constexpr char RANGE_OPERATOR = '-';
+
+      cpp::bitset<256> scan_set;
+      bool invert = false;
+
+      // The circumflex in the first position represents the inversion flag, but
+      // it's easier to apply that at the end so we just store it for now.
+      if (str[cur_pos] == INVERT_FLAG) {
+        invert = true;
+        ++cur_pos;
+      }
+
+      // This is used to determine if a hyphen is being used as a literal or as
+      // a range operator.
+      size_t set_start_pos = cur_pos;
+
+      // Normally the right bracket closes the set, but if it's the first
+      // character (possibly after the inversion flag) then it's instead
+      // included as a character in the set and the second right bracket closes
+      // the set.
+      if (str[cur_pos] == CLOSING_BRACKET) {
+        scan_set.set(CLOSING_BRACKET);
+        ++cur_pos;
+      }
+
+      while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) {
+        // If a hyphen is being used as a range operator, since it's neither at
+        // the beginning nor end of the set.
+        if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos &&
+            str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') {
+          // Technically there is no requirement to correct the ordering of the
+          // range, but since the range operator is entirely implementation
+          // defined it seems like a good convenience.
+          char a = str[cur_pos - 1];
+          char b = str[cur_pos + 1];
+          char start = (a < b ? a : b);
+          char end = (a < b ? b : a);
+          scan_set.set_range(start, end);
+          cur_pos += 2;
+        } else {
+          scan_set.set(str[cur_pos]);
+          ++cur_pos;
+        }
+      }
+      if (invert)
+        scan_set.flip();
+
+      if (str[cur_pos] == CLOSING_BRACKET) {
+        ++cur_pos;
+        section.scan_set = scan_set;
+      } else {
+        // if the end of the string was encountered, this is not a valid set.
+        section.has_conv = false;
+      }
+    }
+  } else {
+    // raw section
+    section.has_conv = false;
+    while (str[cur_pos] != '%' && str[cur_pos] != '\0')
+      ++cur_pos;
+  }
+  section.raw_string = {str + starting_pos, cur_pos - starting_pos};
+  return section;
+}
+
+template <typename ArgProvider>
+LengthModifier Parser<ArgProvider>::parse_length_modifier(size_t *local_pos) {
+  switch (str[*local_pos]) {
+  case ('l'):
+    if (str[*local_pos + 1] == 'l') {
+      *local_pos += 2;
+      return LengthModifier::ll;
+    } else {
+      ++*local_pos;
+      return LengthModifier::l;
+    }
+  case ('h'):
+    if (str[*local_pos + 1] == 'h') {
+      *local_pos += 2;
+      return LengthModifier::hh;
+    } else {
+      ++*local_pos;
+      return LengthModifier::h;
+    }
+  case ('L'):
+    ++*local_pos;
+    return LengthModifier::L;
+  case ('j'):
+    ++*local_pos;
+    return LengthModifier::j;
+  case ('z'):
+    ++*local_pos;
+    return LengthModifier::z;
+  case ('t'):
+    ++*local_pos;
+    return LengthModifier::t;
+  default:
+    return LengthModifier::NONE;
+  }
+}
+
+//----------------------------------------------------
+// INDEX MODE ONLY FUNCTIONS AFTER HERE:
+//----------------------------------------------------
+
+#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
+
+template <typename ArgProvider>
+size_t Parser<ArgProvider>::parse_index(size_t *local_pos) {
+  if (internal::isdigit(str[*local_pos])) {
+    auto result = internal::strtointeger<int>(str + *local_pos, 10);
+    size_t index = result.value;
+    if (str[*local_pos + result.parsed_len] != '$')
+      return 0;
+    *local_pos = 1 + result.parsed_len + *local_pos;
+    return index;
+  }
+  return 0;
+}
+
+template <typename ArgProvider>
+void Parser<ArgProvider>::args_to_index(size_t index) {
+  if (args_index > index) {
+    args_index = 1;
+    args_cur = args_start;
+  }
+
+  while (args_index < index) {
+    // Since all arguments must be pointers, we can just read all of them as
+    // void * and not worry about type issues.
+    args_cur.template next_var<void *>();
+    ++args_index;
+  }
+}
+
+#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
+
 } // namespace scanf_core
 } // namespace __llvm_libc
 
diff --git a/libc/src/stdio/scanf_core/scanf_main.cpp b/libc/src/stdio/scanf_core/scanf_main.cpp
index 5a79d2e624ab0aa..e7e41cbe899d720 100644
--- a/libc/src/stdio/scanf_core/scanf_main.cpp
+++ b/libc/src/stdio/scanf_core/scanf_main.cpp
@@ -21,7 +21,7 @@ namespace scanf_core {
 
 int scanf_main(Reader *reader, const char *__restrict str,
                internal::ArgList &args) {
-  Parser parser(str, args);
+  Parser<internal::ArgList> parser(str, args);
   int ret_val = READ_OK;
   int conversions = 0;
   for (FormatSection cur_section = parser.get_next_section();
diff --git a/libc/test/src/stdio/printf_core/parser_test.cpp b/libc/test/src/stdio/printf_core/parser_test.cpp
index 61f8c7cbe580e74..910b611f5194939 100644
--- a/libc/test/src/stdio/printf_core/parser_test.cpp
+++ b/libc/test/src/stdio/printf_core/parser_test.cpp
@@ -17,24 +17,25 @@
 #include "test/UnitTest/Test.h"
 
 using __llvm_libc::cpp::string_view;
+using __llvm_libc::internal::ArgList;
 
 void init(const char *__restrict str, ...) {
   va_list vlist;
   va_start(vlist, str);
-  __llvm_libc::internal::ArgList v(vlist);
+  ArgList v(vlist);
   va_end(vlist);
 
-  __llvm_libc::printf_core::Parser parser(str, v);
+  __llvm_libc::printf_core::Parser<ArgList> parser(str, v);
 }
 
 void evaluate(__llvm_libc::printf_core::FormatSection *format_arr,
               const char *__restrict str, ...) {
   va_list vlist;
   va_start(vlist, str);
-  __llvm_libc::internal::ArgList v(vlist);
+  ArgList v(vlist);
   va_end(vlist);
 
-  __llvm_libc::printf_core::Parser parser(str, v);
+  __llvm_libc::printf_core::Parser<ArgList> parser(str, v);
 
   for (auto cur_section = parser.get_next_section();
        !cur_section.raw_string.empty();
diff --git a/libc/test/src/stdio/scanf_core/parser_test.cpp b/libc/test/src/stdio/scanf_core/parser_test.cpp
index 2ccaf84c6755233..b1f9efa0f8a2bc7 100644
--- a/libc/test/src/stdio/scanf_core/parser_test.cpp
+++ b/libc/test/src/stdio/scanf_core/parser_test.cpp
@@ -18,14 +18,15 @@
 #include "test/UnitTest/Test.h"
 
 using __llvm_libc::cpp::string_view;
+using __llvm_libc::internal::ArgList;
 
 void init(const char *__restrict str, ...) {
   va_list vlist;
   va_start(vlist, str);
-  __llvm_libc::internal::ArgList v(vlist);
+  ArgList v(vlist);
   va_end(vlist);
 
-  __llvm_libc::scanf_core::Parser parser(str, v);
+  __llvm_libc::scanf_core::Parser<ArgList> parser(str, v);
 }
 
 void evaluate(__llvm_libc::scanf_core::FormatSection *format_arr,
@@ -35,7 +36,7 @@ void evaluate(__llvm_libc::scanf_core::FormatSection *format_arr,
   __llvm_libc::internal::ArgList v(vlist);
   va_end(vlist);
 
-  __llvm_libc::scanf_core::Parser parser(str, v);
+  __llvm_libc::scanf_core::Parser<ArgList> parser(str, v);
 
   for (auto cur_section = parser.get_next_section();
        !cur_section.raw_string.empty();

>From 2ac72735eecc08bb3bcdeab0ba9480672c1d5e7e Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6 at vols.utk.edu>
Date: Thu, 14 Sep 2023 13:08:57 -0500
Subject: [PATCH 2/2] Update bazel

---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 2 --
 1 file changed, 2 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 17e4913749d51c6..0de48b3b01b9f90 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2688,7 +2688,6 @@ libc_support_library(
 
 libc_support_library(
     name = "printf_parser",
-    srcs = ["src/stdio/printf_core/parser.cpp"],
     hdrs = ["src/stdio/printf_core/parser.h"],
     defines = PRINTF_COPTS,
     deps = [
@@ -2710,7 +2709,6 @@ libc_support_library(
 # Only used for testing.
 libc_support_library(
     name = "printf_mock_parser",
-    srcs = ["src/stdio/printf_core/parser.cpp"],
     hdrs = ["src/stdio/printf_core/parser.h"],
     defines = PRINTF_COPTS + ["LIBC_COPT_MOCK_ARG_LIST"],
     deps = [



More information about the libc-commits mailing list