[libc-commits] [libc] 945fa67 - [libc][NFC] add index mode to printf parser

Michael Jones via libc-commits libc-commits at lists.llvm.org
Fri May 6 12:06:13 PDT 2022


Author: Michael Jones
Date: 2022-05-06T12:06:08-07:00
New Revision: 945fa672c60d265fa0dc13f5a5445768945feb0a

URL: https://github.com/llvm/llvm-project/commit/945fa672c60d265fa0dc13f5a5445768945feb0a
DIFF: https://github.com/llvm/llvm-project/commit/945fa672c60d265fa0dc13f5a5445768945feb0a.diff

LOG: [libc][NFC] add index mode to printf parser

This patch is a followup to the previous patch which implemented the
main printf parsing logic as well as sequential mode. This patch adds
index mode.

Reviewed By: sivachandra

Differential Revision: https://reviews.llvm.org/D123424

Added: 
    

Modified: 
    libc/src/stdio/printf_core/CMakeLists.txt
    libc/src/stdio/printf_core/parser.cpp
    libc/src/stdio/printf_core/parser.h
    libc/test/src/stdio/printf_core/parser_test.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index d167a0191c840..965bbcd14da89 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -17,7 +17,7 @@ add_object_library(
     libc.src.__support.ctype_utils
     libc.src.__support.str_to_integer
     libc.src.__support.CPP.bit
-
+    libc.src.string.memory_utils.memset_implementation
 )
 
 add_header_library(

diff  --git a/libc/src/stdio/printf_core/parser.cpp b/libc/src/stdio/printf_core/parser.cpp
index 1541416a84c7c..a373f38536b5b 100644
--- a/libc/src/stdio/printf_core/parser.cpp
+++ b/libc/src/stdio/printf_core/parser.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// #define LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE 1 // This will be a compile flag.
+
 #include "parser.h"
 
 #include "src/__support/arg_list.h"
@@ -17,8 +19,6 @@
 namespace __llvm_libc {
 namespace printf_core {
 
-#define LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE 1 // This will be a compile flag.
-
 #ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
 #define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index)
 #else
@@ -36,6 +36,10 @@ FormatSection Parser::get_next_section() {
     ++cur_pos;
     [[maybe_unused]] size_t conv_index = 0;
 
+#ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
+    conv_index = parse_index(&cur_pos);
+#endif // LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
+
     section.flags = parse_flags(&cur_pos);
 
     // handle width
@@ -192,7 +196,7 @@ LengthModifier Parser::parse_length_modifier(size_t *local_pos) {
       return LengthModifier::l;
     }
   case ('h'):
-    if (str[cur_pos + 1] == 'h') {
+    if (str[*local_pos + 1] == 'h') {
       *local_pos += 2;
       return LengthModifier::hh;
     } else {
@@ -216,5 +220,190 @@ LengthModifier Parser::parse_length_modifier(size_t *local_pos) {
   }
 }
 
+//----------------------------------------------------
+// INDEX MODE ONLY FUNCTIONS AFTER HERE:
+//----------------------------------------------------
+
+#ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
+
+size_t Parser::parse_index(size_t *local_pos) {
+  if (internal::isdigit(str[*local_pos])) {
+    char *int_end;
+    size_t index =
+        internal::strtointeger<size_t>(str + *local_pos, &int_end, 10);
+    if (int_end[0] != '$')
+      return 0;
+    *local_pos = 1 + int_end - str;
+    return index;
+  }
+  return 0;
+}
+
+Parser::TypeDesc Parser::get_type_desc(size_t index) {
+  // index mode is assumed, and the indicies start at 1, so an index
+  // of 0 is invalid.
+  size_t local_pos = 0;
+
+  while (str[local_pos]) {
+    if (str[local_pos] == '%') {
+      ++local_pos;
+
+      size_t conv_index = parse_index(&local_pos);
+
+      // the flags aren't relevant for this situation, but I need to skip past
+      // them so they're parsed but the result is discarded.
+      parse_flags(&local_pos);
+
+      // handle width
+      if (str[local_pos] == '*') {
+        ++local_pos;
+
+        size_t width_index = parse_index(&local_pos);
+        set_type_desc(width_index, TYPE_DESC<int>);
+        if (width_index == index)
+          return TYPE_DESC<int>;
+
+      } else if (internal::isdigit(str[local_pos])) {
+        while (internal::isdigit(str[local_pos]))
+          ++local_pos;
+      }
+
+      // handle precision
+      if (str[local_pos] == '.') {
+        ++local_pos;
+        if (str[local_pos] == '*') {
+          ++local_pos;
+
+          size_t precision_index = parse_index(&local_pos);
+          set_type_desc(precision_index, TYPE_DESC<int>);
+          if (precision_index == index)
+            return TYPE_DESC<int>;
+
+        } else if (internal::isdigit(str[local_pos])) {
+          while (internal::isdigit(str[local_pos]))
+            ++local_pos;
+        }
+      }
+
+      LengthModifier lm = parse_length_modifier(&local_pos);
+
+      // if we don't have an index for this conversion, then its position is
+      // unknown and all this information is irrelevant. The rest of this logic
+      // has been for skipping past this conversion properly to avoid
+      // weirdness with %%.
+      if (conv_index == 0) {
+        ++local_pos;
+        continue;
+      }
+
+      TypeDesc conv_size = TYPE_DESC<void>;
+      switch (str[local_pos]) {
+      case ('%'):
+        conv_size = TYPE_DESC<void>;
+        break;
+      case ('c'):
+        conv_size = TYPE_DESC<int>;
+        break;
+      case ('d'):
+      case ('i'):
+      case ('o'):
+      case ('x'):
+      case ('X'):
+      case ('u'):
+        switch (lm) {
+        case (LengthModifier::hh):
+        case (LengthModifier::h):
+        case (LengthModifier::none):
+          conv_size = TYPE_DESC<int>;
+          break;
+        case (LengthModifier::l):
+          conv_size = TYPE_DESC<long>;
+          break;
+        case (LengthModifier::ll):
+        case (LengthModifier::L): // This isn't in the standard, but is in other
+                                  // libc implementations.
+          conv_size = TYPE_DESC<long long>;
+          break;
+        case (LengthModifier::j):
+          conv_size = TYPE_DESC<intmax_t>;
+          break;
+        case (LengthModifier::z):
+          conv_size = TYPE_DESC<size_t>;
+          break;
+        case (LengthModifier::t):
+          conv_size = TYPE_DESC<ptr
diff _t>;
+          break;
+        }
+        break;
+      case ('f'):
+      case ('F'):
+      case ('e'):
+      case ('E'):
+      case ('a'):
+      case ('A'):
+      case ('g'):
+      case ('G'):
+        if (lm != LengthModifier::L)
+          conv_size = TYPE_DESC<double>;
+        else
+          conv_size = TYPE_DESC<long double>;
+        break;
+      case ('n'):
+      case ('p'):
+      case ('s'):
+        conv_size = TYPE_DESC<void *>;
+        break;
+      default:
+        conv_size = TYPE_DESC<int>;
+        break;
+      }
+
+      set_type_desc(conv_index, conv_size);
+      if (conv_index == index)
+        return conv_size;
+    }
+    ++local_pos;
+  }
+
+  // If there is no size for the requested index, then just guess that it's an
+  // int.
+  return TYPE_DESC<int>;
+}
+
+void Parser::args_to_index(size_t index) {
+  if (args_index > index) {
+    args_index = 1;
+    args_cur = args_start;
+  }
+
+  while (args_index < index) {
+    Parser::TypeDesc cur_type_desc = TYPE_DESC<void>;
+    if (args_index <= DESC_ARR_LEN)
+      cur_type_desc = desc_arr[args_index - 1];
+
+    if (cur_type_desc == TYPE_DESC<void>)
+      cur_type_desc = get_type_desc(args_index);
+
+    if (cur_type_desc == TYPE_DESC<uint32_t>)
+      args_cur.next_var<uint32_t>();
+    else if (cur_type_desc == TYPE_DESC<uint64_t>)
+      args_cur.next_var<uint64_t>();
+    // Floating point numbers are stored separately from the other arguments.
+    else if (cur_type_desc == TYPE_DESC<double>)
+      args_cur.next_var<double>();
+    else if (cur_type_desc == TYPE_DESC<long double>)
+      args_cur.next_var<long double>();
+    // pointers may be stored separately from normal values.
+    else if (cur_type_desc == TYPE_DESC<void *>)
+      args_cur.next_var<void *>();
+    else
+      args_cur.next_var<uint32_t>();
+
+    ++args_index;
+  }
+}
+
+#endif // LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
+
 } // namespace printf_core
 } // namespace __llvm_libc

diff  --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index aa33bbd9535a9..25dd0f65e8171 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -11,6 +11,7 @@
 
 #include "src/__support/arg_list.h"
 #include "src/stdio/printf_core/core_structs.h"
+#include "src/string/memory_utils/memset_implementations.h"
 
 #include <stddef.h>
 
@@ -21,16 +22,50 @@ class Parser {
   const char *__restrict str;
 
   size_t cur_pos = 0;
+  internal::ArgList args_cur;
 
+#ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
+  // args_start stores the start of the va_args, which is allows getting the
+  // value of arguments that have already been passed. args_index is tracked so
+  // that we know which argument args_cur is on.
   internal::ArgList args_start;
-  internal::ArgList args_cur;
   size_t args_index = 1;
 
+  enum PrimaryType : uint8_t { Integer = 0, Float = 1, Pointer = 2 };
+
+  // TypeDesc stores the information about a type that is relevant to printf in
+  // a relatively compact manner.
+  struct TypeDesc {
+    uint8_t size;
+    PrimaryType primary_type;
+    constexpr bool operator==(const TypeDesc &other) const {
+      return (size == other.size) && (primary_type == other.primary_type);
+    }
+  };
+  // TODO: Make this size configurable via a compile option.
+  static constexpr size_t DESC_ARR_LEN = 32;
+  // desc_arr stores the sizes of the variables in the ArgList. This is used in
+  // index mode to reduce repeated string parsing. The sizes are stored as
+  // TypeDesc objects, which store the size as well as minimal type information.
+  // This is necessary because some systems separate the floating point and
+  // integer values in va_args.
+  TypeDesc desc_arr[DESC_ARR_LEN];
+
   // TODO: Look into object stores for optimization.
 
+#endif // LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
+
 public:
+#ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
   Parser(const char *__restrict new_str, internal::ArgList &args)
-      : str(new_str), args_start(args), args_cur(args) {}
+      : str(new_str), args_cur(args), args_start(args) {
+    inline_memset(reinterpret_cast<char *>(desc_arr), 0,
+                  DESC_ARR_LEN * sizeof(TypeDesc));
+  }
+#else
+  Parser(const char *__restrict new_str, internal::ArgList &args)
+      : str(new_str), args_cur(args) {}
+#endif // LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
 
   // get_next_section will parse the format string until it has a fully
   // specified format section. This can either be a raw format section with no
@@ -53,9 +88,68 @@ class Parser {
 
   // get_next_arg_value gets the next value from the arg list as type T.
   template <class T> T inline get_next_arg_value() {
-    ++args_index;
     return args_cur.next_var<T>();
   }
+
+  //----------------------------------------------------
+  // INDEX MODE ONLY FUNCTIONS AFTER HERE:
+  //----------------------------------------------------
+
+#ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
+
+  // parse_index parses the index of a value inside a format string. It
+  // assumes that str[*local_pos] points to character after a '%' or '*', and
+  // returns 0 if there is no closing $, or if it finds no number. If it finds a
+  // number, it will move local_pos past the end of the $, else it will not move
+  // local_pos.
+  size_t parse_index(size_t *local_pos);
+
+  template <typename T>
+  static constexpr TypeDesc TYPE_DESC{sizeof(T), PrimaryType::Integer};
+  template <>
+  static constexpr TypeDesc TYPE_DESC<double>{sizeof(double),
+                                              PrimaryType::Float};
+  template <>
+  static constexpr TypeDesc TYPE_DESC<long double>{sizeof(long double),
+                                                   PrimaryType::Float};
+  template <>
+  static constexpr TypeDesc TYPE_DESC<void *>{sizeof(void *),
+                                              PrimaryType::Pointer};
+  template <>
+  static constexpr TypeDesc TYPE_DESC<void>{0, PrimaryType::Integer};
+
+  void inline set_type_desc(size_t index, TypeDesc value) {
+    if (index != 0 && index <= DESC_ARR_LEN)
+      desc_arr[index - 1] = value;
+  }
+
+  // get_arg_value gets the value from the arg list at index (starting at 1).
+  // This may require parsing the format string. An index of 0 is interpreted as
+  // the next value.
+  template <class T> T inline get_arg_value(size_t index) {
+    if (!(index == 0 || index == args_index))
+      args_to_index(index);
+
+    set_type_desc(index, TYPE_DESC<T>);
+
+    ++args_index;
+    return get_next_arg_value<T>();
+  }
+
+  // the ArgList can only return the next item in the list. This function is
+  // used in index mode when the item that needs to be read is not the next one.
+  // It moves cur_args to the index requested so the the appropriate value may
+  // be read. This may involve parsing the format string, and is in the worst
+  // case an O(n^2) operation.
+  void args_to_index(size_t index);
+
+  // get_type_desc assumes that this format string uses index mode. It iterates
+  // through the format string until it finds a format specifier that defines
+  // the type of index, and returns a TypeDesc describing that type. It does not
+  // modify cur_pos.
+  TypeDesc get_type_desc(size_t index);
+
+#endif // LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
 };
 
 } // namespace printf_core

diff  --git a/libc/test/src/stdio/printf_core/parser_test.cpp b/libc/test/src/stdio/printf_core/parser_test.cpp
index 73c40eab94a3d..e149acb6658d8 100644
--- a/libc/test/src/stdio/printf_core/parser_test.cpp
+++ b/libc/test/src/stdio/printf_core/parser_test.cpp
@@ -260,3 +260,199 @@ TEST(LlvmLibcPrintfParserTest, EvalThreeArgs) {
 
   ASSERT_FORMAT_EQ(expected2, format_arr[2]);
 }
+
+#ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
+
+TEST(LlvmLibcPrintfParserTest, IndexModeOneArg) {
+  __llvm_libc::printf_core::FormatSection format_arr[10];
+  const char *str = "%1$d";
+  int arg1 = 12345;
+  evaluate(format_arr, str, arg1);
+
+  __llvm_libc::printf_core::FormatSection expected;
+  expected.has_conv = true;
+  expected.raw_len = 4;
+  expected.raw_string = str;
+  expected.conv_val_raw = arg1;
+  expected.conv_name = 'd';
+
+  ASSERT_FORMAT_EQ(expected, format_arr[0]);
+}
+
+TEST(LlvmLibcPrintfParserTest, IndexModeThreeArgsSequential) {
+  __llvm_libc::printf_core::FormatSection format_arr[10];
+  const char *str = "%1$d%2$f%3$s";
+  int arg1 = 12345;
+  double arg2 = 123.45;
+  const char *arg3 = "12345";
+  evaluate(format_arr, str, arg1, arg2, arg3);
+
+  __llvm_libc::printf_core::FormatSection expected0, expected1, expected2;
+  expected0.has_conv = true;
+  expected0.raw_len = 4;
+  expected0.raw_string = str;
+  expected0.conv_val_raw = arg1;
+  expected0.conv_name = 'd';
+
+  ASSERT_FORMAT_EQ(expected0, format_arr[0]);
+
+  expected1.has_conv = true;
+  expected1.raw_len = 4;
+  expected1.raw_string = str + 4;
+  expected1.conv_val_raw = __llvm_libc::bit_cast<uint64_t>(arg2);
+  expected1.conv_name = 'f';
+
+  ASSERT_FORMAT_EQ(expected1, format_arr[1]);
+
+  expected2.has_conv = true;
+  expected2.raw_len = 4;
+  expected2.raw_string = str + 8;
+  expected2.conv_val_ptr = const_cast<char *>(arg3);
+  expected2.conv_name = 's';
+
+  ASSERT_FORMAT_EQ(expected2, format_arr[2]);
+}
+
+TEST(LlvmLibcPrintfParserTest, IndexModeThreeArgsReverse) {
+  __llvm_libc::printf_core::FormatSection format_arr[10];
+  const char *str = "%3$d%2$f%1$s";
+  int arg1 = 12345;
+  double arg2 = 123.45;
+  const char *arg3 = "12345";
+  evaluate(format_arr, str, arg3, arg2, arg1);
+
+  __llvm_libc::printf_core::FormatSection expected0, expected1, expected2;
+  expected0.has_conv = true;
+  expected0.raw_len = 4;
+  expected0.raw_string = str;
+  expected0.conv_val_raw = arg1;
+  expected0.conv_name = 'd';
+
+  ASSERT_FORMAT_EQ(expected0, format_arr[0]);
+
+  expected1.has_conv = true;
+  expected1.raw_len = 4;
+  expected1.raw_string = str + 4;
+  expected1.conv_val_raw = __llvm_libc::bit_cast<uint64_t>(arg2);
+  expected1.conv_name = 'f';
+
+  ASSERT_FORMAT_EQ(expected1, format_arr[1]);
+
+  expected2.has_conv = true;
+  expected2.raw_len = 4;
+  expected2.raw_string = str + 8;
+  expected2.conv_val_ptr = const_cast<char *>(arg3);
+  expected2.conv_name = 's';
+
+  ASSERT_FORMAT_EQ(expected2, format_arr[2]);
+}
+
+TEST(LlvmLibcPrintfParserTest, IndexModeTenArgsRandom) {
+  __llvm_libc::printf_core::FormatSection format_arr[10];
+  const char *str = "%6$d%3$d%7$d%2$d%8$d%1$d%4$d%9$d%5$d%10$d";
+  int args[10] = {6, 4, 2, 7, 9, 1, 3, 5, 8, 10};
+  evaluate(format_arr, str, args[0], args[1], args[2], args[3], args[4],
+           args[5], args[6], args[7], args[8], args[9]);
+
+  for (size_t i = 0; i < 10; ++i) {
+    __llvm_libc::printf_core::FormatSection expected;
+    expected.has_conv = true;
+    expected.raw_len = 4 + (i >= 9 ? 1 : 0);
+    expected.raw_string = str + (4 * i);
+    expected.conv_val_raw = i + 1;
+    expected.conv_name = 'd';
+    EXPECT_FORMAT_EQ(expected, format_arr[i]);
+  }
+}
+
+TEST(LlvmLibcPrintfParserTest, IndexModeComplexParsing) {
+  __llvm_libc::printf_core::FormatSection format_arr[10];
+  const char *str = "normal text %3$llu %% %2$ *4$f %2$ .*4$f %1$1.1c";
+  char arg1 = '1';
+  double arg2 = 123.45;
+  unsigned long long arg3 = 12345;
+  int arg4 = 10;
+  evaluate(format_arr, str, arg1, arg2, arg3, arg4);
+
+  __llvm_libc::printf_core::FormatSection expected0, expected1, expected2,
+      expected3, expected4, expected5, expected6, expected7, expected8,
+      expected9;
+
+  expected0.has_conv = false;
+  expected0.raw_len = 12;
+  expected0.raw_string = str;
+
+  EXPECT_FORMAT_EQ(expected0, format_arr[0]);
+
+  expected1.has_conv = true;
+  expected1.raw_len = 6;
+  expected1.raw_string = str + 12;
+  expected1.length_modifier = __llvm_libc::printf_core::LengthModifier::ll;
+  expected1.conv_val_raw = arg3;
+  expected1.conv_name = 'u';
+
+  EXPECT_FORMAT_EQ(expected1, format_arr[1]);
+
+  expected2.has_conv = false;
+  expected2.raw_len = 1;
+  expected2.raw_string = str + 18;
+
+  EXPECT_FORMAT_EQ(expected2, format_arr[2]);
+
+  expected3.has_conv = true;
+  expected3.raw_len = 2;
+  expected3.raw_string = str + 19;
+  expected3.conv_name = '%';
+
+  EXPECT_FORMAT_EQ(expected3, format_arr[3]);
+
+  expected4.has_conv = false;
+  expected4.raw_len = 1;
+  expected4.raw_string = str + 21;
+
+  EXPECT_FORMAT_EQ(expected4, format_arr[4]);
+
+  expected5.has_conv = true;
+  expected5.raw_len = 8;
+  expected5.raw_string = str + 22;
+  expected5.flags = __llvm_libc::printf_core::FormatFlags::SPACE_PREFIX;
+  expected5.min_width = arg4;
+  expected5.conv_val_raw = __llvm_libc::bit_cast<uint64_t>(arg2);
+  expected5.conv_name = 'f';
+
+  EXPECT_FORMAT_EQ(expected5, format_arr[5]);
+
+  expected6.has_conv = false;
+  expected6.raw_len = 1;
+  expected6.raw_string = str + 30;
+
+  EXPECT_FORMAT_EQ(expected6, format_arr[6]);
+
+  expected7.has_conv = true;
+  expected7.raw_len = 9;
+  expected7.raw_string = str + 31;
+  expected7.flags = __llvm_libc::printf_core::FormatFlags::SPACE_PREFIX;
+  expected7.precision = arg4;
+  expected7.conv_val_raw = __llvm_libc::bit_cast<uint64_t>(arg2);
+  expected7.conv_name = 'f';
+
+  EXPECT_FORMAT_EQ(expected7, format_arr[7]);
+
+  expected8.has_conv = false;
+  expected8.raw_len = 1;
+  expected8.raw_string = str + 40;
+
+  EXPECT_FORMAT_EQ(expected8, format_arr[8]);
+
+  expected9.has_conv = true;
+  expected9.raw_len = 7;
+  expected9.raw_string = str + 41;
+  expected9.min_width = 1;
+  expected9.precision = 1;
+  expected9.conv_val_raw = arg1;
+  expected9.conv_name = 'c';
+
+  EXPECT_FORMAT_EQ(expected9, format_arr[9]);
+}
+
+#endif // LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE


        


More information about the libc-commits mailing list