[libc-commits] [libc] 65f4cc6 - [libc] add scanf int conversions

Michael Jones via libc-commits libc-commits at lists.llvm.org
Tue Dec 13 12:51:36 PST 2022


Author: Michael Jones
Date: 2022-12-13T12:51:29-08:00
New Revision: 65f4cc63102aa77c49c63826ec55af0582c3d8ed

URL: https://github.com/llvm/llvm-project/commit/65f4cc63102aa77c49c63826ec55af0582c3d8ed
DIFF: https://github.com/llvm/llvm-project/commit/65f4cc63102aa77c49c63826ec55af0582c3d8ed.diff

LOG: [libc] add scanf int conversions

This patch adds the integer conversions %d/i/u/o/x/X to scanf as well as
unit tests to check their correctness.

Reviewed By: sivachandra

Differential Revision: https://reviews.llvm.org/D138880

Added: 
    libc/src/stdio/scanf_core/int_converter.cpp
    libc/src/stdio/scanf_core/int_converter.h

Modified: 
    libc/src/stdio/scanf_core/CMakeLists.txt
    libc/src/stdio/scanf_core/converter.cpp
    libc/test/src/stdio/sscanf_test.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/stdio/scanf_core/CMakeLists.txt b/libc/src/stdio/scanf_core/CMakeLists.txt
index ef8ec0ab6f0fb..ab0530d6d3861 100644
--- a/libc/src/stdio/scanf_core/CMakeLists.txt
+++ b/libc/src/stdio/scanf_core/CMakeLists.txt
@@ -78,9 +78,10 @@ add_object_library(
   SRCS
     converter.cpp
     string_converter.cpp
+    int_converter.cpp
   HDRS
     converter.h
-    string_converter.h
+    int_converter.h
   DEPENDS
     .reader
     .core_structs

diff  --git a/libc/src/stdio/scanf_core/converter.cpp b/libc/src/stdio/scanf_core/converter.cpp
index 3cfa8758349ec..fbb2c1bcd5ee5 100644
--- a/libc/src/stdio/scanf_core/converter.cpp
+++ b/libc/src/stdio/scanf_core/converter.cpp
@@ -12,6 +12,7 @@
 #include "src/stdio/scanf_core/core_structs.h"
 #include "src/stdio/scanf_core/reader.h"
 
+#include "src/stdio/scanf_core/int_converter.h"
 #include "src/stdio/scanf_core/string_converter.h"
 
 #include <stddef.h>
@@ -32,16 +33,16 @@ int convert(Reader *reader, const FormatSection &to_conv) {
   case 'c':
   case '[':
     return convert_string(reader, to_conv);
-    //   case 'd':
-    //   case 'i':
-    //   case 'u':
-    //   case 'o':
-    //   case 'x':
-    //   case 'X':
-    //     ret_val = raw_match(reader, " ");
-    //     if (ret_val != READ_OK)
-    //       return ret_val;
-    //     return convert_int(reader, to_conv);
+  case 'd':
+  case 'i':
+  case 'u':
+  case 'o':
+  case 'x':
+  case 'X':
+    ret_val = raw_match(reader, " ");
+    if (ret_val != READ_OK)
+      return ret_val;
+    return convert_int(reader, to_conv);
     // #ifndef LLVM_LIBC_SCANF_DISABLE_FLOAT
     //   case 'f':
     //   case 'F':

diff  --git a/libc/src/stdio/scanf_core/int_converter.cpp b/libc/src/stdio/scanf_core/int_converter.cpp
new file mode 100644
index 0000000000000..1087166742b6c
--- /dev/null
+++ b/libc/src/stdio/scanf_core/int_converter.cpp
@@ -0,0 +1,266 @@
+//===-- Int type specifier converters for scanf -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/scanf_core/string_converter.h"
+
+#include "src/__support/CPP/limits.h"
+#include "src/__support/ctype_utils.h"
+#include "src/stdio/scanf_core/core_structs.h"
+#include "src/stdio/scanf_core/reader.h"
+
+#include <stddef.h>
+
+namespace __llvm_libc {
+namespace scanf_core {
+
+constexpr char inline to_lower(char a) { return a | 32; }
+
+constexpr inline int b36_char_to_int(char input) {
+  if (internal::isdigit(input))
+    return input - '0';
+  if (internal::isalpha(input))
+    return to_lower(input) + 10 - 'a';
+  return 0;
+}
+
+void write_with_length(uintmax_t output_val, const FormatSection &to_conv) {
+  if ((to_conv.flags & NO_WRITE) != 0) {
+    return;
+  }
+  LengthModifier lm = to_conv.length_modifier;
+  void *output_ptr = to_conv.output_ptr;
+  switch (lm) {
+  case (LengthModifier::hh):
+    *reinterpret_cast<unsigned char *>(output_ptr) =
+        static_cast<unsigned char>(output_val);
+    break;
+  case (LengthModifier::h):
+    *reinterpret_cast<unsigned short *>(output_ptr) =
+        static_cast<unsigned short>(output_val);
+    break;
+  case (LengthModifier::NONE):
+    *reinterpret_cast<unsigned int *>(output_ptr) =
+        static_cast<unsigned int>(output_val);
+    break;
+  case (LengthModifier::l):
+    *reinterpret_cast<unsigned long *>(output_ptr) =
+        static_cast<unsigned long>(output_val);
+    break;
+  case (LengthModifier::ll):
+  case (LengthModifier::L):
+    *reinterpret_cast<unsigned long long *>(output_ptr) =
+        static_cast<unsigned long long>(output_val);
+    break;
+  case (LengthModifier::j):
+    *reinterpret_cast<uintmax_t *>(output_ptr) =
+        static_cast<uintmax_t>(output_val);
+    break;
+  case (LengthModifier::z):
+    *reinterpret_cast<size_t *>(output_ptr) = static_cast<size_t>(output_val);
+    break;
+  case (LengthModifier::t):
+    *reinterpret_cast<ptr
diff _t *>(output_ptr) =
+        static_cast<ptr
diff _t>(output_val);
+    break;
+  }
+}
+
+// This code is very similar to the code in __support/str_to_integer.h but is
+// not quite the same. Here is the list of 
diff erences and why they exist:
+//  1) This takes a reader and a format section instead of a char* and the base.
+//      This should be fairly self explanatory. While the char* could be adapted
+//      to a reader and the base could be calculated ahead of time, the
+//      semantics are slightly 
diff erent, specifically a char* can be indexed
+//      freely (I can read str[2] and then str[0]) whereas a File (which the
+//      reader may contain) cannot.
+//  2) Because this uses a Reader, this function can only unget once.
+//      This is relevant because scanf specifies it reads the "longest sequence
+//      of input characters which does not exceed any specified field width and
+//      which is, or is a prefix of, a matching input sequence." Whereas the
+//      strtol function accepts "the longest initial subsequence of the input
+//      string (...) that is of the expected form." This is demonstrated by the
+//      
diff erences in how they deal with the string "0xZZZ" when parsing as
+//      hexadecimal. Scanf will read the "0x" as a valid prefix and return 0,
+//      since it reads the first 'Z', sees that it's not a valid hex digit, and
+//      reverses one character. The strtol function on the other hand only
+//      accepts the "0" since that's the longest valid hexadecimal sequence. It
+//      sees the 'Z' after the "0x" and determines that this is not the prefix
+//      to a valid hex string.
+//  3) This conversion may have a maximum width.
+//      If a maximum width is specified, this conversion is only allowed to
+//      accept a certain number of characters. Strtol doesn't have any such
+//      limitation.
+int convert_int(Reader *reader, const FormatSection &to_conv) {
+  // %d "Matches an optionally signed decimal integer [...] with the value 10
+  // for the base argument. The corresponding argument shall be a pointer to
+  // signed integer."
+
+  // %i "Matches an optionally signed integer [...] with the value 0 for the
+  // base argument. The corresponding argument shall be a pointer to signed
+  // integer."
+
+  // %u "Matches an optionally signed decimal integer [...] with the value 10
+  // for the base argument. The corresponding argument shall be a pointer to
+  // unsigned integer"
+
+  // %o "Matches an optionally signed octal integer [...] with the value 8 for
+  // the base argument. The corresponding argument shall be a pointer to
+  // unsigned integer"
+
+  // %x/X "Matches an optionally signed hexadecimal integer [...] with the value
+  // 16 for the base argument. The corresponding argument shall be a pointer to
+  // unsigned integer"
+
+  size_t max_width = cpp::numeric_limits<size_t>::max();
+  if (to_conv.max_width > 0) {
+    max_width = to_conv.max_width;
+  }
+
+  uintmax_t result = 0;
+  bool is_number = false;
+  bool is_signed = false;
+  int base = 0;
+  if (to_conv.conv_name == 'i') {
+    base = 0;
+    is_signed = true;
+  } else if (to_conv.conv_name == 'o') {
+    base = 8;
+  } else if (to_lower(to_conv.conv_name) == 'x') {
+    base = 16;
+  } else if (to_conv.conv_name == 'd') {
+    base = 10;
+    is_signed = true;
+  } else { // conv_name must be 'u'
+    base = 10;
+  }
+
+  char cur_char = reader->getc();
+
+  char result_sign = '+';
+  if (cur_char == '+' || cur_char == '-') {
+    result_sign = cur_char;
+    if (max_width > 1) {
+      --max_width;
+      cur_char = reader->getc();
+    } else {
+      // If the max width has been hit already, then the return value must be 0
+      // since no actual digits of the number have been parsed yet.
+      write_with_length(0, to_conv);
+      return MATCHING_FAILURE;
+    }
+  }
+  const bool is_negative = result_sign == '-';
+
+  // Base of 0 means automatically determine the base. Base of 16 may have a
+  // prefix of "0x"
+  if (base == 0 || base == 16) {
+    // If the first character is 0, then it could be octal or hex.
+    if (cur_char == '0') {
+      is_number = true;
+
+      // Read the next character to check.
+      if (max_width > 1) {
+        --max_width;
+        cur_char = reader->getc();
+      } else {
+        write_with_length(0, to_conv);
+        return READ_OK;
+      }
+
+      if (to_lower(cur_char) == 'x') {
+        // This is a valid hex prefix.
+        base = 16;
+        if (max_width > 1) {
+          --max_width;
+          cur_char = reader->getc();
+        } else {
+          write_with_length(0, to_conv);
+          return READ_OK;
+        }
+
+      } else {
+        if (base == 0) {
+          base = 8;
+        }
+      }
+    } else if (base == 0) {
+      if (internal::isdigit(cur_char)) {
+        // If the first character is a 
diff erent number, then it's 10.
+        base = 10;
+      } else {
+        // If the first character isn't a valid digit, then there are no valid
+        // digits at all. The number is 0.
+        reader->ungetc(cur_char);
+        write_with_length(0, to_conv);
+        return MATCHING_FAILURE;
+      }
+    }
+  }
+
+  constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits<uintmax_t>::max();
+  constexpr uintmax_t SIGNED_MAX =
+      static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max());
+  constexpr uintmax_t NEGATIVE_SIGNED_MAX =
+      static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()) + 1;
+
+  const uintmax_t MAX =
+      (is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX)
+                 : UNSIGNED_MAX);
+
+  const uintmax_t max_div_by_base = MAX / base;
+
+  if (internal::isalnum(cur_char) && b36_char_to_int(cur_char) < base) {
+    is_number = true;
+  }
+
+  bool has_overflow = false;
+  size_t i = 0;
+  for (; i < max_width && internal::isalnum(cur_char) &&
+         b36_char_to_int(cur_char) < base;
+       ++i, cur_char = reader->getc()) {
+
+    uintmax_t cur_digit = b36_char_to_int(cur_char);
+
+    if (result == MAX) {
+      has_overflow = true;
+      continue;
+    } else if (result > max_div_by_base) {
+      result = MAX;
+      has_overflow = true;
+    } else {
+      result = result * base;
+    }
+
+    if (result > MAX - cur_digit) {
+      result = MAX;
+      has_overflow = true;
+    } else {
+      result = result + cur_digit;
+    }
+  }
+
+  // We always read one more character than will be used, so we have to put the
+  // last one back.
+  reader->ungetc(cur_char);
+
+  if (has_overflow) {
+    write_with_length(MAX, to_conv);
+  } else {
+    if (is_negative)
+      result = -result;
+
+    write_with_length(result, to_conv);
+  }
+
+  if (!is_number)
+    return MATCHING_FAILURE;
+  return READ_OK;
+}
+
+} // namespace scanf_core
+} // namespace __llvm_libc

diff  --git a/libc/src/stdio/scanf_core/int_converter.h b/libc/src/stdio/scanf_core/int_converter.h
new file mode 100644
index 0000000000000..af5ab3e060a7b
--- /dev/null
+++ b/libc/src/stdio/scanf_core/int_converter.h
@@ -0,0 +1,25 @@
+//===-- Int type specifier converter for scanf ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_INT_CONVERTER_H
+#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_INT_CONVERTER_H
+
+#include "src/stdio/scanf_core/core_structs.h"
+#include "src/stdio/scanf_core/reader.h"
+
+#include <stddef.h>
+
+namespace __llvm_libc {
+namespace scanf_core {
+
+int convert_int(Reader *reader, const FormatSection &to_conv);
+
+} // namespace scanf_core
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_INT_CONVERTER_H

diff  --git a/libc/test/src/stdio/sscanf_test.cpp b/libc/test/src/stdio/sscanf_test.cpp
index 70f652361b7d3..b3e146f1d64be 100644
--- a/libc/test/src/stdio/sscanf_test.cpp
+++ b/libc/test/src/stdio/sscanf_test.cpp
@@ -6,8 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/limits.h"
 #include "src/stdio/sscanf.h"
 
+#include <stdio.h> // For EOF
+
 #include "utils/UnitTest/Test.h"
 
 TEST(LlvmLibcSScanfTest, SimpleStringConv) {
@@ -28,3 +31,200 @@ TEST(LlvmLibcSScanfTest, SimpleStringConv) {
   ASSERT_STREQ(buffer, "abc");
   ASSERT_STREQ(buffer2, "123");
 }
+
+TEST(LlvmLibcSScanfTest, IntConvSimple) {
+  int ret_val;
+  int result = 0;
+  ret_val = __llvm_libc::sscanf("123", "%d", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 123);
+
+  ret_val = __llvm_libc::sscanf("456", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 456);
+
+  ret_val = __llvm_libc::sscanf("789", "%x", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0x789);
+
+  ret_val = __llvm_libc::sscanf("012", "%o", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 012);
+
+  ret_val = __llvm_libc::sscanf("345", "%u", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 345);
+
+  ret_val = __llvm_libc::sscanf("Not an integer", "%d", &result);
+  EXPECT_EQ(ret_val, 0);
+}
+
+TEST(LlvmLibcSScanfTest, IntConvLengthModifier) {
+  int ret_val;
+  uintmax_t max_result = 0;
+  int int_result = 0;
+  char char_result = 0;
+
+  ret_val = __llvm_libc::sscanf("123", "%ju", &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, uintmax_t(123));
+
+  // Check overflow handling
+  ret_val = __llvm_libc::sscanf("999999999999999999999999999999999999", "%ju",
+                                &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, __llvm_libc::cpp::numeric_limits<uintmax_t>::max());
+
+  // Because this is unsigned, any out of range value should return the maximum,
+  // even with a negative sign.
+  ret_val = __llvm_libc::sscanf("-999999999999999999999999999999999999", "%ju",
+                                &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, __llvm_libc::cpp::numeric_limits<uintmax_t>::max());
+
+  ret_val = __llvm_libc::sscanf("-18446744073709551616", "%ju", &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, __llvm_libc::cpp::numeric_limits<uintmax_t>::max());
+
+  // But any number below the maximum should have the - sign applied.
+  ret_val = __llvm_libc::sscanf("-1", "%ju", &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, uintmax_t(-1));
+
+  ret_val = __llvm_libc::sscanf("-1", "%u", &int_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(int_result, -1);
+
+  max_result = 0xff00ff00ff00ff00;
+  char_result = 0x6f;
+
+  // Overflows for sizes larger than the maximum are handled by casting.
+  ret_val = __llvm_libc::sscanf("8589967360", "%d", &int_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(int_result, int(8589967360)); // 2^33 + 2^15
+
+  // Check that the adjacent values weren't touched by the overflow.
+  ASSERT_EQ(max_result, uintmax_t(0xff00ff00ff00ff00));
+  ASSERT_EQ(char_result, char(0x6f));
+
+  ret_val = __llvm_libc::sscanf("-8589967360", "%d", &int_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(int_result, int(-8589967360));
+  ASSERT_EQ(max_result, uintmax_t(0xff00ff00ff00ff00));
+  ASSERT_EQ(char_result, char(0x6f));
+
+  ret_val = __llvm_libc::sscanf("25", "%hhd", &char_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(char_result, char(25));
+}
+
+TEST(LlvmLibcSScanfTest, IntConvBaseSelection) {
+  int ret_val;
+  int result = 0;
+  ret_val = __llvm_libc::sscanf("0xabc123", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0xabc123);
+
+  ret_val = __llvm_libc::sscanf("0456", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0456);
+
+  ret_val = __llvm_libc::sscanf("0999", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0);
+
+  ret_val = __llvm_libc::sscanf("123abc456", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 123);
+}
+
+TEST(LlvmLibcSScanfTest, IntConvMaxLengthTests) {
+  int ret_val;
+  int result = 0;
+
+  ret_val = __llvm_libc::sscanf("12", "%1d", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 1);
+
+  ret_val = __llvm_libc::sscanf("-1", "%1d", &result);
+  EXPECT_EQ(ret_val, 0);
+  EXPECT_EQ(result, 0);
+
+  ret_val = __llvm_libc::sscanf("+1", "%1d", &result);
+  EXPECT_EQ(ret_val, 0);
+  EXPECT_EQ(result, 0);
+
+  ret_val = __llvm_libc::sscanf("01", "%1d", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0);
+
+  ret_val = __llvm_libc::sscanf("01", "%1i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0);
+
+  ret_val = __llvm_libc::sscanf("0x1", "%2i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0);
+
+  ret_val = __llvm_libc::sscanf("-0x1", "%3i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0);
+
+  ret_val = __llvm_libc::sscanf("-0x123", "%4i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, -1);
+
+  ret_val = __llvm_libc::sscanf("123456789", "%5i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 12345);
+
+  ret_val = __llvm_libc::sscanf("123456789", "%10i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 123456789);
+}
+
+TEST(LlvmLibcSScanfTest, IntConvNoWriteTests) {
+  int ret_val;
+  // Result shouldn't be used by these tests, but it's safer to have it and
+  // check it.
+  int result = 0;
+  ret_val = __llvm_libc::sscanf("-1", "%*1d", &result);
+  EXPECT_EQ(ret_val, 0);
+  EXPECT_EQ(result, 0);
+
+  ret_val = __llvm_libc::sscanf("01", "%*1i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0);
+
+  ret_val = __llvm_libc::sscanf("0x1", "%*2i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0);
+
+  ret_val = __llvm_libc::sscanf("a", "%*i", &result);
+  EXPECT_EQ(ret_val, 0);
+  EXPECT_EQ(result, 0);
+
+  ret_val = __llvm_libc::sscanf("123", "%*i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0);
+}
+
+TEST(LlvmLibcSScanfTest, CombinedConv) {
+  int ret_val;
+  int result = 0;
+  char buffer[10];
+  ret_val = __llvm_libc::sscanf("123abc", "%i%s", &result, buffer);
+  EXPECT_EQ(ret_val, 2);
+  EXPECT_EQ(result, 123);
+  ASSERT_STREQ(buffer, "abc");
+
+  ret_val = __llvm_libc::sscanf("0xZZZ", "%i%s", &result, buffer);
+  EXPECT_EQ(ret_val, 2);
+  EXPECT_EQ(result, 0);
+  ASSERT_STREQ(buffer, "ZZZ");
+
+  ret_val = __llvm_libc::sscanf("0xZZZ", "%X%s", &result, buffer);
+  EXPECT_EQ(ret_val, 2);
+  EXPECT_EQ(result, 0);
+  ASSERT_STREQ(buffer, "ZZZ");
+}


        


More information about the libc-commits mailing list