[llvm] 2fcffcd - [ADT] Simplifying hex string parsing so it runs faster in debug modes.

Stella Laurenzo via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 3 20:44:09 PDT 2021


Author: Ben Vanik
Date: 2021-11-03T20:31:20-07:00
New Revision: 2fcffcd0e8e54f3c8c12fcfc58db4af47d5c4272

URL: https://github.com/llvm/llvm-project/commit/2fcffcd0e8e54f3c8c12fcfc58db4af47d5c4272
DIFF: https://github.com/llvm/llvm-project/commit/2fcffcd0e8e54f3c8c12fcfc58db4af47d5c4272.diff

LOG: [ADT] Simplifying hex string parsing so it runs faster in debug modes.

This expands the lookup table statically and avoids routing through methods that
contain asserts (like StringRef/std::string element accessors and drop_front)
such that performance is more predictable across compilation environments. This
was primarily driven by slow debug mode performance but has a large benefit in
release builds as well.

```
ssd_mobilenet_v2_face_float (42MB .mlir)
  Debug/MSVC (old):  5.22s
  Debug/MSVC (new):  0.16s
Release/MSVC (old):  0.81s
Release/MSVC (new):  0.02s

huggingface_minilm (536MB .mlir)
  Debug/MSVC (old): 65.31s
  Debug/MSVC (new):  2.03s
Release/MSVC (old):  9.93s
Release/MSVC (new):  0.27s
```

Now in debug the time is split evenly between lexString, tryGetFromHex, and
element attrs hashing, with the next step to making it faster being to combine
the work (incremental hashing during conversion, etc) - but this is at least in
the right order of magnitude and retains the original API surface.

I have not profiled a build with clang but this is strictly less code and simpler
data structures so I'd expect improvements there as well.

This also fixes a bug where 0xFF bytes in the input would read out of bounds.

Reviewed By: dblaikie, stellaraccident

Differential Revision: https://reviews.llvm.org/D112105

Added: 
    

Modified: 
    llvm/include/llvm/ADT/StringExtras.h
    llvm/unittests/ADT/StringExtrasTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/ADT/StringExtras.h b/llvm/include/llvm/ADT/StringExtras.h
index 0c2868040a44a..2ca672e7855b0 100644
--- a/llvm/include/llvm/ADT/StringExtras.h
+++ b/llvm/include/llvm/ADT/StringExtras.h
@@ -67,22 +67,27 @@ inline ArrayRef<uint8_t> arrayRefFromStringRef(StringRef Input) {
 ///
 /// If \p C is not a valid hex digit, -1U is returned.
 inline unsigned hexDigitValue(char C) {
-  struct HexTable {
-    unsigned LUT[255] = {};
-    constexpr HexTable() {
-      // Default initialize everything to invalid.
-      for (int i = 0; i < 255; ++i)
-        LUT[i] = ~0U;
-      // Initialize `0`-`9`.
-      for (int i = 0; i < 10; ++i)
-        LUT['0' + i] = i;
-      // Initialize `A`-`F` and `a`-`f`.
-      for (int i = 0; i < 6; ++i)
-        LUT['A' + i] = LUT['a' + i] = 10 + i;
-    }
+  /* clang-format off */
+  static const int16_t LUT[256] = {
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1,  // '0'..'9'
+    -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 'A'..'F'
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 'a'..'f'
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   };
-  constexpr HexTable Table;
-  return Table.LUT[static_cast<unsigned char>(C)];
+  /* clang-format on */
+  return LUT[static_cast<unsigned char>(C)];
 }
 
 /// Checks if character \p C is one of the 10 decimal digits.
@@ -210,24 +215,31 @@ inline bool tryGetFromHex(StringRef Input, std::string &Output) {
   if (Input.empty())
     return true;
 
-  Output.reserve((Input.size() + 1) / 2);
+  // If the input string is not properly aligned on 2 nibbles we pad out the
+  // front with a 0 prefix; e.g. `ABC` -> `0ABC`.
+  Output.resize((Input.size() + 1) / 2);
+  char *OutputPtr = const_cast<char *>(Output.data());
   if (Input.size() % 2 == 1) {
     uint8_t Hex = 0;
     if (!tryGetHexFromNibbles('0', Input.front(), Hex))
       return false;
-
-    Output.push_back(Hex);
+    *OutputPtr++ = Hex;
     Input = Input.drop_front();
   }
 
-  assert(Input.size() % 2 == 0);
-  while (!Input.empty()) {
+  // Convert the nibble pairs (e.g. `9C`) into bytes (0x9C).
+  // With the padding above we know the input is aligned and the output expects
+  // exactly half as many bytes as nibbles in the input.
+  size_t InputSize = Input.size();
+  assert(InputSize % 2 == 0);
+  const char *InputPtr = Input.data();
+  for (size_t OutputIndex = 0; OutputIndex < InputSize / 2; ++OutputIndex) {
     uint8_t Hex = 0;
-    if (!tryGetHexFromNibbles(Input[0], Input[1], Hex))
+    if (!tryGetHexFromNibbles(InputPtr[OutputIndex * 2 + 0], // MSB
+                              InputPtr[OutputIndex * 2 + 1], // LSB
+                              Hex))
       return false;
-
-    Output.push_back(Hex);
-    Input = Input.drop_front(2);
+    OutputPtr[OutputIndex] = Hex;
   }
   return true;
 }

diff  --git a/llvm/unittests/ADT/StringExtrasTest.cpp b/llvm/unittests/ADT/StringExtrasTest.cpp
index 20437f9fbbb39..49a9bcd79db99 100644
--- a/llvm/unittests/ADT/StringExtrasTest.cpp
+++ b/llvm/unittests/ADT/StringExtrasTest.cpp
@@ -91,7 +91,7 @@ TEST(StringExtrasTest, ToAndFromHex) {
   EXPECT_EQ(EvenData, fromHex(EvenStr));
   EXPECT_EQ(StringRef(EvenStr).lower(), toHex(EvenData, true));
 
-  std::string InvalidStr = "A5ZX";
+  std::string InvalidStr = "A50\xFF";
   std::string IgnoredOutput;
   EXPECT_FALSE(tryGetFromHex(InvalidStr, IgnoredOutput));
 }


        


More information about the llvm-commits mailing list