[llvm] [llvm] Improve implementation of StringRef::find_last_of and cie (PR #71865)

via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 6 12:52:12 PST 2024


https://github.com/serge-sans-paille updated https://github.com/llvm/llvm-project/pull/71865

>From 75e87c937ef41d4c2e5e455e79b50b020e02de92 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton at mozilla.com>
Date: Thu, 16 Nov 2023 18:03:32 +0100
Subject: [PATCH] [llvm] Improve implementation of StringRef::find_last_of for
 the usual case of 2 chars

Almost all usage of StringRef::find_last_of in Clang/LLVM use a Needle
of 2 elements, which can be optimized using a generic vectorized
algorithm and a few bit hacks.
---
 llvm/lib/Support/StringRef.cpp       | 57 +++++++++++++++++++++++++++-
 llvm/unittests/ADT/StringRefTest.cpp | 24 ++++++++++++
 2 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp
index f1042131a89cb7..d2c740a27def69 100644
--- a/llvm/lib/Support/StringRef.cpp
+++ b/llvm/lib/Support/StringRef.cpp
@@ -268,17 +268,70 @@ StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
   return npos;
 }
 
+// See https://graphics.stanford.edu/~seander/bithacks.html#ValueInWord
+static inline uint64_t haszero(uint64_t v) {
+  return ~((((v & 0x7F7F7F7F7F7F7F7F) + 0x7F7F7F7F7F7F7F7F) | v) |
+           0x7F7F7F7F7F7F7F7F);
+}
+static inline uint64_t hasvalue(uint64_t x, char n) {
+  return haszero((x) ^ (~0UL / 255 * (n)));
+}
+
+/// This is a hot spot for some clangd operations, enough to be eligible to
+/// a pseudo - vectorized implementation.
+static StringRef::size_type
+vectorized_find_last_of_specialized(const char *Data, size_t Sz, char C0,
+                                    char C1) {
+  while (Sz >= 8) {
+    Sz -= 8;
+    uint64_t Buffer = 0;
+    std::memcpy((void *)&Buffer, (void *)(Data + Sz), sizeof(Buffer));
+    uint64_t Check = hasvalue(Buffer, C0) | hasvalue(Buffer, C1);
+    if (Check)
+      return Sz + 7 - llvm::countl_zero(Check) / 8;
+  }
+  if (Sz >= 4) {
+    Sz -= 4;
+    uint32_t Buffer = 0;
+    std::memcpy((void *)&Buffer, (void *)(Data + Sz), sizeof(Buffer));
+    uint64_t Check = hasvalue(Buffer, C0) | hasvalue(Buffer, C1);
+    if (Check)
+      return Sz + 7 - llvm::countl_zero(Check) / 8;
+  }
+  if (Sz >= 2) {
+    Sz -= 2;
+    uint16_t Buffer = 0;
+    std::memcpy((void *)&Buffer, (void *)(Data + Sz), sizeof(Buffer));
+    uint64_t Check = hasvalue(Buffer, C0) | hasvalue(Buffer, C1);
+    if (Check)
+      return Sz + 7 - llvm::countl_zero(Check) / 8;
+  }
+  if (Sz >= 1)
+    if (*Data == C0 || *Data == C1)
+      return 0;
+
+  return StringRef::npos;
+}
+
 /// find_last_of - Find the last character in the string that is in \arg C,
 /// or npos if not found.
 ///
-/// Note: O(size() + Chars.size())
+/// Note: O(size() + Chars.size()) for the generic case.
 StringRef::size_type StringRef::find_last_of(StringRef Chars,
                                              size_t From) const {
+  size_type Sz = std::min(From, Length);
+
+  if (Chars.size() == 2) {
+    auto res =
+        vectorized_find_last_of_specialized(Data, Sz, Chars[0], Chars[1]);
+    return res;
+  }
+
   std::bitset<1 << CHAR_BIT> CharBits;
   for (char C : Chars)
     CharBits.set((unsigned char)C);
 
-  for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
+  for (size_type i = Sz - 1, e = -1; i != e; --i)
     if (CharBits.test((unsigned char)Data[i]))
       return i;
   return npos;
diff --git a/llvm/unittests/ADT/StringRefTest.cpp b/llvm/unittests/ADT/StringRefTest.cpp
index 8df71e8ad03378..dc7985724d7292 100644
--- a/llvm/unittests/ADT/StringRefTest.cpp
+++ b/llvm/unittests/ADT/StringRefTest.cpp
@@ -548,6 +548,30 @@ TEST(StringRefTest, Find) {
   EXPECT_EQ(3U, Str.find_last_not_of('o'));
   EXPECT_EQ(1U, Str.find_last_not_of("lo"));
   EXPECT_EQ(StringRef::npos, Str.find_last_not_of("helo"));
+
+  Str = "01234567891";
+  EXPECT_EQ(0U, Str.find_last_of("0z"));
+  EXPECT_EQ(0U, Str.find_last_of("0"));
+  EXPECT_EQ(10U, Str.find_last_of("1z"));
+  EXPECT_EQ(10U, Str.find_last_of("1"));
+  EXPECT_EQ(2U, Str.find_last_of("2z"));
+  EXPECT_EQ(2U, Str.find_last_of("2"));
+  EXPECT_EQ(3U, Str.find_last_of("3z"));
+  EXPECT_EQ(3U, Str.find_last_of("3"));
+  EXPECT_EQ(4U, Str.find_last_of("4z"));
+  EXPECT_EQ(4U, Str.find_last_of("4"));
+  EXPECT_EQ(5U, Str.find_last_of("5z"));
+  EXPECT_EQ(5U, Str.find_last_of("5"));
+  EXPECT_EQ(6U, Str.find_last_of("6z"));
+  EXPECT_EQ(6U, Str.find_last_of("6"));
+  EXPECT_EQ(7U, Str.find_last_of("7z"));
+  EXPECT_EQ(7U, Str.find_last_of("7"));
+  EXPECT_EQ(8U, Str.find_last_of("8z"));
+  EXPECT_EQ(8U, Str.find_last_of("8"));
+  EXPECT_EQ(9U, Str.find_last_of("9z"));
+  EXPECT_EQ(9U, Str.find_last_of("9"));
+  EXPECT_EQ(StringRef::npos, Str.find_last_of("az"));
+  EXPECT_EQ(StringRef::npos, Str.find_last_of("a"));
 }
 
 TEST(StringRefTest, Count) {



More information about the llvm-commits mailing list