[libc-commits] [libc] [libc] add an SVE implementation of strlen (PR #167259)

Schrodinger ZHU Yifan via libc-commits libc-commits at lists.llvm.org
Sun Nov 9 17:14:16 PST 2025


SchrodingerZhu wrote:

```
// -*- C++ -*-
// Standalone SVE/NEON/libc strlen microbenchmark (always registers all)

#include <algorithm>
#include <array>
#include <cassert>
#include <chrono>
#include <cinttypes>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <memory>
#include <random>
#include <string>
#include <tuple>
#include <vector>

#include <arm_neon.h>
#include <arm_sve.h>

#define LIBC_LIKELY(x)   __builtin_expect(!!(x), 1)
#define LIBC_UNLIKELY(x) __builtin_expect(!!(x), 0)

// -----------------------------------------------------------------------------
// NEON implementation
// -----------------------------------------------------------------------------
namespace neon {
[[maybe_unused]] static inline size_t string_length(const char* src) {
  using Vector __attribute__((may_alias)) = uint8x8_t;

  uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
  const Vector* block_ptr =
      reinterpret_cast<const Vector*>(src - misalign_bytes);

  Vector v = *block_ptr;
  Vector vcmp = vceqz_u8(v);
  uint64x1_t cmp_mask = vreinterpret_u64_u8(vcmp);
  uint64_t cmp = vget_lane_u64(cmp_mask, 0);
  cmp >>= (misalign_bytes << 3);
  if (cmp) return __builtin_ctzll(cmp) >> 3;

  while (true) {
    ++block_ptr;
    v = *block_ptr;
    vcmp = vceqz_u8(v);
    cmp_mask = vreinterpret_u64_u8(vcmp);
    cmp = vget_lane_u64(vcmp, 0);
    if (cmp) {
      size_t base = reinterpret_cast<uintptr_t>(block_ptr) -
                    reinterpret_cast<uintptr_t>(src);
      return base + ((__builtin_ctzll(cmp)) >> 3);
    }
  }
}
} // namespace neon

// -----------------------------------------------------------------------------
// SVE implementation
// -----------------------------------------------------------------------------
namespace sve {
[[maybe_unused]] static inline size_t string_length(const char* src) {
  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(src);
  svsetffr();
  const svbool_t all_true = svptrue_b8();
  svbool_t cmp_zero;
  size_t len = 0;

  for (;;) {
    svuint8_t data = svldff1_u8(all_true, ptr);
    svbool_t fault_mask = svrdffr_z(all_true);
    bool no_fault = svptest_last(all_true, fault_mask);
    if (LIBC_LIKELY(no_fault)) {
      len += svcntb();
      cmp_zero = svcmpeq_n_u8(all_true, data, 0);
      bool has_no_zero = !svptest_any(all_true, cmp_zero);
      if (LIBC_LIKELY(has_no_zero)) {
        ptr += svcntb();
        continue;
      }
      len -= svcntb();
      break;
    } else {
      cmp_zero = svcmpeq_n_u8(fault_mask, data, 0);
      bool has_zero = svptest_any(fault_mask, cmp_zero);
      if (LIBC_LIKELY(has_zero)) break;
      len += svcntp_b8(all_true, fault_mask);
      ptr += svcntb();
      svsetffr();
      continue;
    }
  }
  svbool_t before_zero = svbrkb_z(all_true, cmp_zero);
  len += svcntp_b8(all_true, before_zero);
  return len;
}
} // namespace sve

// -----------------------------------------------------------------------------
// libc fallback
// -----------------------------------------------------------------------------
namespace syslibc {
static inline size_t string_length(const char* s) { return std::strlen(s); }
} // namespace syslibc

// -----------------------------------------------------------------------------
// Benchmark harness
// -----------------------------------------------------------------------------
struct Impl {
  const char* name;
  size_t (*fn)(const char*);
};

static std::vector<Impl> get_impls() {
  return {
    {"libc", &syslibc::string_length},
    {"neon", &neon::string_length},
    {"sve",  &sve::string_length}
  };
}

struct Result {
  std::string name;
  double ns_per_call;
  double gib_per_s;
};

static inline uint64_t now_ns() {
  using clock = std::chrono::steady_clock;
  return std::chrono::duration_cast<std::chrono::nanoseconds>(
             clock::now().time_since_epoch()).count();
}

// correctness check
static bool run_correctness(const std::vector<Impl>& impls) {
  bool ok = true;
  std::vector<size_t> sizes = {0,1,3,7,8,9,15,16,31,32,63,64,127,128,255,256,511,512,1023,1024,4096};
  for (size_t n : sizes) {
    std::unique_ptr<char[]> s(new char[n + 2]);
    std::fill(s.get(), s.get() + n, 'A');
    s[n] = 0;
    size_t ref = syslibc::string_length(s.get());
    for (auto& impl : impls) {
      size_t got = impl.fn(s.get());
      if (got != ref) {
        std::cerr << "FAIL " << impl.name << " len=" << n
                  << " got=" << got << " ref=" << ref << "\n";
        ok = false;
      }
    }
  }
  return ok;
}

static Result bench(const Impl& impl, size_t size, size_t reps) {
  std::unique_ptr<char[]> buf(new char[size + 1]);
  std::fill(buf.get(), buf.get() + size, 'X');
  buf[size] = 0;

  volatile size_t dummy = 0;
  uint64_t t0 = now_ns();
  for (size_t i = 0; i < reps; ++i)
    dummy += impl.fn(buf.get());
  uint64_t t1 = now_ns();
  double ns_call = double(t1 - t0) / reps;
  double gib_s = (double(size) * reps) / ((t1 - t0) * 1e-9) / (1024.0 * 1024.0 * 1024.0);
  (void)dummy;
  return {impl.name, ns_call, gib_s};
}

int main() {
  auto impls = get_impls();
  std::cout << "Implementations:";
  for (auto& i : impls) std::cout << " " << i.name;
  std::cout << "\n";

  if (!run_correctness(impls)) {
    std::cerr << "Correctness check failed!\n";
    return 1;
  }

  std::vector<size_t> sizes = {16, 64, 256, 1024, 4096, 1<<20};
  for (size_t s : sizes) {
    std::cout << "\n=== strlen(" << s << " bytes) ===\n";
    for (auto& impl : impls) {
      Result r = bench(impl, s, 1000000 / std::max<size_t>(1, s/16));
      std::cout << impl.name << ": " << r.ns_per_call << " ns/call, "
                << r.gib_per_s << " GiB/s\n";
    }
  }
  return 0;
}
```

https://github.com/llvm/llvm-project/pull/167259


More information about the libc-commits mailing list