[libc-commits] [libc] [libc] add an SVE implementation of strlen (PR #167259)
Schrodinger ZHU Yifan via libc-commits
libc-commits at lists.llvm.org
Sun Nov 9 17:14:16 PST 2025
SchrodingerZhu wrote:
```
// -*- C++ -*-
// Standalone SVE/NEON/libc strlen microbenchmark (always registers all)
#include <algorithm>
#include <array>
#include <cassert>
#include <chrono>
#include <cinttypes>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <memory>
#include <random>
#include <string>
#include <tuple>
#include <vector>
#include <arm_neon.h>
#include <arm_sve.h>
#define LIBC_LIKELY(x) __builtin_expect(!!(x), 1)
#define LIBC_UNLIKELY(x) __builtin_expect(!!(x), 0)
// -----------------------------------------------------------------------------
// NEON implementation
// -----------------------------------------------------------------------------
namespace neon {
[[maybe_unused]] static inline size_t string_length(const char* src) {
using Vector __attribute__((may_alias)) = uint8x8_t;
uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
const Vector* block_ptr =
reinterpret_cast<const Vector*>(src - misalign_bytes);
Vector v = *block_ptr;
Vector vcmp = vceqz_u8(v);
uint64x1_t cmp_mask = vreinterpret_u64_u8(vcmp);
uint64_t cmp = vget_lane_u64(cmp_mask, 0);
cmp >>= (misalign_bytes << 3);
if (cmp) return __builtin_ctzll(cmp) >> 3;
while (true) {
++block_ptr;
v = *block_ptr;
vcmp = vceqz_u8(v);
cmp_mask = vreinterpret_u64_u8(vcmp);
cmp = vget_lane_u64(vcmp, 0);
if (cmp) {
size_t base = reinterpret_cast<uintptr_t>(block_ptr) -
reinterpret_cast<uintptr_t>(src);
return base + ((__builtin_ctzll(cmp)) >> 3);
}
}
}
} // namespace neon
// -----------------------------------------------------------------------------
// SVE implementation
// -----------------------------------------------------------------------------
namespace sve {
[[maybe_unused]] static inline size_t string_length(const char* src) {
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(src);
svsetffr();
const svbool_t all_true = svptrue_b8();
svbool_t cmp_zero;
size_t len = 0;
for (;;) {
svuint8_t data = svldff1_u8(all_true, ptr);
svbool_t fault_mask = svrdffr_z(all_true);
bool no_fault = svptest_last(all_true, fault_mask);
if (LIBC_LIKELY(no_fault)) {
len += svcntb();
cmp_zero = svcmpeq_n_u8(all_true, data, 0);
bool has_no_zero = !svptest_any(all_true, cmp_zero);
if (LIBC_LIKELY(has_no_zero)) {
ptr += svcntb();
continue;
}
len -= svcntb();
break;
} else {
cmp_zero = svcmpeq_n_u8(fault_mask, data, 0);
bool has_zero = svptest_any(fault_mask, cmp_zero);
if (LIBC_LIKELY(has_zero)) break;
len += svcntp_b8(all_true, fault_mask);
ptr += svcntb();
svsetffr();
continue;
}
}
svbool_t before_zero = svbrkb_z(all_true, cmp_zero);
len += svcntp_b8(all_true, before_zero);
return len;
}
} // namespace sve
// -----------------------------------------------------------------------------
// libc fallback
// -----------------------------------------------------------------------------
namespace syslibc {
static inline size_t string_length(const char* s) { return std::strlen(s); }
} // namespace syslibc
// -----------------------------------------------------------------------------
// Benchmark harness
// -----------------------------------------------------------------------------
struct Impl {
const char* name;
size_t (*fn)(const char*);
};
static std::vector<Impl> get_impls() {
return {
{"libc", &syslibc::string_length},
{"neon", &neon::string_length},
{"sve", &sve::string_length}
};
}
struct Result {
std::string name;
double ns_per_call;
double gib_per_s;
};
static inline uint64_t now_ns() {
using clock = std::chrono::steady_clock;
return std::chrono::duration_cast<std::chrono::nanoseconds>(
clock::now().time_since_epoch()).count();
}
// correctness check
static bool run_correctness(const std::vector<Impl>& impls) {
bool ok = true;
std::vector<size_t> sizes = {0,1,3,7,8,9,15,16,31,32,63,64,127,128,255,256,511,512,1023,1024,4096};
for (size_t n : sizes) {
std::unique_ptr<char[]> s(new char[n + 2]);
std::fill(s.get(), s.get() + n, 'A');
s[n] = 0;
size_t ref = syslibc::string_length(s.get());
for (auto& impl : impls) {
size_t got = impl.fn(s.get());
if (got != ref) {
std::cerr << "FAIL " << impl.name << " len=" << n
<< " got=" << got << " ref=" << ref << "\n";
ok = false;
}
}
}
return ok;
}
static Result bench(const Impl& impl, size_t size, size_t reps) {
std::unique_ptr<char[]> buf(new char[size + 1]);
std::fill(buf.get(), buf.get() + size, 'X');
buf[size] = 0;
volatile size_t dummy = 0;
uint64_t t0 = now_ns();
for (size_t i = 0; i < reps; ++i)
dummy += impl.fn(buf.get());
uint64_t t1 = now_ns();
double ns_call = double(t1 - t0) / reps;
double gib_s = (double(size) * reps) / ((t1 - t0) * 1e-9) / (1024.0 * 1024.0 * 1024.0);
(void)dummy;
return {impl.name, ns_call, gib_s};
}
int main() {
auto impls = get_impls();
std::cout << "Implementations:";
for (auto& i : impls) std::cout << " " << i.name;
std::cout << "\n";
if (!run_correctness(impls)) {
std::cerr << "Correctness check failed!\n";
return 1;
}
std::vector<size_t> sizes = {16, 64, 256, 1024, 4096, 1<<20};
for (size_t s : sizes) {
std::cout << "\n=== strlen(" << s << " bytes) ===\n";
for (auto& impl : impls) {
Result r = bench(impl, s, 1000000 / std::max<size_t>(1, s/16));
std::cout << impl.name << ": " << r.ns_per_call << " ns/call, "
<< r.gib_per_s << " GiB/s\n";
}
}
return 0;
}
```
https://github.com/llvm/llvm-project/pull/167259
More information about the libc-commits
mailing list