[libc-commits] [libc] 67fe3bd - [libc][mem*] Introduce Sized/Backends for new mem framework

Guillaume Chatelet via libc-commits libc-commits at lists.llvm.org
Wed Jun 22 04:21:19 PDT 2022


Author: Guillaume Chatelet
Date: 2022-06-22T11:21:06Z
New Revision: 67fe3bd33ce7698e35a98f165ec6ba6090aeff85

URL: https://github.com/llvm/llvm-project/commit/67fe3bd33ce7698e35a98f165ec6ba6090aeff85
DIFF: https://github.com/llvm/llvm-project/commit/67fe3bd33ce7698e35a98f165ec6ba6090aeff85.diff

LOG: [libc][mem*] Introduce Sized/Backends for new mem framework

This patch is a subpart of D125768 intented to make the review easier.

The `SizedOp` struct represents operations to be performed on a certain number of bytes.
It is responsible for breaking them down into platform types and forwarded to the `Backend`.

The `Backend` struct represents a lower level abstraction that works only on types (`uint8_t`, `__m128i`, ...).
It is similar to instruction selection.

Differential Revision: https://reviews.llvm.org/D126768

Added: 
    libc/src/string/memory_utils/backend_aarch64.h
    libc/src/string/memory_utils/backend_scalar.h
    libc/src/string/memory_utils/backend_x86.h
    libc/src/string/memory_utils/backends.h
    libc/src/string/memory_utils/sized_op.h
    libc/test/src/string/memory_utils/backend_test.cpp

Modified: 
    libc/test/src/string/memory_utils/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/libc/src/string/memory_utils/backend_aarch64.h b/libc/src/string/memory_utils/backend_aarch64.h
new file mode 100644
index 0000000000000..7dc9b33c17b47
--- /dev/null
+++ b/libc/src/string/memory_utils/backend_aarch64.h
@@ -0,0 +1,71 @@
+//===-- Elementary operations for aarch64 ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_AARCH64_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_AARCH64_H
+
+#if !defined(LLVM_LIBC_ARCH_AARCH64)
+#include "src/string/memory_utils/backend_scalar.h"
+
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+namespace __llvm_libc {
+
+struct Aarch64Backend : public Scalar64BitBackend {
+  static constexpr bool IS_BACKEND_TYPE = true;
+
+  template <typename T, Temporality TS, Aligned AS,
+            cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
+  static inline T load(const T *src) {
+    return Scalar64BitBackend::template load<T, TS, AS>(src);
+  }
+};
+
+// Implementation of the SizedOp abstraction for the set operation.
+struct Zva64 {
+  static constexpr size_t SIZE = 64;
+
+  template <typename DstAddrT>
+  static inline void set(DstAddrT dst, ubyte value) {
+#if __SIZEOF_POINTER__ == 4
+    asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
+#else
+    asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
+#endif
+  }
+};
+
+inline static bool hasZva() {
+  uint64_t zva_val;
+  asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
+  // DC ZVA is permitted if DZP, bit [4] is zero.
+  // BS, bits [3:0] is log2 of the block size in words.
+  // So the next line checks whether the instruction is permitted and block size
+  // is 16 words (i.e. 64 bytes).
+  return (zva_val & 0b11111) == 0b00100;
+}
+
+namespace aarch64 {
+using _1 = SizedOp<Aarch64Backend, 1>;
+using _2 = SizedOp<Aarch64Backend, 2>;
+using _3 = SizedOp<Aarch64Backend, 3>;
+using _4 = SizedOp<Aarch64Backend, 4>;
+using _8 = SizedOp<Aarch64Backend, 8>;
+using _16 = SizedOp<Aarch64Backend, 16>;
+using _32 = SizedOp<Aarch64Backend, 32>;
+using _64 = SizedOp<Aarch64Backend, 64>;
+using _128 = SizedOp<Aarch64Backend, 128>;
+} // namespace aarch64
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_ARCH_AARCH64
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_AARCH64_H

diff  --git a/libc/src/string/memory_utils/backend_scalar.h b/libc/src/string/memory_utils/backend_scalar.h
new file mode 100644
index 0000000000000..00b8ed66d9cf1
--- /dev/null
+++ b/libc/src/string/memory_utils/backend_scalar.h
@@ -0,0 +1,104 @@
+//===-- Elementary operations for native scalar types ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_SCALAR_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_SCALAR_H
+
+#include "src/__support/CPP/TypeTraits.h" // ConditionalType, EnableIfType
+#include "src/__support/endian.h"
+
+namespace __llvm_libc {
+
+struct Scalar64BitBackend {
+  static constexpr bool IS_BACKEND_TYPE = true;
+
+  template <typename T>
+  static constexpr bool IsScalarType =
+      cpp::IsSameV<T, uint8_t> || cpp::IsSameV<T, uint16_t> ||
+      cpp::IsSameV<T, uint32_t> || cpp::IsSameV<T, uint64_t>;
+
+  template <typename T, Temporality TS, Aligned AS>
+  static inline T load(const T *src) {
+    static_assert(IsScalarType<T>);
+    static_assert(TS == Temporality::TEMPORAL,
+                  "Scalar load does not support non-temporal access");
+    return *src;
+  }
+
+  template <typename T, Temporality TS, Aligned AS>
+  static inline void store(T *dst, T value) {
+    static_assert(IsScalarType<T>);
+    static_assert(TS == Temporality::TEMPORAL,
+                  "Scalar store does not support non-temporal access");
+    *dst = value;
+  }
+
+  template <typename T> static inline T splat(ubyte value) {
+    static_assert(IsScalarType<T>);
+    return (T(~0ULL) / T(0xFF)) * T(value);
+  }
+
+  template <typename T> static inline uint64_t notEquals(T v1, T v2) {
+    static_assert(IsScalarType<T>);
+    return v1 ^ v2;
+  }
+
+  template <typename T> static inline int32_t threeWayCmp(T v1, T v2) {
+    DeferredStaticAssert("not implemented");
+  }
+
+  // Returns the type to use to consume Size bytes.
+  template <size_t Size>
+  using getNextType = cpp::ConditionalType<
+      Size >= 8, uint64_t,
+      cpp::ConditionalType<Size >= 4, uint32_t,
+                           cpp::ConditionalType<Size >= 2, uint16_t, uint8_t>>>;
+};
+
+template <>
+int32_t inline Scalar64BitBackend::threeWayCmp<uint8_t>(uint8_t a, uint8_t b) {
+  const int16_t la = Endian::to_big_endian(a);
+  const int16_t lb = Endian::to_big_endian(b);
+  return la - lb;
+}
+template <>
+int32_t inline Scalar64BitBackend::threeWayCmp<uint16_t>(uint16_t a,
+                                                         uint16_t b) {
+  const int32_t la = Endian::to_big_endian(a);
+  const int32_t lb = Endian::to_big_endian(b);
+  return la - lb;
+}
+template <>
+int32_t inline Scalar64BitBackend::threeWayCmp<uint32_t>(uint32_t a,
+                                                         uint32_t b) {
+  const uint32_t la = Endian::to_big_endian(a);
+  const uint32_t lb = Endian::to_big_endian(b);
+  return la > lb ? 1 : la < lb ? -1 : 0;
+}
+template <>
+int32_t inline Scalar64BitBackend::threeWayCmp<uint64_t>(uint64_t a,
+                                                         uint64_t b) {
+  const uint64_t la = Endian::to_big_endian(a);
+  const uint64_t lb = Endian::to_big_endian(b);
+  return la > lb ? 1 : la < lb ? -1 : 0;
+}
+
+namespace scalar {
+using _1 = SizedOp<Scalar64BitBackend, 1>;
+using _2 = SizedOp<Scalar64BitBackend, 2>;
+using _3 = SizedOp<Scalar64BitBackend, 3>;
+using _4 = SizedOp<Scalar64BitBackend, 4>;
+using _8 = SizedOp<Scalar64BitBackend, 8>;
+using _16 = SizedOp<Scalar64BitBackend, 16>;
+using _32 = SizedOp<Scalar64BitBackend, 32>;
+using _64 = SizedOp<Scalar64BitBackend, 64>;
+using _128 = SizedOp<Scalar64BitBackend, 128>;
+} // namespace scalar
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_SCALAR_H

diff  --git a/libc/src/string/memory_utils/backend_x86.h b/libc/src/string/memory_utils/backend_x86.h
new file mode 100644
index 0000000000000..aee1d2275e0ba
--- /dev/null
+++ b/libc/src/string/memory_utils/backend_x86.h
@@ -0,0 +1,221 @@
+//===-- Elementary operations for x86 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H
+
+#if defined(LLVM_LIBC_ARCH_X86)
+#include "src/__support/CPP/TypeTraits.h" // ConditionalType, EnableIfType
+#include "src/string/memory_utils/backend_scalar.h"
+
+#ifdef __SSE2__
+#include <immintrin.h>
+#endif //  __SSE2__
+
+#if defined(__SSE2__)
+#define HAS_M128 true
+#else
+#define HAS_M128 false
+#endif
+
+#if defined(__AVX2__)
+#define HAS_M256 true
+#else
+#define HAS_M256 false
+#endif
+
+#if defined(__AVX512F__) and defined(__AVX512BW__)
+#define HAS_M512 true
+#else
+#define HAS_M512 false
+#endif
+
+namespace __llvm_libc {
+struct X86Backend : public Scalar64BitBackend {
+  static constexpr bool IS_BACKEND_TYPE = true;
+
+  // Scalar types use base class implementations.
+  template <typename T, Temporality TS, Aligned AS,
+            cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
+  static inline T load(const T *src) {
+    return Scalar64BitBackend::template load<T, TS, AS>(src);
+  }
+
+  // Scalar types use base class implementations.
+  template <typename T, Temporality TS, Aligned AS,
+            cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
+  static inline void store(T *dst, T value) {
+    Scalar64BitBackend::template store<T, TS, AS>(dst, value);
+  }
+
+  // Scalar types use base class implementations.
+  template <typename T,
+            cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
+  static inline uint64_t notEquals(T v1, T v2) {
+    return Scalar64BitBackend::template notEquals<T>(v1, v2);
+  }
+
+  // Scalar types use base class implementations.
+  template <typename T,
+            cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
+  static inline T splat(ubyte value) {
+    return Scalar64BitBackend::template splat<T>(value);
+  }
+
+  // Scalar types use base class implementations.
+  template <typename T,
+            cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
+  static inline int32_t threeWayCmp(T v1, T v2) {
+    return Scalar64BitBackend::template threeWayCmp<T>(v1, v2);
+  }
+
+  // X86 types are specialized below.
+  template <
+      typename T, Temporality TS, Aligned AS,
+      cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>, bool> = true>
+  static inline T load(const T *src);
+
+  // X86 types are specialized below.
+  template <
+      typename T, Temporality TS, Aligned AS,
+      cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>, bool> = true>
+  static inline void store(T *dst, T value);
+
+  // X86 types are specialized below.
+  template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>,
+                                          bool> = true>
+  static inline T splat(ubyte value);
+
+  // X86 types are specialized below.
+  template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>,
+                                          bool> = true>
+  static inline uint64_t notEquals(T v1, T v2);
+
+  template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>,
+                                          bool> = true>
+  static inline int32_t threeWayCmp(T v1, T v2) {
+    return char_
diff (reinterpret_cast<char *>(&v1),
+                     reinterpret_cast<char *>(&v2), notEquals(v1, v2));
+  }
+
+  // Returns the type to use to consume Size bytes.
+  template <size_t Size>
+  using getNextType = cpp::ConditionalType<
+      (HAS_M512 && Size >= 64), __m512i,
+      cpp::ConditionalType<
+          (HAS_M256 && Size >= 32), __m256i,
+          cpp::ConditionalType<(HAS_M128 && Size >= 16), __m128i,
+                               Scalar64BitBackend::getNextType<Size>>>>;
+
+private:
+  static inline int32_t char_
diff (const char *a, const char *b, uint64_t mask) {
+    const size_t 
diff _index = mask == 0 ? 0 : __builtin_ctzll(mask);
+    const int16_t ca = (unsigned char)a[
diff _index];
+    const int16_t cb = (unsigned char)b[
diff _index];
+    return ca - cb;
+  }
+};
+
+static inline void repmovsb(void *dst, const void *src, size_t runtime_size) {
+  asm volatile("rep movsb"
+               : "+D"(dst), "+S"(src), "+c"(runtime_size)
+               :
+               : "memory");
+}
+
+#define SPECIALIZE_LOAD(T, OS, AS, INTRISIC)                                   \
+  template <> inline T X86Backend::load<T, OS, AS>(const T *src) {             \
+    return INTRISIC(const_cast<T *>(src));                                     \
+  }
+#define SPECIALIZE_STORE(T, OS, AS, INTRISIC)                                  \
+  template <> inline void X86Backend::store<T, OS, AS>(T * dst, T value) {     \
+    INTRISIC(dst, value);                                                      \
+  }
+
+#if HAS_M128
+SPECIALIZE_LOAD(__m128i, Temporality::TEMPORAL, Aligned::YES, _mm_load_si128)
+SPECIALIZE_LOAD(__m128i, Temporality::TEMPORAL, Aligned::NO, _mm_loadu_si128)
+SPECIALIZE_LOAD(__m128i, Temporality::NON_TEMPORAL, Aligned::YES,
+                _mm_stream_load_si128)
+// X86 non-temporal load needs aligned access
+SPECIALIZE_STORE(__m128i, Temporality::TEMPORAL, Aligned::YES, _mm_store_si128)
+SPECIALIZE_STORE(__m128i, Temporality::TEMPORAL, Aligned::NO, _mm_storeu_si128)
+SPECIALIZE_STORE(__m128i, Temporality::NON_TEMPORAL, Aligned::YES,
+                 _mm_stream_si128)
+// X86 non-temporal store needs aligned access
+template <> inline __m128i X86Backend::splat<__m128i>(ubyte value) {
+  return _mm_set1_epi8(__builtin_bit_cast(char, value));
+}
+template <>
+inline uint64_t X86Backend::notEquals<__m128i>(__m128i a, __m128i b) {
+  using T = char __attribute__((__vector_size__(16)));
+  return _mm_movemask_epi8(T(a) != T(b));
+}
+#endif // HAS_M128
+
+#if HAS_M256
+SPECIALIZE_LOAD(__m256i, Temporality::TEMPORAL, Aligned::YES, _mm256_load_si256)
+SPECIALIZE_LOAD(__m256i, Temporality::TEMPORAL, Aligned::NO, _mm256_loadu_si256)
+SPECIALIZE_LOAD(__m256i, Temporality::NON_TEMPORAL, Aligned::YES,
+                _mm256_stream_load_si256)
+// X86 non-temporal load needs aligned access
+SPECIALIZE_STORE(__m256i, Temporality::TEMPORAL, Aligned::YES,
+                 _mm256_store_si256)
+SPECIALIZE_STORE(__m256i, Temporality::TEMPORAL, Aligned::NO,
+                 _mm256_storeu_si256)
+SPECIALIZE_STORE(__m256i, Temporality::NON_TEMPORAL, Aligned::YES,
+                 _mm256_stream_si256)
+// X86 non-temporal store needs aligned access
+template <> inline __m256i X86Backend::splat<__m256i>(ubyte value) {
+  return _mm256_set1_epi8(__builtin_bit_cast(char, value));
+}
+template <>
+inline uint64_t X86Backend::notEquals<__m256i>(__m256i a, __m256i b) {
+  using T = char __attribute__((__vector_size__(32)));
+  return _mm256_movemask_epi8(T(a) != T(b));
+}
+#endif // HAS_M256
+
+#if HAS_M512
+SPECIALIZE_LOAD(__m512i, Temporality::TEMPORAL, Aligned::YES, _mm512_load_si512)
+SPECIALIZE_LOAD(__m512i, Temporality::TEMPORAL, Aligned::NO, _mm512_loadu_si512)
+SPECIALIZE_LOAD(__m512i, Temporality::NON_TEMPORAL, Aligned::YES,
+                _mm512_stream_load_si512)
+// X86 non-temporal load needs aligned access
+SPECIALIZE_STORE(__m512i, Temporality::TEMPORAL, Aligned::YES,
+                 _mm512_store_si512)
+SPECIALIZE_STORE(__m512i, Temporality::TEMPORAL, Aligned::NO,
+                 _mm512_storeu_si512)
+SPECIALIZE_STORE(__m512i, Temporality::NON_TEMPORAL, Aligned::YES,
+                 _mm512_stream_si512)
+// X86 non-temporal store needs aligned access
+template <> inline __m512i X86Backend::splat<__m512i>(ubyte value) {
+  return _mm512_broadcastb_epi8(_mm_set1_epi8(__builtin_bit_cast(char, value)));
+}
+template <>
+inline uint64_t X86Backend::notEquals<__m512i>(__m512i a, __m512i b) {
+  return _mm512_cmpneq_epi8_mask(a, b);
+}
+#endif // HAS_M512
+
+namespace x86 {
+using _1 = SizedOp<X86Backend, 1>;
+using _2 = SizedOp<X86Backend, 2>;
+using _3 = SizedOp<X86Backend, 3>;
+using _4 = SizedOp<X86Backend, 4>;
+using _8 = SizedOp<X86Backend, 8>;
+using _16 = SizedOp<X86Backend, 16>;
+using _32 = SizedOp<X86Backend, 32>;
+using _64 = SizedOp<X86Backend, 64>;
+using _128 = SizedOp<X86Backend, 128>;
+} // namespace x86
+
+} // namespace __llvm_libc
+
+#endif // defined(LLVM_LIBC_ARCH_X86)
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H

diff  --git a/libc/src/string/memory_utils/backends.h b/libc/src/string/memory_utils/backends.h
new file mode 100644
index 0000000000000..6d241fa5eb289
--- /dev/null
+++ b/libc/src/string/memory_utils/backends.h
@@ -0,0 +1,60 @@
+//===-- Elementary operations to compose memory primitives ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the concept of a Backend.
+// It constitutes the lowest level of the framework and is akin to instruction
+// selection. It defines how to implement aligned/unaligned,
+// temporal/non-temporal native loads and stores for a particular architecture
+// as well as efficient ways to fill and compare types.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKENDS_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKENDS_H
+
+#include "src/string/memory_utils/address.h" // Temporality, Aligned
+#include "src/string/memory_utils/sized_op.h" // SizedOp
+#include <stddef.h>                           // size_t
+#include <stdint.h>                           // uint##_t
+
+namespace __llvm_libc {
+
+// Backends must implement the following interface.
+struct NoBackend {
+  static constexpr bool IS_BACKEND_TYPE = true;
+
+  // Loads a T from `src` honoring Temporality and Alignment.
+  template <typename T, Temporality, Aligned> static T load(const T *src);
+
+  // Stores a T to `dst` honoring Temporality and Alignment.
+  template <typename T, Temporality, Aligned>
+  static void store(T *dst, T value);
+
+  // Returns a T filled with `value` bytes.
+  template <typename T> static T splat(ubyte value);
+
+  // Returns zero iff v1 == v2.
+  template <typename T> static uint64_t notEquals(T v1, T v2);
+
+  // Returns zero iff v1 == v2, a negative number if v1 < v2 and a positive
+  // number otherwise.
+  template <typename T> static int32_t threeWayCmp(T v1, T v2);
+
+  // Returns the type to use to consume Size bytes.
+  // If no type handles Size bytes at once
+  template <size_t Size> using getNextType = void;
+};
+
+} // namespace __llvm_libc
+
+// We inline all backend implementations here to simplify the build system.
+// Each file need to be guarded with the appropriate LLVM_LIBC_ARCH_XXX ifdef.
+#include "src/string/memory_utils/backend_aarch64.h"
+#include "src/string/memory_utils/backend_scalar.h"
+#include "src/string/memory_utils/backend_x86.h"
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKENDS_H

diff  --git a/libc/src/string/memory_utils/sized_op.h b/libc/src/string/memory_utils/sized_op.h
new file mode 100644
index 0000000000000..12ace7cc6bdfc
--- /dev/null
+++ b/libc/src/string/memory_utils/sized_op.h
@@ -0,0 +1,177 @@
+//===-- Sized Operations --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SizedOp struct that serves as the middle end of the
+// framework. It implements sized memory operations by breaking them down into
+// simpler types whose availability is described in the Backend. It also
+// provides a way to load and store sized chunks of memory (necessary for the
+// move operation). SizedOp are the building blocks of higher order algorithms
+// like HeadTail, Align or Loop.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_SIZED_OP_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_SIZED_OP_H
+
+#include <stddef.h> // size_t
+
+#ifndef LLVM_LIBC_USE_BUILTIN_MEMCPY_INLINE
+#define LLVM_LIBC_USE_BUILTIN_MEMCPY_INLINE                                    \
+  __has_builtin(__builtin_memcpy_inline)
+#endif // LLVM_LIBC_USE_BUILTIN_MEMCPY_INLINE
+
+#ifndef LLVM_LIBC_USE_BUILTIN_MEMSET_INLINE
+#define LLVM_LIBC_USE_BUILTIN_MEMSET_INLINE                                    \
+  __has_builtin(__builtin_memset_inline)
+#endif // LLVM_LIBC_USE_BUILTIN_MEMSET_INLINE
+
+namespace __llvm_libc {
+
+template <typename Backend, size_t Size> struct SizedOp {
+  static constexpr size_t SIZE = Size;
+
+private:
+  static_assert(Backend::IS_BACKEND_TYPE);
+  static_assert(SIZE > 0);
+  using type = typename Backend::template getNextType<Size>;
+  static constexpr size_t TYPE_SIZE = sizeof(type);
+  static_assert(SIZE >= TYPE_SIZE);
+  static constexpr size_t NEXT_SIZE = Size - TYPE_SIZE;
+  using NextBlock = SizedOp<Backend, NEXT_SIZE>;
+
+  // Returns whether we can use an aligned operations.
+  // This is possible because the address type carries known compile-time
+  // alignment informations.
+  template <typename T, typename AddrT> static constexpr Aligned isAligned() {
+    static_assert(IsAddressType<AddrT>::Value);
+    return AddrT::ALIGNMENT > 1 && AddrT::ALIGNMENT >= sizeof(T) ? Aligned::YES
+                                                                 : Aligned::NO;
+  }
+
+  // Loads a value of the current `type` from `src`.
+  // This function is responsible for extracting Temporality and Alignment from
+  // the Address type.
+  template <typename SrcAddrT> static inline auto nativeLoad(SrcAddrT src) {
+    static_assert(IsAddressType<SrcAddrT>::Value && SrcAddrT::IS_READ);
+    constexpr auto AS = isAligned<type, SrcAddrT>();
+    constexpr auto TS = SrcAddrT::TEMPORALITY;
+    return Backend::template load<type, TS, AS>(as<const type>(src));
+  }
+
+  // Stores a value of the current `type` to `dst`.
+  // This function is responsible for extracting Temporality and Alignment from
+  // the Address type.
+  template <typename DstAddrT>
+  static inline void nativeStore(type value, DstAddrT dst) {
+    static_assert(IsAddressType<DstAddrT>::Value && DstAddrT::IS_WRITE);
+    constexpr auto AS = isAligned<type, DstAddrT>();
+    constexpr auto TS = DstAddrT::TEMPORALITY;
+    return Backend::template store<type, TS, AS>(as<type>(dst), value);
+  }
+
+  // A well aligned POD structure to store Size bytes.
+  // This is used to implement the move operations.
+  struct Value {
+    alignas(alignof(type)) ubyte payload[Size];
+  };
+
+public:
+  template <typename DstAddrT, typename SrcAddrT>
+  static inline void copy(DstAddrT dst, SrcAddrT src) {
+    static_assert(IsAddressType<DstAddrT>::Value && DstAddrT::IS_WRITE);
+    static_assert(IsAddressType<SrcAddrT>::Value && SrcAddrT::IS_READ);
+    if constexpr (LLVM_LIBC_USE_BUILTIN_MEMCPY_INLINE &&
+                  DstAddrT::TEMPORALITY == Temporality::TEMPORAL &&
+                  SrcAddrT::TEMPORALITY == Temporality::TEMPORAL) {
+      // delegate optimized copy to compiler.
+      __builtin_memcpy_inline(dst.ptr(), src.ptr(), Size);
+      return;
+    }
+    nativeStore(nativeLoad(src), dst);
+    if constexpr (NEXT_SIZE > 0)
+      NextBlock::copy(offsetAddr<TYPE_SIZE>(dst), offsetAddr<TYPE_SIZE>(src));
+  }
+
+  template <typename DstAddrT, typename SrcAddrT>
+  static inline void move(DstAddrT dst, SrcAddrT src) {
+    const auto payload = nativeLoad(src);
+    if constexpr (NEXT_SIZE > 0)
+      NextBlock::move(offsetAddr<TYPE_SIZE>(dst), offsetAddr<TYPE_SIZE>(src));
+    nativeStore(payload, dst);
+  }
+
+  template <typename DstAddrT>
+  static inline void set(DstAddrT dst, ubyte value) {
+    if constexpr (LLVM_LIBC_USE_BUILTIN_MEMSET_INLINE &&
+                  DstAddrT::TEMPORALITY == Temporality::TEMPORAL) {
+      // delegate optimized set to compiler.
+      __builtin_memset_inline(dst.ptr(), value, Size);
+      return;
+    }
+    nativeStore(Backend::template splat<type>(value), dst);
+    if constexpr (NEXT_SIZE > 0)
+      NextBlock::set(offsetAddr<TYPE_SIZE>(dst), value);
+  }
+
+  template <typename SrcAddrT1, typename SrcAddrT2>
+  static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2) {
+    const uint64_t current =
+        Backend::template notEquals<type>(nativeLoad(src1), nativeLoad(src2));
+    if constexpr (NEXT_SIZE > 0) {
+      // In the case where we cannot handle Size with single operation (e.g.
+      // Size == 3) we can either return early if current is non zero or
+      // aggregate all the operations through the bitwise or operator.
+      // We chose the later to reduce branching.
+      return current | (NextBlock::isDifferent(offsetAddr<TYPE_SIZE>(src1),
+                                               offsetAddr<TYPE_SIZE>(src2)));
+    } else {
+      return current;
+    }
+  }
+
+  template <typename SrcAddrT1, typename SrcAddrT2>
+  static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2) {
+    const auto a = nativeLoad(src1);
+    const auto b = nativeLoad(src2);
+    // If we cannot handle Size as a single operation we have two choices:
+    // - Either use Backend's threeWayCmp directly and return it is non
+    // zero.
+    //
+    //   if (int32_t res = Backend::template threeWayCmp<type>(a, b))
+    //     return res;
+    //
+    // - Or use Backend's notEquals first and use threeWayCmp only if
+    // 
diff erent, the assumption here is that notEquals is faster than
+    // threeWayCmp and that we can save cycles when the Size needs to be
+    // decomposed in many sizes (e.g. Size == 7 => 4 + 2 + 1)
+    //
+    //   if (Backend::template notEquals<type>(a, b))
+    //     return Backend::template threeWayCmp<type>(a, b);
+    //
+    // We chose the former to reduce code bloat and branching.
+    if (int32_t res = Backend::template threeWayCmp<type>(a, b))
+      return res;
+    if constexpr (NEXT_SIZE > 0)
+      return NextBlock::threeWayCmp(offsetAddr<TYPE_SIZE>(src1),
+                                    offsetAddr<TYPE_SIZE>(src2));
+    return 0;
+  }
+
+  template <typename SrcAddrT> static Value load(SrcAddrT src) {
+    Value output;
+    copy(DstAddr<alignof(type)>(output.payload), src);
+    return output;
+  }
+
+  template <typename DstAddrT> static void store(DstAddrT dst, Value value) {
+    copy(dst, SrcAddr<alignof(type)>(value.payload));
+  }
+};
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_SIZED_OP_H

diff  --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
index d5d32107636d5..83926e48aff55 100644
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -4,6 +4,7 @@ add_libc_unittest(
     libc_string_unittests
   SRCS
     address_test.cpp
+    backend_test.cpp
     elements_test.cpp
     memory_access_test.cpp
     utils_test.cpp

diff  --git a/libc/test/src/string/memory_utils/backend_test.cpp b/libc/test/src/string/memory_utils/backend_test.cpp
new file mode 100644
index 0000000000000..27418b7c9933e
--- /dev/null
+++ b/libc/test/src/string/memory_utils/backend_test.cpp
@@ -0,0 +1,197 @@
+//===-- Unittests for backends --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/Array.h"
+#include "src/__support/CPP/ArrayRef.h"
+#include "src/__support/CPP/Bit.h"
+#include "src/__support/architectures.h"
+#include "src/string/memory_utils/backends.h"
+#include "utils/UnitTest/Test.h"
+#include <string.h>
+
+namespace __llvm_libc {
+
+template <size_t Size> using Buffer = cpp::Array<char, Size>;
+
+static char GetRandomChar() {
+  // Implementation of C++ minstd_rand seeded with 123456789.
+  // https://en.cppreference.com/w/cpp/numeric/random
+  // "Minimum standard", recommended by Park, Miller, and Stockmeyer in 1993
+  static constexpr const uint64_t a = 48271;
+  static constexpr const uint64_t c = 0;
+  static constexpr const uint64_t m = 2147483647;
+  static uint64_t seed = 123456789;
+  seed = (a * seed + c) % m;
+  return seed;
+}
+
+static void Randomize(cpp::MutableArrayRef<char> buffer) {
+  for (auto &current : buffer)
+    current = GetRandomChar();
+}
+
+template <size_t Size> static Buffer<Size> GetRandomBuffer() {
+  Buffer<Size> buffer;
+  Randomize(buffer);
+  return buffer;
+}
+
+template <typename Backend, size_t Size> struct Conf {
+  static_assert(Backend::IS_BACKEND_TYPE);
+  using BufferT = Buffer<Size>;
+  using T = typename Backend::template getNextType<Size>;
+  static_assert(sizeof(T) == Size);
+  static constexpr size_t SIZE = Size;
+
+  static BufferT splat(ubyte value) {
+    return bit_cast<BufferT>(Backend::template splat<T>(value));
+  }
+
+  static uint64_t notEquals(const BufferT &v1, const BufferT &v2) {
+    return Backend::template notEquals<T>(bit_cast<T>(v1), bit_cast<T>(v2));
+  }
+
+  static int32_t threeWayCmp(const BufferT &v1, const BufferT &v2) {
+    return Backend::template threeWayCmp<T>(bit_cast<T>(v1), bit_cast<T>(v2));
+  }
+};
+
+using FunctionTypes = testing::TypeList< //
+#if defined(LLVM_LIBC_ARCH_X86)          //
+    Conf<X86Backend, 1>,                 //
+    Conf<X86Backend, 2>,                 //
+    Conf<X86Backend, 4>,                 //
+    Conf<X86Backend, 8>,                 //
+#if HAS_M128
+    Conf<X86Backend, 16>, //
+#endif
+#if HAS_M256
+    Conf<X86Backend, 32>, //
+#endif
+#if HAS_M512
+    Conf<X86Backend, 64>, //
+#endif
+#endif                           // defined(LLVM_LIBC_ARCH_X86)
+    Conf<Scalar64BitBackend, 1>, //
+    Conf<Scalar64BitBackend, 2>, //
+    Conf<Scalar64BitBackend, 4>, //
+    Conf<Scalar64BitBackend, 8>  //
+    >;
+
+TYPED_TEST(LlvmLibcMemoryBackend, splat, FunctionTypes) {
+  for (auto value : cpp::Array<uint8_t, 3>{0u, 1u, 255u}) {
+    alignas(64) const auto stored = ParamType::splat(bit_cast<ubyte>(value));
+    for (size_t i = 0; i < ParamType::SIZE; ++i)
+      EXPECT_EQ(bit_cast<uint8_t>(stored[i]), value);
+  }
+}
+
+TYPED_TEST(LlvmLibcMemoryBackend, notEquals, FunctionTypes) {
+  alignas(64) const auto a = GetRandomBuffer<ParamType::SIZE>();
+  EXPECT_EQ(ParamType::notEquals(a, a), 0UL);
+  for (size_t i = 0; i < a.size(); ++i) {
+    alignas(64) auto b = a;
+    ++b[i];
+    EXPECT_NE(ParamType::notEquals(a, b), 0UL);
+    EXPECT_NE(ParamType::notEquals(b, a), 0UL);
+  }
+}
+
+TYPED_TEST(LlvmLibcMemoryBackend, threeWayCmp, FunctionTypes) {
+  alignas(64) const auto a = GetRandomBuffer<ParamType::SIZE>();
+  EXPECT_EQ(ParamType::threeWayCmp(a, a), 0);
+  for (size_t i = 0; i < a.size(); ++i) {
+    alignas(64) auto b = a;
+    ++b[i];
+    const auto cmp = memcmp(&a, &b, sizeof(a));
+    ASSERT_NE(cmp, 0);
+    if (cmp > 0) {
+      EXPECT_GT(ParamType::threeWayCmp(a, b), 0);
+      EXPECT_LT(ParamType::threeWayCmp(b, a), 0);
+    } else {
+      EXPECT_LT(ParamType::threeWayCmp(a, b), 0);
+      EXPECT_GT(ParamType::threeWayCmp(b, a), 0);
+    }
+  }
+}
+
+template <typename Backend, size_t Size, Temporality TS, Aligned AS>
+struct LoadStoreConf {
+  static_assert(Backend::IS_BACKEND_TYPE);
+  using BufferT = Buffer<Size>;
+  using T = typename Backend::template getNextType<Size>;
+  static_assert(sizeof(T) == Size);
+  static constexpr size_t SIZE = Size;
+
+  static BufferT load(const BufferT &ref) {
+    const auto *ptr = bit_cast<const T *>(ref.data());
+    const T value = Backend::template load<T, TS, AS>(ptr);
+    return bit_cast<BufferT>(value);
+  }
+
+  static void store(BufferT &ref, const BufferT value) {
+    auto *ptr = bit_cast<T *>(ref.data());
+    Backend::template store<T, TS, AS>(ptr, bit_cast<T>(value));
+  }
+};
+
+using LoadStoreTypes = testing::TypeList<                              //
+#if defined(LLVM_LIBC_ARCH_X86)                                        //
+    LoadStoreConf<X86Backend, 1, Temporality::TEMPORAL, Aligned::NO>,  //
+    LoadStoreConf<X86Backend, 1, Temporality::TEMPORAL, Aligned::YES>, //
+    LoadStoreConf<X86Backend, 2, Temporality::TEMPORAL, Aligned::NO>,  //
+    LoadStoreConf<X86Backend, 2, Temporality::TEMPORAL, Aligned::YES>, //
+    LoadStoreConf<X86Backend, 4, Temporality::TEMPORAL, Aligned::NO>,  //
+    LoadStoreConf<X86Backend, 4, Temporality::TEMPORAL, Aligned::YES>, //
+    LoadStoreConf<X86Backend, 8, Temporality::TEMPORAL, Aligned::NO>,  //
+    LoadStoreConf<X86Backend, 8, Temporality::TEMPORAL, Aligned::YES>, //
+#if HAS_M128
+    LoadStoreConf<X86Backend, 16, Temporality::TEMPORAL, Aligned::NO>,      //
+    LoadStoreConf<X86Backend, 16, Temporality::TEMPORAL, Aligned::YES>,     //
+    LoadStoreConf<X86Backend, 16, Temporality::NON_TEMPORAL, Aligned::YES>, //
+#endif
+#if HAS_M256
+    LoadStoreConf<X86Backend, 32, Temporality::TEMPORAL, Aligned::NO>,      //
+    LoadStoreConf<X86Backend, 32, Temporality::TEMPORAL, Aligned::YES>,     //
+    LoadStoreConf<X86Backend, 32, Temporality::NON_TEMPORAL, Aligned::YES>, //
+#endif
+#if HAS_M512
+    LoadStoreConf<X86Backend, 64, Temporality::TEMPORAL, Aligned::NO>,      //
+    LoadStoreConf<X86Backend, 64, Temporality::TEMPORAL, Aligned::YES>,     //
+    LoadStoreConf<X86Backend, 64, Temporality::NON_TEMPORAL, Aligned::YES>, //
+#endif
+#endif // defined(LLVM_LIBC_ARCH_X86)
+    LoadStoreConf<Scalar64BitBackend, 1, Temporality::TEMPORAL, Aligned::NO>, //
+    LoadStoreConf<Scalar64BitBackend, 1, Temporality::TEMPORAL,
+                  Aligned::YES>,                                              //
+    LoadStoreConf<Scalar64BitBackend, 2, Temporality::TEMPORAL, Aligned::NO>, //
+    LoadStoreConf<Scalar64BitBackend, 2, Temporality::TEMPORAL,
+                  Aligned::YES>,                                              //
+    LoadStoreConf<Scalar64BitBackend, 4, Temporality::TEMPORAL, Aligned::NO>, //
+    LoadStoreConf<Scalar64BitBackend, 4, Temporality::TEMPORAL,
+                  Aligned::YES>,                                              //
+    LoadStoreConf<Scalar64BitBackend, 8, Temporality::TEMPORAL, Aligned::NO>, //
+    LoadStoreConf<Scalar64BitBackend, 8, Temporality::TEMPORAL, Aligned::YES> //
+    >;
+
+TYPED_TEST(LlvmLibcMemoryBackend, load, LoadStoreTypes) {
+  alignas(64) const auto expected = GetRandomBuffer<ParamType::SIZE>();
+  const auto loaded = ParamType::load(expected);
+  for (size_t i = 0; i < ParamType::SIZE; ++i)
+    EXPECT_EQ(loaded[i], expected[i]);
+}
+
+TYPED_TEST(LlvmLibcMemoryBackend, store, LoadStoreTypes) {
+  alignas(64) const auto expected = GetRandomBuffer<ParamType::SIZE>();
+  alignas(64) typename ParamType::BufferT stored;
+  ParamType::store(stored, expected);
+  for (size_t i = 0; i < ParamType::SIZE; ++i)
+    EXPECT_EQ(stored[i], expected[i]);
+}
+
+} // namespace __llvm_libc


        


More information about the libc-commits mailing list