[libc-commits] [libc] 1a4696d - [libc][NFC] Use new approach based on types to code memset

Tue Apr 11 04:32:19 PDT 2023

Author: Guillaume Chatelet
Date: 2023-04-11T11:32:09Z
New Revision: 1a4696d9ec26eb4f7e9768bc279235667232e92a

URL: https://github.com/llvm/llvm-project/commit/1a4696d9ec26eb4f7e9768bc279235667232e92a
DIFF: https://github.com/llvm/llvm-project/commit/1a4696d9ec26eb4f7e9768bc279235667232e92a.diff

LOG: [libc][NFC] Use new approach based on types to code memset

Added: 
    

Modified: 
    libc/src/string/memory_utils/memset_implementations.h
    libc/src/string/memory_utils/op_aarch64.h
    libc/src/string/memory_utils/op_generic.h
    libc/test/src/string/memory_utils/op_tests.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/string/memory_utils/memset_implementations.h b/libc/src/string/memory_utils/memset_implementations.h
index 16c11470572b5..dfc7df87aceb2 100644

--- a/libc/src/string/memory_utils/memset_implementations.h
+++ b/libc/src/string/memory_utils/memset_implementations.h
@@ -26,86 +26,101 @@ namespace __llvm_libc {
 inline_memset_embedded_tiny(Ptr dst, uint8_t value, size_t count) {
   LIBC_LOOP_NOUNROLL
   for (size_t offset = 0; offset < count; ++offset)
-    generic::Memset<1, 1>::block(dst + offset, value);
+    generic::Memset<uint8_t>::block(dst + offset, value);
 }
 
 #if defined(LIBC_TARGET_ARCH_IS_X86)
-template <size_t MaxSize>
 [[maybe_unused]] LIBC_INLINE static void
 inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
+#if defined(__AVX512F__)
+  using uint128_t = uint8x16_t;
+  using uint256_t = uint8x32_t;
+  using uint512_t = uint8x64_t;
+#elif defined(__AVX__)
+  using uint128_t = uint8x16_t;
+  using uint256_t = uint8x32_t;
+  using uint512_t = cpp::array<uint8x32_t, 2>;
+#elif defined(__SSE2__)
+  using uint128_t = uint8x16_t;
+  using uint256_t = cpp::array<uint8x16_t, 2>;
+  using uint512_t = cpp::array<uint8x16_t, 4>;
+#else
+  using uint128_t = cpp::array<uint64_t, 2>;
+  using uint256_t = cpp::array<uint64_t, 4>;
+  using uint512_t = cpp::array<uint64_t, 8>;
+#endif
+
   if (count == 0)
     return;
   if (count == 1)
-    return generic::Memset<1, MaxSize>::block(dst, value);
+    return generic::Memset<uint8_t>::block(dst, value);
   if (count == 2)
-    return generic::Memset<2, MaxSize>::block(dst, value);
+    return generic::Memset<uint16_t>::block(dst, value);
   if (count == 3)
-    return generic::Memset<3, MaxSize>::block(dst, value);
+    return generic::Memset<uint16_t, uint8_t>::block(dst, value);
   if (count <= 8)
-    return generic::Memset<4, MaxSize>::head_tail(dst, value, count);
+    return generic::Memset<uint32_t>::head_tail(dst, value, count);
   if (count <= 16)
-    return generic::Memset<8, MaxSize>::head_tail(dst, value, count);
+    return generic::Memset<uint64_t>::head_tail(dst, value, count);
   if (count <= 32)
-    return generic::Memset<16, MaxSize>::head_tail(dst, value, count);
+    return generic::Memset<uint128_t>::head_tail(dst, value, count);
   if (count <= 64)
-    return generic::Memset<32, MaxSize>::head_tail(dst, value, count);
+    return generic::Memset<uint256_t>::head_tail(dst, value, count);
   if (count <= 128)
-    return generic::Memset<64, MaxSize>::head_tail(dst, value, count);
+    return generic::Memset<uint512_t>::head_tail(dst, value, count);
   // Aligned loop
-  generic::Memset<32, MaxSize>::block(dst, value);
+  generic::Memset<uint256_t>::block(dst, value);
   align_to_next_boundary<32>(dst, count);
-  return generic::Memset<32, MaxSize>::loop_and_tail(dst, value, count);
+  return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
 }
 #endif // defined(LIBC_TARGET_ARCH_IS_X86)
 
 #if defined(LIBC_TARGET_ARCH_IS_AARCH64)
-template <size_t MaxSize>
 [[maybe_unused]] LIBC_INLINE static void
 inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) {
+  static_assert(aarch64::kNeon, "aarch64 supports vector types");
+  using uint128_t = uint8x16_t;
+  using uint256_t = uint8x32_t;
+  using uint512_t = uint8x64_t;
   if (count == 0)
     return;
   if (count <= 3) {
-    generic::Memset<1, MaxSize>::block(dst, value);
+    generic::Memset<uint8_t>::block(dst, value);
     if (count > 1)
-      generic::Memset<2, MaxSize>::tail(dst, value, count);
+      generic::Memset<uint16_t>::tail(dst, value, count);
     return;
   }
   if (count <= 8)
-    return generic::Memset<4, MaxSize>::head_tail(dst, value, count);
+    return generic::Memset<uint32_t>::head_tail(dst, value, count);
   if (count <= 16)
-    return generic::Memset<8, MaxSize>::head_tail(dst, value, count);
+    return generic::Memset<uint64_t>::head_tail(dst, value, count);
   if (count <= 32)
-    return generic::Memset<16, MaxSize>::head_tail(dst, value, count);
+    return generic::Memset<uint128_t>::head_tail(dst, value, count);
   if (count <= (32 + 64)) {
-    generic::Memset<32, MaxSize>::block(dst, value);
+    generic::Memset<uint256_t>::block(dst, value);
     if (count <= 64)
-      return generic::Memset<32, MaxSize>::tail(dst, value, count);
-    generic::Memset<32, MaxSize>::block(dst + 32, value);
-    generic::Memset<32, MaxSize>::tail(dst, value, count);
+      return generic::Memset<uint256_t>::tail(dst, value, count);
+    generic::Memset<uint256_t>::block(dst + 32, value);
+    generic::Memset<uint256_t>::tail(dst, value, count);
     return;
   }
   if (count >= 448 && value == 0 && aarch64::neon::hasZva()) {
-    generic::Memset<64, MaxSize>::block(dst, 0);
+    generic::Memset<uint512_t>::block(dst, 0);
     align_to_next_boundary<64>(dst, count);
-    return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count);
+    return aarch64::neon::BzeroCacheLine::loop_and_tail(dst, 0, count);
   } else {
-    generic::Memset<16, MaxSize>::block(dst, value);
+    generic::Memset<uint128_t>::block(dst, value);
     align_to_next_boundary<16>(dst, count);
-    return generic::Memset<64, MaxSize>::loop_and_tail(dst, value, count);
+    return generic::Memset<uint512_t>::loop_and_tail(dst, value, count);
   }
 }
 #endif // defined(LIBC_TARGET_ARCH_IS_AARCH64)
 
 LIBC_INLINE static void inline_memset(Ptr dst, uint8_t value, size_t count) {
 #if defined(LIBC_TARGET_ARCH_IS_X86)
-  static constexpr size_t kMaxSize = x86::kAvx512F ? 64
-                                     : x86::kAvx   ? 32
-                                     : x86::kSse2  ? 16
-                                                   : 8;
-  return inline_memset_x86<kMaxSize>(dst, value, count);
+  return inline_memset_x86(dst, value, count);
 #elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
-  static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
-  return inline_memset_aarch64<kMaxSize>(dst, value, count);
+  return inline_memset_aarch64(dst, value, count);
 #else
   return inline_memset_embedded_tiny(dst, value, count);
 #endif

diff  --git a/libc/src/string/memory_utils/op_aarch64.h b/libc/src/string/memory_utils/op_aarch64.h
index f9aabd0fbcade..e8c8b211e57b5 100644
--- a/libc/src/string/memory_utils/op_aarch64.h
+++ b/libc/src/string/memory_utils/op_aarch64.h
@@ -30,11 +30,10 @@ static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
 
 namespace neon {
 
-template <size_t Size> struct BzeroCacheLine {
-  static constexpr size_t SIZE = Size;
+struct BzeroCacheLine {
+  static constexpr size_t SIZE = 64;
 
   LIBC_INLINE static void block(Ptr dst, uint8_t) {
-    static_assert(Size == 64);
 #if __SIZEOF_POINTER__ == 4
     asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
 #else
@@ -43,15 +42,13 @@ template <size_t Size> struct BzeroCacheLine {
   }
 
   LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
-    static_assert(Size > 1, "a loop of size 1 does not need tail");
     size_t offset = 0;
     do {
       block(dst + offset, value);
       offset += SIZE;
     } while (offset < count - SIZE);
     // Unaligned store, we can't use 'dc zva' here.
-    static constexpr size_t kMaxSize = kNeon ? 16 : 8;
-    generic::Memset<Size, kMaxSize>::tail(dst, value, count);
+    generic::Memset<uint8x64_t>::tail(dst, value, count);
   }
 };
 

diff  --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index fd63ac67d005a..a7c5636c2d1ca 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -33,8 +33,7 @@
 
 #include <stdint.h>
 
-namespace __llvm_libc::generic {
-
+namespace __llvm_libc {
 // Compiler types using the vector attributes.
 using uint8x1_t = uint8_t __attribute__((__vector_size__(1)));
 using uint8x2_t = uint8_t __attribute__((__vector_size__(2)));
@@ -43,13 +42,14 @@ using uint8x8_t = uint8_t __attribute__((__vector_size__(8)));
 using uint8x16_t = uint8_t __attribute__((__vector_size__(16)));
 using uint8x32_t = uint8_t __attribute__((__vector_size__(32)));
 using uint8x64_t = uint8_t __attribute__((__vector_size__(64)));
+} // namespace __llvm_libc
 
+namespace __llvm_libc::generic {
 // We accept three types of values as elements for generic operations:
 // - scalar : unsigned integral types
 // - vector : compiler types using the vector attributes
 // - array  : a cpp::array<T, N> where T is itself either a scalar or a vector.
 // The following traits help discriminate between these cases.
-
 template <typename T>
 constexpr bool is_scalar_v = cpp::is_integral_v<T> && cpp::is_unsigned_v<T>;
 
@@ -109,23 +109,11 @@ template <typename T> T splat(uint8_t value) {
     T Out;
     // This for loop is optimized out for vector types.
     for (size_t i = 0; i < sizeof(T); ++i)
-      Out[i] = static_cast<uint8_t>(value);
+      Out[i] = value;
     return Out;
   }
 }
 
-template <typename T> void set(Ptr dst, uint8_t value) {
-  static_assert(is_element_type_v<T>);
-  if constexpr (is_scalar_v<T> || is_vector_v<T>) {
-    store<T>(dst, splat<T>(value));
-  } else if constexpr (is_array_v<T>) {
-    using value_type = typename T::value_type;
-    const value_type Splat = splat<value_type>(value);
-    for (size_t I = 0; I < array_size_v<T>; ++I)
-      store<value_type>(dst + (I * sizeof(value_type)), Splat);
-  }
-}
-
 static_assert((UINTPTR_MAX == 4294967295U) ||
                   (UINTPTR_MAX == 18446744073709551615UL),
               "We currently only support 32- or 64-bit platforms");
@@ -149,9 +137,7 @@ constexpr bool is_decreasing_size() {
 }
 
 template <size_t Size, typename... Ts> struct Largest;
-template <size_t Size> struct Largest<Size> {
-  using type = uint8_t;
-};
+template <size_t Size> struct Largest<Size> : cpp::type_identity<uint8_t> {};
 template <size_t Size, typename T, typename... Ts>
 struct Largest<Size, T, Ts...> {
   using next = Largest<Size, Ts...>;
@@ -179,6 +165,11 @@ template <typename First, typename... Ts> struct SupportedTypes {
   using TypeFor = typename details::Largest<Size, First, Ts...>::type;
 };
 
+// Returns the sum of the sizeof of all the TS types.
+template <typename... TS> static constexpr size_t sum_sizeof() {
+  return (... + sizeof(TS));
+}
+
 // Map from sizes to structures offering static load, store and splat methods.
 // Note: On platforms lacking vector support, we use the ArrayType below and
 // decompose the operation in smaller pieces.
@@ -220,27 +211,23 @@ using getTypeFor = cpp::conditional_t<
 
 ///////////////////////////////////////////////////////////////////////////////
 // Memset
-// The MaxSize template argument gives the maximum size handled natively by the
-// platform. For instance on x86 with AVX support this would be 32. If a size
-// greater than MaxSize is requested we break the operation down in smaller
-// pieces of size MaxSize.
 ///////////////////////////////////////////////////////////////////////////////
-template <size_t Size, size_t MaxSize> struct Memset {
-  static_assert(is_power2(MaxSize));
-  static constexpr size_t SIZE = Size;
+
+template <typename T, typename... TS> struct Memset {
+  static constexpr size_t SIZE = sum_sizeof<T, TS...>();
 
   LIBC_INLINE static void block(Ptr dst, uint8_t value) {
-    if constexpr (Size == 3) {
-      Memset<1, MaxSize>::block(dst + 2, value);
-      Memset<2, MaxSize>::block(dst, value);
-    } else {
-      using T = details::getTypeFor<Size, MaxSize>;
-      if constexpr (details::is_void_v<T>) {
-        deferred_static_assert("Unimplemented Size");
-      } else {
-        set<T>(dst, value);
-      }
+    static_assert(is_element_type_v<T>);
+    if constexpr (is_scalar_v<T> || is_vector_v<T>) {
+      store<T>(dst, splat<T>(value));
+    } else if constexpr (is_array_v<T>) {
+      using value_type = typename T::value_type;
+      const auto Splat = splat<value_type>(value);
+      for (size_t I = 0; I < array_size_v<T>; ++I)
+        store<value_type>(dst + (I * sizeof(value_type)), Splat);
     }
+    if constexpr (sizeof...(TS))
+      Memset<TS...>::block(dst + sizeof(T), value);
   }
 
   LIBC_INLINE static void tail(Ptr dst, uint8_t value, size_t count) {
@@ -253,7 +240,7 @@ template <size_t Size, size_t MaxSize> struct Memset {
   }
 
   LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
-    static_assert(SIZE > 1);
+    static_assert(SIZE > 1, "a loop of size 1 does not need tail");
     size_t offset = 0;
     do {
       block(dst + offset, value);

diff  --git a/libc/test/src/string/memory_utils/op_tests.cpp b/libc/test/src/string/memory_utils/op_tests.cpp
index 7f5d4d4ed460a..b63a629da3f05 100644
--- a/libc/test/src/string/memory_utils/op_tests.cpp
+++ b/libc/test/src/string/memory_utils/op_tests.cpp
@@ -119,24 +119,20 @@ using MemsetImplementations = testing::TypeList<
     builtin::Memset<64>,
 #endif
 #ifdef LLVM_LIBC_HAS_UINT64
-    generic::Memset<8, 8>,  //
-    generic::Memset<16, 8>, //
-    generic::Memset<32, 8>, //
-    generic::Memset<64, 8>, //
+    generic::Memset<uint64_t>, generic::Memset<cpp::array<uint64_t, 2>>,
 #endif
 #ifdef __AVX512F__
-    generic::Memset<64, 64>, // prevents warning about avx512f
+    generic::Memset<uint8x64_t>, generic::Memset<cpp::array<uint8x64_t, 2>>,
 #endif
-    generic::Memset<1, 1>,   //
-    generic::Memset<2, 1>,   //
-    generic::Memset<2, 2>,   //
-    generic::Memset<4, 2>,   //
-    generic::Memset<4, 4>,   //
-    generic::Memset<16, 16>, //
-    generic::Memset<32, 16>, //
-    generic::Memset<64, 16>, //
-    generic::Memset<32, 32>, //
-    generic::Memset<64, 32>  //
+#ifdef __AVX__
+    generic::Memset<uint8x32_t>, generic::Memset<cpp::array<uint8x32_t, 2>>,
+#endif
+#ifdef __SSE2__
+    generic::Memset<uint8x16_t>, generic::Memset<cpp::array<uint8x16_t, 2>>,
+#endif
+    generic::Memset<uint32_t>, generic::Memset<cpp::array<uint32_t, 2>>, //
+    generic::Memset<uint16_t>, generic::Memset<cpp::array<uint16_t, 2>>, //
+    generic::Memset<uint8_t>, generic::Memset<cpp::array<uint8_t, 2>>    //
     >;
 
 // Adapt CheckMemset signature to op implementation signatures.