[libc-commits] [libc] [llvm] [libc] Improve qsort (PR #120450)

Wed Dec 18 18:03:05 PST 2024

================
@@ -11,96 +11,155 @@
 
 #include "src/__support/CPP/cstddef.h"
 #include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memcpy.h"
+#include "src/string/memory_utils/inline_memmove.h"
 
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-using Compare = int(const void *, const void *);
-using CompareWithState = int(const void *, const void *, void *);
-
-enum class CompType { COMPARE, COMPARE_WITH_STATE };
-
-struct Comparator {
-  union {
-    Compare *comp_func;
-    CompareWithState *comp_func_r;
-  };
-  const CompType comp_type;
-
-  void *arg;
-
-  Comparator(Compare *func)
-      : comp_func(func), comp_type(CompType::COMPARE), arg(nullptr) {}
-
-  Comparator(CompareWithState *func, void *arg_val)
-      : comp_func_r(func), comp_type(CompType::COMPARE_WITH_STATE),
-        arg(arg_val) {}
-
-#if defined(__clang__)
-  // Recent upstream changes to -fsanitize=function find more instances of
-  // function type mismatches. One case is with the comparator passed to this
-  // class. Libraries will tend to pass comparators that take pointers to
-  // varying types while this comparator expects to accept const void pointers.
-  // Ideally those tools would pass a function that strictly accepts const
-  // void*s to avoid UB, or would use qsort_r to pass their own comparator.
-  [[clang::no_sanitize("function")]]
-#endif
-  int comp_vals(const void *a, const void *b) const {
-    if (comp_type == CompType::COMPARE) {
-      return comp_func(a, b);
-    } else {
-      return comp_func_r(a, b, arg);
-    }
+// Returns the max amount of bytes deemed reasonable - based on the target
+// properties - for use in local stack arrays.
+constexpr size_t max_stack_array_size() {
+  // Uses target pointer size as heuristic how much memory is available and
+  // unlikely to run into stack overflow and perf problems.
+  constexpr size_t ptr_diff_size = sizeof(ptrdiff_t);
+
+  if constexpr (ptr_diff_size >= 8) {
+    return 4096;
   }
-};
 
-class Array {
-  uint8_t *array;
-  size_t array_size;
+  if constexpr (ptr_diff_size == 4) {
+    return 512;
+  }
+
+  // 8-bit platforms are just not gonna work well with libc, qsort
+  // won't be the problem.
+  // 16-bit platforms ought to be able to store 64 bytes on the stack.
+  return 64;
+}
+
+class ArrayGenericSize {
+  uint8_t *array_base;
+  size_t array_len;
   size_t elem_size;
-  Comparator compare;
+
+  uint8_t *get_internal(size_t i) const noexcept {
+    return array_base + (i * elem_size);
+  }
 
 public:
-  Array(uint8_t *a, size_t s, size_t e, Comparator c)
-      : array(a), array_size(s), elem_size(e), compare(c) {}
-
-  uint8_t *get(size_t i) const { return array + i * elem_size; }
-
-  void swap(size_t i, size_t j) const {
-    uint8_t *elem_i = get(i);
-    uint8_t *elem_j = get(j);
-    for (size_t b = 0; b < elem_size; ++b) {
-      uint8_t temp = elem_i[b];
-      elem_i[b] = elem_j[b];
-      elem_j[b] = temp;
-    }
+  ArrayGenericSize(uint8_t *a, size_t s, size_t e) noexcept
+      : array_base(a), array_len(s), elem_size(e) {}
+
+  static constexpr bool has_fixed_size() { return false; }
+
+  void *get(size_t i) const noexcept {
+    return reinterpret_cast<void *>(get_internal(i));
   }
 
-  int elem_compare(size_t i, const uint8_t *other) const {
-    // An element must compare equal to itself so we don't need to consult the
-    // user provided comparator.
-    if (get(i) == other)
-      return 0;
-    return compare.comp_vals(get(i), other);
+  void swap(size_t i, size_t j) const noexcept {
+    // It's possible to use 8 byte blocks with `uint64_t`, but that
+    // generates more machine code as the remainder loop gets
+    // unrolled, plus 4 byte operations are more likely to be
+    // efficient on a wider variety of hardware. On x86 LLVM tends
+    // to unroll the block loop again into 2 16 byte swaps per
+    // iteration which is another reason that 4 byte blocks yields
+    // good performance even for big types.
+    using block_t = uint32_t;
+    constexpr size_t BLOCK_SIZE = sizeof(block_t);
+
+    uint8_t *elem_i = get_internal(i);
+    uint8_t *elem_j = get_internal(j);
+
+    const size_t elem_size_rem = elem_size % BLOCK_SIZE;
+    const block_t *elem_i_block_end =
+        reinterpret_cast<block_t *>(elem_i + (elem_size - elem_size_rem));
+
+    block_t *elem_i_block = reinterpret_cast<block_t *>(elem_i);
----------------
SchrodingerZhu wrote:

```c++
struct Block {
  std::byte data[4];
  static Block load(const std::byte* src) {
     block_t buf;
     #if __has_builtin(__builtin_memcpy_inline)
          __builtin_memcpy_inline(buf.data, src, 4);
     #else
          __builtin_memcpy(buf.data, src, 4);
     #endif
  } 
  /* store similarly */
};
```
Then you can use `Block::load(...)/Block::store(...)` to do the operation.
The compiler is smart enough to optimize the code into what you are expecting.

https://github.com/llvm/llvm-project/pull/120450