[libc-commits] [libc] [llvm] [libc] Improve qsort (PR #120450)
Lukas Bergdoll via libc-commits
libc-commits at lists.llvm.org
Thu Dec 19 09:24:52 PST 2024
================
@@ -11,96 +11,155 @@
#include "src/__support/CPP/cstddef.h"
#include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memcpy.h"
+#include "src/string/memory_utils/inline_memmove.h"
#include <stdint.h>
namespace LIBC_NAMESPACE_DECL {
namespace internal {
-using Compare = int(const void *, const void *);
-using CompareWithState = int(const void *, const void *, void *);
-
-enum class CompType { COMPARE, COMPARE_WITH_STATE };
-
-struct Comparator {
- union {
- Compare *comp_func;
- CompareWithState *comp_func_r;
- };
- const CompType comp_type;
-
- void *arg;
-
- Comparator(Compare *func)
- : comp_func(func), comp_type(CompType::COMPARE), arg(nullptr) {}
-
- Comparator(CompareWithState *func, void *arg_val)
- : comp_func_r(func), comp_type(CompType::COMPARE_WITH_STATE),
- arg(arg_val) {}
-
-#if defined(__clang__)
- // Recent upstream changes to -fsanitize=function find more instances of
- // function type mismatches. One case is with the comparator passed to this
- // class. Libraries will tend to pass comparators that take pointers to
- // varying types while this comparator expects to accept const void pointers.
- // Ideally those tools would pass a function that strictly accepts const
- // void*s to avoid UB, or would use qsort_r to pass their own comparator.
- [[clang::no_sanitize("function")]]
-#endif
- int comp_vals(const void *a, const void *b) const {
- if (comp_type == CompType::COMPARE) {
- return comp_func(a, b);
- } else {
- return comp_func_r(a, b, arg);
- }
+// Returns the max amount of bytes deemed reasonable - based on the target
+// properties - for use in local stack arrays.
+constexpr size_t max_stack_array_size() {
+ // Uses target pointer size as heuristic how much memory is available and
+ // unlikely to run into stack overflow and perf problems.
+ constexpr size_t ptr_diff_size = sizeof(ptrdiff_t);
+
+ if constexpr (ptr_diff_size >= 8) {
+ return 4096;
}
-};
-class Array {
- uint8_t *array;
- size_t array_size;
+ if constexpr (ptr_diff_size == 4) {
+ return 512;
+ }
+
+ // 8-bit platforms are just not gonna work well with libc, qsort
+ // won't be the problem.
+ // 16-bit platforms ought to be able to store 64 bytes on the stack.
+ return 64;
+}
+
+class ArrayGenericSize {
+ uint8_t *array_base;
+ size_t array_len;
size_t elem_size;
- Comparator compare;
+
+ uint8_t *get_internal(size_t i) const noexcept {
+ return array_base + (i * elem_size);
+ }
public:
- Array(uint8_t *a, size_t s, size_t e, Comparator c)
- : array(a), array_size(s), elem_size(e), compare(c) {}
-
- uint8_t *get(size_t i) const { return array + i * elem_size; }
-
- void swap(size_t i, size_t j) const {
- uint8_t *elem_i = get(i);
- uint8_t *elem_j = get(j);
- for (size_t b = 0; b < elem_size; ++b) {
- uint8_t temp = elem_i[b];
- elem_i[b] = elem_j[b];
- elem_j[b] = temp;
- }
+ ArrayGenericSize(uint8_t *a, size_t s, size_t e) noexcept
+ : array_base(a), array_len(s), elem_size(e) {}
+
+ static constexpr bool has_fixed_size() { return false; }
+
+ void *get(size_t i) const noexcept {
+ return reinterpret_cast<void *>(get_internal(i));
}
- int elem_compare(size_t i, const uint8_t *other) const {
- // An element must compare equal to itself so we don't need to consult the
- // user provided comparator.
- if (get(i) == other)
- return 0;
- return compare.comp_vals(get(i), other);
+ void swap(size_t i, size_t j) const noexcept {
+ // It's possible to use 8 byte blocks with `uint64_t`, but that
+ // generates more machine code as the remainder loop gets
+ // unrolled, plus 4 byte operations are more likely to be
+ // efficient on a wider variety of hardware. On x86 LLVM tends
+ // to unroll the block loop again into 2 16 byte swaps per
+ // iteration which is another reason that 4 byte blocks yields
+ // good performance even for big types.
+ using block_t = uint32_t;
+ constexpr size_t BLOCK_SIZE = sizeof(block_t);
+
+ uint8_t *elem_i = get_internal(i);
+ uint8_t *elem_j = get_internal(j);
+
+ const size_t elem_size_rem = elem_size % BLOCK_SIZE;
+ const block_t *elem_i_block_end =
+ reinterpret_cast<block_t *>(elem_i + (elem_size - elem_size_rem));
+
+ block_t *elem_i_block = reinterpret_cast<block_t *>(elem_i);
----------------
Voultapher wrote:
Good idea, I'll apply it although I'd like to go with a less abstraction heavy approach that is hopefully easier to understand and maintain. What is the difference between using `__builtin_memcpy_inline` and `__builtin_memcpy` here, because in both cases the compiler ought to replace it with inline operations if those are possible on the platform since the size is a constant?
https://github.com/llvm/llvm-project/pull/120450
More information about the libc-commits
mailing list