[libc-commits] [libc] [llvm] [libc] Improve qsort (PR #120450)
Lukas Bergdoll via libc-commits
libc-commits at lists.llvm.org
Thu Dec 19 09:33:02 PST 2024
https://github.com/Voultapher updated https://github.com/llvm/llvm-project/pull/120450
>From 3d6ca050a8163e8dca2796892261b02f918beef7 Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll at gmail.com>
Date: Sat, 14 Dec 2024 14:33:27 +0100
Subject: [PATCH 01/10] Use templated is_less instead of runtime comparator
This changes serves both immediate purposes and lays the groundwork for further
changes. Previously the qsort implementation used a comparator construct that
dispatched the comparison function through a tagged union, this incurs an
additional branch on each comparison to check the tag `comp_type`. There are
also structural changes to the quick_sort implementation to lay the groundwork
for future changes to the partition and equal element handling logic. The
comparator also did an equal idx comparison for each comparison. While this may
sound like a nice optimization for expensive to compare types, a well
constructed implementation will never compare an element to itself. In effect
this change reduces the required comparison for integer sorting from 4 to 2 per
comparison operation in the implementation.
Performance impact, the tested types i32, u64, string and f128 improve by ~1.5x,
and 1k by ~2.4x for the random pattern. Equal element patterns see an
algorithmic degradation, but this will be addressed by a future change.
Nice to have, the `[[clang::no_sanitize("function")]]` is no longer needed.
---
libc/src/stdlib/heap_sort.h | 11 +-
libc/src/stdlib/qsort.cpp | 8 +-
libc/src/stdlib/qsort_data.h | 86 +++------
libc/src/stdlib/qsort_r.cpp | 9 +-
libc/src/stdlib/qsort_util.h | 6 +-
libc/src/stdlib/quick_sort.h | 98 ++++++-----
libc/test/src/stdlib/CMakeLists.txt | 14 +-
libc/test/src/stdlib/SortingTest.h | 163 +++++++-----------
libc/test/src/stdlib/heap_sort_test.cpp | 19 +-
libc/test/src/stdlib/qsort_r_test.cpp | 4 +-
libc/test/src/stdlib/qsort_test.cpp | 19 +-
libc/test/src/stdlib/quick_sort_test.cpp | 16 --
.../libc/test/src/stdlib/BUILD.bazel | 12 +-
13 files changed, 188 insertions(+), 277 deletions(-)
delete mode 100644 libc/test/src/stdlib/quick_sort_test.cpp
diff --git a/libc/src/stdlib/heap_sort.h b/libc/src/stdlib/heap_sort.h
index ccb9ec5f82149e..543c834c0197c7 100644
--- a/libc/src/stdlib/heap_sort.h
+++ b/libc/src/stdlib/heap_sort.h
@@ -18,11 +18,11 @@ namespace internal {
// A simple in-place heapsort implementation.
// Follow the implementation in https://en.wikipedia.org/wiki/Heapsort.
-LIBC_INLINE void heap_sort(const Array &array) {
- size_t end = array.size();
+template <typename F> void heap_sort(const Array &array, const F &is_less) {
+ size_t end = array.len();
size_t start = end / 2;
- auto left_child = [](size_t i) -> size_t { return 2 * i + 1; };
+ const auto left_child = [](size_t i) -> size_t { return 2 * i + 1; };
while (end > 1) {
if (start > 0) {
@@ -40,12 +40,11 @@ LIBC_INLINE void heap_sort(const Array &array) {
while (left_child(root) < end) {
size_t child = left_child(root);
// If there are two children, set child to the greater.
- if (child + 1 < end &&
- array.elem_compare(child, array.get(child + 1)) < 0)
+ if ((child + 1 < end) && is_less(array.get(child), array.get(child + 1)))
++child;
// If the root is less than the greater child
- if (array.elem_compare(root, array.get(child)) >= 0)
+ if (!is_less(array.get(root), array.get(child)))
break;
// Swap the root with the greater child and continue sifting down.
diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp
index 65a63c239f5c0d..ff55b1bd4455eb 100644
--- a/libc/src/stdlib/qsort.cpp
+++ b/libc/src/stdlib/qsort.cpp
@@ -20,12 +20,14 @@ LLVM_LIBC_FUNCTION(void, qsort,
int (*compare)(const void *, const void *))) {
if (array == nullptr || array_size == 0 || elem_size == 0)
return;
- internal::Comparator c(compare);
auto arr = internal::Array(reinterpret_cast<uint8_t *>(array), array_size,
- elem_size, c);
+ elem_size);
- internal::sort(arr);
+ internal::unstable_sort(
+ arr, [compare](const void *a, const void *b) noexcept -> bool {
+ return compare(a, b) < 0;
+ });
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h
index c529d55ca46ffd..f296c26db4fd6b 100644
--- a/libc/src/stdlib/qsort_data.h
+++ b/libc/src/stdlib/qsort_data.h
@@ -17,60 +17,25 @@
namespace LIBC_NAMESPACE_DECL {
namespace internal {
-using Compare = int(const void *, const void *);
-using CompareWithState = int(const void *, const void *, void *);
-
-enum class CompType { COMPARE, COMPARE_WITH_STATE };
-
-struct Comparator {
- union {
- Compare *comp_func;
- CompareWithState *comp_func_r;
- };
- const CompType comp_type;
-
- void *arg;
-
- Comparator(Compare *func)
- : comp_func(func), comp_type(CompType::COMPARE), arg(nullptr) {}
-
- Comparator(CompareWithState *func, void *arg_val)
- : comp_func_r(func), comp_type(CompType::COMPARE_WITH_STATE),
- arg(arg_val) {}
-
-#if defined(__clang__)
- // Recent upstream changes to -fsanitize=function find more instances of
- // function type mismatches. One case is with the comparator passed to this
- // class. Libraries will tend to pass comparators that take pointers to
- // varying types while this comparator expects to accept const void pointers.
- // Ideally those tools would pass a function that strictly accepts const
- // void*s to avoid UB, or would use qsort_r to pass their own comparator.
- [[clang::no_sanitize("function")]]
-#endif
- int comp_vals(const void *a, const void *b) const {
- if (comp_type == CompType::COMPARE) {
- return comp_func(a, b);
- } else {
- return comp_func_r(a, b, arg);
- }
- }
-};
-
class Array {
- uint8_t *array;
- size_t array_size;
+ uint8_t *array_base;
+ size_t array_len;
size_t elem_size;
- Comparator compare;
+
+ uint8_t *get_internal(size_t i) const { return array_base + (i * elem_size); }
public:
- Array(uint8_t *a, size_t s, size_t e, Comparator c)
- : array(a), array_size(s), elem_size(e), compare(c) {}
+ Array(uint8_t *a, size_t s, size_t e)
+ : array_base(a), array_len(s), elem_size(e) {}
- uint8_t *get(size_t i) const { return array + i * elem_size; }
+ inline void *get(size_t i) const {
+ return reinterpret_cast<void *>(get_internal(i));
+ }
void swap(size_t i, size_t j) const {
- uint8_t *elem_i = get(i);
- uint8_t *elem_j = get(j);
+ uint8_t *elem_i = get_internal(i);
+ uint8_t *elem_j = get_internal(j);
+
for (size_t b = 0; b < elem_size; ++b) {
uint8_t temp = elem_i[b];
elem_i[b] = elem_j[b];
@@ -78,30 +43,21 @@ class Array {
}
}
- int elem_compare(size_t i, const uint8_t *other) const {
- // An element must compare equal to itself so we don't need to consult the
- // user provided comparator.
- if (get(i) == other)
- return 0;
- return compare.comp_vals(get(i), other);
- }
-
- size_t size() const { return array_size; }
+ size_t len() const { return array_len; }
- // Make an Array starting at index |i| and size |s|.
- LIBC_INLINE Array make_array(size_t i, size_t s) const {
- return Array(get(i), s, elem_size, compare);
+ // Make an Array starting at index |i| and length |s|.
+ inline Array make_array(size_t i, size_t s) const {
+ return Array(get_internal(i), s, elem_size);
}
- // Reset this Array to point at a different interval of the same items.
- LIBC_INLINE void reset_bounds(uint8_t *a, size_t s) {
- array = a;
- array_size = s;
+ // Reset this Array to point at a different interval of the same
+ // items starting at index |i|.
+ inline void reset_bounds(size_t i, size_t s) {
+ array_base = get_internal(i);
+ array_len = s;
}
};
-using SortingRoutine = void(const Array &);
-
} // namespace internal
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp
index bf61a40e847341..530e3f5f58e19a 100644
--- a/libc/src/stdlib/qsort_r.cpp
+++ b/libc/src/stdlib/qsort_r.cpp
@@ -21,11 +21,14 @@ LLVM_LIBC_FUNCTION(void, qsort_r,
void *arg)) {
if (array == nullptr || array_size == 0 || elem_size == 0)
return;
- internal::Comparator c(compare, arg);
+
auto arr = internal::Array(reinterpret_cast<uint8_t *>(array), array_size,
- elem_size, c);
+ elem_size);
- internal::sort(arr);
+ internal::unstable_sort(
+ arr, [compare, arg](const void *a, const void *b) noexcept -> bool {
+ return compare(a, b, arg) < 0;
+ });
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h
index d42adde06d9762..0faf6e1afdec00 100644
--- a/libc/src/stdlib/qsort_util.h
+++ b/libc/src/stdlib/qsort_util.h
@@ -27,11 +27,13 @@
namespace LIBC_NAMESPACE_DECL {
namespace internal {
+template <typename F> void unstable_sort(Array array, const F &is_less) {
#if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT
-constexpr auto sort = quick_sort;
+ quick_sort(array, is_less);
#elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT
-constexpr auto sort = heap_sort;
+ heap_sort(array, is_less);
#endif
+}
} // namespace internal
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
index 82b90a7d511d99..6972c233dde257 100644
--- a/libc/src/stdlib/quick_sort.h
+++ b/libc/src/stdlib/quick_sort.h
@@ -19,70 +19,78 @@ namespace LIBC_NAMESPACE_DECL {
namespace internal {
// A simple quicksort implementation using the Hoare partition scheme.
-LIBC_INLINE size_t partition(const Array &array) {
- const size_t array_size = array.size();
- size_t pivot_index = array_size / 2;
- uint8_t *pivot = array.get(pivot_index);
- size_t i = 0;
- size_t j = array_size - 1;
+template <typename F>
+size_t partition_hoare(const Array &array, const void *pivot,
+ const F &is_less) {
+ const size_t array_len = array.len();
+
+ size_t left = 0;
+ size_t right = array_len;
while (true) {
- int compare_i, compare_j;
-
- while ((compare_i = array.elem_compare(i, pivot)) < 0)
- ++i;
- while ((compare_j = array.elem_compare(j, pivot)) > 0)
- --j;
-
- // At some point i will crossover j so we will definitely break out of
- // this while loop.
- if (i >= j)
- return j + 1;
-
- array.swap(i, j);
-
- // The pivot itself might have got swapped so we will update the pivot.
- if (i == pivot_index) {
- pivot = array.get(j);
- pivot_index = j;
- } else if (j == pivot_index) {
- pivot = array.get(i);
- pivot_index = i;
+ while (left < right && is_less(array.get(left), pivot))
+ ++left;
+
+ while (true) {
+ --right;
+ if (left >= right || is_less(array.get(right), pivot)) {
+ break;
+ }
}
- if (compare_i == 0 && compare_j == 0) {
- // If we do not move the pointers, we will end up with an
- // infinite loop as i and j will be stuck without advancing.
- ++i;
- --j;
- }
+ if (left >= right)
+ break;
+
+ array.swap(left, right);
+ ++left;
}
+
+ return left;
}
-LIBC_INLINE void quick_sort(Array array) {
+template <typename F>
+size_t partition(const Array &array, size_t pivot_index, const F &is_less) {
+ // Place the pivot at the beginning of the array.
+ array.swap(0, pivot_index);
+
+ const Array array_without_pivot = array.make_array(1, array.len() - 1);
+ const void *pivot = array.get(0);
+ const size_t num_lt = partition_hoare(array_without_pivot, pivot, is_less);
+
+ // Place the pivot between the two partitions.
+ array.swap(0, num_lt);
+
+ return num_lt;
+}
+
+template <typename F> void quick_sort(Array &array, const F &is_less) {
while (true) {
- const size_t array_size = array.size();
- if (array_size <= 1)
+ const size_t array_len = array.len();
+ if (array_len <= 1)
return;
- size_t split_index = partition(array);
- if (array_size == 2)
+
+ const size_t pivot_index = array_len / 2;
+ size_t split_index = partition(array, pivot_index, is_less);
+
+ if (array_len == 2)
// The partition operation sorts the two element array.
return;
- // Make Arrays describing the two sublists that still need sorting.
+ // Split the array into `left`, `pivot`, and `right`.
Array left = array.make_array(0, split_index);
- Array right = array.make_array(split_index, array.size() - split_index);
+ const size_t right_start = split_index + 1;
+ Array right = array.make_array(right_start, array.len() - right_start);
// Recurse to sort the smaller of the two, and then loop round within this
// function to sort the larger. This way, recursive call depth is bounded
// by log2 of the total array size, because every recursive call is sorting
// a list at most half the length of the one in its caller.
- if (left.size() < right.size()) {
- quick_sort(left);
- array.reset_bounds(right.get(0), right.size());
+ if (left.len() < right.len()) {
+ quick_sort(left, is_less);
+ array.reset_bounds(right_start, right.len());
} else {
- quick_sort(right);
- array.reset_bounds(left.get(0), left.size());
+ quick_sort(right, is_less);
+ array.reset_bounds(0, left.len());
}
}
}
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index ff6034e43e9f6a..0044afed1d12c1 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -300,18 +300,6 @@ add_libc_test(
libc.src.stdlib.bsearch
)
-add_libc_test(
- quick_sort_test
- SUITE
- libc-stdlib-tests
- SRCS
- quick_sort_test.cpp
- HDRS
- SortingTest.h
- DEPENDS
- libc.src.stdlib.qsort_util
-)
-
add_libc_test(
heap_sort_test
SUITE
@@ -321,7 +309,7 @@ add_libc_test(
HDRS
SortingTest.h
DEPENDS
- libc.src.stdlib.qsort_util
+ libc.src.stdlib.qsort
)
add_libc_test(
diff --git a/libc/test/src/stdlib/SortingTest.h b/libc/test/src/stdlib/SortingTest.h
index d34584e5addf03..cba16566d89e79 100644
--- a/libc/test/src/stdlib/SortingTest.h
+++ b/libc/test/src/stdlib/SortingTest.h
@@ -7,19 +7,18 @@
//===----------------------------------------------------------------------===//
#include "src/__support/macros/config.h"
-#include "src/stdlib/qsort_data.h"
+#include "src/stdlib/qsort.h"
#include "test/UnitTest/Test.h"
class SortingTest : public LIBC_NAMESPACE::testing::Test {
- using Array = LIBC_NAMESPACE::internal::Array;
- using Comparator = LIBC_NAMESPACE::internal::Comparator;
- using SortingRoutine = LIBC_NAMESPACE::internal::SortingRoutine;
+ using SortFn = void (*)(void *array, size_t array_size, size_t elem_size,
+ int (*compare)(const void *, const void *));
-public:
static int int_compare(const void *l, const void *r) {
int li = *reinterpret_cast<const int *>(l);
int ri = *reinterpret_cast<const int *>(r);
+
if (li == ri)
return 0;
else if (li > ri)
@@ -28,16 +27,19 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
return -1;
}
- void test_sorted_array(SortingRoutine sort_func) {
+ static void int_sort(SortFn sort_fn, int *array, size_t array_len) {
+ sort_fn(reinterpret_cast<void *>(array), array_len, sizeof(int),
+ int_compare);
+ }
+
+public:
+ void test_sorted_array(SortFn sort_fn) {
int array[25] = {10, 23, 33, 35, 55, 70, 71, 100, 110,
123, 133, 135, 155, 170, 171, 1100, 1110, 1123,
1133, 1135, 1155, 1170, 1171, 11100, 12310};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
ASSERT_LE(array[0], 10);
ASSERT_LE(array[1], 23);
@@ -66,45 +68,36 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
ASSERT_LE(array[24], 12310);
}
- void test_reversed_sorted_array(SortingRoutine sort_func) {
+ void test_reversed_sorted_array(SortFn sort_fn) {
int array[] = {25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13,
12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
+ int_sort(sort_fn, array, ARRAY_LEN);
- sort_func(arr);
-
- for (int i = 0; i < int(ARRAY_SIZE - 1); ++i)
+ for (int i = 0; i < int(ARRAY_LEN - 1); ++i)
ASSERT_EQ(array[i], i + 1);
}
- void test_all_equal_elements(SortingRoutine sort_func) {
+ void test_all_equal_elements(SortFn sort_fn) {
int array[] = {100, 100, 100, 100, 100, 100, 100, 100, 100,
100, 100, 100, 100, 100, 100, 100, 100, 100,
100, 100, 100, 100, 100, 100, 100};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
- for (size_t i = 0; i < ARRAY_SIZE; ++i)
+ for (size_t i = 0; i < ARRAY_LEN; ++i)
ASSERT_EQ(array[i], 100);
}
- void test_unsorted_array_1(SortingRoutine sort_func) {
+ void test_unsorted_array_1(SortFn sort_fn) {
int array[25] = {10, 23, 8, 35, 55, 45, 40, 100, 110,
123, 90, 80, 70, 60, 171, 11, 1, -1,
-5, -10, 1155, 1170, 1171, 12, -100};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
ASSERT_EQ(array[0], -100);
ASSERT_EQ(array[1], -10);
@@ -133,14 +126,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
ASSERT_EQ(array[24], 1171);
}
- void test_unsorted_array_2(SortingRoutine sort_func) {
+ void test_unsorted_array_2(SortFn sort_fn) {
int array[7] = {10, 40, 45, 55, 35, 23, 60};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
-
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
ASSERT_EQ(array[0], 10);
ASSERT_EQ(array[1], 23);
@@ -151,14 +141,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
ASSERT_EQ(array[6], 60);
}
- void test_unsorted_array_duplicated_1(SortingRoutine sort_func) {
+ void test_unsorted_array_duplicated_1(SortFn sort_fn) {
int array[6] = {10, 10, 20, 20, 5, 5};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
ASSERT_EQ(array[0], 5);
ASSERT_EQ(array[1], 5);
@@ -168,14 +155,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
ASSERT_EQ(array[5], 20);
}
- void test_unsorted_array_duplicated_2(SortingRoutine sort_func) {
+ void test_unsorted_array_duplicated_2(SortFn sort_fn) {
int array[10] = {20, 10, 10, 10, 10, 20, 21, 21, 21, 21};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
-
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
ASSERT_EQ(array[0], 10);
ASSERT_EQ(array[1], 10);
@@ -189,14 +173,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
ASSERT_EQ(array[9], 21);
}
- void test_unsorted_array_duplicated_3(SortingRoutine sort_func) {
+ void test_unsorted_array_duplicated_3(SortFn sort_fn) {
int array[10] = {20, 30, 30, 30, 30, 20, 21, 21, 21, 21};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
ASSERT_EQ(array[0], 20);
ASSERT_EQ(array[1], 20);
@@ -210,117 +191,93 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
ASSERT_EQ(array[9], 30);
}
- void test_unsorted_three_element_1(SortingRoutine sort_func) {
+ void test_unsorted_three_element_1(SortFn sort_fn) {
int array[3] = {14999024, 0, 3};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
ASSERT_EQ(array[0], 0);
ASSERT_EQ(array[1], 3);
ASSERT_EQ(array[2], 14999024);
}
- void test_unsorted_three_element_2(SortingRoutine sort_func) {
+ void test_unsorted_three_element_2(SortFn sort_fn) {
int array[3] = {3, 14999024, 0};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
-
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
ASSERT_EQ(array[0], 0);
ASSERT_EQ(array[1], 3);
ASSERT_EQ(array[2], 14999024);
}
- void test_unsorted_three_element_3(SortingRoutine sort_func) {
+ void test_unsorted_three_element_3(SortFn sort_fn) {
int array[3] = {3, 0, 14999024};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
ASSERT_EQ(array[0], 0);
ASSERT_EQ(array[1], 3);
ASSERT_EQ(array[2], 14999024);
}
- void test_same_three_element(SortingRoutine sort_func) {
+ void test_same_three_element(SortFn sort_fn) {
int array[3] = {12345, 12345, 12345};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
-
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
ASSERT_EQ(array[0], 12345);
ASSERT_EQ(array[1], 12345);
ASSERT_EQ(array[2], 12345);
}
- void test_unsorted_two_element_1(SortingRoutine sort_func) {
+ void test_unsorted_two_element_1(SortFn sort_fn) {
int array[] = {14999024, 0};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
ASSERT_EQ(array[0], 0);
ASSERT_EQ(array[1], 14999024);
}
- void test_unsorted_two_element_2(SortingRoutine sort_func) {
+ void test_unsorted_two_element_2(SortFn sort_fn) {
int array[] = {0, 14999024};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
ASSERT_EQ(array[0], 0);
ASSERT_EQ(array[1], 14999024);
}
- void test_same_two_element(SortingRoutine sort_func) {
+ void test_same_two_element(SortFn sort_fn) {
int array[] = {12345, 12345};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
-
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
ASSERT_EQ(array[0], 12345);
ASSERT_EQ(array[1], 12345);
}
- void test_single_element(SortingRoutine sort_func) {
+ void test_single_element(SortFn sort_fn) {
int array[] = {12345};
- constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
- auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
- sizeof(int), Comparator(int_compare));
+ constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- sort_func(arr);
+ int_sort(sort_fn, array, ARRAY_LEN);
ASSERT_EQ(array[0], 12345);
}
diff --git a/libc/test/src/stdlib/heap_sort_test.cpp b/libc/test/src/stdlib/heap_sort_test.cpp
index d70e3dc2272beb..7223d9b23f98a9 100644
--- a/libc/test/src/stdlib/heap_sort_test.cpp
+++ b/libc/test/src/stdlib/heap_sort_test.cpp
@@ -7,10 +7,21 @@
//===----------------------------------------------------------------------===//
#include "SortingTest.h"
-#include "src/stdlib/heap_sort.h"
+#include "src/stdlib/qsort_util.h"
-void sort(const LIBC_NAMESPACE::internal::Array &array) {
- LIBC_NAMESPACE::internal::heap_sort(array);
+void heap_sort(void *array, size_t array_size, size_t elem_size,
+ int (*compare)(const void *, const void *)) {
+
+ if (array == nullptr || array_size == 0 || elem_size == 0)
+ return;
+
+ auto arr = LIBC_NAMESPACE::internal::Array(reinterpret_cast<uint8_t *>(array),
+ array_size, elem_size);
+
+ LIBC_NAMESPACE::internal::heap_sort(
+ arr, [compare](const void *a, const void *b) noexcept -> bool {
+ return compare(a, b) < 0;
+ });
}
-LIST_SORTING_TESTS(HeapSort, sort);
+LIST_SORTING_TESTS(HeapSort, heap_sort);
diff --git a/libc/test/src/stdlib/qsort_r_test.cpp b/libc/test/src/stdlib/qsort_r_test.cpp
index 6893fdc7b74c82..f18923618ed5ee 100644
--- a/libc/test/src/stdlib/qsort_r_test.cpp
+++ b/libc/test/src/stdlib/qsort_r_test.cpp
@@ -62,9 +62,9 @@ TEST(LlvmLibcQsortRTest, SortedArray) {
ASSERT_LE(array[23], 11100);
ASSERT_LE(array[24], 12310);
- // This is a sorted list, but there still have to have been at least N
+ // This is a sorted list, but there still have to have been at least N - 1
// comparisons made.
- ASSERT_GE(count, ARRAY_SIZE);
+ ASSERT_GE(count, ARRAY_SIZE - 1);
}
TEST(LlvmLibcQsortRTest, ReverseSortedArray) {
diff --git a/libc/test/src/stdlib/qsort_test.cpp b/libc/test/src/stdlib/qsort_test.cpp
index 1e921a86fd1fd3..e89a092d37ccc9 100644
--- a/libc/test/src/stdlib/qsort_test.cpp
+++ b/libc/test/src/stdlib/qsort_test.cpp
@@ -7,11 +7,20 @@
//===----------------------------------------------------------------------===//
#include "SortingTest.h"
-#include "src/stdlib/qsort.h"
+#include "src/stdlib/qsort_util.h"
-void sort(const LIBC_NAMESPACE::internal::Array &array) {
- LIBC_NAMESPACE::qsort(reinterpret_cast<void *>(array.get(0)), array.size(),
- sizeof(int), SortingTest::int_compare);
+void quick_sort(void *array, size_t array_size, size_t elem_size,
+ int (*compare)(const void *, const void *)) {
+ if (array == nullptr || array_size == 0 || elem_size == 0)
+ return;
+
+ auto arr = LIBC_NAMESPACE::internal::Array(reinterpret_cast<uint8_t *>(array),
+ array_size, elem_size);
+
+ LIBC_NAMESPACE::internal::quick_sort(
+ arr, [compare](const void *a, const void *b) noexcept -> bool {
+ return compare(a, b) < 0;
+ });
}
-LIST_SORTING_TESTS(Qsort, sort);
+LIST_SORTING_TESTS(Qsort, quick_sort);
diff --git a/libc/test/src/stdlib/quick_sort_test.cpp b/libc/test/src/stdlib/quick_sort_test.cpp
deleted file mode 100644
index d6bf77ebfd40d7..00000000000000
--- a/libc/test/src/stdlib/quick_sort_test.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Unittests for quick sort ------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SortingTest.h"
-#include "src/stdlib/quick_sort.h"
-
-void sort(const LIBC_NAMESPACE::internal::Array &array) {
- LIBC_NAMESPACE::internal::quick_sort(array);
-}
-
-LIST_SORTING_TESTS(QuickSort, sort);
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
index e4b4b075705e8a..a6dac1f82b9f9f 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
@@ -130,21 +130,13 @@ libc_test(
],
)
-libc_test(
- name = "quick_sort_test",
- srcs = ["quick_sort_test.cpp"],
- deps = [
- ":qsort_test_helper",
- "//libc:qsort_util",
- ],
-)
-
libc_test(
name = "heap_sort_test",
srcs = ["heap_sort_test.cpp"],
+ libc_function_deps = ["//libc:qsort"],
deps = [
":qsort_test_helper",
- "//libc:qsort_util",
+ "//libc:types_size_t",
],
)
>From 5dfe9bb0af16b960d6ee5e4990a6fa86d4ee4cf2 Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll at gmail.com>
Date: Sat, 14 Dec 2024 15:33:11 +0100
Subject: [PATCH 02/10] Use higher quality pivot selection
This commit ports the glidesort recursive median pivot selection. Without a
dedicated small-sort this pivot selection is worse for cheap to compare types,
but it is a precursor to optimally leverage pdqsort style equal element
filtering.
---
libc/src/stdlib/qsort_data.h | 17 ++++---
libc/src/stdlib/qsort_pivot.h | 93 +++++++++++++++++++++++++++++++++++
libc/src/stdlib/quick_sort.h | 6 +--
3 files changed, 105 insertions(+), 11 deletions(-)
create mode 100644 libc/src/stdlib/qsort_pivot.h
diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h
index f296c26db4fd6b..affff3da25cdaa 100644
--- a/libc/src/stdlib/qsort_data.h
+++ b/libc/src/stdlib/qsort_data.h
@@ -16,23 +16,24 @@
namespace LIBC_NAMESPACE_DECL {
namespace internal {
-
class Array {
uint8_t *array_base;
size_t array_len;
size_t elem_size;
- uint8_t *get_internal(size_t i) const { return array_base + (i * elem_size); }
+ uint8_t *get_internal(size_t i) const noexcept {
+ return array_base + (i * elem_size);
+ }
public:
- Array(uint8_t *a, size_t s, size_t e)
+ Array(uint8_t *a, size_t s, size_t e) noexcept
: array_base(a), array_len(s), elem_size(e) {}
- inline void *get(size_t i) const {
+ inline void *get(size_t i) const noexcept {
return reinterpret_cast<void *>(get_internal(i));
}
- void swap(size_t i, size_t j) const {
+ void swap(size_t i, size_t j) const noexcept {
uint8_t *elem_i = get_internal(i);
uint8_t *elem_j = get_internal(j);
@@ -43,16 +44,16 @@ class Array {
}
}
- size_t len() const { return array_len; }
+ size_t len() const noexcept { return array_len; }
// Make an Array starting at index |i| and length |s|.
- inline Array make_array(size_t i, size_t s) const {
+ inline Array make_array(size_t i, size_t s) const noexcept {
return Array(get_internal(i), s, elem_size);
}
// Reset this Array to point at a different interval of the same
// items starting at index |i|.
- inline void reset_bounds(size_t i, size_t s) {
+ inline void reset_bounds(size_t i, size_t s) noexcept {
array_base = get_internal(i);
array_len = s;
}
diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h
new file mode 100644
index 00000000000000..8411a324f737ec
--- /dev/null
+++ b/libc/src/stdlib/qsort_pivot.h
@@ -0,0 +1,93 @@
+//===-- Implementation header for qsort utilities ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
+#define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
+
+#include "qsort_data.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+// Recursively select a pseudomedian if above this threshold.
+constexpr size_t PSEUDO_MEDIAN_REC_THRESHOLD = 64;
+
+// Selects a pivot from `array`. Algorithm taken from glidesort by Orson Peters.
+//
+// This chooses a pivot by sampling an adaptive amount of points, approximating
+// the quality of a median of sqrt(n) elements.
+template <typename F>
+size_t choose_pivot(const Array &array, const F &is_less) {
+ const size_t len = array.len();
+
+ if (len < 8) {
+ return 0;
+ }
+
+ const size_t len_div_8 = len / 8;
+
+ const size_t a = 0; // [0, floor(n/8))
+ const size_t b = len_div_8 * 4; // [4*floor(n/8), 5*floor(n/8))
+ const size_t c = len_div_8 * 7; // [7*floor(n/8), 8*floor(n/8))
+
+ if (len < PSEUDO_MEDIAN_REC_THRESHOLD) {
+ return median3(array, a, b, c, is_less);
+ } else {
+ return median3_rec(array, a, b, c, len_div_8, is_less);
+ }
+}
+
+// Calculates an approximate median of 3 elements from sections a, b, c, or
+// recursively from an approximation of each, if they're large enough. By
+// dividing the size of each section by 8 when recursing we have logarithmic
+// recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) =
+// O(n^(log(3)/log(8))) ~= O(n^0.528) elements.
+template <typename F>
+size_t median3_rec(const Array &array, size_t a, size_t b, size_t c, size_t n,
+ const F &is_less) {
+ if (n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD) {
+ const size_t n8 = n / 8;
+ a = median3_rec(array, a, a + (n8 * 4), a + (n8 * 7), n8, is_less);
+ b = median3_rec(array, b, b + (n8 * 4), b + (n8 * 7), n8, is_less);
+ c = median3_rec(array, c, c + (n8 * 4), c + (n8 * 7), n8, is_less);
+ }
+ return median3(array, a, b, c, is_less);
+}
+
+/// Calculates the median of 3 elements.
+template <typename F>
+size_t median3(const Array &array, size_t a, size_t b, size_t c,
+ const F &is_less) {
+ const void *a_ptr = array.get(a);
+ const void *b_ptr = array.get(b);
+ const void *c_ptr = array.get(c);
+
+ const bool x = is_less(a_ptr, b_ptr);
+ const bool y = is_less(a_ptr, c_ptr);
+ if (x == y) {
+ // If x=y=0 then b, c <= a. In this case we want to return max(b, c).
+ // If x=y=1 then a < b, c. In this case we want to return min(b, c).
+ // By toggling the outcome of b < c using XOR x we get this behavior.
+ const bool z = is_less(b_ptr, c_ptr);
+ if (z ^ x) {
+ return c;
+ } else {
+ return b;
+ }
+ } else {
+ // Either c <= a < b or b <= a < c, thus a is our median.
+ return a;
+ }
+}
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
index 6972c233dde257..a21826681ff2c1 100644
--- a/libc/src/stdlib/quick_sort.h
+++ b/libc/src/stdlib/quick_sort.h
@@ -9,9 +9,9 @@
#ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
#define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
-#include "src/__support/macros/attributes.h"
+#include "src/__support/CPP/cstddef.h"
#include "src/__support/macros/config.h"
-#include "src/stdlib/qsort_data.h"
+#include "src/stdlib/qsort_pivot.h"
#include <stdint.h>
@@ -69,7 +69,7 @@ template <typename F> void quick_sort(Array &array, const F &is_less) {
if (array_len <= 1)
return;
- const size_t pivot_index = array_len / 2;
+ const size_t pivot_index = choose_pivot(array, is_less);
size_t split_index = partition(array, pivot_index, is_less);
if (array_len == 2)
>From ddf07964b9770513e58ddf5f5f20adc46774fa50 Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll at gmail.com>
Date: Sat, 14 Dec 2024 15:52:46 +0100
Subject: [PATCH 03/10] Use pdqsort style equal element filtering
This commits the technique introduced by Orson Peters in pdqsort to efficiently
handle inputs with repeated elements for example as found in zipfian or low
cardinality distributions. The random_z1 pattern sees a ~3.5x improvement in
performance for input length 10k across the different types. This change implies
O(N x log(K)) for inputs with K distinct elements.
---
libc/src/stdlib/quick_sort.h | 36 +++++++++++++++++++++++++++++++++---
1 file changed, 33 insertions(+), 3 deletions(-)
diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
index a21826681ff2c1..91a8ba8c524c1d 100644
--- a/libc/src/stdlib/quick_sort.h
+++ b/libc/src/stdlib/quick_sort.h
@@ -63,13 +63,36 @@ size_t partition(const Array &array, size_t pivot_index, const F &is_less) {
return num_lt;
}
-template <typename F> void quick_sort(Array &array, const F &is_less) {
+template <typename F>
+void quick_sort_impl(Array &array, const void *ancestor_pivot,
+ const F &is_less) {
while (true) {
const size_t array_len = array.len();
if (array_len <= 1)
return;
const size_t pivot_index = choose_pivot(array, is_less);
+
+ // If the chosen pivot is equal to the predecessor, then it's the smallest
+ // element in the slice. Partition the slice into elements equal to and
+ // elements greater than the pivot. This case is usually hit when the slice
+ // contains many duplicate elements.
+ if (ancestor_pivot) {
+ if (!is_less(ancestor_pivot, array.get(pivot_index))) {
+ const size_t num_lt =
+ partition(array, pivot_index,
+ [is_less](const void *a, const void *b) noexcept -> bool {
+ return !is_less(b, a);
+ });
+
+ // Continue sorting elements greater than the pivot. We know that
+ // `num_lt` cont
+ array.reset_bounds(num_lt + 1, array.len() - (num_lt + 1));
+ ancestor_pivot = nullptr;
+ continue;
+ }
+ }
+
size_t split_index = partition(array, pivot_index, is_less);
if (array_len == 2)
@@ -78,6 +101,7 @@ template <typename F> void quick_sort(Array &array, const F &is_less) {
// Split the array into `left`, `pivot`, and `right`.
Array left = array.make_array(0, split_index);
+ const void *pivot = array.get(split_index);
const size_t right_start = split_index + 1;
Array right = array.make_array(right_start, array.len() - right_start);
@@ -86,15 +110,21 @@ template <typename F> void quick_sort(Array &array, const F &is_less) {
// by log2 of the total array size, because every recursive call is sorting
// a list at most half the length of the one in its caller.
if (left.len() < right.len()) {
- quick_sort(left, is_less);
+ quick_sort_impl(left, ancestor_pivot, is_less);
array.reset_bounds(right_start, right.len());
+ ancestor_pivot = pivot;
} else {
- quick_sort(right, is_less);
+ quick_sort_impl(right, pivot, is_less);
array.reset_bounds(0, left.len());
}
}
}
+template <typename F> void quick_sort(Array &array, const F &is_less) {
+ const void *ancestor_pivot = nullptr;
+ quick_sort_impl(array, ancestor_pivot, is_less);
+}
+
} // namespace internal
} // namespace LIBC_NAMESPACE_DECL
>From 9e05a1f959570f6bf0c1f023f0a8ca157e4af8ee Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll at gmail.com>
Date: Mon, 16 Dec 2024 17:57:53 +0100
Subject: [PATCH 04/10] Use heapsort fallback to guarantee O(N x log(N)) worst
case performance
The combination of a high quality pivot selection function with equal element
filtering as added in the previous commit makes for quicksort that in practice
hardly ever hits quadratic run-time scaling, but malicious inputs and accidents
are possible. By limiting the recursion depth and switching to heapsort -
introsort the second i in idisort - we can guarantee a worst case expected time
to sort the data of O(N x log(N)).
---
libc/src/stdlib/quick_sort.h | 39 +++++++++++++++++++++++-------------
1 file changed, 25 insertions(+), 14 deletions(-)
diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
index 91a8ba8c524c1d..3442fd7aa0f762 100644
--- a/libc/src/stdlib/quick_sort.h
+++ b/libc/src/stdlib/quick_sort.h
@@ -10,6 +10,7 @@
#define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
#include "src/__support/CPP/cstddef.h"
+#include "src/__support/big_int.h"
#include "src/__support/macros/config.h"
#include "src/stdlib/qsort_pivot.h"
@@ -64,13 +65,22 @@ size_t partition(const Array &array, size_t pivot_index, const F &is_less) {
}
template <typename F>
-void quick_sort_impl(Array &array, const void *ancestor_pivot,
+void quick_sort_impl(Array &array, const void *ancestor_pivot, size_t limit,
const F &is_less) {
while (true) {
const size_t array_len = array.len();
if (array_len <= 1)
return;
+ // If too many bad pivot choices were made, simply fall back to
+ // heapsort in order to guarantee `O(N x log(N))` worst-case.
+ if (limit == 0) {
+ heap_sort(array, is_less);
+ return;
+ }
+
+ limit -= 1;
+
const size_t pivot_index = choose_pivot(array, is_less);
// If the chosen pivot is equal to the predecessor, then it's the smallest
@@ -105,24 +115,25 @@ void quick_sort_impl(Array &array, const void *ancestor_pivot,
const size_t right_start = split_index + 1;
Array right = array.make_array(right_start, array.len() - right_start);
- // Recurse to sort the smaller of the two, and then loop round within this
- // function to sort the larger. This way, recursive call depth is bounded
- // by log2 of the total array size, because every recursive call is sorting
- // a list at most half the length of the one in its caller.
- if (left.len() < right.len()) {
- quick_sort_impl(left, ancestor_pivot, is_less);
- array.reset_bounds(right_start, right.len());
- ancestor_pivot = pivot;
- } else {
- quick_sort_impl(right, pivot, is_less);
- array.reset_bounds(0, left.len());
- }
+ // Recurse into the left side. We have a fixed recursion limit,
+ // testing shows no real benefit for recursing into the shorter
+ // side.
+ quick_sort_impl(left, ancestor_pivot, limit, is_less);
+
+ // Continue with the right side.
+ array = right;
+ ancestor_pivot = pivot;
}
}
+constexpr size_t ilog2(size_t n) { return cpp::bit_width(n) - 1; }
+
template <typename F> void quick_sort(Array &array, const F &is_less) {
const void *ancestor_pivot = nullptr;
- quick_sort_impl(array, ancestor_pivot, is_less);
+ // Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
+ // The binary OR by one is used to eliminate the zero-check in the logarithm.
+ const size_t limit = 2 * ilog2((array.len() | 1));
+ quick_sort_impl(array, ancestor_pivot, limit, is_less);
}
} // namespace internal
>From 064678d7a6bec493671576c1e22a204bdbe6db9b Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll at gmail.com>
Date: Mon, 16 Dec 2024 20:51:18 +0100
Subject: [PATCH 05/10] Use fixed size array specialization
By itself this optimization only nets a ~1.2x improvement for types like i32,
u64 and f128, but it unlocks ~2x improvements for partitioning. The cost is a
modest increase in binary-size of libc, `unstable_sort` is only instantiated
twice, which raises the total from 2 to 8 instantiations in the library. But
since the fixed size instantiations have much cheaper swap routines they only
add ~1.5k each vs the ~4.5k generic instantiation. In effect binary size for
qsort and qsort_r goes from ~9k to ~18k.
---
libc/src/stdlib/heap_sort.h | 3 +-
libc/src/stdlib/qsort.cpp | 9 +--
libc/src/stdlib/qsort_data.h | 98 +++++++++++++++++++++++--
libc/src/stdlib/qsort_pivot.h | 15 ++--
libc/src/stdlib/qsort_r.cpp | 8 +-
libc/src/stdlib/qsort_util.h | 35 ++++++++-
libc/src/stdlib/quick_sort.h | 21 +++---
libc/test/src/stdlib/heap_sort_test.cpp | 4 +-
libc/test/src/stdlib/qsort_test.cpp | 11 +--
9 files changed, 152 insertions(+), 52 deletions(-)
diff --git a/libc/src/stdlib/heap_sort.h b/libc/src/stdlib/heap_sort.h
index 543c834c0197c7..de08984497782a 100644
--- a/libc/src/stdlib/heap_sort.h
+++ b/libc/src/stdlib/heap_sort.h
@@ -18,7 +18,8 @@ namespace internal {
// A simple in-place heapsort implementation.
// Follow the implementation in https://en.wikipedia.org/wiki/Heapsort.
-template <typename F> void heap_sort(const Array &array, const F &is_less) {
+template <typename A, typename F>
+void heap_sort(const A &array, const F &is_less) {
size_t end = array.len();
size_t start = end / 2;
diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp
index ff55b1bd4455eb..1402f7963c389e 100644
--- a/libc/src/stdlib/qsort.cpp
+++ b/libc/src/stdlib/qsort.cpp
@@ -18,14 +18,9 @@ namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(void, qsort,
(void *array, size_t array_size, size_t elem_size,
int (*compare)(const void *, const void *))) {
- if (array == nullptr || array_size == 0 || elem_size == 0)
- return;
-
- auto arr = internal::Array(reinterpret_cast<uint8_t *>(array), array_size,
- elem_size);
-
internal::unstable_sort(
- arr, [compare](const void *a, const void *b) noexcept -> bool {
+ array, array_size, elem_size,
+ [compare](const void *a, const void *b) noexcept -> bool {
return compare(a, b) < 0;
});
}
diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h
index affff3da25cdaa..d06418050c1867 100644
--- a/libc/src/stdlib/qsort_data.h
+++ b/libc/src/stdlib/qsort_data.h
@@ -11,12 +11,33 @@
#include "src/__support/CPP/cstddef.h"
#include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memcpy.h"
+#include "src/string/memory_utils/inline_memmove.h"
#include <stdint.h>
namespace LIBC_NAMESPACE_DECL {
namespace internal {
-class Array {
+// Returns the max amount of bytes deemed reasonable - based on the target
+// settings - for use in local stack arrays.
+constexpr size_t max_stack_array_size() {
+ constexpr size_t ptr_diff_size = sizeof(ptrdiff_t);
+
+ if constexpr (ptr_diff_size >= 8) {
+ return 4096;
+ }
+
+ if constexpr (ptr_diff_size == 4) {
+ return 512;
+ }
+
+ // 8-bit platforms are just not gonna work well with libc, qsort
+ // won't be the problem.
+ // 16-bit platforms ought to be able to store 64 bytes on the stack.
+ return 64;
+}
+
+class ArrayGenericSize {
uint8_t *array_base;
size_t array_len;
size_t elem_size;
@@ -26,7 +47,7 @@ class Array {
}
public:
- Array(uint8_t *a, size_t s, size_t e) noexcept
+ ArrayGenericSize(uint8_t *a, size_t s, size_t e) noexcept
: array_base(a), array_len(s), elem_size(e) {}
inline void *get(size_t i) const noexcept {
@@ -34,21 +55,82 @@ class Array {
}
void swap(size_t i, size_t j) const noexcept {
+ // For sizes below this doing the extra function call is not
+ // worth it.
+ constexpr size_t MIN_MEMCPY_SIZE = 32;
+
+ constexpr size_t STACK_ARRAY_SIZE = max_stack_array_size();
+ alignas(32) uint8_t tmp[STACK_ARRAY_SIZE];
+
uint8_t *elem_i = get_internal(i);
uint8_t *elem_j = get_internal(j);
- for (size_t b = 0; b < elem_size; ++b) {
- uint8_t temp = elem_i[b];
- elem_i[b] = elem_j[b];
- elem_j[b] = temp;
+ if (elem_size >= MIN_MEMCPY_SIZE && elem_size <= STACK_ARRAY_SIZE) {
+ // Block copies are much more efficient, even if `elem_size`
+ // is unknown once `elem_size` passes a certain CPU specific
+ // threshold.
+ inline_memcpy(tmp, elem_i, elem_size);
+ inline_memmove(elem_i, elem_j, elem_size);
+ inline_memcpy(elem_j, tmp, elem_size);
+ } else {
+ for (size_t b = 0; b < elem_size; ++b) {
+ uint8_t temp = elem_i[b];
+ elem_i[b] = elem_j[b];
+ elem_j[b] = temp;
+ }
}
}
size_t len() const noexcept { return array_len; }
// Make an Array starting at index |i| and length |s|.
- inline Array make_array(size_t i, size_t s) const noexcept {
- return Array(get_internal(i), s, elem_size);
+ inline ArrayGenericSize make_array(size_t i, size_t s) const noexcept {
+ return ArrayGenericSize(get_internal(i), s, elem_size);
+ }
+
+ // Reset this Array to point at a different interval of the same
+ // items starting at index |i|.
+ inline void reset_bounds(size_t i, size_t s) noexcept {
+ array_base = get_internal(i);
+ array_len = s;
+ }
+};
+
+// Having a specialized Array type for sorting that knowns at
+// compile-time what the size of the element is, allows for much more
+// efficient swapping and for cheaper offset calculations.
+template <size_t ELEM_SIZE> class ArrayFixedSize {
+ uint8_t *array_base;
+ size_t array_len;
+
+ uint8_t *get_internal(size_t i) const noexcept {
+ return array_base + (i * ELEM_SIZE);
+ }
+
+public:
+ ArrayFixedSize(uint8_t *a, size_t s) noexcept : array_base(a), array_len(s) {}
+
+ inline void *get(size_t i) const noexcept {
+ return reinterpret_cast<void *>(get_internal(i));
+ }
+
+ void swap(size_t i, size_t j) const noexcept {
+ alignas(32) uint8_t tmp[ELEM_SIZE];
+
+ uint8_t *elem_i = get_internal(i);
+ uint8_t *elem_j = get_internal(j);
+
+ inline_memcpy(tmp, elem_i, ELEM_SIZE);
+ inline_memmove(elem_i, elem_j, ELEM_SIZE);
+ inline_memcpy(elem_j, tmp, ELEM_SIZE);
+ }
+
+ size_t len() const noexcept { return array_len; }
+
+ // Make an Array starting at index |i| and length |s|.
+ inline ArrayFixedSize<ELEM_SIZE> make_array(size_t i,
+ size_t s) const noexcept {
+ return ArrayFixedSize<ELEM_SIZE>(get_internal(i), s);
}
// Reset this Array to point at a different interval of the same
diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h
index 8411a324f737ec..68716d01c47a0b 100644
--- a/libc/src/stdlib/qsort_pivot.h
+++ b/libc/src/stdlib/qsort_pivot.h
@@ -9,7 +9,7 @@
#ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
#define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
-#include "qsort_data.h"
+#include "src/stdlib/qsort_pivot.h"
#include <stdint.h>
@@ -23,8 +23,8 @@ constexpr size_t PSEUDO_MEDIAN_REC_THRESHOLD = 64;
//
// This chooses a pivot by sampling an adaptive amount of points, approximating
// the quality of a median of sqrt(n) elements.
-template <typename F>
-size_t choose_pivot(const Array &array, const F &is_less) {
+template <typename A, typename F>
+size_t choose_pivot(const A &array, const F &is_less) {
const size_t len = array.len();
if (len < 8) {
@@ -49,8 +49,8 @@ size_t choose_pivot(const Array &array, const F &is_less) {
// dividing the size of each section by 8 when recursing we have logarithmic
// recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) =
// O(n^(log(3)/log(8))) ~= O(n^0.528) elements.
-template <typename F>
-size_t median3_rec(const Array &array, size_t a, size_t b, size_t c, size_t n,
+template <typename A, typename F>
+size_t median3_rec(const A &array, size_t a, size_t b, size_t c, size_t n,
const F &is_less) {
if (n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD) {
const size_t n8 = n / 8;
@@ -62,9 +62,8 @@ size_t median3_rec(const Array &array, size_t a, size_t b, size_t c, size_t n,
}
/// Calculates the median of 3 elements.
-template <typename F>
-size_t median3(const Array &array, size_t a, size_t b, size_t c,
- const F &is_less) {
+template <typename A, typename F>
+size_t median3(const A &array, size_t a, size_t b, size_t c, const F &is_less) {
const void *a_ptr = array.get(a);
const void *b_ptr = array.get(b);
const void *c_ptr = array.get(c);
diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp
index 530e3f5f58e19a..1d340560bcbe12 100644
--- a/libc/src/stdlib/qsort_r.cpp
+++ b/libc/src/stdlib/qsort_r.cpp
@@ -19,14 +19,10 @@ LLVM_LIBC_FUNCTION(void, qsort_r,
(void *array, size_t array_size, size_t elem_size,
int (*compare)(const void *, const void *, void *),
void *arg)) {
- if (array == nullptr || array_size == 0 || elem_size == 0)
- return;
-
- auto arr = internal::Array(reinterpret_cast<uint8_t *>(array), array_size,
- elem_size);
internal::unstable_sort(
- arr, [compare, arg](const void *a, const void *b) noexcept -> bool {
+ array, array_size, elem_size,
+ [compare, arg](const void *a, const void *b) noexcept -> bool {
return compare(a, b, arg) < 0;
});
}
diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h
index 0faf6e1afdec00..ee3f737926f530 100644
--- a/libc/src/stdlib/qsort_util.h
+++ b/libc/src/stdlib/qsort_util.h
@@ -27,7 +27,7 @@
namespace LIBC_NAMESPACE_DECL {
namespace internal {
-template <typename F> void unstable_sort(Array array, const F &is_less) {
+template <typename A, typename F> void sort_inst(A &array, const F &is_less) {
#if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT
quick_sort(array, is_less);
#elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT
@@ -35,6 +35,39 @@ template <typename F> void unstable_sort(Array array, const F &is_less) {
#endif
}
+template <typename F>
+void unstable_sort(void *array, size_t array_len, size_t elem_size,
+ const F &is_less) {
+ if (array == nullptr || array_len == 0 || elem_size == 0) {
+ return;
+ }
+
+ uint8_t *array_base = reinterpret_cast<uint8_t *>(array);
+
+ switch (elem_size) {
+ case 4: {
+ auto arr_fixed_size = internal::ArrayFixedSize<4>(array_base, array_len);
+ sort_inst(arr_fixed_size, is_less);
+ return;
+ }
+ case 8: {
+ auto arr_fixed_size = internal::ArrayFixedSize<8>(array_base, array_len);
+ sort_inst(arr_fixed_size, is_less);
+ return;
+ }
+ case 16: {
+ auto arr_fixed_size = internal::ArrayFixedSize<16>(array_base, array_len);
+ sort_inst(arr_fixed_size, is_less);
+ return;
+ }
+ default:
+ auto arr_generic_size =
+ internal::ArrayGenericSize(array_base, array_len, elem_size);
+ sort_inst(arr_generic_size, is_less);
+ return;
+ }
+}
+
} // namespace internal
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
index 3442fd7aa0f762..ace450d6cf805d 100644
--- a/libc/src/stdlib/quick_sort.h
+++ b/libc/src/stdlib/quick_sort.h
@@ -20,9 +20,8 @@ namespace LIBC_NAMESPACE_DECL {
namespace internal {
// A simple quicksort implementation using the Hoare partition scheme.
-template <typename F>
-size_t partition_hoare(const Array &array, const void *pivot,
- const F &is_less) {
+template <typename A, typename F>
+size_t partition_hoare(const A &array, const void *pivot, const F &is_less) {
const size_t array_len = array.len();
size_t left = 0;
@@ -49,12 +48,12 @@ size_t partition_hoare(const Array &array, const void *pivot,
return left;
}
-template <typename F>
-size_t partition(const Array &array, size_t pivot_index, const F &is_less) {
+template <typename A, typename F>
+size_t partition(const A &array, size_t pivot_index, const F &is_less) {
// Place the pivot at the beginning of the array.
array.swap(0, pivot_index);
- const Array array_without_pivot = array.make_array(1, array.len() - 1);
+ const A array_without_pivot = array.make_array(1, array.len() - 1);
const void *pivot = array.get(0);
const size_t num_lt = partition_hoare(array_without_pivot, pivot, is_less);
@@ -64,8 +63,8 @@ size_t partition(const Array &array, size_t pivot_index, const F &is_less) {
return num_lt;
}
-template <typename F>
-void quick_sort_impl(Array &array, const void *ancestor_pivot, size_t limit,
+template <typename A, typename F>
+void quick_sort_impl(A &array, const void *ancestor_pivot, size_t limit,
const F &is_less) {
while (true) {
const size_t array_len = array.len();
@@ -110,10 +109,10 @@ void quick_sort_impl(Array &array, const void *ancestor_pivot, size_t limit,
return;
// Split the array into `left`, `pivot`, and `right`.
- Array left = array.make_array(0, split_index);
+ A left = array.make_array(0, split_index);
const void *pivot = array.get(split_index);
const size_t right_start = split_index + 1;
- Array right = array.make_array(right_start, array.len() - right_start);
+ A right = array.make_array(right_start, array.len() - right_start);
// Recurse into the left side. We have a fixed recursion limit,
// testing shows no real benefit for recursing into the shorter
@@ -128,7 +127,7 @@ void quick_sort_impl(Array &array, const void *ancestor_pivot, size_t limit,
constexpr size_t ilog2(size_t n) { return cpp::bit_width(n) - 1; }
-template <typename F> void quick_sort(Array &array, const F &is_less) {
+template <typename A, typename F> void quick_sort(A &array, const F &is_less) {
const void *ancestor_pivot = nullptr;
// Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
// The binary OR by one is used to eliminate the zero-check in the logarithm.
diff --git a/libc/test/src/stdlib/heap_sort_test.cpp b/libc/test/src/stdlib/heap_sort_test.cpp
index 7223d9b23f98a9..43c8c69739615a 100644
--- a/libc/test/src/stdlib/heap_sort_test.cpp
+++ b/libc/test/src/stdlib/heap_sort_test.cpp
@@ -15,8 +15,8 @@ void heap_sort(void *array, size_t array_size, size_t elem_size,
if (array == nullptr || array_size == 0 || elem_size == 0)
return;
- auto arr = LIBC_NAMESPACE::internal::Array(reinterpret_cast<uint8_t *>(array),
- array_size, elem_size);
+ auto arr = LIBC_NAMESPACE::internal::ArrayGenericSize(
+ reinterpret_cast<uint8_t *>(array), array_size, elem_size);
LIBC_NAMESPACE::internal::heap_sort(
arr, [compare](const void *a, const void *b) noexcept -> bool {
diff --git a/libc/test/src/stdlib/qsort_test.cpp b/libc/test/src/stdlib/qsort_test.cpp
index e89a092d37ccc9..63cbb205c4e174 100644
--- a/libc/test/src/stdlib/qsort_test.cpp
+++ b/libc/test/src/stdlib/qsort_test.cpp
@@ -11,14 +11,9 @@
void quick_sort(void *array, size_t array_size, size_t elem_size,
int (*compare)(const void *, const void *)) {
- if (array == nullptr || array_size == 0 || elem_size == 0)
- return;
-
- auto arr = LIBC_NAMESPACE::internal::Array(reinterpret_cast<uint8_t *>(array),
- array_size, elem_size);
-
- LIBC_NAMESPACE::internal::quick_sort(
- arr, [compare](const void *a, const void *b) noexcept -> bool {
+ LIBC_NAMESPACE::internal::unstable_sort(
+ array, array_size, elem_size,
+ [compare](const void *a, const void *b) noexcept -> bool {
return compare(a, b) < 0;
});
}
>From 702fa77112a620c45c795454fd1d756eb47aaf7e Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll at gmail.com>
Date: Mon, 16 Dec 2024 22:17:04 +0100
Subject: [PATCH 06/10] Use branchless lomuto partition when elem size is known
at compile-time
Uses a simplified version of the branchless partition scheme used in ipnsort.
This achieves a ~2.5x perf improvement for i32, u64 and f128 for random
patterns. This is mainly achieved by avoiding branch misprediction penalties.
---
libc/src/stdlib/qsort_data.h | 12 +++++++++-
libc/src/stdlib/quick_sort.h | 44 +++++++++++++++++++++++++++++++++---
2 files changed, 52 insertions(+), 4 deletions(-)
diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h
index d06418050c1867..d2487e36333a5a 100644
--- a/libc/src/stdlib/qsort_data.h
+++ b/libc/src/stdlib/qsort_data.h
@@ -18,9 +18,12 @@
namespace LIBC_NAMESPACE_DECL {
namespace internal {
+
// Returns the max amount of bytes deemed reasonable - based on the target
-// settings - for use in local stack arrays.
+// properties - for use in local stack arrays.
constexpr size_t max_stack_array_size() {
+ // Uses target pointer size as heuristic how much memory is available and
+ // unlikely to run into stack overflow and perf problems.
constexpr size_t ptr_diff_size = sizeof(ptrdiff_t);
if constexpr (ptr_diff_size >= 8) {
@@ -50,6 +53,8 @@ class ArrayGenericSize {
ArrayGenericSize(uint8_t *a, size_t s, size_t e) noexcept
: array_base(a), array_len(s), elem_size(e) {}
+ static constexpr bool has_fixed_size() { return false; }
+
inline void *get(size_t i) const noexcept {
return reinterpret_cast<void *>(get_internal(i));
}
@@ -110,6 +115,11 @@ template <size_t ELEM_SIZE> class ArrayFixedSize {
public:
ArrayFixedSize(uint8_t *a, size_t s) noexcept : array_base(a), array_len(s) {}
+ // Beware this function is used a heuristic for cheap to swap types, so
+ // instantiating `ArrayFixedSize` with `ELEM_SIZE > 100` is probably a bad
+ // idea perf wise.
+ static constexpr bool has_fixed_size() { return true; }
+
inline void *get(size_t i) const noexcept {
return reinterpret_cast<void *>(get_internal(i));
}
diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
index ace450d6cf805d..fddcd1a18282ba 100644
--- a/libc/src/stdlib/quick_sort.h
+++ b/libc/src/stdlib/quick_sort.h
@@ -19,9 +19,38 @@
namespace LIBC_NAMESPACE_DECL {
namespace internal {
-// A simple quicksort implementation using the Hoare partition scheme.
+// Branchless Lomuto partition based on the implementation by Lukas
+// Bergdoll and Orson Peters
+// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md.
+// Simplified to avoid having to stack allocate.
template <typename A, typename F>
-size_t partition_hoare(const A &array, const void *pivot, const F &is_less) {
+size_t partition_lomuto_branchless(const A &array, const void *pivot,
+ const F &is_less) {
+ const size_t array_len = array.len();
+
+ size_t left = 0;
+ size_t right = 0;
+
+ while (right < array_len) {
+ const bool right_is_lt = is_less(array.get(right), pivot);
+ array.swap(left, right);
+ left += static_cast<size_t>(right_is_lt);
+ right += 1;
+ }
+
+ return left;
+}
+
+// Optimized for large types that are expensive to move. Not optimized
+// for integers. It's possible to use a cyclic permutation here for
+// large types as done in ipnsort but the advantages of this are limited
+// as `is_less` is a small wrapper around a call to a function pointer
+// and won't incur much binary-size overhead. The other reason to use
+// cyclic permutation is to have more efficient swapping, but we don't
+// know the element size so this isn't applicable here either.
+template <typename A, typename F>
+size_t partition_hoare_branchy(const A &array, const void *pivot,
+ const F &is_less) {
const size_t array_len = array.len();
size_t left = 0;
@@ -55,7 +84,16 @@ size_t partition(const A &array, size_t pivot_index, const F &is_less) {
const A array_without_pivot = array.make_array(1, array.len() - 1);
const void *pivot = array.get(0);
- const size_t num_lt = partition_hoare(array_without_pivot, pivot, is_less);
+
+ size_t num_lt;
+ if constexpr (A::has_fixed_size()) {
+ // Branchless Lomuto avoid branch misprediction penalties, but
+ // it also swaps more often which only is faster if the swap a
+ // constant operation.
+ num_lt = partition_lomuto_branchless(array_without_pivot, pivot, is_less);
+ } else {
+ num_lt = partition_hoare_branchy(array_without_pivot, pivot, is_less);
+ }
// Place the pivot between the two partitions.
array.swap(0, num_lt);
>From 2b1660affad3ada309b2e99499cf888cab5e65b8 Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll at gmail.com>
Date: Tue, 17 Dec 2024 11:13:23 +0100
Subject: [PATCH 07/10] Use more generic dynamic element size swap impl
The new impl is better for types < 100 bytes and worse for very large types like
1k, but still not bad, plus it has smaller binary-size footprint and avoids a
surprising performance cliff past `STACK_ARRAY_SIZE` as well as heuristics.
Also remove inline "hints" it's the wrong way to do it and the situation is
simple enough that the compiler should now best and it's the wrong way to hint
it anyway.
---
libc/src/stdlib/qsort_data.h | 62 +++++++++++++++++++++---------------
1 file changed, 36 insertions(+), 26 deletions(-)
diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h
index d2487e36333a5a..f7446ec2bb637d 100644
--- a/libc/src/stdlib/qsort_data.h
+++ b/libc/src/stdlib/qsort_data.h
@@ -55,47 +55,58 @@ class ArrayGenericSize {
static constexpr bool has_fixed_size() { return false; }
- inline void *get(size_t i) const noexcept {
+ void *get(size_t i) const noexcept {
return reinterpret_cast<void *>(get_internal(i));
}
void swap(size_t i, size_t j) const noexcept {
- // For sizes below this doing the extra function call is not
- // worth it.
- constexpr size_t MIN_MEMCPY_SIZE = 32;
-
- constexpr size_t STACK_ARRAY_SIZE = max_stack_array_size();
- alignas(32) uint8_t tmp[STACK_ARRAY_SIZE];
+ // It's possible to use 8 byte blocks with `uint64_t`, but that
+ // generates more machine code as the remainder loop gets
+ // unrolled, plus 4 byte operations are more likely to be
+ // efficient on a wider variety of hardware. On x86 LLVM tends
+ // to unroll the block loop again into 2 16 byte swaps per
+ // iteration which is another reason that 4 byte blocks yields
+ // good performance even for big types.
+ using block_t = uint32_t;
+ constexpr size_t BLOCK_SIZE = sizeof(block_t);
uint8_t *elem_i = get_internal(i);
uint8_t *elem_j = get_internal(j);
- if (elem_size >= MIN_MEMCPY_SIZE && elem_size <= STACK_ARRAY_SIZE) {
- // Block copies are much more efficient, even if `elem_size`
- // is unknown once `elem_size` passes a certain CPU specific
- // threshold.
- inline_memcpy(tmp, elem_i, elem_size);
- inline_memmove(elem_i, elem_j, elem_size);
- inline_memcpy(elem_j, tmp, elem_size);
- } else {
- for (size_t b = 0; b < elem_size; ++b) {
- uint8_t temp = elem_i[b];
- elem_i[b] = elem_j[b];
- elem_j[b] = temp;
- }
+ const size_t elem_size_rem = elem_size % BLOCK_SIZE;
+ const block_t *elem_i_block_end =
+ reinterpret_cast<block_t *>(elem_i + (elem_size - elem_size_rem));
+
+ block_t *elem_i_block = reinterpret_cast<block_t *>(elem_i);
+ block_t *elem_j_block = reinterpret_cast<block_t *>(elem_j);
+
+ while (elem_i_block != elem_i_block_end) {
+ block_t tmp = *elem_i_block;
+ *elem_i_block = *elem_j_block;
+ *elem_j_block = tmp;
+ elem_i_block += 1;
+ elem_j_block += 1;
+ }
+
+ elem_i = reinterpret_cast<uint8_t *>(elem_i_block);
+ elem_j = reinterpret_cast<uint8_t *>(elem_j_block);
+ for (size_t n = 0; n < elem_size_rem; ++n) {
+ uint8_t tmp = elem_i[n];
+ elem_i[n] = elem_j[n];
+ elem_j[n] = tmp;
}
}
size_t len() const noexcept { return array_len; }
// Make an Array starting at index |i| and length |s|.
- inline ArrayGenericSize make_array(size_t i, size_t s) const noexcept {
+ ArrayGenericSize make_array(size_t i, size_t s) const noexcept {
return ArrayGenericSize(get_internal(i), s, elem_size);
}
// Reset this Array to point at a different interval of the same
// items starting at index |i|.
- inline void reset_bounds(size_t i, size_t s) noexcept {
+ void reset_bounds(size_t i, size_t s) noexcept {
array_base = get_internal(i);
array_len = s;
}
@@ -120,7 +131,7 @@ template <size_t ELEM_SIZE> class ArrayFixedSize {
// idea perf wise.
static constexpr bool has_fixed_size() { return true; }
- inline void *get(size_t i) const noexcept {
+ void *get(size_t i) const noexcept {
return reinterpret_cast<void *>(get_internal(i));
}
@@ -138,14 +149,13 @@ template <size_t ELEM_SIZE> class ArrayFixedSize {
size_t len() const noexcept { return array_len; }
// Make an Array starting at index |i| and length |s|.
- inline ArrayFixedSize<ELEM_SIZE> make_array(size_t i,
- size_t s) const noexcept {
+ ArrayFixedSize<ELEM_SIZE> make_array(size_t i, size_t s) const noexcept {
return ArrayFixedSize<ELEM_SIZE>(get_internal(i), s);
}
// Reset this Array to point at a different interval of the same
// items starting at index |i|.
- inline void reset_bounds(size_t i, size_t s) noexcept {
+ void reset_bounds(size_t i, size_t s) noexcept {
array_base = get_internal(i);
array_len = s;
}
>From efb98579a7f56c721402d526086fa24a9cbc3110 Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll at gmail.com>
Date: Tue, 17 Dec 2024 12:04:17 +0100
Subject: [PATCH 08/10] Avoid unnecessary pivot swap for common recursion case
The pivot selection will pick 0 as the pivot index if the array length is < 8,
which is a common recursion leaf case. This isn't a big perf win more on the
order 1-2% but's it's trivial and more or less free.
---
libc/src/stdlib/quick_sort.h | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
index fddcd1a18282ba..13cce7ee5d86cd 100644
--- a/libc/src/stdlib/quick_sort.h
+++ b/libc/src/stdlib/quick_sort.h
@@ -80,7 +80,9 @@ size_t partition_hoare_branchy(const A &array, const void *pivot,
template <typename A, typename F>
size_t partition(const A &array, size_t pivot_index, const F &is_less) {
// Place the pivot at the beginning of the array.
- array.swap(0, pivot_index);
+ if (pivot_index != 0) {
+ array.swap(0, pivot_index);
+ }
const A array_without_pivot = array.make_array(1, array.len() - 1);
const void *pivot = array.get(0);
>From aecca523598ef05189835ab4dddf8aa000870c3a Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll at gmail.com>
Date: Thu, 19 Dec 2024 17:35:20 +0100
Subject: [PATCH 09/10] Apply review comments
---
libc/src/stdlib/qsort.cpp | 9 +-
libc/src/stdlib/qsort_data.h | 79 ++++------
libc/src/stdlib/qsort_pivot.h | 7 +-
libc/src/stdlib/qsort_r.cpp | 9 +-
libc/src/stdlib/qsort_util.h | 77 +++++-----
libc/src/stdlib/quick_sort.h | 6 +-
libc/test/src/stdlib/CMakeLists.txt | 4 +-
libc/test/src/stdlib/SortingTest.h | 144 +++++++++++++-----
libc/test/src/stdlib/heap_sort_test.cpp | 11 +-
.../{qsort_test.cpp => quick_sort_test.cpp} | 4 +-
.../libc/test/src/stdlib/BUILD.bazel | 4 +-
11 files changed, 202 insertions(+), 152 deletions(-)
rename libc/test/src/stdlib/{qsort_test.cpp => quick_sort_test.cpp} (87%)
diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp
index 1402f7963c389e..914297a0ddba9b 100644
--- a/libc/src/stdlib/qsort.cpp
+++ b/libc/src/stdlib/qsort.cpp
@@ -18,11 +18,10 @@ namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(void, qsort,
(void *array, size_t array_size, size_t elem_size,
int (*compare)(const void *, const void *))) {
- internal::unstable_sort(
- array, array_size, elem_size,
- [compare](const void *a, const void *b) noexcept -> bool {
- return compare(a, b) < 0;
- });
+ internal::unstable_sort(array, array_size, elem_size,
+ [compare](const void *a, const void *b) -> bool {
+ return compare(a, b) < 0;
+ });
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h
index f7446ec2bb637d..647239878add65 100644
--- a/libc/src/stdlib/qsort_data.h
+++ b/libc/src/stdlib/qsort_data.h
@@ -19,47 +19,25 @@
namespace LIBC_NAMESPACE_DECL {
namespace internal {
-// Returns the max amount of bytes deemed reasonable - based on the target
-// properties - for use in local stack arrays.
-constexpr size_t max_stack_array_size() {
- // Uses target pointer size as heuristic how much memory is available and
- // unlikely to run into stack overflow and perf problems.
- constexpr size_t ptr_diff_size = sizeof(ptrdiff_t);
-
- if constexpr (ptr_diff_size >= 8) {
- return 4096;
- }
-
- if constexpr (ptr_diff_size == 4) {
- return 512;
- }
-
- // 8-bit platforms are just not gonna work well with libc, qsort
- // won't be the problem.
- // 16-bit platforms ought to be able to store 64 bytes on the stack.
- return 64;
-}
-
class ArrayGenericSize {
- uint8_t *array_base;
+ cpp::byte *array_base;
size_t array_len;
size_t elem_size;
- uint8_t *get_internal(size_t i) const noexcept {
+ LIBC_INLINE cpp::byte *get_internal(size_t i) const {
return array_base + (i * elem_size);
}
public:
- ArrayGenericSize(uint8_t *a, size_t s, size_t e) noexcept
- : array_base(a), array_len(s), elem_size(e) {}
+ ArrayGenericSize(void *a, size_t s, size_t e)
+ : array_base(reinterpret_cast<cpp::byte *>(a)), array_len(s),
+ elem_size(e) {}
static constexpr bool has_fixed_size() { return false; }
- void *get(size_t i) const noexcept {
- return reinterpret_cast<void *>(get_internal(i));
- }
+ LIBC_INLINE void *get(size_t i) const { return get_internal(i); }
- void swap(size_t i, size_t j) const noexcept {
+ void swap(size_t i, size_t j) const {
// It's possible to use 8 byte blocks with `uint64_t`, but that
// generates more machine code as the remainder loop gets
// unrolled, plus 4 byte operations are more likely to be
@@ -70,8 +48,8 @@ class ArrayGenericSize {
using block_t = uint32_t;
constexpr size_t BLOCK_SIZE = sizeof(block_t);
- uint8_t *elem_i = get_internal(i);
- uint8_t *elem_j = get_internal(j);
+ cpp::byte *elem_i = get_internal(i);
+ cpp::byte *elem_j = get_internal(j);
const size_t elem_size_rem = elem_size % BLOCK_SIZE;
const block_t *elem_i_block_end =
@@ -88,74 +66,73 @@ class ArrayGenericSize {
elem_j_block += 1;
}
- elem_i = reinterpret_cast<uint8_t *>(elem_i_block);
- elem_j = reinterpret_cast<uint8_t *>(elem_j_block);
+ elem_i = reinterpret_cast<cpp::byte *>(elem_i_block);
+ elem_j = reinterpret_cast<cpp::byte *>(elem_j_block);
for (size_t n = 0; n < elem_size_rem; ++n) {
- uint8_t tmp = elem_i[n];
+ cpp::byte tmp = elem_i[n];
elem_i[n] = elem_j[n];
elem_j[n] = tmp;
}
}
- size_t len() const noexcept { return array_len; }
+ LIBC_INLINE size_t len() const { return array_len; }
// Make an Array starting at index |i| and length |s|.
- ArrayGenericSize make_array(size_t i, size_t s) const noexcept {
+ LIBC_INLINE ArrayGenericSize make_array(size_t i, size_t s) const {
return ArrayGenericSize(get_internal(i), s, elem_size);
}
// Reset this Array to point at a different interval of the same
// items starting at index |i|.
- void reset_bounds(size_t i, size_t s) noexcept {
+ LIBC_INLINE void reset_bounds(size_t i, size_t s) {
array_base = get_internal(i);
array_len = s;
}
};
-// Having a specialized Array type for sorting that knowns at
+// Having a specialized Array type for sorting that knows at
// compile-time what the size of the element is, allows for much more
// efficient swapping and for cheaper offset calculations.
template <size_t ELEM_SIZE> class ArrayFixedSize {
- uint8_t *array_base;
+ cpp::byte *array_base;
size_t array_len;
- uint8_t *get_internal(size_t i) const noexcept {
+ LIBC_INLINE cpp::byte *get_internal(size_t i) const {
return array_base + (i * ELEM_SIZE);
}
public:
- ArrayFixedSize(uint8_t *a, size_t s) noexcept : array_base(a), array_len(s) {}
+ ArrayFixedSize(void *a, size_t s)
+ : array_base(reinterpret_cast<cpp::byte *>(a)), array_len(s) {}
// Beware this function is used a heuristic for cheap to swap types, so
// instantiating `ArrayFixedSize` with `ELEM_SIZE > 100` is probably a bad
// idea perf wise.
static constexpr bool has_fixed_size() { return true; }
- void *get(size_t i) const noexcept {
- return reinterpret_cast<void *>(get_internal(i));
- }
+ LIBC_INLINE void *get(size_t i) const { return get_internal(i); }
- void swap(size_t i, size_t j) const noexcept {
- alignas(32) uint8_t tmp[ELEM_SIZE];
+ LIBC_INLINE void swap(size_t i, size_t j) const {
+ alignas(32) cpp::byte tmp[ELEM_SIZE];
- uint8_t *elem_i = get_internal(i);
- uint8_t *elem_j = get_internal(j);
+ cpp::byte *elem_i = get_internal(i);
+ cpp::byte *elem_j = get_internal(j);
inline_memcpy(tmp, elem_i, ELEM_SIZE);
inline_memmove(elem_i, elem_j, ELEM_SIZE);
inline_memcpy(elem_j, tmp, ELEM_SIZE);
}
- size_t len() const noexcept { return array_len; }
+ LIBC_INLINE size_t len() const { return array_len; }
// Make an Array starting at index |i| and length |s|.
- ArrayFixedSize<ELEM_SIZE> make_array(size_t i, size_t s) const noexcept {
+ LIBC_INLINE ArrayFixedSize<ELEM_SIZE> make_array(size_t i, size_t s) const {
return ArrayFixedSize<ELEM_SIZE>(get_internal(i), s);
}
// Reset this Array to point at a different interval of the same
// items starting at index |i|.
- void reset_bounds(size_t i, size_t s) noexcept {
+ LIBC_INLINE void reset_bounds(size_t i, size_t s) {
array_base = get_internal(i);
array_len = s;
}
diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h
index 68716d01c47a0b..616f3ed09676e0 100644
--- a/libc/src/stdlib/qsort_pivot.h
+++ b/libc/src/stdlib/qsort_pivot.h
@@ -9,8 +9,6 @@
#ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
#define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
-#include "src/stdlib/qsort_pivot.h"
-
#include <stdint.h>
namespace LIBC_NAMESPACE_DECL {
@@ -75,11 +73,10 @@ size_t median3(const A &array, size_t a, size_t b, size_t c, const F &is_less) {
// If x=y=1 then a < b, c. In this case we want to return min(b, c).
// By toggling the outcome of b < c using XOR x we get this behavior.
const bool z = is_less(b_ptr, c_ptr);
- if (z ^ x) {
+ if (z ^ x)
return c;
- } else {
+ else
return b;
- }
} else {
// Either c <= a < b or b <= a < c, thus a is our median.
return a;
diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp
index 1d340560bcbe12..dfd2d28f483d6a 100644
--- a/libc/src/stdlib/qsort_r.cpp
+++ b/libc/src/stdlib/qsort_r.cpp
@@ -20,11 +20,10 @@ LLVM_LIBC_FUNCTION(void, qsort_r,
int (*compare)(const void *, const void *, void *),
void *arg)) {
- internal::unstable_sort(
- array, array_size, elem_size,
- [compare, arg](const void *a, const void *b) noexcept -> bool {
- return compare(a, b, arg) < 0;
- });
+ internal::unstable_sort(array, array_size, elem_size,
+ [compare, arg](const void *a, const void *b) -> bool {
+ return compare(a, b, arg) < 0;
+ });
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h
index ee3f737926f530..630a668bcf0e0b 100644
--- a/libc/src/stdlib/qsort_util.h
+++ b/libc/src/stdlib/qsort_util.h
@@ -15,9 +15,13 @@
#define LIBC_QSORT_QUICK_SORT 1
#define LIBC_QSORT_HEAP_SORT 2
+#ifdef LIBC_OPTIMIZE_FOR_SIZE
+#define LIBC_QSORT_IMPL LIBC_QSORT_HEAP_SORT
+#else
#ifndef LIBC_QSORT_IMPL
#define LIBC_QSORT_IMPL LIBC_QSORT_QUICK_SORT
#endif // LIBC_QSORT_IMPL
+#endif // LIBC_OPTIMIZE_FOR_SIZE
#if (LIBC_QSORT_IMPL != LIBC_QSORT_QUICK_SORT && \
LIBC_QSORT_IMPL != LIBC_QSORT_HEAP_SORT)
@@ -27,45 +31,50 @@
namespace LIBC_NAMESPACE_DECL {
namespace internal {
-template <typename A, typename F> void sort_inst(A &array, const F &is_less) {
-#if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT
- quick_sort(array, is_less);
-#elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT
- heap_sort(array, is_less);
-#endif
+template <bool USE_QUICKSORT, typename F>
+void unstable_sort_impl(void *array, size_t array_len, size_t elem_size,
+ const F &is_less) {
+ if (array == nullptr || array_len == 0 || elem_size == 0)
+ return;
+
+ if constexpr (USE_QUICKSORT) {
+ switch (elem_size) {
+ case 4: {
+ auto arr_fixed_size = internal::ArrayFixedSize<4>(array, array_len);
+ quick_sort(arr_fixed_size, is_less);
+ return;
+ }
+ case 8: {
+ auto arr_fixed_size = internal::ArrayFixedSize<8>(array, array_len);
+ quick_sort(arr_fixed_size, is_less);
+ return;
+ }
+ case 16: {
+ auto arr_fixed_size = internal::ArrayFixedSize<16>(array, array_len);
+ quick_sort(arr_fixed_size, is_less);
+ return;
+ }
+ default:
+ auto arr_generic_size =
+ internal::ArrayGenericSize(array, array_len, elem_size);
+ quick_sort(arr_generic_size, is_less);
+ return;
+ }
+ } else {
+ auto arr_generic_size =
+ internal::ArrayGenericSize(array, array_len, elem_size);
+ heap_sort(arr_generic_size, is_less);
+ }
}
template <typename F>
void unstable_sort(void *array, size_t array_len, size_t elem_size,
const F &is_less) {
- if (array == nullptr || array_len == 0 || elem_size == 0) {
- return;
- }
-
- uint8_t *array_base = reinterpret_cast<uint8_t *>(array);
-
- switch (elem_size) {
- case 4: {
- auto arr_fixed_size = internal::ArrayFixedSize<4>(array_base, array_len);
- sort_inst(arr_fixed_size, is_less);
- return;
- }
- case 8: {
- auto arr_fixed_size = internal::ArrayFixedSize<8>(array_base, array_len);
- sort_inst(arr_fixed_size, is_less);
- return;
- }
- case 16: {
- auto arr_fixed_size = internal::ArrayFixedSize<16>(array_base, array_len);
- sort_inst(arr_fixed_size, is_less);
- return;
- }
- default:
- auto arr_generic_size =
- internal::ArrayGenericSize(array_base, array_len, elem_size);
- sort_inst(arr_generic_size, is_less);
- return;
- }
+#if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT
+ unstable_sort_impl<true, F>(array, array_len, elem_size, is_less);
+#elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT
+ unstable_sort_impl<false, F>(array, array_len, elem_size, is_less);
+#endif
}
} // namespace internal
diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
index 13cce7ee5d86cd..6eb8e00bd78915 100644
--- a/libc/src/stdlib/quick_sort.h
+++ b/libc/src/stdlib/quick_sort.h
@@ -9,8 +9,8 @@
#ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
#define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
+#include "src/__support/CPP/bit.h"
#include "src/__support/CPP/cstddef.h"
-#include "src/__support/big_int.h"
#include "src/__support/macros/config.h"
#include "src/stdlib/qsort_pivot.h"
@@ -90,7 +90,7 @@ size_t partition(const A &array, size_t pivot_index, const F &is_less) {
size_t num_lt;
if constexpr (A::has_fixed_size()) {
// Branchless Lomuto avoid branch misprediction penalties, but
- // it also swaps more often which only is faster if the swap a
+ // it also swaps more often which is only faster if the swap is a fast
// constant operation.
num_lt = partition_lomuto_branchless(array_without_pivot, pivot, is_less);
} else {
@@ -130,7 +130,7 @@ void quick_sort_impl(A &array, const void *ancestor_pivot, size_t limit,
if (!is_less(ancestor_pivot, array.get(pivot_index))) {
const size_t num_lt =
partition(array, pivot_index,
- [is_less](const void *a, const void *b) noexcept -> bool {
+ [is_less](const void *a, const void *b) -> bool {
return !is_less(b, a);
});
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 0044afed1d12c1..aa596178f21fd1 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -313,11 +313,11 @@ add_libc_test(
)
add_libc_test(
- qsort_test
+ quick_sort_test
SUITE
libc-stdlib-tests
SRCS
- qsort_test.cpp
+ quick_sort_test.cpp
HDRS
SortingTest.h
DEPENDS
diff --git a/libc/test/src/stdlib/SortingTest.h b/libc/test/src/stdlib/SortingTest.h
index cba16566d89e79..ecdfdb76d0113e 100644
--- a/libc/test/src/stdlib/SortingTest.h
+++ b/libc/test/src/stdlib/SortingTest.h
@@ -10,10 +10,13 @@
#include "src/stdlib/qsort.h"
#include "test/UnitTest/Test.h"
+#include <iostream>
+
class SortingTest : public LIBC_NAMESPACE::testing::Test {
- using SortFn = void (*)(void *array, size_t array_size, size_t elem_size,
- int (*compare)(const void *, const void *));
+ using SortingRoutine = void (*)(void *array, size_t array_len,
+ size_t elem_size,
+ int (*compare)(const void *, const void *));
static int int_compare(const void *l, const void *r) {
int li = *reinterpret_cast<const int *>(l);
@@ -27,19 +30,19 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
return -1;
}
- static void int_sort(SortFn sort_fn, int *array, size_t array_len) {
- sort_fn(reinterpret_cast<void *>(array), array_len, sizeof(int),
- int_compare);
+ static void int_sort(SortingRoutine sort_func, int *array, size_t array_len) {
+ sort_func(reinterpret_cast<void *>(array), array_len, sizeof(int),
+ int_compare);
}
public:
- void test_sorted_array(SortFn sort_fn) {
+ void test_sorted_array(SortingRoutine sort_func) {
int array[25] = {10, 23, 33, 35, 55, 70, 71, 100, 110,
123, 133, 135, 155, 170, 171, 1100, 1110, 1123,
1133, 1135, 1155, 1170, 1171, 11100, 12310};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
ASSERT_LE(array[0], 10);
ASSERT_LE(array[1], 23);
@@ -68,36 +71,36 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
ASSERT_LE(array[24], 12310);
}
- void test_reversed_sorted_array(SortFn sort_fn) {
+ void test_reversed_sorted_array(SortingRoutine sort_func) {
int array[] = {25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13,
12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
for (int i = 0; i < int(ARRAY_LEN - 1); ++i)
ASSERT_EQ(array[i], i + 1);
}
- void test_all_equal_elements(SortFn sort_fn) {
+ void test_all_equal_elements(SortingRoutine sort_func) {
int array[] = {100, 100, 100, 100, 100, 100, 100, 100, 100,
100, 100, 100, 100, 100, 100, 100, 100, 100,
100, 100, 100, 100, 100, 100, 100};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
for (size_t i = 0; i < ARRAY_LEN; ++i)
ASSERT_EQ(array[i], 100);
}
- void test_unsorted_array_1(SortFn sort_fn) {
+ void test_unsorted_array_1(SortingRoutine sort_func) {
int array[25] = {10, 23, 8, 35, 55, 45, 40, 100, 110,
123, 90, 80, 70, 60, 171, 11, 1, -1,
-5, -10, 1155, 1170, 1171, 12, -100};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
ASSERT_EQ(array[0], -100);
ASSERT_EQ(array[1], -10);
@@ -126,11 +129,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
ASSERT_EQ(array[24], 1171);
}
- void test_unsorted_array_2(SortFn sort_fn) {
+ void test_unsorted_array_2(SortingRoutine sort_func) {
int array[7] = {10, 40, 45, 55, 35, 23, 60};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
ASSERT_EQ(array[0], 10);
ASSERT_EQ(array[1], 23);
@@ -141,11 +144,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
ASSERT_EQ(array[6], 60);
}
- void test_unsorted_array_duplicated_1(SortFn sort_fn) {
+ void test_unsorted_array_duplicated_1(SortingRoutine sort_func) {
int array[6] = {10, 10, 20, 20, 5, 5};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
ASSERT_EQ(array[0], 5);
ASSERT_EQ(array[1], 5);
@@ -155,11 +158,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
ASSERT_EQ(array[5], 20);
}
- void test_unsorted_array_duplicated_2(SortFn sort_fn) {
+ void test_unsorted_array_duplicated_2(SortingRoutine sort_func) {
int array[10] = {20, 10, 10, 10, 10, 20, 21, 21, 21, 21};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
ASSERT_EQ(array[0], 10);
ASSERT_EQ(array[1], 10);
@@ -173,11 +176,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
ASSERT_EQ(array[9], 21);
}
- void test_unsorted_array_duplicated_3(SortFn sort_fn) {
+ void test_unsorted_array_duplicated_3(SortingRoutine sort_func) {
int array[10] = {20, 30, 30, 30, 30, 20, 21, 21, 21, 21};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
ASSERT_EQ(array[0], 20);
ASSERT_EQ(array[1], 20);
@@ -191,96 +194,160 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
ASSERT_EQ(array[9], 30);
}
- void test_unsorted_three_element_1(SortFn sort_fn) {
+ void test_unsorted_three_element_1(SortingRoutine sort_func) {
int array[3] = {14999024, 0, 3};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
ASSERT_EQ(array[0], 0);
ASSERT_EQ(array[1], 3);
ASSERT_EQ(array[2], 14999024);
}
- void test_unsorted_three_element_2(SortFn sort_fn) {
+ void test_unsorted_three_element_2(SortingRoutine sort_func) {
int array[3] = {3, 14999024, 0};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
ASSERT_EQ(array[0], 0);
ASSERT_EQ(array[1], 3);
ASSERT_EQ(array[2], 14999024);
}
- void test_unsorted_three_element_3(SortFn sort_fn) {
+ void test_unsorted_three_element_3(SortingRoutine sort_func) {
int array[3] = {3, 0, 14999024};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
ASSERT_EQ(array[0], 0);
ASSERT_EQ(array[1], 3);
ASSERT_EQ(array[2], 14999024);
}
- void test_same_three_element(SortFn sort_fn) {
+ void test_same_three_element(SortingRoutine sort_func) {
int array[3] = {12345, 12345, 12345};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
ASSERT_EQ(array[0], 12345);
ASSERT_EQ(array[1], 12345);
ASSERT_EQ(array[2], 12345);
}
- void test_unsorted_two_element_1(SortFn sort_fn) {
+ void test_unsorted_two_element_1(SortingRoutine sort_func) {
int array[] = {14999024, 0};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
ASSERT_EQ(array[0], 0);
ASSERT_EQ(array[1], 14999024);
}
- void test_unsorted_two_element_2(SortFn sort_fn) {
+ void test_unsorted_two_element_2(SortingRoutine sort_func) {
int array[] = {0, 14999024};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
ASSERT_EQ(array[0], 0);
ASSERT_EQ(array[1], 14999024);
}
- void test_same_two_element(SortFn sort_fn) {
+ void test_same_two_element(SortingRoutine sort_func) {
int array[] = {12345, 12345};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
ASSERT_EQ(array[0], 12345);
ASSERT_EQ(array[1], 12345);
}
- void test_single_element(SortFn sort_fn) {
+ void test_single_element(SortingRoutine sort_func) {
int array[] = {12345};
constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
- int_sort(sort_fn, array, ARRAY_LEN);
+ int_sort(sort_func, array, ARRAY_LEN);
ASSERT_EQ(array[0], 12345);
}
+
+ void test_different_elem_size(SortingRoutine sort_func) {
+ // Random order of values [0,50) to avoid only testing pre-sorted handling.
+ // Long enough to reach interesting code.
+ constexpr uint8_t ARRAY_INITIAL_VALS[] = {
+ 42, 13, 8, 4, 17, 28, 20, 32, 22, 29, 7, 2, 46, 37, 26, 49, 24,
+ 38, 10, 18, 40, 36, 47, 15, 11, 48, 44, 33, 1, 5, 16, 35, 39, 41,
+ 14, 23, 3, 9, 6, 27, 21, 25, 31, 45, 12, 43, 34, 30, 19, 0};
+
+ constexpr size_t ARRAY_LEN = sizeof(ARRAY_INITIAL_VALS);
+ constexpr size_t MAX_ELEM_SIZE = 150;
+ constexpr size_t BUF_SIZE = ARRAY_LEN * MAX_ELEM_SIZE;
+
+ static_assert(ARRAY_LEN < 256); // so we can encode the values.
+
+ // Minimum alignment to test implementation for bugs related to assuming
+ // incorrect association between alignment and element size.
+ alignas(1) uint8_t buf[BUF_SIZE];
+
+ const auto fill_buf = [&buf](size_t elem_size) {
+ for (size_t i = 0; i < BUF_SIZE; ++i) {
+ buf[i] = 0;
+ }
+
+ for (size_t elem_i = 0, buf_i = 0; elem_i < ARRAY_LEN; ++elem_i) {
+ const uint8_t elem_val = ARRAY_INITIAL_VALS[elem_i];
+ for (size_t elem_byte_i = 0; elem_byte_i < elem_size; ++elem_byte_i) {
+ buf[buf_i] = elem_val;
+ buf_i += 1;
+ }
+ }
+ };
+
+ for (size_t elem_size = 0; elem_size <= MAX_ELEM_SIZE; ++elem_size) {
+ // Fill all bytes with data to ensure mistakes in elem swap are noticed.
+ fill_buf(elem_size);
+
+ sort_func(reinterpret_cast<void *>(buf), ARRAY_LEN, elem_size,
+ [](const void *a, const void *b) -> int {
+ const uint8_t a_val = *reinterpret_cast<const uint8_t *>(a);
+ const uint8_t b_val = *reinterpret_cast<const uint8_t *>(b);
+
+ if (a_val < b_val) {
+ return -1;
+ } else if (a_val > b_val) {
+ return 1;
+ } else {
+ return 0;
+ }
+ });
+
+ for (size_t elem_i = 0, buf_i = 0; elem_i < ARRAY_LEN; ++elem_i) {
+ const uint8_t expected_elem_val = static_cast<uint8_t>(elem_i);
+
+ for (size_t elem_byte_i = 0; elem_byte_i < elem_size; ++elem_byte_i) {
+ const uint8_t buf_val = buf[buf_i];
+ // Check that every byte in the element has the expected value.
+ ASSERT_EQ(buf_val, expected_elem_val)
+ << "elem_size: " << elem_size << " buf_i: " << buf_i << '\n';
+ buf_i += 1;
+ }
+ }
+ }
+ }
};
#define LIST_SORTING_TESTS(Name, Func) \
@@ -331,4 +398,7 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
TEST_F(LlvmLibc##Name##Test, SingleElementArray) { \
test_single_element(Func); \
} \
+ TEST_F(LlvmLibc##Name##Test, DifferentElemSizeArray) { \
+ test_different_elem_size(Func); \
+ } \
static_assert(true)
diff --git a/libc/test/src/stdlib/heap_sort_test.cpp b/libc/test/src/stdlib/heap_sort_test.cpp
index 43c8c69739615a..cd802ff2099e3b 100644
--- a/libc/test/src/stdlib/heap_sort_test.cpp
+++ b/libc/test/src/stdlib/heap_sort_test.cpp
@@ -12,14 +12,11 @@
void heap_sort(void *array, size_t array_size, size_t elem_size,
int (*compare)(const void *, const void *)) {
- if (array == nullptr || array_size == 0 || elem_size == 0)
- return;
+ constexpr bool USE_QUICKSORT = false;
- auto arr = LIBC_NAMESPACE::internal::ArrayGenericSize(
- reinterpret_cast<uint8_t *>(array), array_size, elem_size);
-
- LIBC_NAMESPACE::internal::heap_sort(
- arr, [compare](const void *a, const void *b) noexcept -> bool {
+ LIBC_NAMESPACE::internal::unstable_sort_impl<USE_QUICKSORT>(
+ array, array_size, elem_size,
+ [compare](const void *a, const void *b) noexcept -> bool {
return compare(a, b) < 0;
});
}
diff --git a/libc/test/src/stdlib/qsort_test.cpp b/libc/test/src/stdlib/quick_sort_test.cpp
similarity index 87%
rename from libc/test/src/stdlib/qsort_test.cpp
rename to libc/test/src/stdlib/quick_sort_test.cpp
index 63cbb205c4e174..0b3bcad94a20a4 100644
--- a/libc/test/src/stdlib/qsort_test.cpp
+++ b/libc/test/src/stdlib/quick_sort_test.cpp
@@ -11,7 +11,9 @@
void quick_sort(void *array, size_t array_size, size_t elem_size,
int (*compare)(const void *, const void *)) {
- LIBC_NAMESPACE::internal::unstable_sort(
+ constexpr bool USE_QUICKSORT = true;
+
+ LIBC_NAMESPACE::internal::unstable_sort_impl<USE_QUICKSORT>(
array, array_size, elem_size,
[compare](const void *a, const void *b) noexcept -> bool {
return compare(a, b) < 0;
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
index a6dac1f82b9f9f..c0f15469126628 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
@@ -121,8 +121,8 @@ libc_support_library(
)
libc_test(
- name = "qsort_test",
- srcs = ["qsort_test.cpp"],
+ name = "quick_sort_test",
+ srcs = ["quick_sort_test.cpp"],
libc_function_deps = ["//libc:qsort"],
deps = [
":qsort_test_helper",
>From 850ca5aa2176680b2caeef26d718e024a43b081d Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll at gmail.com>
Date: Thu, 19 Dec 2024 18:28:28 +0100
Subject: [PATCH 10/10] Fix strict aliasing violation in `swap` function
There might be platform where addressing the user provided array of unknown
alignment as `uint32_t` would be incorrect. Generally the previous code violates
strict aliasing and thus invokes UB. By using the "magic" properties of `memcpy`
we can sidestep the issue by letting it do the load and store for us. In
practice this means on platforms where addressing the memory as 4 byte regions
is safe and efficient to do - which is most widely used platforms - the compiler
can insert the appropriate `mov` instructions inline and if a platform does not
support this operation it will likely insert a call to the `memcpy` function.
---
libc/src/stdlib/qsort_data.h | 33 ++++++++++++++-------------------
1 file changed, 14 insertions(+), 19 deletions(-)
diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h
index 647239878add65..f85960e0854718 100644
--- a/libc/src/stdlib/qsort_data.h
+++ b/libc/src/stdlib/qsort_data.h
@@ -11,8 +11,6 @@
#include "src/__support/CPP/cstddef.h"
#include "src/__support/macros/config.h"
-#include "src/string/memory_utils/inline_memcpy.h"
-#include "src/string/memory_utils/inline_memmove.h"
#include <stdint.h>
@@ -48,26 +46,23 @@ class ArrayGenericSize {
using block_t = uint32_t;
constexpr size_t BLOCK_SIZE = sizeof(block_t);
+ alignas(block_t) cpp::byte tmp_block[BLOCK_SIZE];
+
cpp::byte *elem_i = get_internal(i);
cpp::byte *elem_j = get_internal(j);
const size_t elem_size_rem = elem_size % BLOCK_SIZE;
- const block_t *elem_i_block_end =
- reinterpret_cast<block_t *>(elem_i + (elem_size - elem_size_rem));
-
- block_t *elem_i_block = reinterpret_cast<block_t *>(elem_i);
- block_t *elem_j_block = reinterpret_cast<block_t *>(elem_j);
-
- while (elem_i_block != elem_i_block_end) {
- block_t tmp = *elem_i_block;
- *elem_i_block = *elem_j_block;
- *elem_j_block = tmp;
- elem_i_block += 1;
- elem_j_block += 1;
+ const cpp::byte *elem_i_block_end = elem_i + (elem_size - elem_size_rem);
+
+ while (elem_i != elem_i_block_end) {
+ __builtin_memcpy(tmp_block, elem_i, BLOCK_SIZE);
+ __builtin_memcpy(elem_i, elem_j, BLOCK_SIZE);
+ __builtin_memcpy(elem_j, tmp_block, BLOCK_SIZE);
+
+ elem_i += BLOCK_SIZE;
+ elem_j += BLOCK_SIZE;
}
- elem_i = reinterpret_cast<cpp::byte *>(elem_i_block);
- elem_j = reinterpret_cast<cpp::byte *>(elem_j_block);
for (size_t n = 0; n < elem_size_rem; ++n) {
cpp::byte tmp = elem_i[n];
elem_i[n] = elem_j[n];
@@ -118,9 +113,9 @@ template <size_t ELEM_SIZE> class ArrayFixedSize {
cpp::byte *elem_i = get_internal(i);
cpp::byte *elem_j = get_internal(j);
- inline_memcpy(tmp, elem_i, ELEM_SIZE);
- inline_memmove(elem_i, elem_j, ELEM_SIZE);
- inline_memcpy(elem_j, tmp, ELEM_SIZE);
+ __builtin_memcpy(tmp, elem_i, ELEM_SIZE);
+ __builtin_memmove(elem_i, elem_j, ELEM_SIZE);
+ __builtin_memcpy(elem_j, tmp, ELEM_SIZE);
}
LIBC_INLINE size_t len() const { return array_len; }
More information about the libc-commits
mailing list