[libcxx-commits] [libcxx] 27a062e - [libc++] Implement std::gcd using the binary version (#77747)

Wed May 8 07:21:36 PDT 2024

Author: serge-sans-paille
Date: 2024-05-08T14:21:31Z
New Revision: 27a062e9ca7c92e89ed4084c3c3affb9fa39aabb

URL: https://github.com/llvm/llvm-project/commit/27a062e9ca7c92e89ed4084c3c3affb9fa39aabb
DIFF: https://github.com/llvm/llvm-project/commit/27a062e9ca7c92e89ed4084c3c3affb9fa39aabb.diff

LOG: [libc++] Implement std::gcd using the binary version (#77747)

The binary version is four times faster than current implementation in
my setup, and generally considered a better implementation.

Code inspired by https://en.algorithmica.org/hpc/algorithms/gcd/ which
itself is inspired by
https://lemire.me/blog/2013/12/26/fastest-way-to-compute-the-greatest-common-divisor/

Fix #77648

Added: 
    libcxx/benchmarks/numeric/gcd.bench.cpp

Modified: 
    libcxx/benchmarks/CMakeLists.txt
    libcxx/include/__numeric/gcd_lcm.h
    libcxx/test/libcxx/transitive_includes/cxx03.csv
    libcxx/test/libcxx/transitive_includes/cxx11.csv
    libcxx/test/libcxx/transitive_includes/cxx14.csv
    libcxx/test/libcxx/transitive_includes/cxx17.csv
    libcxx/test/libcxx/transitive_includes/cxx20.csv
    libcxx/test/libcxx/transitive_includes/cxx26.csv
    libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp

Removed: 
    


################################################################################
diff  --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index 5dc3be0c367e5..93b549a316e38 100644

--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -122,7 +122,7 @@ endif()
 add_library(           cxx-benchmarks-flags-libcxx INTERFACE)
 target_link_libraries( cxx-benchmarks-flags-libcxx INTERFACE cxx-benchmarks-flags)
 target_compile_options(cxx-benchmarks-flags-libcxx INTERFACE ${SANITIZER_FLAGS} -Wno-user-defined-literals -Wno-suggest-override)
-target_link_options(   cxx-benchmarks-flags-libcxx INTERFACE -nostdlib++ "-L${BENCHMARK_LIBCXX_INSTALL}/lib" "-L${BENCHMARK_LIBCXX_INSTALL}/lib64" ${SANITIZER_FLAGS})
+target_link_options(   cxx-benchmarks-flags-libcxx INTERFACE -lm -nostdlib++ "-L${BENCHMARK_LIBCXX_INSTALL}/lib" "-L${BENCHMARK_LIBCXX_INSTALL}/lib64" ${SANITIZER_FLAGS})
 
 set(libcxx_benchmark_targets)
 
@@ -220,6 +220,7 @@ set(BENCHMARK_TESTS
     lexicographical_compare_three_way.bench.cpp
     map.bench.cpp
     monotonic_buffer.bench.cpp
+    numeric/gcd.bench.cpp
     ordered_set.bench.cpp
     shared_mutex_vs_mutex.bench.cpp
     stop_token.bench.cpp

diff  --git a/libcxx/benchmarks/numeric/gcd.bench.cpp b/libcxx/benchmarks/numeric/gcd.bench.cpp
new file mode 100644
index 0000000000000..f8b6a856cd0d5
--- /dev/null
+++ b/libcxx/benchmarks/numeric/gcd.bench.cpp
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <array>
+#include <benchmark/benchmark.h>
+#include <cstring>
+#include <numeric>
+#include <random>
+
+template <class T>
+static std::array<T, 1000> generate(std::uniform_int_distribution<T> distribution = std::uniform_int_distribution<T>{
+                                        std::numeric_limits<T>::min() + 1, std::numeric_limits<T>::max()}) {
+  std::mt19937 generator;
+  std::array<T, 1000> result;
+  std::generate_n(result.begin(), result.size(), [&] { return distribution(generator); });
+  return result;
+}
+
+static void bm_gcd_random(benchmark::State& state) {
+  std::array data = generate<int>();
+  while (state.KeepRunningBatch(data.size()))
+    for (auto v0 : data)
+      for (auto v1 : data)
+        benchmark::DoNotOptimize(std::gcd(v0, v1));
+}
+BENCHMARK(bm_gcd_random);
+
+static void bm_gcd_trivial(benchmark::State& state) {
+  int lhs = ~static_cast<int>(0), rhs = 1;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(lhs);
+    benchmark::DoNotOptimize(rhs);
+    benchmark::DoNotOptimize(std::gcd(lhs, rhs));
+  }
+}
+BENCHMARK(bm_gcd_trivial);
+
+static void bm_gcd_complex(benchmark::State& state) {
+  int lhs = 2971215073, rhs = 1836311903;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(lhs);
+    benchmark::DoNotOptimize(rhs);
+    benchmark::DoNotOptimize(std::gcd(lhs, rhs));
+  }
+}
+BENCHMARK(bm_gcd_complex);
+
+BENCHMARK_MAIN();

diff  --git a/libcxx/include/__numeric/gcd_lcm.h b/libcxx/include/__numeric/gcd_lcm.h
index 48df2338051e2..5d735a51a47eb 100644
--- a/libcxx/include/__numeric/gcd_lcm.h
+++ b/libcxx/include/__numeric/gcd_lcm.h
@@ -10,7 +10,9 @@
 #ifndef _LIBCPP___NUMERIC_GCD_LCM_H
 #define _LIBCPP___NUMERIC_GCD_LCM_H
 
+#include <__algorithm/min.h>
 #include <__assert>
+#include <__bit/countr.h>
 #include <__config>
 #include <__type_traits/common_type.h>
 #include <__type_traits/is_integral.h>
@@ -50,9 +52,47 @@ struct __ct_abs<_Result, _Source, false> {
 };
 
 template <class _Tp>
-_LIBCPP_CONSTEXPR _LIBCPP_HIDDEN _Tp __gcd(_Tp __m, _Tp __n) {
+_LIBCPP_CONSTEXPR _LIBCPP_HIDDEN _Tp __gcd(_Tp __a, _Tp __b) {
   static_assert((!is_signed<_Tp>::value), "");
-  return __n == 0 ? __m : std::__gcd<_Tp>(__n, __m % __n);
+
+  // From: https://lemire.me/blog/2013/12/26/fastest-way-to-compute-the-greatest-common-divisor
+  //
+  // If power of two divides both numbers, we can push it out.
+  // - gcd( 2^x * a, 2^x * b) = 2^x * gcd(a, b)
+  //
+  // If and only if exactly one number is even, we can divide that number by that power.
+  // - if a, b are odd, then gcd(2^x * a, b) = gcd(a, b)
+  //
+  // And standard gcd algorithm where instead of modulo, minus is used.
+
+  if (__a < __b) {
+    _Tp __tmp = __b;
+    __b       = __a;
+    __a       = __tmp;
+  }
+  if (__b == 0)
+    return __a;
+  __a %= __b; // Make both argument of the same size, and early result in the easy case.
+  if (__a == 0)
+    return __b;
+
+  int __az    = std::__countr_zero(__a);
+  int __bz    = std::__countr_zero(__b);
+  int __shift = std::min(__az, __bz);
+  __a >>= __az;
+  __b >>= __bz;
+  do {
+    _Tp __
diff  = __a - __b;
+    if (__a > __b) {
+      __a = __b;
+      __b = __
diff ;
+    } else {
+      __b = __b - __a;
+    }
+    if (__
diff  != 0)
+      __b >>= std::__countr_zero(__
diff );
+  } while (__b != 0);
+  return __a << __shift;
 }
 
 template <class _Tp, class _Up>

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index cf0af3b8bb392..92601fab5b773 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -570,6 +570,7 @@ numeric cstddef
 numeric cstdint
 numeric execution
 numeric functional
+numeric initializer_list
 numeric iterator
 numeric limits
 numeric new

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index f514ee3028f64..c05eb42deb9a1 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -575,6 +575,7 @@ numeric cstddef
 numeric cstdint
 numeric execution
 numeric functional
+numeric initializer_list
 numeric iterator
 numeric limits
 numeric new

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index 43e3f996adba3..09252b7b7d2db 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -578,6 +578,7 @@ numeric cstddef
 numeric cstdint
 numeric execution
 numeric functional
+numeric initializer_list
 numeric iterator
 numeric limits
 numeric new

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index 43e3f996adba3..09252b7b7d2db 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -578,6 +578,7 @@ numeric cstddef
 numeric cstdint
 numeric execution
 numeric functional
+numeric initializer_list
 numeric iterator
 numeric limits
 numeric new

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index 8463f17db411a..ce4ccc3d11615 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -589,6 +589,7 @@ numeric cstddef
 numeric cstdint
 numeric execution
 numeric functional
+numeric initializer_list
 numeric iterator
 numeric limits
 numeric new

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 62d931c0eebad..f68249aeec78c 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -176,6 +176,29 @@ experimental/simd limits
 experimental/type_traits initializer_list
 experimental/type_traits type_traits
 experimental/utility utility
+experimental/vector experimental/memory_resource
+experimental/vector vector
+ext/hash_map algorithm
+ext/hash_map cmath
+ext/hash_map cstddef
+ext/hash_map cstdint
+ext/hash_map cstring
+ext/hash_map functional
+ext/hash_map initializer_list
+ext/hash_map limits
+ext/hash_map new
+ext/hash_map stdexcept
+ext/hash_map string
+ext/hash_set algorithm
+ext/hash_set cmath
+ext/hash_set cstddef
+ext/hash_set cstdint
+ext/hash_set cstring
+ext/hash_set functional
+ext/hash_set initializer_list
+ext/hash_set limits
+ext/hash_set new
+ext/hash_set string
 filesystem compare
 filesystem cstddef
 filesystem cstdint

diff  --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
index 831c226f9c8ea..212804356a056 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
@@ -17,6 +17,7 @@
 #include <cassert>
 #include <climits>
 #include <cstdint>
+#include <random>
 #include <type_traits>
 
 #include "test_macros.h"
@@ -48,6 +49,74 @@ constexpr bool test0(int in1, int in2, int out)
     return true;
 }
 
+template <typename T>
+T basic_gcd_(T m, T n) {
+  return n == 0 ? m : basic_gcd_<T>(n, m % n);
+}
+
+template <typename T>
+T basic_gcd(T m, T n) {
+  using Tp = std::make_unsigned_t<T>;
+  if (m < 0 && m != std::numeric_limits<T>::min())
+    m = -m;
+  if (n < 0 && n != std::numeric_limits<T>::min())
+    n = -n;
+  return basic_gcd_(static_cast<Tp>(m), static_cast<Tp>(n));
+}
+
+template <typename Input>
+void do_fuzzy_tests() {
+  std::mt19937 gen(1938);
+  std::uniform_int_distribution<Input> distrib;
+
+  constexpr int nb_rounds = 10000;
+  for (int i = 0; i < nb_rounds; ++i) {
+    Input n = distrib(gen);
+    Input m = distrib(gen);
+    assert(std::gcd(n, m) == basic_gcd(n, m));
+  }
+}
+
+template <typename Input>
+void do_limit_tests() {
+  Input inputs[] = {
+      // The behavior of std::gcd is undefined if the absolute value of one of its
+      // operand is not representable in the result type.
+      std::numeric_limits<Input>::min() + (std::is_signed<Input>::value ? 3 : 0),
+      std::numeric_limits<Input>::min() + 1,
+      std::numeric_limits<Input>::min() + 2,
+      std::numeric_limits<Input>::max(),
+      std::numeric_limits<Input>::max() - 1,
+      std::numeric_limits<Input>::max() - 2,
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      (Input)-1,
+      (Input)-2,
+      (Input)-3,
+      (Input)-4,
+      (Input)-5,
+      (Input)-6,
+      (Input)-7,
+      (Input)-8,
+      (Input)-9,
+      (Input)-10,
+  };
+
+  for (auto n : inputs) {
+    for (auto m : inputs) {
+      assert(std::gcd(n, m) == basic_gcd(n, m));
+    }
+  }
+}
 
 template <typename Input1, typename Input2 = Input1>
 constexpr bool do_test(int = 0)
@@ -143,5 +212,23 @@ int main(int argc, char**)
     assert(res == 2);
     }
 
-  return 0;
+    do_fuzzy_tests<std::int8_t>();
+    do_fuzzy_tests<std::int16_t>();
+    do_fuzzy_tests<std::int32_t>();
+    do_fuzzy_tests<std::int64_t>();
+    do_fuzzy_tests<std::uint8_t>();
+    do_fuzzy_tests<std::uint16_t>();
+    do_fuzzy_tests<std::uint32_t>();
+    do_fuzzy_tests<std::uint64_t>();
+
+    do_limit_tests<std::int8_t>();
+    do_limit_tests<std::int16_t>();
+    do_limit_tests<std::int32_t>();
+    do_limit_tests<std::int64_t>();
+    do_limit_tests<std::uint8_t>();
+    do_limit_tests<std::uint16_t>();
+    do_limit_tests<std::uint32_t>();
+    do_limit_tests<std::uint64_t>();
+
+    return 0;
 }