[libcxx-commits] [libcxx] [libc++] Optimizations for uniform_int_distribution (PR #140161)

Mon Jun 23 17:57:09 PDT 2025

https://github.com/LRFLEW updated https://github.com/llvm/llvm-project/pull/140161

>From 1d6f63b44126527f34a5583254115c0554942dc4 Mon Sep 17 00:00:00 2001
From: LRFLEW <LRFLEW at aol.com>
Date: Wed, 14 May 2025 00:18:01 -0500
Subject: [PATCH 1/2] Update Tests for uniform_int_distribution

---
 .../benchmarks/numeric/rand.uni.int.bench.cpp | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 libcxx/test/benchmarks/numeric/rand.uni.int.bench.cpp

diff --git a/libcxx/test/benchmarks/numeric/rand.uni.int.bench.cpp b/libcxx/test/benchmarks/numeric/rand.uni.int.bench.cpp
new file mode 100644
index 0000000000000..eb9a76835853d
--- /dev/null
+++ b/libcxx/test/benchmarks/numeric/rand.uni.int.bench.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: c++03
+
+#include <cstdint>
+#include <random>
+
+#include <benchmark/benchmark.h>
+
+template <typename Eng, std::uint64_t Max>
+static void bm_uniform_int_distribution(benchmark::State& state) {
+  Eng eng;
+  std::uniform_int_distribution<std::uint64_t> dist(1ull, Max);
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(dist(eng));
+  }
+}
+
+// n = 1
+// Best Case
+BENCHMARK(bm_uniform_int_distribution<std::minstd_rand0, 1ull << 20>);
+BENCHMARK(bm_uniform_int_distribution<std::ranlux24_base, 1ull << 20>);
+// Worst Case
+BENCHMARK(bm_uniform_int_distribution<std::minstd_rand0, (1ull << 19) + 1ull>);
+BENCHMARK(bm_uniform_int_distribution<std::ranlux24_base, (1ull << 19) + 1ull>);
+// Median Case
+BENCHMARK(bm_uniform_int_distribution<std::minstd_rand0, (1ull << 19) + (1ull << 18)>);
+BENCHMARK(bm_uniform_int_distribution<std::ranlux24_base, (1ull << 19) + (1ull << 18)>);
+
+// n = 2, n0 = 2
+// Best Case
+BENCHMARK(bm_uniform_int_distribution<std::minstd_rand0, 1ull << 40>);
+BENCHMARK(bm_uniform_int_distribution<std::ranlux24_base, 1ull << 40>);
+// Worst Case
+BENCHMARK(bm_uniform_int_distribution<std::minstd_rand0, (1ull << 39) + 1ull>);
+BENCHMARK(bm_uniform_int_distribution<std::ranlux24_base, (1ull << 39) + 1ull>);
+// Median Case
+BENCHMARK(bm_uniform_int_distribution<std::minstd_rand0, (1ull << 39) + (1ull << 38)>);
+BENCHMARK(bm_uniform_int_distribution<std::ranlux24_base, (1ull << 39) + (1ull << 38)>);
+
+// n = 2, n0 = 1
+// Best Case
+BENCHMARK(bm_uniform_int_distribution<std::minstd_rand0, 1ull << 41>);
+BENCHMARK(bm_uniform_int_distribution<std::ranlux24_base, 1ull << 41>);
+// Worst Case
+BENCHMARK(bm_uniform_int_distribution<std::minstd_rand0, (1ull << 40) + 1ull>);
+BENCHMARK(bm_uniform_int_distribution<std::ranlux24_base, (1ull << 40) + 1ull>);
+// Median Case
+BENCHMARK(bm_uniform_int_distribution<std::minstd_rand0, (1ull << 40) + (1ull << 39)>);
+BENCHMARK(bm_uniform_int_distribution<std::ranlux24_base, (1ull << 40) + (1ull << 39)>);
+
+BENCHMARK_MAIN();

>From 57e7fe48a2daa20efdbfade4c4a3c9e703126328 Mon Sep 17 00:00:00 2001
From: LRFLEW <LRFLEW at aol.com>
Date: Thu, 15 May 2025 20:57:47 -0500
Subject: [PATCH 2/2] Optimize uniform_int_distribution

---
 .../__random/uniform_int_distribution.h       | 89 +++++++++++--------
 1 file changed, 52 insertions(+), 37 deletions(-)

diff --git a/libcxx/include/__random/uniform_int_distribution.h b/libcxx/include/__random/uniform_int_distribution.h
index fa2c33755b739..2d06808cda5ab 100644
--- a/libcxx/include/__random/uniform_int_distribution.h
+++ b/libcxx/include/__random/uniform_int_distribution.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___RANDOM_UNIFORM_INT_DISTRIBUTION_H
 #define _LIBCPP___RANDOM_UNIFORM_INT_DISTRIBUTION_H
 
+#include <__assert>
 #include <__bit/countl.h>
 #include <__config>
 #include <__cstddef/size_t.h>
@@ -64,7 +65,7 @@ class __independent_bits_engine {
   _LIBCPP_HIDE_FROM_ABI __independent_bits_engine(_Engine& __e, size_t __w);
 
   // generating functions
-  _LIBCPP_HIDE_FROM_ABI result_type operator()() { return __eval(integral_constant<bool, _Rp != 0>()); }
+  _LIBCPP_HIDE_FROM_ABI result_type operator()() { return __eval(integral_constant<bool, (_Rp & (_Rp - 1)) != 0>()); }
 
 private:
   _LIBCPP_HIDE_FROM_ABI result_type __eval(false_type);
@@ -74,49 +75,66 @@ class __independent_bits_engine {
 template <class _Engine, class _UIntType>
 __independent_bits_engine<_Engine, _UIntType>::__independent_bits_engine(_Engine& __e, size_t __w)
     : __e_(__e), __w_(__w) {
-  __n_  = __w_ / __m + (__w_ % __m != 0);
-  __w0_ = __w_ / __n_;
-  if (_Rp == 0)
-    __y0_ = _Rp;
-  else if (__w0_ < _WDt)
-    __y0_ = (_Rp >> __w0_) << __w0_;
-  else
-    __y0_ = 0;
-  if (_Rp - __y0_ > __y0_ / __n_) {
-    ++__n_;
+  _LIBCPP_ASSERT_INTERNAL(
+      __w_ <= numeric_limits<result_type>::digits, "cannot sample more bits than result_type can hold");
+  _LIBCPP_ASSERT_INTERNAL(__w_ > 0, "must sample a positive number of bits");
+  if (__w_ <= __m) {
+    __n_ = __n0_ = 1;
+    __w0_        = __w_;
+    __mask0_ = __mask1_ = ~_Engine_result_type(0) >> (_EDt - __w0_);
+    __y0_ = __y1_ = _Rp & ~__mask0_;
+  } else {
+    __n_     = (__w_ + __m - 1) / __m;
     __w0_ = __w_ / __n_;
-    if (__w0_ < _WDt)
-      __y0_ = (_Rp >> __w0_) << __w0_;
-    else
-      __y0_ = 0;
+    __mask0_ = __mask1_ = ~_Engine_result_type(0) >> (_EDt - __w0_);
+    __y0_ = __y1_ = _Rp & ~__mask0_;
+    if _LIBCPP_CONSTEXPR_SINCE_CXX17 ((_Rp & (_Rp - 1)) != 0) {
+      if (_Rp - __y0_ > __y0_ / __n_) {
+        ++__n_;
+        __w0_    = __w_ / __n_;
+        __mask0_ = __mask1_ = ~_Engine_result_type(0) >> (_EDt - __w0_);
+        __y0_ = __y1_ = _Rp & ~__mask0_;
+      }
+    }
+    size_t __n1 = __w_ % __n_;
+    __n0_       = __n_ - __n1;
+    if (__n1 > 0) {
+      __mask1_ = ~_Engine_result_type(0) >> (_EDt - (__w0_ + 1));
+      __y1_    = _Rp & ~__mask1_;
+    }
   }
-  __n0_ = __n_ - __w_ % __n_;
-  if (__w0_ < _WDt - 1)
-    __y1_ = (_Rp >> (__w0_ + 1)) << (__w0_ + 1);
-  else
-    __y1_ = 0;
-  __mask0_ = __w0_ > 0 ? _Engine_result_type(~0) >> (_EDt - __w0_) : _Engine_result_type(0);
-  __mask1_ = __w0_ < _EDt - 1 ? _Engine_result_type(~0) >> (_EDt - (__w0_ + 1)) : _Engine_result_type(~0);
 }
 
 template <class _Engine, class _UIntType>
 inline _UIntType __independent_bits_engine<_Engine, _UIntType>::__eval(false_type) {
-  return static_cast<result_type>(__e_() & __mask0_);
+  result_type __sp = (__e_() - _Engine::min()) & __mask0_;
+  for (size_t __k = 1; __k < __n0_; ++__k) {
+    __sp <<= __w0_;
+    __sp += (__e_() - _Engine::min()) & __mask0_;
+  }
+  for (size_t __k = __n0_; __k < __n_; ++__k) {
+    __sp <<= __w0_ + 1;
+    __sp += (__e_() - _Engine::min()) & __mask1_;
+  }
+  return __sp;
 }
 
 template <class _Engine, class _UIntType>
 _UIntType __independent_bits_engine<_Engine, _UIntType>::__eval(true_type) {
-  const size_t __w_rt = numeric_limits<result_type>::digits;
-  result_type __sp    = 0;
-  for (size_t __k = 0; __k < __n0_; ++__k) {
+  result_type __sp;
+  {
+    _Engine_result_type __u;
+    do {
+      __u = __e_() - _Engine::min();
+    } while (__u >= __y0_);
+    __sp = __u & __mask0_;
+  }
+  for (size_t __k = 1; __k < __n0_; ++__k) {
     _Engine_result_type __u;
     do {
       __u = __e_() - _Engine::min();
     } while (__u >= __y0_);
-    if (__w0_ < __w_rt)
-      __sp <<= __w0_;
-    else
-      __sp = 0;
+    __sp <<= __w0_;
     __sp += __u & __mask0_;
   }
   for (size_t __k = __n0_; __k < __n_; ++__k) {
@@ -124,10 +142,7 @@ _UIntType __independent_bits_engine<_Engine, _UIntType>::__eval(true_type) {
     do {
       __u = __e_() - _Engine::min();
     } while (__u >= __y1_);
-    if (__w0_ < __w_rt - 1)
-      __sp <<= __w0_ + 1;
-    else
-      __sp = 0;
+    __sp <<= __w0_ + 1;
     __sp += __u & __mask1_;
   }
   return __sp;
@@ -218,9 +233,9 @@ typename uniform_int_distribution<_IntType>::result_type uniform_int_distributio
   typedef __independent_bits_engine<_URNG, _UIntType> _Eng;
   if (__rp == 0)
     return static_cast<result_type>(_Eng(__g, __dt)());
-  size_t __w = __dt - std::__countl_zero(__rp) - 1;
-  if ((__rp & (numeric_limits<_UIntType>::max() >> (__dt - __w))) != 0)
-    ++__w;
+  size_t __w = __dt - std::__countl_zero(__rp);
+  if ((__rp & (__rp - 1)) == 0)
+    return static_cast<result_type>(_Eng(__g, __w - 1)() + __p.a());
   _Eng __e(__g, __w);
   _UIntType __u;
   do {