[libcxx-commits] [libcxx] [llvm] Update Google Benchmark to v1.9.5 (PR #198964)

Sat May 23 20:22:15 PDT 2026

https://github.com/brad0 updated https://github.com/llvm/llvm-project/pull/198964

>From da459b57a72281172e7cb4873854fc248dc895b9 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad at comstyle.com>
Date: Wed, 20 May 2026 23:48:37 -0400
Subject: [PATCH] Update Google Benchmark to v1.9.5

---
 .../algorithms/min_max_element.bench.cpp      |  31 +-
 .../containers/deque_iterator.bench.cpp       |  29 +-
 libcxx/test/benchmarks/join_view.bench.cpp    |   4 +-
 third-party/benchmark/.bazelversion           |   1 +
 third-party/benchmark/.clang-tidy             |  40 +-
 third-party/benchmark/.clang-tidy.ignore      |   1 +
 third-party/benchmark/.pre-commit-config.yaml |  10 +-
 third-party/benchmark/.travis.yml             | 208 ------
 third-party/benchmark/.ycm_extra_conf.py      |   8 +-
 third-party/benchmark/AUTHORS                 |   4 +
 third-party/benchmark/CMakeLists.txt          |  50 +-
 third-party/benchmark/CONTRIBUTORS            |   5 +
 third-party/benchmark/MODULE.bazel            |  23 +-
 third-party/benchmark/README.md               |  14 +-
 .../benchmark/bazel/benchmark_deps.bzl        |  10 +-
 .../benchmark/bindings/python/build_defs.bzl  |  31 -
 .../python/google_benchmark/__init__.py       |  36 +-
 .../python/google_benchmark/benchmark.cc      |  77 ++-
 .../python/google_benchmark/example.py        |   9 +-
 .../python/google_benchmark/version.py        |   7 -
 .../benchmark/bindings/python/nanobind.BUILD  |  59 --
 .../bindings/python/python_headers.BUILD      |  10 -
 .../benchmark/cmake/CXXFeatureCheck.cmake     | 112 ++--
 third-party/benchmark/cmake/Config.cmake.in   |   5 +
 .../benchmark/cmake/GoogleTest.cmake.in       |   7 +-
 third-party/benchmark/cmake/benchmark.pc.in   |   4 +-
 .../benchmark/cmake/benchmark_main.pc.in      |   7 +
 third-party/benchmark/docs/dependencies.md    |   6 +
 .../platform_specific_build_instructions.md   |   8 +-
 .../benchmark/docs/reducing_variance.md       |  50 +-
 third-party/benchmark/docs/releasing.md       |  19 +-
 third-party/benchmark/docs/user_guide.md      | 165 ++++-
 .../benchmark/include/benchmark/benchmark.h   | 542 ++++++++-------
 third-party/benchmark/pyproject.toml          |  36 +-
 third-party/benchmark/setup.py                | 186 ++++--
 third-party/benchmark/src/CMakeLists.txt      |  31 +-
 third-party/benchmark/src/benchmark.cc        | 214 ++++--
 .../benchmark/src/benchmark_api_internal.cc   |  23 +-
 .../benchmark/src/benchmark_api_internal.h    |  19 +-
 third-party/benchmark/src/benchmark_main.cc   |   2 +-
 third-party/benchmark/src/benchmark_name.cc   |   4 +-
 .../benchmark/src/benchmark_register.cc       | 107 +--
 .../benchmark/src/benchmark_register.h        |   5 +-
 third-party/benchmark/src/benchmark_runner.cc | 235 ++++---
 third-party/benchmark/src/benchmark_runner.h  |  22 +-
 third-party/benchmark/src/check.cc            |   5 +-
 third-party/benchmark/src/check.h             |  16 +-
 third-party/benchmark/src/colorprint.cc       |  22 +-
 third-party/benchmark/src/colorprint.h        |   7 +-
 third-party/benchmark/src/commandlineflags.cc |  53 +-
 third-party/benchmark/src/commandlineflags.h  |   4 +
 third-party/benchmark/src/complexity.cc       |  25 +-
 third-party/benchmark/src/console_reporter.cc |  25 +-
 third-party/benchmark/src/counter.cc          |  18 +-
 third-party/benchmark/src/csv_reporter.cc     |  23 +-
 third-party/benchmark/src/cycleclock.h        |  29 +-
 third-party/benchmark/src/internal_macros.h   |  10 +
 third-party/benchmark/src/json_reporter.cc    |  50 +-
 third-party/benchmark/src/log.h               |  12 -
 third-party/benchmark/src/perf_counters.cc    |  12 +-
 third-party/benchmark/src/re.h                |  10 +-
 third-party/benchmark/src/reporter.cc         |  31 +-
 third-party/benchmark/src/statistics.cc       |  50 +-
 third-party/benchmark/src/string_util.cc      |  32 +-
 third-party/benchmark/src/string_util.h       |   1 -
 third-party/benchmark/src/sysinfo.cc          | 177 +++--
 third-party/benchmark/src/thread_manager.h    |  24 +-
 third-party/benchmark/src/timers.cc           |  40 +-
 third-party/benchmark/src/timers.h            |  29 +-
 third-party/benchmark/test/CMakeLists.txt     |  61 +-
 third-party/benchmark/test/basic_test.cc      |   8 +-
 third-party/benchmark/test/benchmark_gtest.cc |   2 +-
 .../benchmark_min_time_flag_iters_test.cc     |  26 +-
 .../test/benchmark_min_time_flag_time_test.cc |  33 +-
 .../benchmark_random_interleaving_gtest.cc    |   5 +-
 ...benchmark_setup_teardown_cb_types_gtest.cc | 126 ++++
 .../test/benchmark_setup_teardown_test.cc     |  22 +-
 third-party/benchmark/test/benchmark_test.cc  |  80 +--
 third-party/benchmark/test/complexity_test.cc |  83 +--
 third-party/benchmark/test/cxx03_test.cc      |  62 --
 third-party/benchmark/test/cxx11_test.cc      |  12 +
 .../benchmark/test/diagnostics_test.cc        |  27 +-
 .../test/display_aggregates_only_test.cc      |   4 +
 .../test/donotoptimize_assembly_test.cc       |   7 +-
 .../benchmark/test/donotoptimize_test.cc      |   8 +-
 third-party/benchmark/test/filter_test.cc     |  26 +-
 .../benchmark/test/internal_threading_test.cc |  15 +-
 third-party/benchmark/test/link_main_test.cc  |   5 +-
 .../test/locale_impermeability_test.cc        |  47 ++
 .../benchmark/test/manual_threading_test.cc   | 175 +++++
 third-party/benchmark/test/map_test.cc        |   7 +-
 .../benchmark/test/memory_manager_test.cc     |   7 +-
 .../benchmark/test/memory_results_gtest.cc    | 101 +++
 .../benchmark/test/multiple_ranges_test.cc    |   4 +-
 third-party/benchmark/test/options_test.cc    |   4 +-
 third-party/benchmark/test/output_test.h      |  14 +-
 .../benchmark/test/output_test_helper.cc      | 111 +++-
 third-party/benchmark/test/overload_test.cc   |  35 +
 .../benchmark/test/perf_counters_gtest.cc     |   8 +-
 .../benchmark/test/perf_counters_test.cc      |   9 +-
 .../benchmark/test/profiler_manager_gtest.cc  |  42 ++
 .../test/profiler_manager_iterations_test.cc  |  62 ++
 .../benchmark/test/profiler_manager_test.cc   |  54 ++
 .../benchmark/test/register_benchmark_test.cc |  23 +-
 .../benchmark/test/repetitions_test.cc        |  11 +-
 .../test/report_aggregates_only_test.cc       |   3 +
 .../benchmark/test/reporter_output_test.cc    |  23 +-
 .../benchmark/test/skip_with_error_test.cc    |  26 +-
 third-party/benchmark/test/spec_arg_test.cc   |  10 +-
 .../benchmark/test/spec_arg_verbosity_test.cc |   6 +-
 .../benchmark/test/state_assembly_test.cc     |   1 +
 .../benchmark/test/string_util_gtest.cc       | 184 +++---
 .../test/templated_fixture_method_test.cc     |  26 +
 third-party/benchmark/test/time_unit_gtest.cc |   4 +-
 .../test/user_counters_tabular_test.cc        |  15 +-
 .../benchmark/test/user_counters_test.cc      |  58 +-
 .../test/user_counters_thousands_test.cc      |  10 +-
 .../test/user_counters_threads_test.cc        | 622 ++++++++++++++++++
 third-party/benchmark/tools/compare.py        |  39 +-
 third-party/benchmark/tools/gbench/report.py  | 118 ++--
 third-party/benchmark/tools/gbench/util.py    |  52 +-
 .../benchmark/tools/libpfm.BUILD.bazel        | 238 ++++++-
 third-party/benchmark/tools/requirements.txt  |   4 +-
 third-party/benchmark/tools/strip_asm.py      |  16 +-
 124 files changed, 4075 insertions(+), 1887 deletions(-)
 create mode 100644 third-party/benchmark/.bazelversion
 create mode 100644 third-party/benchmark/.clang-tidy.ignore
 delete mode 100644 third-party/benchmark/.travis.yml
 delete mode 100644 third-party/benchmark/bindings/python/build_defs.bzl
 delete mode 100644 third-party/benchmark/bindings/python/google_benchmark/version.py
 delete mode 100644 third-party/benchmark/bindings/python/nanobind.BUILD
 delete mode 100644 third-party/benchmark/bindings/python/python_headers.BUILD
 create mode 100644 third-party/benchmark/cmake/benchmark_main.pc.in
 create mode 100644 third-party/benchmark/test/benchmark_setup_teardown_cb_types_gtest.cc
 delete mode 100644 third-party/benchmark/test/cxx03_test.cc
 create mode 100644 third-party/benchmark/test/cxx11_test.cc
 create mode 100644 third-party/benchmark/test/locale_impermeability_test.cc
 create mode 100644 third-party/benchmark/test/manual_threading_test.cc
 create mode 100644 third-party/benchmark/test/memory_results_gtest.cc
 create mode 100644 third-party/benchmark/test/overload_test.cc
 create mode 100644 third-party/benchmark/test/profiler_manager_gtest.cc
 create mode 100644 third-party/benchmark/test/profiler_manager_iterations_test.cc
 create mode 100644 third-party/benchmark/test/profiler_manager_test.cc
 create mode 100644 third-party/benchmark/test/templated_fixture_method_test.cc
 create mode 100644 third-party/benchmark/test/user_counters_threads_test.cc

diff --git a/libcxx/test/benchmarks/algorithms/min_max_element.bench.cpp b/libcxx/test/benchmarks/algorithms/min_max_element.bench.cpp
index dea48b826e307..616cd97c0e2d1 100644
--- a/libcxx/test/benchmarks/algorithms/min_max_element.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/min_max_element.bench.cpp
@@ -13,21 +13,22 @@
 
 #include <benchmark/benchmark.h>
 
-void run_sizes(auto benchmark) {
-  benchmark->Arg(1)
-      ->Arg(2)
-      ->Arg(3)
-      ->Arg(4)
-      ->Arg(64)
-      ->Arg(512)
-      ->Arg(1024)
-      ->Arg(4000)
-      ->Arg(4096)
-      ->Arg(5500)
-      ->Arg(64000)
-      ->Arg(65536)
-      ->Arg(70000);
-}
+auto run_sizes =
+    [](benchmark::Benchmark* benchmark) {
+      benchmark->Arg(1)
+          ->Arg(2)
+          ->Arg(3)
+          ->Arg(4)
+          ->Arg(64)
+          ->Arg(512)
+          ->Arg(1024)
+          ->Arg(4000)
+          ->Arg(4096)
+          ->Arg(5500)
+          ->Arg(64000)
+          ->Arg(65536)
+          ->Arg(70000);
+    };
 
 template <class T>
 void BM_std_minmax_element(benchmark::State& state) {
diff --git a/libcxx/test/benchmarks/containers/deque_iterator.bench.cpp b/libcxx/test/benchmarks/containers/deque_iterator.bench.cpp
index d1db8ed358c0b..8f2be22213a9b 100644
--- a/libcxx/test/benchmarks/containers/deque_iterator.bench.cpp
+++ b/libcxx/test/benchmarks/containers/deque_iterator.bench.cpp
@@ -14,20 +14,21 @@
 #include "benchmark/benchmark.h"
 
 namespace {
-void run_sizes(auto benchmark) {
-  benchmark->Arg(0)
-      ->Arg(1)
-      ->Arg(2)
-      ->Arg(64)
-      ->Arg(512)
-      ->Arg(1024)
-      ->Arg(4000)
-      ->Arg(4096)
-      ->Arg(5500)
-      ->Arg(64000)
-      ->Arg(65536)
-      ->Arg(70000);
-}
+auto run_sizes =
+    [](benchmark::Benchmark* benchmark) {
+      benchmark->Arg(0)
+          ->Arg(1)
+          ->Arg(2)
+          ->Arg(64)
+          ->Arg(512)
+          ->Arg(1024)
+          ->Arg(4000)
+          ->Arg(4096)
+          ->Arg(5500)
+          ->Arg(64000)
+          ->Arg(65536)
+          ->Arg(70000);
+    };
 
 template <class FromContainer, class ToContainer, class Func>
 void benchmark_containers(benchmark::State& state, FromContainer& d, ToContainer& v, Func&& func) {
diff --git a/libcxx/test/benchmarks/join_view.bench.cpp b/libcxx/test/benchmarks/join_view.bench.cpp
index 9f6db4a3766af..1bd81800b7e5f 100644
--- a/libcxx/test/benchmarks/join_view.bench.cpp
+++ b/libcxx/test/benchmarks/join_view.bench.cpp
@@ -15,7 +15,7 @@
 #include "benchmark/benchmark.h"
 
 namespace {
-void run_sizes(auto benchmark) {
+auto run_sizes = [](benchmark::Benchmark* benchmark) {
   benchmark->Arg(0)
       ->Arg(1)
       ->Arg(2)
@@ -28,7 +28,7 @@ void run_sizes(auto benchmark) {
       ->Arg(64000)
       ->Arg(65536)
       ->Arg(70000);
-}
+};
 
 void BM_join_view_in_vectors(benchmark::State& state) {
   auto size = state.range(0);
diff --git a/third-party/benchmark/.bazelversion b/third-party/benchmark/.bazelversion
new file mode 100644
index 0000000000000..2b0aa21219df8
--- /dev/null
+++ b/third-party/benchmark/.bazelversion
@@ -0,0 +1 @@
+8.2.1
diff --git a/third-party/benchmark/.clang-tidy b/third-party/benchmark/.clang-tidy
index 56938a598d1ee..d6cd768f6ddf9 100644
--- a/third-party/benchmark/.clang-tidy
+++ b/third-party/benchmark/.clang-tidy
@@ -1,7 +1,37 @@
 ---
-Checks:          'clang-analyzer-*,readability-redundant-*,performance-*'
-WarningsAsErrors: 'clang-analyzer-*,readability-redundant-*,performance-*'
-HeaderFilterRegex: '.*'
-AnalyzeTemporaryDtors: false
+Checks: >
+  abseil-*,
+  bugprone-*,
+  clang-analyzer-*,
+  cppcoreguidelines-*,
+  google-*,
+  misc-*,
+  performance-*,
+  readability-*,
+  -clang-analyzer-deadcode*,
+  -clang-analyzer-optin*,
+  -readability-identifier-length
+WarningsAsErrors: ''
+HeaderFilterRegex: ''
 FormatStyle:     none
-User:            user
+CheckOptions:
+  llvm-else-after-return.WarnOnConditionVariables: 'false'
+  modernize-loop-convert.MinConfidence: reasonable
+  modernize-replace-auto-ptr.IncludeStyle: llvm
+  cert-str34-c.DiagnoseSignedUnsignedCharComparisons: 'false'
+  google-readability-namespace-comments.ShortNamespaceLines: '10'
+  cert-err33-c.CheckedFunctions: '::aligned_alloc;::asctime_s;::at_quick_exit;::atexit;::bsearch;::bsearch_s;::btowc;::c16rtomb;::c32rtomb;::calloc;::clock;::cnd_broadcast;::cnd_init;::cnd_signal;::cnd_timedwait;::cnd_wait;::ctime_s;::fclose;::fflush;::fgetc;::fgetpos;::fgets;::fgetwc;::fopen;::fopen_s;::fprintf;::fprintf_s;::fputc;::fputs;::fputwc;::fputws;::fread;::freopen;::freopen_s;::fscanf;::fscanf_s;::fseek;::fsetpos;::ftell;::fwprintf;::fwprintf_s;::fwrite;::fwscanf;::fwscanf_s;::getc;::getchar;::getenv;::getenv_s;::gets_s;::getwc;::getwchar;::gmtime;::gmtime_s;::localtime;::localtime_s;::malloc;::mbrtoc16;::mbrtoc32;::mbsrtowcs;::mbsrtowcs_s;::mbstowcs;::mbstowcs_s;::memchr;::mktime;::mtx_init;::mtx_lock;::mtx_timedlock;::mtx_trylock;::mtx_unlock;::printf_s;::putc;::putwc;::raise;::realloc;::remove;::rename;::scanf;::scanf_s;::setlocale;::setvbuf;::signal;::snprintf;::snprintf_s;::sprintf;::sprintf_s;::sscanf;::sscanf_s;::strchr;::strerror_s;::strftime;::strpbrk;::strrchr;::strstr;::strtod;::strtof;::strtoimax;::strtok;::strtok_s;::strtol;::strtold;::strtoll;::strtoul;::strtoull;::strtoumax;::strxfrm;::swprintf;::swprintf_s;::swscanf;::swscanf_s;::thrd_create;::thrd_detach;::thrd_join;::thrd_sleep;::time;::timespec_get;::tmpfile;::tmpfile_s;::tmpnam;::tmpnam_s;::tss_create;::tss_get;::tss_set;::ungetc;::ungetwc;::vfprintf;::vfprintf_s;::vfscanf;::vfscanf_s;::vfwprintf;::vfwprintf_s;::vfwscanf;::vfwscanf_s;::vprintf_s;::vscanf;::vscanf_s;::vsnprintf;::vsnprintf_s;::vsprintf;::vsprintf_s;::vsscanf;::vsscanf_s;::vswprintf;::vswprintf_s;::vswscanf;::vswscanf_s;::vwprintf_s;::vwscanf;::vwscanf_s;::wcrtomb;::wcschr;::wcsftime;::wcspbrk;::wcsrchr;::wcsrtombs;::wcsrtombs_s;::wcsstr;::wcstod;::wcstof;::wcstoimax;::wcstok;::wcstok_s;::wcstol;::wcstold;::wcstoll;::wcstombs;::wcstombs_s;::wcstoul;::wcstoull;::wcstoumax;::wcsxfrm;::wctob;::wctrans;::wctype;::wmemchr;::wprintf_s;::wscanf;::wscanf_s;'
+  cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField: 'false'
+  cert-dcl16-c.NewSuffixes: 'L;LL;LU;LLU'
+  google-readability-braces-around-statements.ShortStatementLines: '1'
+  cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic: 'true'
+  google-readability-namespace-comments.SpacesBeforeComments: '2'
+  modernize-loop-convert.MaxCopySize: '16'
+  modernize-pass-by-value.IncludeStyle: llvm
+  modernize-use-nullptr.NullMacros: 'NULL'
+  llvm-qualified-auto.AddConstToQualified: 'false'
+  modernize-loop-convert.NamingStyle: CamelCase
+  llvm-else-after-return.WarnOnUnfixable: 'false'
+  google-readability-function-size.StatementThreshold: '800'
+...
+
diff --git a/third-party/benchmark/.clang-tidy.ignore b/third-party/benchmark/.clang-tidy.ignore
new file mode 100644
index 0000000000000..dba559d6cac05
--- /dev/null
+++ b/third-party/benchmark/.clang-tidy.ignore
@@ -0,0 +1 @@
+.*third_party/.*
diff --git a/third-party/benchmark/.pre-commit-config.yaml b/third-party/benchmark/.pre-commit-config.yaml
index 0247d1b062b72..57af012f7aefa 100644
--- a/third-party/benchmark/.pre-commit-config.yaml
+++ b/third-party/benchmark/.pre-commit-config.yaml
@@ -1,18 +1,18 @@
 repos:
   -   repo: https://github.com/keith/pre-commit-buildifier
-      rev: 6.4.0
+      rev: 8.2.1
       hooks:
       -   id: buildifier
       -   id: buildifier-lint
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.8.0
+    rev: v1.18.2
     hooks:
       - id: mypy
         types_or: [ python, pyi ]
         args: [ "--ignore-missing-imports", "--scripts-are-modules" ]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.13
+    rev: v0.14.0
     hooks:
-      - id: ruff
+      - id: ruff-check
         args: [ --fix, --exit-non-zero-on-fix ]
-      - id: ruff-format
\ No newline at end of file
+      - id: ruff-format
diff --git a/third-party/benchmark/.travis.yml b/third-party/benchmark/.travis.yml
deleted file mode 100644
index 8cfed3d10dab5..0000000000000
--- a/third-party/benchmark/.travis.yml
+++ /dev/null
@@ -1,208 +0,0 @@
-sudo: required
-dist: trusty
-language: cpp
-
-matrix:
-  include:
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - lcov
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - g++-multilib
-            - libc6:i386
-      env:
-        - COMPILER=g++
-        - C_COMPILER=gcc
-        - BUILD_TYPE=Debug
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - g++-multilib
-            - libc6:i386
-      env:
-        - COMPILER=g++
-        - C_COMPILER=gcc
-        - BUILD_TYPE=Release
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-    - compiler: gcc
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
-    # Clang w/ libc++
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
-        - LIBCXX_BUILD=1
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    # Clang w/ 32bit libc++
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            - clang-3.8
-            - g++-multilib
-            - libc6:i386
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    # Clang w/ 32bit libc++
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            - clang-3.8
-            - g++-multilib
-            - libc6:i386
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
-        - LIBCXX_BUILD=1
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    # Clang w/ libc++, ASAN, UBSAN
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-        - UBSAN_OPTIONS=print_stacktrace=1
-    # Clang w/ libc++ and MSAN
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    # Clang w/ libc++ and MSAN
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++
-        - BUILD_TYPE=Release
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-
-before_script:
-  - if [ -n "${LIBCXX_BUILD}" ]; then
-      source .libcxx-setup.sh;
-    fi
-  - if [ -n "${ENABLE_SANITIZER}" ]; then
-      export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
-    else
-      export EXTRA_OPTIONS="";
-    fi
-  - mkdir -p build && cd build
-
-before_install:
-  - if [ -z "$BUILD_32_BITS" ]; then
-      export BUILD_32_BITS=OFF && echo disabling 32 bit build;
-    fi
-  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
-      sudo add-apt-repository -y "ppa:ubuntu-toolchain-r/test";
-      sudo apt-get update --option Acquire::Retries=100 --option Acquire::http::Timeout="60";
-    fi
-
-install:
-  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
-      travis_wait sudo -E apt-get -yq --no-install-suggests --no-install-recommends install g++-6;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "linux" -a "${BUILD_32_BITS}" == "OFF" ]; then
-      travis_wait sudo -E apt-get -y --no-install-suggests --no-install-recommends install llvm-3.9-tools;
-      sudo cp /usr/lib/llvm-3.9/bin/FileCheck /usr/local/bin/;
-    fi
-  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
-      PATH=~/.local/bin:${PATH};
-      pip install --user --upgrade pip;
-      travis_wait pip install --user cpp-coveralls;
-    fi
-  - if [ "${C_COMPILER}" == "gcc-7" -a "${TRAVIS_OS_NAME}" == "osx" ]; then
-      rm -f /usr/local/include/c++;
-      brew update;
-      travis_wait brew install gcc at 7;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "linux" ]; then
-      sudo apt-get update -qq;
-      sudo apt-get install -qq unzip cmake3;
-      wget https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-linux-x86_64.sh --output-document bazel-installer.sh;
-      travis_wait sudo bash bazel-installer.sh;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
-      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-darwin-x86_64.sh;
-      travis_wait sudo bash bazel-installer.sh;
-    fi
-
-script:
-  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_C_FLAGS="${EXTRA_FLAGS}" -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS} ${EXTRA_CXX_FLAGS}" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ${EXTRA_OPTIONS} ..
-  - make
-  - ctest -C ${BUILD_TYPE} --output-on-failure
-  - bazel test -c dbg --define google_benchmark.have_regex=posix --announce_rc --verbose_failures --test_output=errors --keep_going //test/...
-
-after_success:
-  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
-      coveralls --include src --include include --gcov-options '\-lp' --root .. --build-root .;
-    fi
diff --git a/third-party/benchmark/.ycm_extra_conf.py b/third-party/benchmark/.ycm_extra_conf.py
index caf257f0540e1..ffef1b4daf9d9 100644
--- a/third-party/benchmark/.ycm_extra_conf.py
+++ b/third-party/benchmark/.ycm_extra_conf.py
@@ -83,10 +83,10 @@ def IsHeaderFile(filename):
 
 
 def GetCompilationInfoForFile(filename):
-    # The compilation_commands.json file generated by CMake does not have entries
-    # for header files. So we do our best by asking the db for flags for a
-    # corresponding source file, if any. If one exists, the flags for that file
-    # should be good enough.
+    # The compilation_commands.json file generated by CMake does not have
+    # entries for header files. So we do our best by asking the db for flags for
+    # a corresponding source file, if any. If one exists, the flags for that
+    # file should be good enough.
     if IsHeaderFile(filename):
         basename = os.path.splitext(filename)[0]
         for extension in SOURCE_EXTENSIONS:
diff --git a/third-party/benchmark/AUTHORS b/third-party/benchmark/AUTHORS
index 2170e46fd4a05..11d28f7229ed5 100644
--- a/third-party/benchmark/AUTHORS
+++ b/third-party/benchmark/AUTHORS
@@ -44,6 +44,7 @@ Jordan Williams <jwillikers at protonmail.com>
 Jussi Knuuttila <jussi.knuuttila at gmail.com>
 Kaito Udagawa <umireon at gmail.com>
 Kishan Kumar <kumar.kishan at outlook.com>
+Kostiantyn Lazukin <konstantin.lazukin at gmail.com>
 Lei Xu <eddyxu at gmail.com>
 Marcel Jacobse <mjacobse at uni-bremen.de>
 Matt Clarkson <mattyclarkson at gmail.com>
@@ -54,14 +55,17 @@ MongoDB Inc.
 Nick Hutchinson <nshutchinson at gmail.com>
 Norman Heino <norman.heino at gmail.com>
 Oleksandr Sochka <sasha.sochka at gmail.com>
+Olga Fadeeva <olga.kiselik at gmail.com>
 Ori Livneh <ori.livneh at gmail.com>
 Paul Redmond <paul.redmond at gmail.com>
+Prithvi Rao <ee22b024 at smail.iitm.ac.in>
 Radoslav Yovchev <radoslav.tm at gmail.com>
 Raghu Raja <raghu at enfabrica.net>
 Rainer Orth <ro at cebitec.uni-bielefeld.de>
 Roman Lebedev <lebedev.ri at gmail.com>
 Sayan Bhattacharjee <aero.sayan at gmail.com>
 Shapr3D <google-contributors at shapr3d.com>
+Shashank Thakur <shashankt2004 at gmail.com>
 Shuo Chen <chenshuo at chenshuo.com>
 Staffan Tjernstrom <staffantj at gmail.com>
 Steinar H. Gunderson <sgunderson at bigfoot.com>
diff --git a/third-party/benchmark/CMakeLists.txt b/third-party/benchmark/CMakeLists.txt
index d9bcc6a4939be..ada04a61c45b4 100644
--- a/third-party/benchmark/CMakeLists.txt
+++ b/third-party/benchmark/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Require CMake 3.10. If available, use the policies up to CMake 3.22.
-cmake_minimum_required (VERSION 3.10...3.22)
+cmake_minimum_required (VERSION 3.13...3.22)
 
-project (benchmark VERSION 1.8.3 LANGUAGES CXX)
+project (benchmark VERSION 1.9.5 LANGUAGES CXX)
 
 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
@@ -29,6 +29,7 @@ endif()
 option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
 option(BENCHMARK_ENABLE_DOXYGEN "Build documentation with Doxygen." OFF)
 option(BENCHMARK_INSTALL_DOCS "Enable installation of documentation." ON)
+option(BENCHMARK_INSTALL_TOOLS "Enable installation of tools." ON)
 
 # Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
 # may require downloading the source code.
@@ -104,7 +105,7 @@ get_git_version(GIT_VERSION)
 
 # If no git version can be determined, use the version
 # from the project() command
-if ("${GIT_VERSION}" STREQUAL "0.0.0")
+if ("${GIT_VERSION}" STREQUAL "v0.0.0")
   set(VERSION "v${benchmark_VERSION}")
 else()
   set(VERSION "${GIT_VERSION}")
@@ -138,11 +139,7 @@ if (BENCHMARK_BUILD_32_BITS)
   add_required_cxx_compiler_flag(-m32)
 endif()
 
-if (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC")
-  set(BENCHMARK_CXX_STANDARD 14)
-else()
-  set(BENCHMARK_CXX_STANDARD 11)
-endif()
+set(BENCHMARK_CXX_STANDARD 17)
 
 set(CMAKE_CXX_STANDARD ${BENCHMARK_CXX_STANDARD})
 set(CMAKE_CXX_STANDARD_REQUIRED YES)
@@ -152,8 +149,17 @@ if (MSVC)
   # Turn compiler warnings up to 11
   string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
+
+  # MP flag only applies to cl, not cl frontends to other compilers (e.g. clang-cl, icx-cl etc)
+  if(CMAKE_CXX_COMPILER_ID MATCHES MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
+  endif()
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
 
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-WX)
+  endif()
+
   if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
     add_cxx_compiler_flag(-EHs-)
     add_cxx_compiler_flag(-EHa-)
@@ -190,6 +196,8 @@ else()
   add_cxx_compiler_flag(-Wshadow)
   add_cxx_compiler_flag(-Wfloat-equal)
   add_cxx_compiler_flag(-Wold-style-cast)
+  add_cxx_compiler_flag(-Wconversion)
+  add_cxx_compiler_flag(-Wformat=2)
   if(BENCHMARK_ENABLE_WERROR)
       add_cxx_compiler_flag(-Werror)
   endif()
@@ -210,6 +218,9 @@ else()
     # See #631 for rationale.
     add_cxx_compiler_flag(-wd1786)
     add_cxx_compiler_flag(-fno-finite-math-only)
+    # ICC17u2: overloaded virtual function "benchmark::Fixture::SetUp" is only partially
+    # overridden (because of deprecated overload)
+    add_cxx_compiler_flag(-wd654)
   endif()
   # Disable deprecation warnings for release builds (when -Werror is enabled).
   if(BENCHMARK_ENABLE_WERROR)
@@ -224,9 +235,7 @@ else()
       add_cxx_compiler_flag(-Wstrict-aliasing)
     endif()
   endif()
-  # ICC17u2: overloaded virtual function "benchmark::Fixture::SetUp" is only partially overridden
-  # (because of deprecated overload)
-  add_cxx_compiler_flag(-wd654)
+
   add_cxx_compiler_flag(-Wthread-safety)
   if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
     cxx_feature_check(THREAD_SAFETY_ATTRIBUTES "-DINCLUDE_DIRECTORIES=${PROJECT_SOURCE_DIR}/include")
@@ -300,17 +309,18 @@ if (BENCHMARK_USE_LIBCXX)
   endif()
 endif(BENCHMARK_USE_LIBCXX)
 
-set(EXTRA_CXX_FLAGS "")
-if (WIN32 AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-  # Clang on Windows fails to compile the regex feature check under C++11
-  set(EXTRA_CXX_FLAGS "-DCMAKE_CXX_STANDARD=14")
+# C++ feature checks
+# Determine the correct regular expression engine to use. First compatible engine found is used.
+cxx_feature_check(STD_REGEX)
+
+if(NOT HAVE_STD_REGEX)
+  cxx_feature_check(GNU_POSIX_REGEX)
+endif()
+
+if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX)
+  cxx_feature_check(POSIX_REGEX)
 endif()
 
-# C++ feature checks
-# Determine the correct regular expression engine to use
-cxx_feature_check(STD_REGEX ${EXTRA_CXX_FLAGS})
-cxx_feature_check(GNU_POSIX_REGEX ${EXTRA_CXX_FLAGS})
-cxx_feature_check(POSIX_REGEX ${EXTRA_CXX_FLAGS})
 if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
   message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
 endif()
diff --git a/third-party/benchmark/CONTRIBUTORS b/third-party/benchmark/CONTRIBUTORS
index 9ca2caa3ee784..52e49cce46e68 100644
--- a/third-party/benchmark/CONTRIBUTORS
+++ b/third-party/benchmark/CONTRIBUTORS
@@ -42,6 +42,7 @@ Dominic Hamon <dma at stripysock.com> <dominic at google.com>
 Dominik Czarnota <dominik.b.czarnota at gmail.com>
 Dominik Korman <kormandominik at gmail.com>
 Donald Aingworth <donalds_junk_mail at yahoo.com>
+Doug Evans <xdje42 at gmail.com>
 Eric Backus <eric_backus at alum.mit.edu>
 Eric Fiselier <eric at efcs.ca>
 Eugene Zhuk <eugene.zhuk at gmail.com>
@@ -66,6 +67,7 @@ Jussi Knuuttila <jussi.knuuttila at gmail.com>
 Kaito Udagawa <umireon at gmail.com>
 Kai Wolf <kai.wolf at gmail.com>
 Kishan Kumar <kumar.kishan at outlook.com>
+Kostiantyn Lazukin <konstantin.lazukin at gmail.com>
 Lei Xu <eddyxu at gmail.com>
 Marcel Jacobse <mjacobse at uni-bremen.de>
 Matt Clarkson <mattyclarkson at gmail.com>
@@ -75,10 +77,12 @@ Min-Yih Hsu <yihshyng223 at gmail.com>
 Nick Hutchinson <nshutchinson at gmail.com>
 Norman Heino <norman.heino at gmail.com>
 Oleksandr Sochka <sasha.sochka at gmail.com>
+Olga Fadeeva <olga.kiselik at gmail.com>
 Ori Livneh <ori.livneh at gmail.com>
 Pascal Leroy <phl at google.com>
 Paul Redmond <paul.redmond at gmail.com>
 Pierre Phaneuf <pphaneuf at google.com>
+Prithvi Rao <ee22b024 at smail.iitm.ac.in>
 Radoslav Yovchev <radoslav.tm at gmail.com>
 Raghu Raja <raghu at enfabrica.net>
 Rainer Orth <ro at cebitec.uni-bielefeld.de>
@@ -87,6 +91,7 @@ Ray Glover <ray.glover at uk.ibm.com>
 Robert Guo <robert.guo at mongodb.com>
 Roman Lebedev <lebedev.ri at gmail.com>
 Sayan Bhattacharjee <aero.sayan at gmail.com>
+Shashank Thakur <shashankt2004 at gmail.com>
 Shuo Chen <chenshuo at chenshuo.com>
 Steven Wan <wan.yu at ibm.com>
 Tobias Schmidt <tobias.schmidt at in.tum.de>
diff --git a/third-party/benchmark/MODULE.bazel b/third-party/benchmark/MODULE.bazel
index 7e0e0161235b4..c162d05313b14 100644
--- a/third-party/benchmark/MODULE.bazel
+++ b/third-party/benchmark/MODULE.bazel
@@ -1,17 +1,16 @@
 module(
     name = "google_benchmark",
-    version = "1.8.3",
+    version = "1.9.5",
 )
 
-bazel_dep(name = "bazel_skylib", version = "1.5.0")
-bazel_dep(name = "platforms", version = "0.0.7")
-bazel_dep(name = "rules_foreign_cc", version = "0.10.1")
+bazel_dep(name = "bazel_skylib", version = "1.7.1")
+bazel_dep(name = "platforms", version = "0.0.10")
 bazel_dep(name = "rules_cc", version = "0.0.9")
 
-bazel_dep(name = "rules_python", version = "0.27.1", dev_dependency = True)
-bazel_dep(name = "googletest", version = "1.12.1", dev_dependency = True, repo_name = "com_google_googletest")
+bazel_dep(name = "rules_python", version = "1.0.0", dev_dependency = True)
+bazel_dep(name = "googletest", version = "1.14.0", dev_dependency = True, repo_name = "com_google_googletest")
 
-bazel_dep(name = "libpfm", version = "4.11.0")
+bazel_dep(name = "libpfm", version = "4.11.0.bcr.1")
 
 # Register a toolchain for Python 3.9 to be able to build numpy. Python
 # versions >=3.10 are problematic.
@@ -19,7 +18,15 @@ bazel_dep(name = "libpfm", version = "4.11.0")
 # of relying on the changing default version from rules_python.
 
 python = use_extension("@rules_python//python/extensions:python.bzl", "python", dev_dependency = True)
+python.toolchain(python_version = "3.8")
 python.toolchain(python_version = "3.9")
+python.toolchain(python_version = "3.10")
+python.toolchain(python_version = "3.11")
+python.toolchain(
+    is_default = True,
+    python_version = "3.12",
+)
+python.toolchain(python_version = "3.13")
 
 pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip", dev_dependency = True)
 pip.parse(
@@ -30,3 +37,5 @@ pip.parse(
 use_repo(pip, "tools_pip_deps")
 
 # -- bazel_dep definitions -- #
+
+bazel_dep(name = "nanobind_bazel", version = "2.9.2", dev_dependency = True)
diff --git a/third-party/benchmark/README.md b/third-party/benchmark/README.md
index a5e5d392d8262..1d4470e8ed7b4 100644
--- a/third-party/benchmark/README.md
+++ b/third-party/benchmark/README.md
@@ -2,9 +2,9 @@
 
 [![build-and-test](https://github.com/google/benchmark/workflows/build-and-test/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Abuild-and-test)
 [![bazel](https://github.com/google/benchmark/actions/workflows/bazel.yml/badge.svg)](https://github.com/google/benchmark/actions/workflows/bazel.yml)
-[![pylint](https://github.com/google/benchmark/workflows/pylint/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Apylint)
 [![test-bindings](https://github.com/google/benchmark/workflows/test-bindings/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Atest-bindings)
 [![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/google/benchmark/badge)](https://securityscorecards.dev/viewer/?uri=github.com/google/benchmark)
 
 [![Discord](https://discordapp.com/api/guilds/1125694995928719494/widget.png?style=shield)](https://discord.gg/cz7UX7wKC2)
 
@@ -50,15 +50,13 @@ IRC channels:
 
 ## Requirements
 
-The library can be used with C++03. However, it requires C++11 to build,
+The library can be used with C++11. However, it requires C++17 to build,
 including compiler and standard library support.
 
-The following minimum versions are required to build the library:
+_See [dependencies.md](docs/dependencies.md) for more details regarding supported
+compilers and standards._
 
-* GCC 4.8
-* Clang 3.4
-* Visual Studio 14 2015
-* Intel 2015 Update 1
+If you have need for a particular compiler to be supported, patches are very welcome.
 
 See [Platform-Specific Build Instructions](docs/platform_specific_build_instructions.md).
 
@@ -80,7 +78,7 @@ $ cmake -E make_directory "build"
 # Generate build system files with cmake, and download any dependencies.
 $ cmake -E chdir "build" cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=on -DCMAKE_BUILD_TYPE=Release ../
 # or, starting with CMake 3.13, use a simpler form:
-# cmake -DCMAKE_BUILD_TYPE=Release -S . -B "build"
+# cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=on -DCMAKE_BUILD_TYPE=Release -S . -B "build"
 # Build the library.
 $ cmake --build "build" --config Release
 ```
diff --git a/third-party/benchmark/bazel/benchmark_deps.bzl b/third-party/benchmark/bazel/benchmark_deps.bzl
index 4fb45a538d4a7..a6be60241315f 100644
--- a/third-party/benchmark/bazel/benchmark_deps.bzl
+++ b/third-party/benchmark/bazel/benchmark_deps.bzl
@@ -18,14 +18,6 @@ def benchmark_deps():
             ],
         )
 
-    if "rules_foreign_cc" not in native.existing_rules():
-        http_archive(
-            name = "rules_foreign_cc",
-            sha256 = "476303bd0f1b04cc311fc258f1708a5f6ef82d3091e53fd1977fa20383425a6a",
-            strip_prefix = "rules_foreign_cc-0.10.1",
-            url = "https://github.com/bazelbuild/rules_foreign_cc/releases/download/0.10.1/rules_foreign_cc-0.10.1.tar.gz",
-        )
-
     if "rules_python" not in native.existing_rules():
         http_archive(
             name = "rules_python",
@@ -45,7 +37,7 @@ def benchmark_deps():
         new_git_repository(
             name = "nanobind",
             remote = "https://github.com/wjakob/nanobind.git",
-            tag = "v1.8.0",
+            tag = "v1.9.2",
             build_file = "@//bindings/python:nanobind.BUILD",
             recursive_init_submodules = True,
         )
diff --git a/third-party/benchmark/bindings/python/build_defs.bzl b/third-party/benchmark/bindings/python/build_defs.bzl
deleted file mode 100644
index d520eda616393..0000000000000
--- a/third-party/benchmark/bindings/python/build_defs.bzl
+++ /dev/null
@@ -1,31 +0,0 @@
-"""
-This file contains some build definitions for C++ extensions used in the Google Benchmark Python bindings.
-"""
-
-load("//third_party/bazel_rules/rules_cc/cc:cc_binary.bzl", "cc_binary")
-
-_SHARED_LIB_SUFFIX = {
-    "//conditions:default": ".so",
-    "//:windows": ".dll",
-}
-
-def py_extension(name, srcs, hdrs = [], copts = [], features = [], deps = []):
-    for shared_lib_suffix in _SHARED_LIB_SUFFIX.values():
-        shared_lib_name = name + shared_lib_suffix
-        cc_binary(
-            name = shared_lib_name,
-            linkshared = True,
-            linkstatic = True,
-            srcs = srcs + hdrs,
-            copts = copts,
-            features = features,
-            deps = deps,
-        )
-
-    return native.py_library(
-        name = name,
-        data = select({
-            platform: [name + shared_lib_suffix]
-            for platform, shared_lib_suffix in _SHARED_LIB_SUFFIX.items()
-        }),
-    )
diff --git a/third-party/benchmark/bindings/python/google_benchmark/__init__.py b/third-party/benchmark/bindings/python/google_benchmark/__init__.py
index e14769f451fe7..331a88e9b5d79 100644
--- a/third-party/benchmark/bindings/python/google_benchmark/__init__.py
+++ b/third-party/benchmark/bindings/python/google_benchmark/__init__.py
@@ -26,9 +26,8 @@ def my_benchmark(state):
   if __name__ == '__main__':
     benchmark.main()
 """
-import atexit
 
-from absl import app
+import atexit
 
 from google_benchmark import _benchmark
 from google_benchmark._benchmark import (
@@ -48,7 +47,8 @@ def my_benchmark(state):
     oNone as oNone,
     oNSquared as oNSquared,
 )
-from google_benchmark.version import __version__ as __version__
+
+__version__ = "1.9.5"
 
 
 class __OptionMaker:
@@ -58,7 +58,8 @@ class __OptionMaker:
     """
 
     class Options:
-        """Pure data class to store options calls, along with the benchmarked function."""
+        """Pure data class to store options calls, along with the benchmarked
+        function."""
 
         def __init__(self, func):
             self.func = func
@@ -81,8 +82,8 @@ def __builder_method(*args, **kwargs):
             def __decorator(func_or_options):
                 options = self.make(func_or_options)
                 options.builder_calls.append((builder_name, args, kwargs))
-                # The decorator returns Options so it is not technically a decorator
-                # and needs a final call to @register
+                # The decorator returns Options so it is not technically a
+                # decorator and needs a final call to @register
                 return options
 
             return __decorator
@@ -91,8 +92,8 @@ def __decorator(func_or_options):
 
 
 # Alias for nicer API.
-# We have to instantiate an object, even if stateless, to be able to use __getattr__
-# on option.range
+# We have to instantiate an object, even if stateless, to be able to use
+# __getattr__ on option.range
 option = __OptionMaker()
 
 
@@ -102,8 +103,8 @@ def register(undefined=None, *, name=None):
         # Decorator is called without parenthesis so we return a decorator
         return lambda f: register(f, name=name)
 
-    # We have either the function to benchmark (simple case) or an instance of Options
-    # (@option._ case).
+    # We have either the function to benchmark (simple case) or an instance of
+    # Options (@option._ case).
     options = __OptionMaker.make(undefined)
 
     if name is None:
@@ -119,22 +120,17 @@ def register(undefined=None, *, name=None):
     return options.func
 
 
-def _flags_parser(argv):
-    argv = _benchmark.Initialize(argv)
-    return app.parse_flags_with_usage(argv)
+def main(argv: list[str] | None = None) -> None:
+    import sys
 
-
-def _run_benchmarks(argv):
-    if len(argv) > 1:
-        raise app.UsageError("Too many command-line arguments.")
+    _benchmark.Initialize(argv or sys.argv)
     return _benchmark.RunSpecifiedBenchmarks()
 
 
-def main(argv=None):
-    return app.run(_run_benchmarks, argv=argv, flags_parser=_flags_parser)
-
+# FIXME: can we rerun with disabled ASLR?
 
 # Methods for use with custom main function.
 initialize = _benchmark.Initialize
 run_benchmarks = _benchmark.RunSpecifiedBenchmarks
+add_custom_context = _benchmark.AddCustomContext
 atexit.register(_benchmark.ClearRegisteredBenchmarks)
diff --git a/third-party/benchmark/bindings/python/google_benchmark/benchmark.cc b/third-party/benchmark/bindings/python/google_benchmark/benchmark.cc
index f44476901cae7..ccd7eb5a503d7 100644
--- a/third-party/benchmark/bindings/python/google_benchmark/benchmark.cc
+++ b/third-party/benchmark/bindings/python/google_benchmark/benchmark.cc
@@ -14,16 +14,18 @@ namespace {
 namespace nb = nanobind;
 
 std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
-  // The `argv` pointers here become invalid when this function returns, but
-  // benchmark holds the pointer to `argv[0]`. We create a static copy of it
-  // so it persists, and replace the pointer below.
-  static std::string executable_name(argv[0]);
   std::vector<char*> ptrs;
   ptrs.reserve(argv.size());
   for (auto& arg : argv) {
     ptrs.push_back(const_cast<char*>(arg.c_str()));
   }
-  ptrs[0] = const_cast<char*>(executable_name.c_str());
+  if (!ptrs.empty()) {
+    // The `argv` pointers here become invalid when this function returns, but
+    // benchmark holds the pointer to `argv[0]`. We create a static copy of it
+    // so it persists, and replace the pointer below.
+    static std::string executable_name(argv[0]);
+    ptrs[0] = const_cast<char*>(executable_name.c_str());
+  }
   int argc = static_cast<int>(argv.size());
   benchmark::Initialize(&argc, ptrs.data());
   std::vector<std::string> remaining_argv;
@@ -34,14 +36,13 @@ std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
   return remaining_argv;
 }
 
-benchmark::internal::Benchmark* RegisterBenchmark(const std::string& name,
-                                                  nb::callable f) {
+benchmark::Benchmark* RegisterBenchmark(const std::string& name,
+                                        nb::callable f) {
   return benchmark::RegisterBenchmark(
       name, [f](benchmark::State& state) { f(&state); });
 }
 
 NB_MODULE(_benchmark, m) {
-
   using benchmark::TimeUnit;
   nb::enum_<TimeUnit>(m, "TimeUnit")
       .value("kNanosecond", TimeUnit::kNanosecond)
@@ -63,7 +64,7 @@ NB_MODULE(_benchmark, m) {
       .value("oLambda", BigO::oLambda)
       .export_values();
 
-  using benchmark::internal::Benchmark;
+  using benchmark::Benchmark;
   nb::class_<Benchmark>(m, "Benchmark")
       // For methods returning a pointer to the current object, reference
       // return policy is used to ask nanobind not to take ownership of the
@@ -78,47 +79,40 @@ NB_MODULE(_benchmark, m) {
       .def("args", &Benchmark::Args, nb::rv_policy::reference)
       .def("range", &Benchmark::Range, nb::rv_policy::reference,
            nb::arg("start"), nb::arg("limit"))
-      .def("dense_range", &Benchmark::DenseRange,
-           nb::rv_policy::reference, nb::arg("start"),
-           nb::arg("limit"), nb::arg("step") = 1)
+      .def("dense_range", &Benchmark::DenseRange, nb::rv_policy::reference,
+           nb::arg("start"), nb::arg("limit"), nb::arg("step") = 1)
       .def("ranges", &Benchmark::Ranges, nb::rv_policy::reference)
-      .def("args_product", &Benchmark::ArgsProduct,
-           nb::rv_policy::reference)
+      .def("args_product", &Benchmark::ArgsProduct, nb::rv_policy::reference)
       .def("arg_name", &Benchmark::ArgName, nb::rv_policy::reference)
-      .def("arg_names", &Benchmark::ArgNames,
-           nb::rv_policy::reference)
-      .def("range_pair", &Benchmark::RangePair,
-           nb::rv_policy::reference, nb::arg("lo1"), nb::arg("hi1"),
-           nb::arg("lo2"), nb::arg("hi2"))
+      .def("arg_names", &Benchmark::ArgNames, nb::rv_policy::reference)
+      .def("range_pair", &Benchmark::RangePair, nb::rv_policy::reference,
+           nb::arg("lo1"), nb::arg("hi1"), nb::arg("lo2"), nb::arg("hi2"))
       .def("range_multiplier", &Benchmark::RangeMultiplier,
            nb::rv_policy::reference)
       .def("min_time", &Benchmark::MinTime, nb::rv_policy::reference)
       .def("min_warmup_time", &Benchmark::MinWarmUpTime,
            nb::rv_policy::reference)
-      .def("iterations", &Benchmark::Iterations,
-           nb::rv_policy::reference)
-      .def("repetitions", &Benchmark::Repetitions,
-           nb::rv_policy::reference)
+      .def("iterations", &Benchmark::Iterations, nb::rv_policy::reference)
+      .def("repetitions", &Benchmark::Repetitions, nb::rv_policy::reference)
       .def("report_aggregates_only", &Benchmark::ReportAggregatesOnly,
            nb::rv_policy::reference, nb::arg("value") = true)
       .def("display_aggregates_only", &Benchmark::DisplayAggregatesOnly,
            nb::rv_policy::reference, nb::arg("value") = true)
       .def("measure_process_cpu_time", &Benchmark::MeasureProcessCPUTime,
            nb::rv_policy::reference)
-      .def("use_real_time", &Benchmark::UseRealTime,
-           nb::rv_policy::reference)
+      .def("use_real_time", &Benchmark::UseRealTime, nb::rv_policy::reference)
       .def("use_manual_time", &Benchmark::UseManualTime,
            nb::rv_policy::reference)
       .def(
           "complexity",
           (Benchmark * (Benchmark::*)(benchmark::BigO)) & Benchmark::Complexity,
-          nb::rv_policy::reference,
-          nb::arg("complexity") = benchmark::oAuto);
+          nb::rv_policy::reference, nb::arg("complexity") = benchmark::oAuto);
 
   using benchmark::Counter;
   nb::class_<Counter> py_counter(m, "Counter");
 
-  nb::enum_<Counter::Flags>(py_counter, "Flags")
+  nb::enum_<Counter::Flags>(py_counter, "Flags", nb::is_arithmetic(),
+                            nb::is_flag())
       .value("kDefaults", Counter::Flags::kDefaults)
       .value("kIsRate", Counter::Flags::kIsRate)
       .value("kAvgThreads", Counter::Flags::kAvgThreads)
@@ -129,8 +123,7 @@ NB_MODULE(_benchmark, m) {
       .value("kAvgIterations", Counter::Flags::kAvgIterations)
       .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate)
       .value("kInvert", Counter::Flags::kInvert)
-      .export_values()
-      .def(nb::self | nb::self);
+      .export_values();
 
   nb::enum_<Counter::OneK>(py_counter, "OneK")
       .value("kIs1000", Counter::OneK::kIs1000)
@@ -141,7 +134,8 @@ NB_MODULE(_benchmark, m) {
       .def(nb::init<double, Counter::Flags, Counter::OneK>(),
            nb::arg("value") = 0., nb::arg("flags") = Counter::kDefaults,
            nb::arg("k") = Counter::kIs1000)
-      .def("__init__", ([](Counter *c, double value) { new (c) Counter(value); }))
+      .def("__init__",
+           ([](Counter* c, double value) { new (c) Counter(value); }))
       .def_rw("value", &Counter::value)
       .def_rw("flags", &Counter::flags)
       .def_rw("oneK", &Counter::oneK)
@@ -161,13 +155,21 @@ NB_MODULE(_benchmark, m) {
       .def_prop_ro("error_occurred", &State::error_occurred)
       .def("set_iteration_time", &State::SetIterationTime)
       .def_prop_rw("bytes_processed", &State::bytes_processed,
-                    &State::SetBytesProcessed)
+                   &State::SetBytesProcessed)
       .def_prop_rw("complexity_n", &State::complexity_length_n,
-                    &State::SetComplexityN)
+                   &State::SetComplexityN)
       .def_prop_rw("items_processed", &State::items_processed,
                    &State::SetItemsProcessed)
       .def("set_label", &State::SetLabel)
-      .def("range", &State::range, nb::arg("pos") = 0)
+      .def(
+          "range",
+          [](const State& state, std::size_t pos = 0) -> int64_t {
+            if (pos < state.range_size()) {
+              return state.range(pos);
+            }
+            throw nb::index_error("pos is out of range");
+          },
+          nb::arg("pos") = 0)
       .def_prop_ro("iterations", &State::iterations)
       .def_prop_ro("name", &State::name)
       .def_rw("counters", &State::counters)
@@ -175,10 +177,13 @@ NB_MODULE(_benchmark, m) {
       .def_prop_ro("threads", &State::threads);
 
   m.def("Initialize", Initialize);
-  m.def("RegisterBenchmark", RegisterBenchmark,
-        nb::rv_policy::reference);
+  m.def("RegisterBenchmark", RegisterBenchmark, nb::rv_policy::reference);
   m.def("RunSpecifiedBenchmarks",
         []() { benchmark::RunSpecifiedBenchmarks(); });
   m.def("ClearRegisteredBenchmarks", benchmark::ClearRegisteredBenchmarks);
+  m.def("AddCustomContext", benchmark::AddCustomContext, nb::arg("key"),
+        nb::arg("value"),
+        "Add a key-value pair to output as part of the context stanza in the "
+        "report.");
 };
 }  // namespace
diff --git a/third-party/benchmark/bindings/python/google_benchmark/example.py b/third-party/benchmark/bindings/python/google_benchmark/example.py
index b5b2f88ff3069..8217b409e0186 100644
--- a/third-party/benchmark/bindings/python/google_benchmark/example.py
+++ b/third-party/benchmark/bindings/python/google_benchmark/example.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 """Example of Python using C++ benchmark framework.
 
-To run this example, you must first install the `google_benchmark` Python package.
+To run this example, you must first install the `google_benchmark` Python
+package.
 
 To install using `setup.py`, download and extract the `google_benchmark` source.
 In the extracted directory, execute:
@@ -21,6 +22,7 @@
 """
 
 import random
+import sys
 import time
 
 import google_benchmark as benchmark
@@ -57,10 +59,11 @@ def skipped(state):
         state.skip_with_error("some error")
         return  # NOTE: You must explicitly return, or benchmark will continue.
 
-    ...  # Benchmark code would be here.
+    # Benchmark code would be here.
 
 
 @benchmark.register
+ at benchmark.option.use_manual_time()
 def manual_timing(state):
     while state:
         # Manually count Python CPU time
@@ -77,7 +80,6 @@ def custom_counters(state):
     num_foo = 0.0
     while state:
         # Benchmark some code here
-        pass
         # Collect some custom metric named foo
         num_foo += 0.13
 
@@ -136,4 +138,5 @@ def computing_complexity(state):
 
 
 if __name__ == "__main__":
+    benchmark.add_custom_context("python", sys.version)
     benchmark.main()
diff --git a/third-party/benchmark/bindings/python/google_benchmark/version.py b/third-party/benchmark/bindings/python/google_benchmark/version.py
deleted file mode 100644
index a324693e2d7b5..0000000000000
--- a/third-party/benchmark/bindings/python/google_benchmark/version.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from importlib.metadata import PackageNotFoundError, version
-
-try:
-    __version__ = version("google-benchmark")
-except PackageNotFoundError:
-    # package is not installed
-    pass
diff --git a/third-party/benchmark/bindings/python/nanobind.BUILD b/third-party/benchmark/bindings/python/nanobind.BUILD
deleted file mode 100644
index 9874b80d1f5ac..0000000000000
--- a/third-party/benchmark/bindings/python/nanobind.BUILD
+++ /dev/null
@@ -1,59 +0,0 @@
-load("@bazel_skylib//lib:selects.bzl", "selects")
-
-licenses(["notice"])
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "msvc_compiler",
-    flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc-cl"},
-)
-
-selects.config_setting_group(
-    name = "winplusmsvc",
-    match_all = [
-        "@platforms//os:windows",
-        ":msvc_compiler",
-    ],
-)
-
-cc_library(
-    name = "nanobind",
-    srcs = glob([
-        "src/*.cpp",
-    ]),
-    additional_linker_inputs = select({
-        "@platforms//os:macos": [":cmake/darwin-ld-cpython.sym"],
-        "//conditions:default": [],
-    }),
-    copts = select({
-        ":msvc_compiler": [
-            "/EHsc",  # exceptions
-            "/Os",  # size optimizations
-            "/GL",  # LTO / whole program optimization
-        ],
-        # these should work on both clang and gcc.
-        "//conditions:default": [
-            "-fexceptions",
-            "-flto",
-            "-Os",
-        ],
-    }),
-    includes = [
-        "ext/robin_map/include",
-        "include",
-    ],
-    linkopts = select({
-        ":winplusmsvc": ["/LTGC"],  # Windows + MSVC.
-        "@platforms//os:macos": ["-Wl,@$(location :cmake/darwin-ld-cpython.sym)"],  # Apple.
-        "//conditions:default": [],
-    }),
-    textual_hdrs = glob(
-        [
-            "include/**/*.h",
-            "src/*.h",
-            "ext/robin_map/include/tsl/*.h",
-        ],
-    ),
-    deps = ["@python_headers"],
-)
diff --git a/third-party/benchmark/bindings/python/python_headers.BUILD b/third-party/benchmark/bindings/python/python_headers.BUILD
deleted file mode 100644
index 8f139f8621e01..0000000000000
--- a/third-party/benchmark/bindings/python/python_headers.BUILD
+++ /dev/null
@@ -1,10 +0,0 @@
-licenses(["notice"])
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "python_headers",
-    hdrs = glob(["**/*.h"]),
-    includes = ["."],
-    visibility = ["//visibility:public"],
-)
diff --git a/third-party/benchmark/cmake/CXXFeatureCheck.cmake b/third-party/benchmark/cmake/CXXFeatureCheck.cmake
index e51482659b0f1..a163a6e09466d 100644
--- a/third-party/benchmark/cmake/CXXFeatureCheck.cmake
+++ b/third-party/benchmark/cmake/CXXFeatureCheck.cmake
@@ -10,22 +10,35 @@
 #
 # include(CXXFeatureCheck)
 # cxx_feature_check(STD_REGEX)
-# Requires CMake 2.8.12+
+# Requires CMake 3.13+
 
 if(__cxx_feature_check)
   return()
 endif()
 set(__cxx_feature_check INCLUDED)
 
-option(CXXFEATURECHECK_DEBUG OFF)
+option(CXXFEATURECHECK_DEBUG OFF "Enable debug messages for CXX feature checks")
 
-function(cxx_feature_check FILE)
-  string(TOLOWER ${FILE} FILE)
-  string(TOUPPER ${FILE} VAR)
-  string(TOUPPER "HAVE_${VAR}" FEATURE)
-  if (DEFINED HAVE_${VAR})
-    set(HAVE_${VAR} 1 PARENT_SCOPE)
-    add_definitions(-DHAVE_${VAR})
+function(cxx_feature_check_print log)
+  if(CXXFEATURECHECK_DEBUG)
+    message(STATUS "${log}")
+  endif()
+endfunction()
+
+function(cxx_feature_check FEATURE)
+  string(TOLOWER ${FEATURE} FILE)
+  string(TOUPPER HAVE_${FEATURE} VAR)
+
+  # Check if the variable is already defined to a true or false for a quick return.
+  # This allows users to predefine the variable to skip the check.
+  # Or, if the variable is already defined by a previous check, we skip the costly check.
+  if (DEFINED ${VAR})
+    if (${VAR})
+      cxx_feature_check_print("Feature ${FEATURE} already enabled.")
+      add_compile_definitions(${VAR})
+    else()
+      cxx_feature_check_print("Feature ${FEATURE} already disabled.")
+    endif()
     return()
   endif()
 
@@ -35,48 +48,53 @@ function(cxx_feature_check FILE)
     list(APPEND FEATURE_CHECK_CMAKE_FLAGS ${ARGV1})
   endif()
 
-  if (NOT DEFINED COMPILE_${FEATURE})
-    if(CMAKE_CROSSCOMPILING)
-      message(STATUS "Cross-compiling to test ${FEATURE}")
-      try_compile(COMPILE_${FEATURE}
-              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CXX_STANDARD 11
-              CXX_STANDARD_REQUIRED ON
-              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
-              OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
-      if(COMPILE_${FEATURE})
-        message(WARNING
-              "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
-        set(RUN_${FEATURE} 0 CACHE INTERNAL "")
-      else()
-        set(RUN_${FEATURE} 1 CACHE INTERNAL "")
-      endif()
-    else()
-      message(STATUS "Compiling and running to test ${FEATURE}")
-      try_run(RUN_${FEATURE} COMPILE_${FEATURE}
-              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CXX_STANDARD 11
-              CXX_STANDARD_REQUIRED ON
-              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
-              COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
+  if(CMAKE_CROSSCOMPILING)
+    cxx_feature_check_print("Cross-compiling to test ${FEATURE}")
+    try_compile(
+      COMPILE_STATUS
+      ${CMAKE_BINARY_DIR} 
+      ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
+      CXX_STANDARD 17
+      CXX_STANDARD_REQUIRED ON
+      CMAKE_FLAGS "${FEATURE_CHECK_CMAKE_FLAGS}"
+      LINK_LIBRARIES "${BENCHMARK_CXX_LIBRARIES}"
+      OUTPUT_VARIABLE COMPILE_OUTPUT_VAR
+    )
+    if(COMPILE_STATUS)
+      set(RUN_STATUS 0)
+      message(WARNING
+              "If you see build failures due to cross compilation, try setting ${VAR} to 0")
     endif()
+  else()
+    cxx_feature_check_print("Compiling and running to test ${FEATURE}")
+    try_run(
+      RUN_STATUS 
+      COMPILE_STATUS
+      ${CMAKE_BINARY_DIR} 
+      ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
+      CXX_STANDARD 17
+      CXX_STANDARD_REQUIRED ON
+      CMAKE_FLAGS "${FEATURE_CHECK_CMAKE_FLAGS}"
+      LINK_LIBRARIES "${BENCHMARK_CXX_LIBRARIES}"
+      COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT
+      RUN_OUTPUT_VARIABLE RUN_OUTPUT
+    )
   endif()
 
-  if(RUN_${FEATURE} EQUAL 0)
+  if(COMPILE_STATUS AND RUN_STATUS EQUAL 0)
     message(STATUS "Performing Test ${FEATURE} -- success")
-    set(HAVE_${VAR} 1 PARENT_SCOPE)
-    add_definitions(-DHAVE_${VAR})
+    set(${VAR} TRUE CACHE BOOL "" FORCE)
+    add_compile_definitions(${VAR})
+    return()
+  endif()
+
+  set(${VAR} FALSE CACHE BOOL "" FORCE)
+  message(STATUS "Performing Test ${FEATURE} -- failed")
+
+  if(NOT COMPILE_STATUS)
+    cxx_feature_check_print("Compile Output: ${COMPILE_OUTPUT}")
   else()
-    if(NOT COMPILE_${FEATURE})
-      if(CXXFEATURECHECK_DEBUG)
-        message(STATUS "Performing Test ${FEATURE} -- failed to compile: ${COMPILE_OUTPUT_VAR}")
-      else()
-        message(STATUS "Performing Test ${FEATURE} -- failed to compile")
-      endif()
-    else()
-      message(STATUS "Performing Test ${FEATURE} -- compiled but failed to run")
-    endif()
+    cxx_feature_check_print("Run Output: ${RUN_OUTPUT}")
   endif()
+
 endfunction()
diff --git a/third-party/benchmark/cmake/Config.cmake.in b/third-party/benchmark/cmake/Config.cmake.in
index 2e15f0cf82dca..c65cdb54e3a99 100644
--- a/third-party/benchmark/cmake/Config.cmake.in
+++ b/third-party/benchmark/cmake/Config.cmake.in
@@ -4,4 +4,9 @@ include (CMakeFindDependencyMacro)
 
 find_dependency (Threads)
 
+if (@BENCHMARK_ENABLE_LIBPFM@)
+    list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+    find_dependency (PFM)
+endif()
+
 include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name at .cmake")
diff --git a/third-party/benchmark/cmake/GoogleTest.cmake.in b/third-party/benchmark/cmake/GoogleTest.cmake.in
index ce653ac375aca..6473892489710 100644
--- a/third-party/benchmark/cmake/GoogleTest.cmake.in
+++ b/third-party/benchmark/cmake/GoogleTest.cmake.in
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required (VERSION 3.13...3.22)
 
 project(googletest-download NONE)
 
@@ -34,11 +34,12 @@ else()
     message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_USE_BUNDLED_GTEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
     return()
   else()
-    message(WARNING "Did not find Google Test sources! Fetching from web...")
+    message(STATUS "Did not find Google Test sources! Fetching from web...")
     ExternalProject_Add(
       googletest
       GIT_REPOSITORY    https://github.com/google/googletest.git
-      GIT_TAG           "release-1.11.0"
+      GIT_TAG           "v1.15.2"
+      GIT_SHALLOW       "ON"
       PREFIX            "${CMAKE_BINARY_DIR}"
       STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
       DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
diff --git a/third-party/benchmark/cmake/benchmark.pc.in b/third-party/benchmark/cmake/benchmark.pc.in
index 9dae881c79f94..bbed29d1eb0b1 100644
--- a/third-party/benchmark/cmake/benchmark.pc.in
+++ b/third-party/benchmark/cmake/benchmark.pc.in
@@ -5,8 +5,8 @@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 
 Name: @PROJECT_NAME@
 Description: Google microbenchmark framework
-Version: @VERSION@
+Version: @NORMALIZED_VERSION@
 
 Libs: -L${libdir} -lbenchmark
-Libs.private: -lpthread
+Libs.private: -lpthread @BENCHMARK_PRIVATE_LINK_LIBRARIES@
 Cflags: -I${includedir}
diff --git a/third-party/benchmark/cmake/benchmark_main.pc.in b/third-party/benchmark/cmake/benchmark_main.pc.in
new file mode 100644
index 0000000000000..e9d81a05eec2c
--- /dev/null
+++ b/third-party/benchmark/cmake/benchmark_main.pc.in
@@ -0,0 +1,7 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+
+Name: @PROJECT_NAME@
+Description: Google microbenchmark framework (with main() function)
+Version: @NORMALIZED_VERSION@
+Requires: benchmark
+Libs: -L${libdir} -lbenchmark_main
diff --git a/third-party/benchmark/docs/dependencies.md b/third-party/benchmark/docs/dependencies.md
index 07760e10e3709..98ce9963910bb 100644
--- a/third-party/benchmark/docs/dependencies.md
+++ b/third-party/benchmark/docs/dependencies.md
@@ -11,3 +11,9 @@ distributions include newer versions, for example:
 * Ubuntu 20.04 provides CMake 3.16.3
 * Debian 11.4 provides CMake 3.18.4
 * Ubuntu 22.04 provides CMake 3.22.1
+
+## Python
+
+The Python bindings require Python 3.10+ as of v1.9.0 (2024-08-16) for installation from PyPI.
+Building from source for older versions probably still works, though. See the [user guide](python_bindings.md) for details on how to build from source.
+The minimum theoretically supported version is Python 3.8, since the used bindings generator (nanobind) only supports Python 3.8+.
diff --git a/third-party/benchmark/docs/platform_specific_build_instructions.md b/third-party/benchmark/docs/platform_specific_build_instructions.md
index 2d5d6c47eead8..5c1439d0a9e01 100644
--- a/third-party/benchmark/docs/platform_specific_build_instructions.md
+++ b/third-party/benchmark/docs/platform_specific_build_instructions.md
@@ -15,22 +15,26 @@ On QNX, the pthread library is part of libc and usually included automatically
 [`pthread_create()`](https://www.qnx.com/developers/docs/7.1/index.html#com.qnx.doc.neutrino.lib_ref/topic/p/pthread_create.html)).
 There's no separate pthread library to link.
 
-## Building with Visual Studio 2015 or 2017
+## Building with Visual Studio 2015, 2017 or 2022
 
 The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
 
 ```
 // Alternatively, can add libraries using linker options.
+
+// First, Add the path to the generated library files (directory containing the `benchmark.lib`) in `[Configuration Properties > Linker > General > Additional Library Directories]`. Then do the following:
 #ifdef _WIN32
 #pragma comment ( lib, "Shlwapi.lib" )
 #ifdef _DEBUG
-#pragma comment ( lib, "benchmarkd.lib" )
+#pragma comment ( lib, "benchmark.lib" )
 #else
 #pragma comment ( lib, "benchmark.lib" )
 #endif
 #endif
 ```
 
+When using the static library, make sure to add `BENCHMARK_STATIC_DEFINE` under `[Configuration Properties > C/C++ > Preprocessor > Preprocessor Definitions]`
+
 Can also use the graphical version of CMake:
 * Open `CMake GUI`.
 * Under `Where to build the binaries`, same path as source plus `build`.
diff --git a/third-party/benchmark/docs/reducing_variance.md b/third-party/benchmark/docs/reducing_variance.md
index 105f96e769144..364f4af15bc34 100644
--- a/third-party/benchmark/docs/reducing_variance.md
+++ b/third-party/benchmark/docs/reducing_variance.md
@@ -39,6 +39,41 @@ The benchmarks you subsequently run will have less variance.
 
 <a name="reducing-variance" />
 
+## Disabling ASLR
+
+If you see this error:
+
+```
+***WARNING*** ASLR is enabled, the results may have unreproducible noise in them.
+```
+
+you might want to disable the ASLR security hardening feature while running the
+benchmark.
+
+The simplest way is to add
+```
+benchmark::MaybeReenterWithoutASLR(argc, argv);
+```
+as the first line of your `main()` function. It will try to disable ASLR
+for the current processor, and, if successful, re-execute the binary.
+Note that `personality(2)` may be forbidden by e.g. seccomp (which happens
+by default if you are running in a Docker container).
+
+Note that if you link to `benchmark_main` already does that for you.
+
+To globally disable ASLR on Linux, run
+```
+echo 0 > /proc/sys/kernel/randomize_va_space
+```
+
+To run a single benchmark with ASLR disabled on Linux, do:
+```
+setarch `uname -m` -R ./a_benchmark
+```
+
+Note that for the information on how to disable ASLR on other operating systems,
+please refer to their documentation.
+
 ## Reducing Variance in Benchmarks
 
 The Linux CPU frequency governor [discussed
@@ -70,23 +105,28 @@ Linux workstation are:
 
 1. Use the performance governor as [discussed
 above](user_guide#disabling-cpu-frequency-scaling).
-1. Disable processor boosting by:
+2. Disable processor boosting by:
    ```sh
    echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost
    ```
    See the Linux kernel's
    [boost.txt](https://www.kernel.org/doc/Documentation/cpu-freq/boost.txt)
    for more information.
-2. Set the benchmark program's task affinity to a fixed cpu.  For example:
+3. Set the benchmark program's task affinity to a fixed cpu.  For example:
    ```sh
    taskset -c 0 ./mybenchmark
    ```
-3. Disabling Hyperthreading/SMT.  This can be done in the Bios or using the
+4. Increase the program's scheduling priority to minimize context switches using `nice` or `chrt`:
+   ```sh
+   sudo nice -n -20 ./mybenchmark
+   sudo chrt -f 80 ./mybenchmark
+   ```
+5. Disabling Hyperthreading/SMT.  This can be done in the Bios or using the
    `/sys` file system (see the LLVM project's [Benchmarking
    tips](https://llvm.org/docs/Benchmarking.html)).
-4. Close other programs that do non-trivial things based on timers, such as
+6. Close other programs that do non-trivial things based on timers, such as
    your web browser, desktop environment, etc.
-5. Reduce the working set of your benchmark to fit within the L1 cache, but
+7. Reduce the working set of your benchmark to fit within the L1 cache, but
    do be aware that this may lead you to optimize for an unrealistic
    situation.
 
diff --git a/third-party/benchmark/docs/releasing.md b/third-party/benchmark/docs/releasing.md
index 09bf93764d009..ab664a8640a4b 100644
--- a/third-party/benchmark/docs/releasing.md
+++ b/third-party/benchmark/docs/releasing.md
@@ -8,16 +8,24 @@
     * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
       commits between the last annotated tag and HEAD
     * Pick the most interesting.
-* Create one last commit that updates the version saved in `CMakeLists.txt` and `MODULE.bazel`
-  to the release version you're creating. (This version will be used if benchmark is installed
-  from the archive you'll be creating in the next step.)
+* Create one last commit that updates the version saved in `CMakeLists.txt`, `MODULE.bazel`,
+  and `bindings/python/google_benchmark/__init__.py` to the release version you're creating.
+  (This version will be used if benchmark is installed from the archive you'll be creating
+  in the next step.)
 
 ```
-project (benchmark VERSION 1.8.0 LANGUAGES CXX)
+# CMakeLists.txt
+project (benchmark VERSION 1.9.0 LANGUAGES CXX)
 ```
 
 ```
-module(name = "com_github_google_benchmark", version="1.8.0")
+# MODULE.bazel
+module(name = "com_github_google_benchmark", version="1.9.0")
+```
+
+```
+# google_benchmark/__init__.py
+__version__ = "1.9.0"
 ```
 
 * Create a release through github's interface
@@ -28,4 +36,3 @@ module(name = "com_github_google_benchmark", version="1.8.0")
       * `git push --force --tags origin`
 * Confirm that the "Build and upload Python wheels" action runs to completion
     * Run it manually if it hasn't run.
-    * IMPORTANT: When re-running manually, make sure to select the newly created `<tag>` as the workflow version in the "Run workflow" tab on the GitHub Actions page. 
diff --git a/third-party/benchmark/docs/user_guide.md b/third-party/benchmark/docs/user_guide.md
index d22a9069091f6..997737f63af91 100644
--- a/third-party/benchmark/docs/user_guide.md
+++ b/third-party/benchmark/docs/user_guide.md
@@ -82,9 +82,9 @@ tabular data on stdout. Example tabular output looks like:
 ```
 Benchmark                               Time(ns)    CPU(ns) Iterations
 ----------------------------------------------------------------------
-BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
-BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
-BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
+BM_SetInsert/1024/1                        28928      29349      23853  133.097kiB/s   33.2742k items/s
+BM_SetInsert/1024/8                        32065      32913      21375  949.487kiB/s   237.372k items/s
+BM_SetInsert/1024/10                       33157      33648      21431  1.13369MiB/s   290.225k items/s
 ```
 
 The JSON format outputs human readable json split into two top level attributes.
@@ -167,6 +167,13 @@ line interface or by setting environment variables before execution. For every
  prevails). A complete list of CLI options is available running benchmarks
  with the `--help` switch.
 
+### Dry runs
+
+To confirm that benchmarks can run successfully without needing to wait for
+multiple repetitions and iterations, the `--benchmark_dry_run` flag can be
+used.  This will run the benchmarks as normal, but for 1 iteration and 1
+repetition only.
+
 <a name="running-a-subset-of-benchmarks" />
 
 ## Running a Subset of Benchmarks
@@ -445,7 +452,7 @@ benchmark. The following example enumerates a dense range on one parameter,
 and a sparse range on the second.
 
 ```c++
-static void CustomArguments(benchmark::internal::Benchmark* b) {
+static void CustomArguments(benchmark::Benchmark* b) {
   for (int i = 0; i <= 10; ++i)
     for (int j = 32; j <= 1024*1024; j *= 8)
       b->Args({i, j});
@@ -455,7 +462,7 @@ BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
 
 ### Passing Arbitrary Arguments to a Benchmark
 
-In C++11 it is possible to define a benchmark that takes an arbitrary number
+It is possible to define a benchmark that takes an arbitrary number
 of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
 macro creates a benchmark that invokes `func`  with the `benchmark::State` as
 the first argument followed by the specified `args...`.
@@ -556,22 +563,19 @@ template <class Q> void BM_Sequential(benchmark::State& state) {
   state.SetBytesProcessed(
       static_cast<int64_t>(state.iterations())*state.range(0));
 }
-// C++03
-BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
 
-// C++11 or newer, you can use the BENCHMARK macro with template parameters:
+// You can use the BENCHMARK macro with template parameters:
 BENCHMARK(BM_Sequential<WaitQueue<int>>)->Range(1<<0, 1<<10);
 
+// Old, legacy verbose C++03 syntax:
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+
 ```
 
 Three macros are provided for adding benchmark templates.
 
 ```c++
-#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK(func<...>) // Takes any number of parameters.
-#else // C++ < C++11
-#define BENCHMARK_TEMPLATE(func, arg1)
-#endif
 #define BENCHMARK_TEMPLATE1(func, arg1)
 #define BENCHMARK_TEMPLATE2(func, arg1, arg2)
 ```
@@ -624,20 +628,22 @@ public:
   }
 };
 
+// Defines and registers `FooTest` using the class `MyFixture`.
 BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
    for (auto _ : st) {
      ...
   }
 }
 
+// Only defines `BarTest` using the class `MyFixture`.
 BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
    for (auto _ : st) {
      ...
   }
 }
-/* BarTest is NOT registered */
+// `BarTest` is NOT registered.
 BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
-/* BarTest is now registered */
+// `BarTest` is now registered.
 ```
 
 ### Templated Fixtures
@@ -653,19 +659,70 @@ For example:
 template<typename T>
 class MyFixture : public benchmark::Fixture {};
 
+// Defines and registers `IntTest` using the class template `MyFixture<int>`.
 BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
    for (auto _ : st) {
      ...
   }
 }
 
+// Only defines `DoubleTest` using the class template `MyFixture<double>`.
 BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
    for (auto _ : st) {
      ...
   }
 }
-
+// `DoubleTest` is NOT registered.
 BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
+// `DoubleTest` is now registered.
+```
+
+If you want to use a method template for your fixtures,
+which you instantiate afterward, use the following macros:
+
+* `BENCHMARK_TEMPLATE_METHOD_F(ClassName, Method)`
+* `BENCHMARK_TEMPLATE_INSTANTIATE_F(ClassName, Method, ...)`
+
+With these macros you can define one method for several instantiations.
+Example (using `MyFixture` from above):
+
+```c++
+// Defines `Test` using the class template `MyFixture`.
+BENCHMARK_TEMPLATE_METHOD_F(MyFixture, Test)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+// Instantiates and registers the benchmark `MyFixture<int>::Test`.
+BENCHMARK_TEMPLATE_INSTANTIATE_F(MyFixture, Test, int)->Threads(2);
+// Instantiates and registers the benchmark `MyFixture<double>::Test`.
+BENCHMARK_TEMPLATE_INSTANTIATE_F(MyFixture, Test, double)->Threads(4);
+```
+
+Inside the method definition of `BENCHMARK_TEMPLATE_METHOD_F` the type `Base` refers
+to the type of the instantiated fixture.
+Accesses to members of the fixture must be prefixed by `this->`.
+
+`BENCHMARK_TEMPLATE_METHOD_F`and `BENCHMARK_TEMPLATE_INSTANTIATE_F` can only be used,
+if the fixture does not use non-type template parameters.
+If you want to pass values as template parameters, use e.g. `std::integral_constant`.
+For example:
+
+```c++
+template<typename Sz>
+class SizedFixture : public benchmark::Fixture {
+  static constexpr auto Size = Sz::value;
+  int myValue;
+};
+
+BENCHMARK_TEMPLATE_METHOD_F(SizedFixture, Test)(benchmark::State& st) {
+   for (auto _ : st) {
+     this->myValue = Base::Size;
+  }
+}
+
+BENCHMARK_TEMPLATE_INSTANTIATE_F(SizedFixture, Test, std::integral_constant<5>)->Threads(2);
 ```
 
 <a name="custom-counters" />
@@ -692,10 +749,6 @@ and `Counter` values. The latter is a `double`-like class, via an implicit
 conversion to `double&`. Thus you can use all of the standard arithmetic
 assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
 
-In multithreaded benchmarks, each counter is set on the calling thread only.
-When the benchmark finishes, the counters from each thread will be summed;
-the resulting sum is the value which will be shown for the benchmark.
-
 The `Counter` constructor accepts three parameters: the value as a `double`
 ; a bit flag which allows you to show counters as rates, and/or as per-thread
 iteration, and/or as per-thread averages, and/or iteration invariants,
@@ -728,12 +781,10 @@ is 1k a 1000 (default, `benchmark::Counter::OneK::kIs1000`), or 1024
   state.counters["BytesProcessed"] = Counter(state.range(0), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
 ```
 
-When you're compiling in C++11 mode or later you can use `insert()` with
-`std::initializer_list`:
+You can use `insert()` with `std::initializer_list`:
 
 <!-- {% raw %} -->
 ```c++
-  // With C++11, this can be done:
   state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
   // ... instead of:
   state.counters["Foo"] = numFoos;
@@ -742,6 +793,10 @@ When you're compiling in C++11 mode or later you can use `insert()` with
 ```
 <!-- {% endraw %} -->
 
+In multithreaded benchmarks, each counter is set on the calling thread only.
+When the benchmark finishes, the counters from each thread will be summed.
+Counters that are configured with `kIsRate`, will report the average rate across all threads, while `kAvgThreadsRate` counters will report the average rate per thread.
+
 ### Counter Reporting
 
 When using the console reporter, by default, user counters are printed at
@@ -856,6 +911,46 @@ BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
 
 Without `UseRealTime`, CPU time is used by default.
 
+### Manual Multithreaded Benchmarks
+
+Google/benchmark uses `std::thread` as multithreading environment per default.
+If you want to use another multithreading environment (e.g. OpenMP), you can provide
+a factory function to your benchmark using the `ThreadRunner` function.
+The factory function takes the number of threads as argument and creates a custom class
+derived from `benchmark::ThreadRunnerBase`.
+This custom class must override the function
+`void RunThreads(const std::function<void(int)>& fn)`.
+`RunThreads` is called by the main thread and spawns the requested number of threads.
+Each spawned thread must call `fn(thread_index)`, where `thread_index` is its own
+thread index. Before `RunThreads` returns, all spawned threads must be joined.
+```c++
+class OpenMPThreadRunner : public benchmark::ThreadRunnerBase
+{
+  OpenMPThreadRunner(int num_threads)
+  : num_threads_(num_threads)
+  {}
+
+  void RunThreads(const std::function<void(int)>& fn) final
+  {
+#pragma omp parallel num_threads(num_threads_)
+    fn(omp_get_thread_num());
+  }
+
+private:
+  int num_threads_;
+};
+
+BENCHMARK(BM_MultiThreaded)
+  ->ThreadRunner([](int num_threads) {
+    return std::make_unique<OpenMPThreadRunner>(num_threads);
+  })
+  ->Threads(1)->Threads(2)->Threads(4);
+```
+The above example creates a parallel OpenMP region before it enters `BM_MultiThreaded`.
+The actual benchmark code can remain the same and is therefore not tied to a specific
+thread runner. The measurement does not include the time for creating and joining the
+threads.
+
 <a name="cpu-timers" />
 
 ## CPU Timers
@@ -1012,11 +1107,11 @@ in any way. `<expr>` may even be removed entirely when the result is already
 known. For example:
 
 ```c++
-  /* Example 1: `<expr>` is removed entirely. */
+  // Example 1: `<expr>` is removed entirely.
   int foo(int x) { return x + 42; }
   while (...) DoNotOptimize(foo(0)); // Optimized to DoNotOptimize(42);
 
-  /*  Example 2: Result of '<expr>' is only reused */
+  // Example 2: Result of '<expr>' is only reused.
   int bar(int) __attribute__((const));
   while (...) DoNotOptimize(bar(0)); // Optimized to:
   // int __result__ = bar(0);
@@ -1094,6 +1189,7 @@ void BM_spin_empty(benchmark::State& state) {
 }
 
 BENCHMARK(BM_spin_empty)
+  ->Repetitions(3) // or add option --benchmark_repetitions=3
   ->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
     return *(std::max_element(std::begin(v), std::end(v)));
   })
@@ -1113,8 +1209,9 @@ void BM_spin_empty(benchmark::State& state) {
 }
 
 BENCHMARK(BM_spin_empty)
+  ->Repetitions(3) // or add option --benchmark_repetitions=3
   ->ComputeStatistics("ratio", [](const std::vector<double>& v) -> double {
-    return std::begin(v) / std::end(v);
+    return v.front() / v.back();
   }, benchmark::StatisticUnit::kPercentage)
   ->Arg(512);
 ```
@@ -1134,6 +1231,21 @@ a report on the number of allocations, bytes used, etc.
 This data will then be reported alongside other performance data, currently
 only when using JSON output.
 
+<a name="profiling" />
+
+## Profiling
+
+It's often useful to also profile benchmarks in particular ways, in addition to
+CPU performance. For this reason, benchmark offers the `RegisterProfilerManager`
+method that allows a custom `ProfilerManager` to be injected.
+
+If set, the `ProfilerManager::AfterSetupStart` and
+`ProfilerManager::BeforeTeardownStop` methods will be called at the start and
+end of a separate benchmark run to allow user code to collect and report
+user-provided profile metrics.
+
+Output collected from this profiling run must be reported separately.
+
 <a name="using-register-benchmark" />
 
 ## Using RegisterBenchmark(name, fn, args...)
@@ -1156,6 +1268,7 @@ For Example:
 auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ };
 
 int main(int argc, char** argv) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
   for (auto& test_input : { /* ... */ })
       benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
   benchmark::Initialize(&argc, argv);
@@ -1220,7 +1333,7 @@ static void BM_test_ranged_fo(benchmark::State & state) {
 
 ## A Faster KeepRunning Loop
 
-In C++11 mode, a ranged-based for loop should be used in preference to
+A ranged-based for loop should be used in preference to
 the `KeepRunning` loop for running the benchmarks. For example:
 
 ```c++
diff --git a/third-party/benchmark/include/benchmark/benchmark.h b/third-party/benchmark/include/benchmark/benchmark.h
index 71399d030a846..f7d1341c8f950 100644
--- a/third-party/benchmark/include/benchmark/benchmark.h
+++ b/third-party/benchmark/include/benchmark/benchmark.h
@@ -40,6 +40,7 @@ BENCHMARK(BM_StringCopy);
 //       my_unittest --benchmark_filter=String
 //       my_unittest --benchmark_filter='Copy|Creation'
 int main(int argc, char** argv) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
   benchmark::Shutdown();
@@ -102,7 +103,7 @@ BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
 // arbitrary set of arguments to run the microbenchmark on.
 // The following example enumerates a dense range on
 // one parameter, and a sparse range on the second.
-static void CustomArguments(benchmark::internal::Benchmark* b) {
+static void CustomArguments(benchmark::Benchmark* b) {
   for (int i = 0; i <= 10; ++i)
     for (int j = 32; j <= 1024*1024; j *= 8)
       b->Args({i, j});
@@ -163,52 +164,32 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #ifndef BENCHMARK_BENCHMARK_H_
 #define BENCHMARK_BENCHMARK_H_
 
-// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer.
-#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
-#define BENCHMARK_HAS_CXX11
-#endif
-
-// This _MSC_VER check should detect VS 2017 v15.3 and newer.
-#if __cplusplus >= 201703L || \
-    (defined(_MSC_VER) && _MSC_VER >= 1911 && _MSVC_LANG >= 201703L)
-#define BENCHMARK_HAS_CXX17
-#endif
-
 #include <stdint.h>
 
 #include <algorithm>
+#include <atomic>
 #include <cassert>
 #include <cstddef>
+#include <functional>
 #include <iosfwd>
 #include <limits>
 #include <map>
+#include <memory>
 #include <set>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "benchmark/export.h"
 
-#if defined(BENCHMARK_HAS_CXX11)
-#include <atomic>
-#include <initializer_list>
-#include <type_traits>
-#include <utility>
-#endif
-
 #if defined(_MSC_VER)
 #include <intrin.h>  // for _ReadWriteBarrier
 #endif
 
-#ifndef BENCHMARK_HAS_CXX11
-#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
-  TypeName(const TypeName&);                         \
-  TypeName& operator=(const TypeName&)
-#else
 #define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
   TypeName(const TypeName&) = delete;                \
   TypeName& operator=(const TypeName&) = delete
-#endif
 
 #ifdef BENCHMARK_HAS_CXX17
 #define BENCHMARK_UNUSED [[maybe_unused]]
@@ -257,6 +238,16 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
   _Pragma("diagnostic push") \
   _Pragma("diag_suppress deprecated_entity_with_custom_message")
 #define BENCHMARK_RESTORE_DEPRECATED_WARNING _Pragma("diagnostic pop")
+#elif defined(_MSC_VER)
+#define BENCHMARK_BUILTIN_EXPECT(x, y) x
+#define BENCHMARK_DEPRECATED_MSG(msg) __declspec(deprecated(msg))
+#define BENCHMARK_WARNING_MSG(msg)                           \
+  __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING( \
+      __LINE__) ") : warning note: " msg))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING \
+  __pragma(warning(push)) \
+  __pragma(warning(disable : 4996))
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING __pragma(warning(pop))
 #else
 #define BENCHMARK_BUILTIN_EXPECT(x, y) x
 #define BENCHMARK_DEPRECATED_MSG(msg)
@@ -284,31 +275,84 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_UNREACHABLE() ((void)0)
 #endif
 
-#ifdef BENCHMARK_HAS_CXX11
-#define BENCHMARK_OVERRIDE override
+#if defined(__GNUC__)
+// Determine the cacheline size based on architecture
+#if defined(__i386__) || defined(__x86_64__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#elif defined(__powerpc64__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 128
+#elif defined(__aarch64__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#elif defined(__arm__)
+// Cache line sizes for ARM: These values are not strictly correct since
+// cache line sizes depend on implementations, not architectures.  There
+// are even implementations with cache line sizes configurable at boot
+// time.
+#if defined(__ARM_ARCH_5T__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 32
+#elif defined(__ARM_ARCH_7A__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#endif  // ARM_ARCH
+#endif  // arches
+#endif  // __GNUC__
+
+#ifndef BENCHMARK_INTERNAL_CACHELINE_SIZE
+// A reasonable default guess.  Note that overestimates tend to waste more
+// space, while underestimates tend to waste more time.
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#endif
+
+#if defined(__GNUC__)
+// Indicates that the declared object be cache aligned using
+// `BENCHMARK_INTERNAL_CACHELINE_SIZE` (see above).
+#define BENCHMARK_INTERNAL_CACHELINE_ALIGNED \
+  __attribute__((aligned(BENCHMARK_INTERNAL_CACHELINE_SIZE)))
+#elif defined(_MSC_VER)
+#define BENCHMARK_INTERNAL_CACHELINE_ALIGNED \
+  __declspec(align(BENCHMARK_INTERNAL_CACHELINE_SIZE))
 #else
-#define BENCHMARK_OVERRIDE
+#define BENCHMARK_INTERNAL_CACHELINE_ALIGNED
 #endif
 
 #if defined(_MSC_VER)
 #pragma warning(push)
 // C4251: <symbol> needs to have dll-interface to be used by clients of class
 #pragma warning(disable : 4251)
-#endif
+#endif  // _MSC_VER_
 
 namespace benchmark {
+
+namespace internal {
+#if (__cplusplus < 201402L || (defined(_MSC_VER) && _MSVC_LANG < 201402L))
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+#else
+using ::std::make_unique;
+#endif
+}  // namespace internal
+
 class BenchmarkReporter;
+class State;
+
+using IterationCount = int64_t;
+
+// Define alias of Setup/Teardown callback function type
+using callback_function = std::function<void(const benchmark::State&)>;
 
 // Default number of minimum benchmark running time in seconds.
 const char kDefaultMinTimeStr[] = "0.5s";
 
+BENCHMARK_EXPORT void MaybeReenterWithoutASLR(int, char**);
+
 // Returns the version of the library.
 BENCHMARK_EXPORT std::string GetBenchmarkVersion();
 
 BENCHMARK_EXPORT void PrintDefaultHelp();
 
 BENCHMARK_EXPORT void Initialize(int* argc, char** argv,
-                                 void (*HelperPrinterf)() = PrintDefaultHelp);
+                                 void (*HelperPrintf)() = PrintDefaultHelp);
 BENCHMARK_EXPORT void Shutdown();
 
 // Report to stdout all arguments in 'argv' as unrecognized except the first.
@@ -377,14 +421,15 @@ BENCHMARK_EXPORT void SetDefaultTimeUnit(TimeUnit unit);
 // benchmark.
 class MemoryManager {
  public:
-  static const int64_t TombstoneValue;
+  static constexpr int64_t TombstoneValue = std::numeric_limits<int64_t>::max();
 
   struct Result {
     Result()
         : num_allocs(0),
           max_bytes_used(0),
           total_allocated_bytes(TombstoneValue),
-          net_heap_growth(TombstoneValue) {}
+          net_heap_growth(TombstoneValue),
+          memory_iterations(0) {}
 
     // The number of allocations made in total between Start and Stop.
     int64_t num_allocs;
@@ -400,6 +445,8 @@ class MemoryManager {
     // ie., total_allocated_bytes - total_deallocated_bytes.
     // Init'ed to TombstoneValue if metric not available.
     int64_t net_heap_growth;
+
+    IterationCount memory_iterations;
   };
 
   virtual ~MemoryManager() {}
@@ -416,12 +463,33 @@ class MemoryManager {
 BENCHMARK_EXPORT
 void RegisterMemoryManager(MemoryManager* memory_manager);
 
+// If a ProfilerManager is registered (via RegisterProfilerManager()), the
+// benchmark will be run an additional time under the profiler to collect and
+// report profile metrics for the run of the benchmark.
+class ProfilerManager {
+ public:
+  virtual ~ProfilerManager() {}
+
+  // This is called after `Setup()` code and right before the benchmark is run.
+  virtual void AfterSetupStart() = 0;
+
+  // This is called before `Teardown()` code and right after the benchmark
+  // completes.
+  virtual void BeforeTeardownStop() = 0;
+};
+
+// Register a ProfilerManager instance that will be used to collect and report
+// profile measurements for benchmark runs.
+BENCHMARK_EXPORT
+void RegisterProfilerManager(ProfilerManager* profiler_manager);
+
 // Add a key-value pair to output as part of the context stanza in the report.
 BENCHMARK_EXPORT
-void AddCustomContext(const std::string& key, const std::string& value);
+void AddCustomContext(std::string key, std::string value);
 
-namespace internal {
 class Benchmark;
+
+namespace internal {
 class BenchmarkImp;
 class BenchmarkFamilies;
 
@@ -432,7 +500,8 @@ void UseCharPointer(char const volatile*);
 
 // Take ownership of the pointer and register the benchmark. Return the
 // registered benchmark.
-BENCHMARK_EXPORT Benchmark* RegisterBenchmarkInternal(Benchmark*);
+BENCHMARK_EXPORT Benchmark* RegisterBenchmarkInternal(
+    std::unique_ptr<Benchmark>);
 
 // Ensure that the standard streams are properly initialized in every TU.
 BENCHMARK_EXPORT int InitializeStreams();
@@ -447,11 +516,9 @@ BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
 
 // Force the compiler to flush pending writes to global memory. Acts as an
 // effective read/write barrier
-#ifdef BENCHMARK_HAS_CXX11
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
   std::atomic_signal_fence(std::memory_order_acq_rel);
 }
-#endif
 
 // The DoNotOptimize(...) function can be used to prevent a value or
 // expression from being optimized away by the compiler. This function is
@@ -476,7 +543,6 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
 #endif
 }
 
-#ifdef BENCHMARK_HAS_CXX11
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
 #if defined(__clang__)
@@ -485,8 +551,8 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
   asm volatile("" : "+m,r"(value) : : "memory");
 #endif
 }
-#endif
-#elif defined(BENCHMARK_HAS_CXX11) && (__GNUC__ >= 5)
+// !defined(__GNUC__) || defined(__llvm__) || defined(__INTEL_COMPILER)
+#elif (__GNUC__ >= 5)
 // Workaround for a bug with full argument copy overhead with GCC.
 // See: #1340 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105519
 template <class Tp>
@@ -542,70 +608,35 @@ inline BENCHMARK_ALWAYS_INLINE
     DoNotOptimize(Tp&& value) {
   asm volatile("" : "+m"(value) : : "memory");
 }
+// !defined(__GNUC__) || defined(__llvm__) || defined(__INTEL_COMPILER)
+#endif
 
-#else
-// Fallback for GCC < 5. Can add some overhead because the compiler is forced
-// to use memory operations instead of operations with registers.
-// TODO: Remove if GCC < 5 will be unsupported.
+#elif defined(_MSC_VER)
 template <class Tp>
 BENCHMARK_DEPRECATED_MSG(
     "The const-ref version of this method can permit "
     "undesired compiler optimizations in benchmarks")
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-  asm volatile("" : : "m"(value) : "memory");
+  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+  _ReadWriteBarrier();
 }
 
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
-  asm volatile("" : "+m"(value) : : "memory");
-}
-
-#ifdef BENCHMARK_HAS_CXX11
-template <class Tp>
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
-  asm volatile("" : "+m"(value) : : "memory");
-}
-#endif
-#endif
-
-#ifndef BENCHMARK_HAS_CXX11
-inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
-  asm volatile("" : : : "memory");
-}
-#endif
-#elif defined(_MSC_VER)
-template <class Tp>
-BENCHMARK_DEPRECATED_MSG(
-    "The const-ref version of this method can permit "
-    "undesired compiler optimizations in benchmarks")
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
   _ReadWriteBarrier();
 }
 
-#ifndef BENCHMARK_HAS_CXX11
-inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { _ReadWriteBarrier(); }
-#endif
-#else
-#ifdef BENCHMARK_HAS_CXX11
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+  _ReadWriteBarrier();
 }
 #else
 template <class Tp>
-BENCHMARK_DEPRECATED_MSG(
-    "The const-ref version of this method can permit "
-    "undesired compiler optimizations in benchmarks")
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
-}
-
-template <class Tp>
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
 }
-#endif
 // FIXME Add ClobberMemory() for non-gnu and non-msvc compilers, before C++11.
 #endif
 
@@ -654,7 +685,7 @@ class Counter {
   Counter(double v = 0., Flags f = kDefaults, OneK k = kIs1000)
       : value(v), flags(f), oneK(k) {}
 
-  BENCHMARK_ALWAYS_INLINE operator double const &() const { return value; }
+  BENCHMARK_ALWAYS_INLINE operator double const&() const { return value; }
   BENCHMARK_ALWAYS_INLINE operator double&() { return value; }
 };
 
@@ -677,8 +708,6 @@ enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
 
 typedef int64_t ComplexityN;
 
-typedef int64_t IterationCount;
-
 enum StatisticUnit { kTime, kPercentage };
 
 // BigOFunc is passed to a benchmark in order to specify the asymptotic
@@ -705,12 +734,7 @@ class ThreadTimer;
 class ThreadManager;
 class PerfCountersMeasurement;
 
-enum AggregationReportMode
-#if defined(BENCHMARK_HAS_CXX11)
-    : unsigned
-#else
-#endif
-{
+enum AggregationReportMode : unsigned {
   // The mode has not been manually specified
   ARM_Unspecified = 0,
   // The mode is user-specified.
@@ -725,11 +749,7 @@ enum AggregationReportMode
       ARM_FileReportAggregatesOnly | ARM_DisplayReportAggregatesOnly
 };
 
-enum Skipped
-#if defined(BENCHMARK_HAS_CXX11)
-    : unsigned
-#endif
-{
+enum Skipped : unsigned {
   NotSkipped = 0,
   SkippedWithMessage,
   SkippedWithError
@@ -737,9 +757,14 @@ enum Skipped
 
 }  // namespace internal
 
+#if defined(_MSC_VER)
+#pragma warning(push)
+// C4324: 'benchmark::State': structure was padded due to alignment specifier
+#pragma warning(disable : 4324)
+#endif  // _MSC_VER_
 // State is passed to a running Benchmark and contains state for the
 // benchmark to use.
-class BENCHMARK_EXPORT State {
+class BENCHMARK_EXPORT BENCHMARK_INTERNAL_CACHELINE_ALIGNED State {
  public:
   struct StateIterator;
   friend struct StateIterator;
@@ -952,6 +977,8 @@ class BENCHMARK_EXPORT State {
   BENCHMARK_ALWAYS_INLINE
   std::string name() const { return name_; }
 
+  size_t range_size() const { return range_.size(); }
+
  private:
   // items we expect on the first cache line (ie 64 bytes of the struct)
   // When total_iterations_ is 0, KeepRunning() and friends will return false.
@@ -984,7 +1011,8 @@ class BENCHMARK_EXPORT State {
   State(std::string name, IterationCount max_iters,
         const std::vector<int64_t>& ranges, int thread_i, int n_threads,
         internal::ThreadTimer* timer, internal::ThreadManager* manager,
-        internal::PerfCountersMeasurement* perf_counters_measurement);
+        internal::PerfCountersMeasurement* perf_counters_measurement,
+        ProfilerManager* profiler_manager);
 
   void StartKeepRunning();
   // Implementation of KeepRunning() and KeepRunningBatch().
@@ -999,9 +1027,13 @@ class BENCHMARK_EXPORT State {
   internal::ThreadTimer* const timer_;
   internal::ThreadManager* const manager_;
   internal::PerfCountersMeasurement* const perf_counters_measurement_;
+  ProfilerManager* const profiler_manager_;
 
   friend class internal::BenchmarkInstance;
 };
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif  // _MSC_VER_
 
 inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
   return KeepRunningInternal(1, /*is_batch=*/false);
@@ -1087,16 +1119,21 @@ inline BENCHMARK_ALWAYS_INLINE State::StateIterator State::end() {
   return StateIterator();
 }
 
-namespace internal {
+// Base class for user-defined multi-threading
+struct ThreadRunnerBase {
+  virtual ~ThreadRunnerBase() {}
+  virtual void RunThreads(const std::function<void(int)>& fn) = 0;
+};
 
-typedef void(Function)(State&);
+// Define alias of ThreadRunner factory function type
+using threadrunner_factory =
+    std::function<std::unique_ptr<ThreadRunnerBase>(int)>;
 
 // ------------------------------------------------------
-// Benchmark registration object.  The BENCHMARK() macro expands
-// into an internal::Benchmark* object.  Various methods can
-// be called on this object to change the properties of the benchmark.
-// Each method returns "this" so that multiple method calls can
-// chained into one expression.
+// Benchmark registration object.  The BENCHMARK() macro expands into a
+// Benchmark* object.  Various methods can be called on this object to
+// change the properties of the benchmark.  Each method returns "this" so
+// that multiple method calls can chained into one expression.
 class BENCHMARK_EXPORT Benchmark {
  public:
   virtual ~Benchmark();
@@ -1143,12 +1180,12 @@ class BENCHMARK_EXPORT Benchmark {
   // Run this benchmark once for a number of values picked from the
   // ranges [start..limit].  (starts and limits are always picked.)
   // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
-  Benchmark* Ranges(const std::vector<std::pair<int64_t, int64_t> >& ranges);
+  Benchmark* Ranges(const std::vector<std::pair<int64_t, int64_t>>& ranges);
 
   // Run this benchmark once for each combination of values in the (cartesian)
   // product of the supplied argument lists.
   // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
-  Benchmark* ArgsProduct(const std::vector<std::vector<int64_t> >& arglists);
+  Benchmark* ArgsProduct(const std::vector<std::vector<int64_t>>& arglists);
 
   // Equivalent to ArgNames({name})
   Benchmark* ArgName(const std::string& name);
@@ -1161,7 +1198,7 @@ class BENCHMARK_EXPORT Benchmark {
   // NOTE: This is a legacy C++03 interface provided for compatibility only.
   //   New code should use 'Ranges'.
   Benchmark* RangePair(int64_t lo1, int64_t hi1, int64_t lo2, int64_t hi2) {
-    std::vector<std::pair<int64_t, int64_t> > ranges;
+    std::vector<std::pair<int64_t, int64_t>> ranges;
     ranges.push_back(std::make_pair(lo1, hi1));
     ranges.push_back(std::make_pair(lo2, hi2));
     return Ranges(ranges);
@@ -1179,15 +1216,15 @@ class BENCHMARK_EXPORT Benchmark {
   //
   // The callback will be passed a State object, which includes the number
   // of threads, thread-index, benchmark arguments, etc.
-  //
-  // The callback must not be NULL or self-deleting.
-  Benchmark* Setup(void (*setup)(const benchmark::State&));
-  Benchmark* Teardown(void (*teardown)(const benchmark::State&));
+  Benchmark* Setup(callback_function&&);
+  Benchmark* Setup(const callback_function&);
+  Benchmark* Teardown(callback_function&&);
+  Benchmark* Teardown(const callback_function&);
 
   // Pass this benchmark object to *func, which can customize
   // the benchmark by calling various methods like Arg, Args,
   // Threads, etc.
-  Benchmark* Apply(void (*func)(Benchmark* benchmark));
+  Benchmark* Apply(const std::function<void(Benchmark* benchmark)>&);
 
   // Set the range multiplier for non-dense range. If not called, the range
   // multiplier kRangeMultiplier will be used.
@@ -1293,6 +1330,9 @@ class BENCHMARK_EXPORT Benchmark {
   // Equivalent to ThreadRange(NumCPUs(), NumCPUs())
   Benchmark* ThreadPerCpu();
 
+  // Sets a user-defined threadrunner (see ThreadRunnerBase)
+  Benchmark* ThreadRunner(threadrunner_factory&& factory);
+
   virtual void Run(State& state) = 0;
 
   TimeUnit GetTimeUnit() const;
@@ -1307,13 +1347,13 @@ class BENCHMARK_EXPORT Benchmark {
   const char* GetArgName(int arg) const;
 
  private:
-  friend class BenchmarkFamilies;
-  friend class BenchmarkInstance;
+  friend class internal::BenchmarkFamilies;
+  friend class internal::BenchmarkInstance;
 
   std::string name_;
-  AggregationReportMode aggregation_report_mode_;
-  std::vector<std::string> arg_names_;       // Args for all benchmark runs
-  std::vector<std::vector<int64_t> > args_;  // Args for all benchmark runs
+  internal::AggregationReportMode aggregation_report_mode_;
+  std::vector<std::string> arg_names_;      // Args for all benchmark runs
+  std::vector<std::vector<int64_t>> args_;  // Args for all benchmark runs
 
   TimeUnit time_unit_;
   bool use_default_time_unit_;
@@ -1328,39 +1368,39 @@ class BENCHMARK_EXPORT Benchmark {
   bool use_manual_time_;
   BigO complexity_;
   BigOFunc* complexity_lambda_;
-  std::vector<Statistics> statistics_;
+  std::vector<internal::Statistics> statistics_;
   std::vector<int> thread_counts_;
 
-  typedef void (*callback_function)(const benchmark::State&);
   callback_function setup_;
   callback_function teardown_;
 
-  Benchmark(Benchmark const&)
-#if defined(BENCHMARK_HAS_CXX11)
-      = delete
-#endif
-      ;
+  threadrunner_factory threadrunner_;
 
-  Benchmark& operator=(Benchmark const&)
-#if defined(BENCHMARK_HAS_CXX11)
-      = delete
-#endif
-      ;
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(Benchmark);
 };
 
+namespace internal {
+
+// clang-format off
+typedef BENCHMARK_DEPRECATED_MSG("Use ::benchmark::Benchmark instead")
+    ::benchmark::Benchmark Benchmark;
+typedef BENCHMARK_DEPRECATED_MSG(
+    "Use ::benchmark::threadrunner_factory instead")
+    ::benchmark::threadrunner_factory threadrunner_factory;
+// clang-format on
+
+typedef void(Function)(State&);
+
 }  // namespace internal
 
 // Create and register a benchmark with the specified 'name' that invokes
 // the specified functor 'fn'.
 //
 // RETURNS: A pointer to the registered benchmark.
-internal::Benchmark* RegisterBenchmark(const std::string& name,
-                                       internal::Function* fn);
+Benchmark* RegisterBenchmark(const std::string& name, internal::Function* fn);
 
-#if defined(BENCHMARK_HAS_CXX11)
 template <class Lambda>
-internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn);
-#endif
+Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn);
 
 // Remove all registered benchmarks. All pointers to previously registered
 // benchmarks are invalidated.
@@ -1369,76 +1409,61 @@ BENCHMARK_EXPORT void ClearRegisteredBenchmarks();
 namespace internal {
 // The class used to hold all Benchmarks created from static function.
 // (ie those created using the BENCHMARK(...) macros.
-class BENCHMARK_EXPORT FunctionBenchmark : public Benchmark {
+class BENCHMARK_EXPORT FunctionBenchmark : public benchmark::Benchmark {
  public:
   FunctionBenchmark(const std::string& name, Function* func)
       : Benchmark(name), func_(func) {}
 
-  void Run(State& st) BENCHMARK_OVERRIDE;
+  void Run(State& st) override;
 
  private:
   Function* func_;
 };
 
-#ifdef BENCHMARK_HAS_CXX11
 template <class Lambda>
-class LambdaBenchmark : public Benchmark {
+class LambdaBenchmark : public benchmark::Benchmark {
  public:
-  void Run(State& st) BENCHMARK_OVERRIDE { lambda_(st); }
+  void Run(State& st) override { lambda_(st); }
 
- private:
   template <class OLambda>
   LambdaBenchmark(const std::string& name, OLambda&& lam)
       : Benchmark(name), lambda_(std::forward<OLambda>(lam)) {}
 
+ private:
   LambdaBenchmark(LambdaBenchmark const&) = delete;
-
-  template <class Lam>  // NOLINTNEXTLINE(readability-redundant-declaration)
-  friend Benchmark* ::benchmark::RegisterBenchmark(const std::string&, Lam&&);
-
   Lambda lambda_;
 };
-#endif
 }  // namespace internal
 
-inline internal::Benchmark* RegisterBenchmark(const std::string& name,
-                                              internal::Function* fn) {
-  // FIXME: this should be a `std::make_unique<>()` but we don't have C++14.
-  // codechecker_intentional [cplusplus.NewDeleteLeaks]
+inline Benchmark* RegisterBenchmark(const std::string& name,
+                                    internal::Function* fn) {
   return internal::RegisterBenchmarkInternal(
-      ::new internal::FunctionBenchmark(name, fn));
+      ::benchmark::internal::make_unique<internal::FunctionBenchmark>(name,
+                                                                      fn));
 }
 
-#ifdef BENCHMARK_HAS_CXX11
 template <class Lambda>
-internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn) {
+Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn) {
   using BenchType =
       internal::LambdaBenchmark<typename std::decay<Lambda>::type>;
-  // FIXME: this should be a `std::make_unique<>()` but we don't have C++14.
-  // codechecker_intentional [cplusplus.NewDeleteLeaks]
   return internal::RegisterBenchmarkInternal(
-      ::new BenchType(name, std::forward<Lambda>(fn)));
+      ::benchmark::internal::make_unique<BenchType>(name,
+                                                    std::forward<Lambda>(fn)));
 }
-#endif
 
-#if defined(BENCHMARK_HAS_CXX11) && \
-    (!defined(BENCHMARK_GCC_VERSION) || BENCHMARK_GCC_VERSION >= 409)
 template <class Lambda, class... Args>
-internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn,
-                                       Args&&... args) {
+Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn,
+                             Args&&... args) {
   return benchmark::RegisterBenchmark(
       name, [=](benchmark::State& st) { fn(st, args...); });
 }
-#else
-#define BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
-#endif
 
 // The base class for all fixture tests.
-class Fixture : public internal::Benchmark {
+class Fixture : public Benchmark {
  public:
-  Fixture() : internal::Benchmark("") {}
+  Fixture() : Benchmark("") {}
 
-  void Run(State& st) BENCHMARK_OVERRIDE {
+  void Run(State& st) override {
     this->SetUp(st);
     this->BenchmarkCase(st);
     this->TearDown(st);
@@ -1484,14 +1509,9 @@ BENCHMARK_DISABLE_COUNTER_WARNING
 BENCHMARK_RESTORE_COUNTER_WARNING
 
 // Helpers for generating unique variable names
-#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_PRIVATE_NAME(...)                                      \
   BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, \
                            __VA_ARGS__)
-#else
-#define BENCHMARK_PRIVATE_NAME(n) \
-  BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
-#endif  // BENCHMARK_HAS_CXX11
 
 #define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
 #define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
@@ -1499,23 +1519,19 @@ BENCHMARK_RESTORE_COUNTER_WARNING
 #define BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method) \
   BaseClass##_##Method##_Benchmark
 
-#define BENCHMARK_PRIVATE_DECLARE(n)                                 \
-  BENCHMARK_DISABLE_COUNTER_WARNING                                  \
-  static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
+#define BENCHMARK_PRIVATE_DECLARE(n)                                   \
+  BENCHMARK_DISABLE_COUNTER_WARNING                                    \
+  /* NOLINTNEXTLINE(misc-use-anonymous-namespace) */                   \
+  static ::benchmark::Benchmark const* const BENCHMARK_PRIVATE_NAME(n) \
       BENCHMARK_RESTORE_COUNTER_WARNING BENCHMARK_UNUSED
 
-#ifdef BENCHMARK_HAS_CXX11
-#define BENCHMARK(...)                                               \
-  BENCHMARK_PRIVATE_DECLARE(_benchmark_) =                           \
-      (::benchmark::internal::RegisterBenchmarkInternal(             \
-          new ::benchmark::internal::FunctionBenchmark(#__VA_ARGS__, \
-                                                       __VA_ARGS__)))
-#else
-#define BENCHMARK(n)                                     \
-  BENCHMARK_PRIVATE_DECLARE(n) =                         \
+#define BENCHMARK(...)                                   \
+  BENCHMARK_PRIVATE_DECLARE(_benchmark_) =               \
       (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(#n, n)))
-#endif  // BENCHMARK_HAS_CXX11
+          ::benchmark::internal::make_unique<            \
+              ::benchmark::internal::FunctionBenchmark>( \
+              #__VA_ARGS__,                              \
+              static_cast<::benchmark::internal::Function*>(__VA_ARGS__))))
 
 // Old-style macros
 #define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
@@ -1525,8 +1541,6 @@ BENCHMARK_RESTORE_COUNTER_WARNING
 #define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
   BENCHMARK(n)->RangePair({{(l1), (h1)}, {(l2), (h2)}})
 
-#ifdef BENCHMARK_HAS_CXX11
-
 // Register a benchmark which invokes the function specified by `func`
 // with the additional arguments specified by `...`.
 //
@@ -1541,12 +1555,11 @@ BENCHMARK_RESTORE_COUNTER_WARNING
 #define BENCHMARK_CAPTURE(func, test_case_name, ...)     \
   BENCHMARK_PRIVATE_DECLARE(_benchmark_) =               \
       (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(  \
+          ::benchmark::internal::make_unique<            \
+              ::benchmark::internal::FunctionBenchmark>( \
               #func "/" #test_case_name,                 \
               [](::benchmark::State& st) { func(st, __VA_ARGS__); })))
 
-#endif  // BENCHMARK_HAS_CXX11
-
 // This will register a benchmark for a templatized function.  For example:
 //
 // template<int arg>
@@ -1558,25 +1571,27 @@ BENCHMARK_RESTORE_COUNTER_WARNING
 #define BENCHMARK_TEMPLATE1(n, a)                        \
   BENCHMARK_PRIVATE_DECLARE(n) =                         \
       (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(#n "<" #a ">", n<a>)))
+          ::benchmark::internal::make_unique<            \
+              ::benchmark::internal::FunctionBenchmark>( \
+              #n "<" #a ">",                             \
+              static_cast<::benchmark::internal::Function*>(n<a>))))
 
-#define BENCHMARK_TEMPLATE2(n, a, b)                                         \
-  BENCHMARK_PRIVATE_DECLARE(n) =                                             \
-      (::benchmark::internal::RegisterBenchmarkInternal(                     \
-          new ::benchmark::internal::FunctionBenchmark(#n "<" #a "," #b ">", \
-                                                       n<a, b>)))
+#define BENCHMARK_TEMPLATE2(n, a, b)                     \
+  BENCHMARK_PRIVATE_DECLARE(n) =                         \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          ::benchmark::internal::make_unique<            \
+              ::benchmark::internal::FunctionBenchmark>( \
+              #n "<" #a "," #b ">",                      \
+              static_cast<::benchmark::internal::Function*>(n<a, b>))))
 
-#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE(n, ...)                       \
   BENCHMARK_PRIVATE_DECLARE(n) =                         \
       (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(  \
-              #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>)))
-#else
-#define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
-#endif
+          ::benchmark::internal::make_unique<            \
+              ::benchmark::internal::FunctionBenchmark>( \
+              #n "<" #__VA_ARGS__ ">",                   \
+              static_cast<::benchmark::internal::Function*>(n<__VA_ARGS__>))))
 
-#ifdef BENCHMARK_HAS_CXX11
 // This will register a benchmark for a templatized function,
 // with the additional arguments specified by `...`.
 //
@@ -1595,21 +1610,21 @@ BENCHMARK_RESTORE_COUNTER_WARNING
 #define BENCHMARK_TEMPLATE2_CAPTURE(func, a, b, test_case_name, ...) \
   BENCHMARK_PRIVATE_DECLARE(func) =                                  \
       (::benchmark::internal::RegisterBenchmarkInternal(             \
-          new ::benchmark::internal::FunctionBenchmark(              \
+          ::benchmark::internal::make_unique<                        \
+              ::benchmark::internal::FunctionBenchmark>(             \
               #func "<" #a "," #b ">"                                \
                     "/" #test_case_name,                             \
               [](::benchmark::State& st) { func<a, b>(st, __VA_ARGS__); })))
-#endif  // BENCHMARK_HAS_CXX11
-
-#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)          \
-  class BaseClass##_##Method##_Benchmark : public BaseClass {   \
-   public:                                                      \
-    BaseClass##_##Method##_Benchmark() {                        \
-      this->SetName(#BaseClass "/" #Method);                    \
-    }                                                           \
-                                                                \
-   protected:                                                   \
-    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
+
+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)        \
+  class BaseClass##_##Method##_Benchmark : public BaseClass { \
+   public:                                                    \
+    BaseClass##_##Method##_Benchmark() {                      \
+      this->SetName(#BaseClass "/" #Method);                  \
+    }                                                         \
+                                                              \
+   protected:                                                 \
+    void BenchmarkCase(::benchmark::State&) override;         \
   };
 
 #define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
@@ -1620,7 +1635,7 @@ BENCHMARK_RESTORE_COUNTER_WARNING
     }                                                               \
                                                                     \
    protected:                                                       \
-    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;     \
+    void BenchmarkCase(::benchmark::State&) override;               \
   };
 
 #define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
@@ -1631,10 +1646,9 @@ BENCHMARK_RESTORE_COUNTER_WARNING
     }                                                                  \
                                                                        \
    protected:                                                          \
-    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;        \
+    void BenchmarkCase(::benchmark::State&) override;                  \
   };
 
-#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, ...)       \
   class BaseClass##_##Method##_Benchmark : public BaseClass<__VA_ARGS__> { \
    public:                                                                 \
@@ -1643,12 +1657,8 @@ BENCHMARK_RESTORE_COUNTER_WARNING
     }                                                                      \
                                                                            \
    protected:                                                              \
-    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;            \
+    void BenchmarkCase(::benchmark::State&) override;                      \
   };
-#else
-#define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(n, a) \
-  BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(n, a)
-#endif
 
 #define BENCHMARK_DEFINE_F(BaseClass, Method)    \
   BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
@@ -1662,21 +1672,50 @@ BENCHMARK_RESTORE_COUNTER_WARNING
   BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
   void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
-#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, ...)            \
   BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
   void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
-#else
-#define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, a) \
-  BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)
-#endif
 
 #define BENCHMARK_REGISTER_F(BaseClass, Method) \
   BENCHMARK_PRIVATE_REGISTER_F(BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method))
 
-#define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
-  BENCHMARK_PRIVATE_DECLARE(TestName) =        \
-      (::benchmark::internal::RegisterBenchmarkInternal(new TestName()))
+#define BENCHMARK_PRIVATE_REGISTER_F(TestName)           \
+  BENCHMARK_PRIVATE_DECLARE(TestName) =                  \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          ::benchmark::internal::make_unique<TestName>()))
+
+#define BENCHMARK_TEMPLATE_PRIVATE_CONCAT_NAME_F(BaseClass, Method) \
+  BaseClass##_##Method##_BenchmarkTemplate
+
+#define BENCHMARK_TEMPLATE_METHOD_F(BaseClass, Method)              \
+  template <class... Args>                                          \
+  class BENCHMARK_TEMPLATE_PRIVATE_CONCAT_NAME_F(BaseClass, Method) \
+      : public BaseClass<Args...> {                                 \
+   protected:                                                       \
+    using Base = BaseClass<Args...>;                                \
+    void BenchmarkCase(::benchmark::State&) override;               \
+  };                                                                \
+  template <class... Args>                                          \
+  void BENCHMARK_TEMPLATE_PRIVATE_CONCAT_NAME_F(                    \
+      BaseClass, Method)<Args...>::BenchmarkCase
+
+#define BENCHMARK_TEMPLATE_PRIVATE_INSTANTIATE_F(BaseClass, Method,           \
+                                                 UniqueName, ...)             \
+  class UniqueName : public BENCHMARK_TEMPLATE_PRIVATE_CONCAT_NAME_F(         \
+                         BaseClass, Method)<__VA_ARGS__> {                    \
+   public:                                                                    \
+    UniqueName() { this->SetName(#BaseClass "<" #__VA_ARGS__ ">/" #Method); } \
+  };                                                                          \
+  BENCHMARK_PRIVATE_DECLARE(BaseClass##_##Method##_Benchmark) =               \
+      (::benchmark::internal::RegisterBenchmarkInternal(                      \
+          ::benchmark::internal::make_unique<UniqueName>()))
+
+#define BENCHMARK_TEMPLATE_INSTANTIATE_F(BaseClass, Method, ...)    \
+  BENCHMARK_DISABLE_COUNTER_WARNING                                 \
+  BENCHMARK_TEMPLATE_PRIVATE_INSTANTIATE_F(                         \
+      BaseClass, Method, BENCHMARK_PRIVATE_NAME(BaseClass##Method), \
+      __VA_ARGS__)                                                  \
+  BENCHMARK_RESTORE_COUNTER_WARNING
 
 // This macro will define and register a benchmark within a fixture class.
 #define BENCHMARK_F(BaseClass, Method)           \
@@ -1694,22 +1733,18 @@ BENCHMARK_RESTORE_COUNTER_WARNING
   BENCHMARK_REGISTER_F(BaseClass, Method);                       \
   void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
-#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_F(BaseClass, Method, ...)                   \
   BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
   BENCHMARK_REGISTER_F(BaseClass, Method);                             \
   void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
-#else
-#define BENCHMARK_TEMPLATE_F(BaseClass, Method, a) \
-  BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)
-#endif
 
 // Helper macro to create a main routine in a test that runs the benchmarks
 // Note the workaround for Hexagon simulator passing argc != 0, argv = NULL.
 #define BENCHMARK_MAIN()                                                \
   int main(int argc, char** argv) {                                     \
+    benchmark::MaybeReenterWithoutASLR(argc, argv);                     \
     char arg0_default[] = "benchmark";                                  \
-    char* args_default = arg0_default;                                  \
+    char* args_default = reinterpret_cast<char*>(arg0_default);         \
     if (!argv) {                                                        \
       argc = 1;                                                         \
       argv = &args_default;                                             \
@@ -1752,7 +1787,10 @@ struct BENCHMARK_EXPORT CPUInfo {
 
 // Adding Struct for System Information
 struct BENCHMARK_EXPORT SystemInfo {
+  enum class ASLR { UNKNOWN, ENABLED, DISABLED };
+
   std::string name;
+  ASLR ASLRStatus;
   static const SystemInfo& Get();
 
  private:
@@ -1789,7 +1827,7 @@ class BENCHMARK_EXPORT BenchmarkReporter {
     CPUInfo const& cpu_info;
     SystemInfo const& sys_info;
     // The number of chars in the longest benchmark name.
-    size_t name_field_width;
+    size_t name_field_width = 0;
     static const char* executable_name;
     Context();
   };
@@ -1812,9 +1850,9 @@ class BENCHMARK_EXPORT BenchmarkReporter {
           complexity(oNone),
           complexity_lambda(),
           complexity_n(0),
+          statistics(),
           report_big_o(false),
           report_rms(false),
-          memory_result(NULL),
           allocs_per_iter(0.0) {}
 
     std::string benchmark_name() const;
@@ -1870,7 +1908,7 @@ class BENCHMARK_EXPORT BenchmarkReporter {
     UserCounters counters;
 
     // Memory metrics.
-    const MemoryManager::Result* memory_result;
+    MemoryManager::Result memory_result;
     double allocs_per_iter;
   };
 
@@ -1962,12 +2000,12 @@ class BENCHMARK_EXPORT ConsoleReporter : public BenchmarkReporter {
   explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
       : output_options_(opts_), name_field_width_(0), printed_header_(false) {}
 
-  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
-  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
+  bool ReportContext(const Context& context) override;
+  void ReportRuns(const std::vector<Run>& reports) override;
 
  protected:
-  virtual void PrintRunData(const Run& report);
-  virtual void PrintHeader(const Run& report);
+  virtual void PrintRunData(const Run& result);
+  virtual void PrintHeader(const Run& run);
 
   OutputOptions output_options_;
   size_t name_field_width_;
@@ -1978,12 +2016,12 @@ class BENCHMARK_EXPORT ConsoleReporter : public BenchmarkReporter {
 class BENCHMARK_EXPORT JSONReporter : public BenchmarkReporter {
  public:
   JSONReporter() : first_report_(true) {}
-  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
-  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
-  void Finalize() BENCHMARK_OVERRIDE;
+  bool ReportContext(const Context& context) override;
+  void ReportRuns(const std::vector<Run>& reports) override;
+  void Finalize() override;
 
  private:
-  void PrintRunData(const Run& report);
+  void PrintRunData(const Run& run);
 
   bool first_report_;
 };
@@ -1993,11 +2031,11 @@ class BENCHMARK_EXPORT BENCHMARK_DEPRECATED_MSG(
     : public BenchmarkReporter {
  public:
   CSVReporter() : printed_header_(false) {}
-  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
-  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
+  bool ReportContext(const Context& context) override;
+  void ReportRuns(const std::vector<Run>& reports) override;
 
  private:
-  void PrintRunData(const Run& report);
+  void PrintRunData(const Run& run);
 
   bool printed_header_;
   std::set<std::string> user_counter_names_;
diff --git a/third-party/benchmark/pyproject.toml b/third-party/benchmark/pyproject.toml
index aa24ae8c3f573..f55daf26068b0 100644
--- a/third-party/benchmark/pyproject.toml
+++ b/third-party/benchmark/pyproject.toml
@@ -1,42 +1,33 @@
 [build-system]
-requires = ["setuptools", "setuptools-scm[toml]", "wheel"]
+requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "google_benchmark"
 description = "A library to benchmark code snippets."
-requires-python = ">=3.8"
-license = {file = "LICENSE"}
+requires-python = ">=3.10"
+license = { file = "LICENSE" }
 keywords = ["benchmark"]
 
-authors = [
-    {name = "Google", email = "benchmark-discuss at googlegroups.com"},
-]
+authors = [{ name = "Google", email = "benchmark-discuss at googlegroups.com" }]
 
 classifiers = [
     "Development Status :: 4 - Beta",
     "Intended Audience :: Developers",
     "Intended Audience :: Science/Research",
     "License :: OSI Approved :: Apache Software License",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
     "Topic :: Software Development :: Testing",
     "Topic :: System :: Benchmark",
 ]
 
 dynamic = ["readme", "version"]
 
-dependencies = [
-    "absl-py>=0.7.1",
-]
-
-[project.optional-dependencies]
-dev = [
-    "pre-commit>=3.3.3",
-]
+[dependency-groups]
+dev = ["pre-commit>=3.3.3"]
 
 [project.urls]
 Homepage = "https://github.com/google/benchmark"
@@ -45,7 +36,7 @@ Repository = "https://github.com/google/benchmark.git"
 Discord = "https://discord.gg/cz7UX7wKC2"
 
 [tool.setuptools]
-package-dir = {"" = "bindings/python"}
+package-dir = { "" = "bindings/python" }
 zip-safe = false
 
 [tool.setuptools.packages.find]
@@ -53,8 +44,7 @@ where = ["bindings/python"]
 
 [tool.setuptools.dynamic]
 readme = { file = "README.md", content-type = "text/markdown" }
-
-[tool.setuptools_scm]
+version = { attr = "google_benchmark.__version__" }
 
 [tool.mypy]
 check_untyped_defs = true
@@ -75,11 +65,13 @@ src = ["bindings/python"]
 line-length = 80
 target-version = "py311"
 
+[tool.ruff.lint]
 # Enable pycodestyle (`E`, `W`), Pyflakes (`F`), and isort (`I`) codes by default.
-select = ["E", "F", "I", "W"]
+select = ["ASYNC", "B", "C4", "C90", "E", "F", "I", "PERF", "PIE", "PT018", "RUF", "SIM", "UP", "W"]
 ignore = [
-    "E501", # line too long
+    "PLW2901",  # redefined-loop-name
+    "UP031",    # printf-string-formatting
 ]
 
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
 combine-as-imports = true
diff --git a/third-party/benchmark/setup.py b/third-party/benchmark/setup.py
index cb20042da5123..d7807b4994396 100644
--- a/third-party/benchmark/setup.py
+++ b/third-party/benchmark/setup.py
@@ -1,46 +1,71 @@
 import contextlib
 import os
 import platform
+import re
 import shutil
-import sysconfig
+import sys
+from collections.abc import Generator
 from pathlib import Path
-from typing import Generator
+from typing import Any
 
 import setuptools
 from setuptools.command import build_ext
 
-PYTHON_INCLUDE_PATH_PLACEHOLDER = "<PYTHON_INCLUDE_PATH>"
-
 IS_WINDOWS = platform.system() == "Windows"
 IS_MAC = platform.system() == "Darwin"
+IS_LINUX = platform.system() == "Linux"
+
+# hardcoded SABI-related options. Requires that each Python interpreter
+# (hermetic or not) participating is of the same major-minor version.
+py_limited_api = sys.version_info >= (3, 12)
+options = {"bdist_wheel": {"py_limited_api": "cp312"}} if py_limited_api else {}
+
+
+def is_cibuildwheel() -> bool:
+    return os.getenv("CIBUILDWHEEL") is not None
 
 
 @contextlib.contextmanager
-def temp_fill_include_path(fp: str) -> Generator[None, None, None]:
-    """Temporarily set the Python include path in a file."""
-    with open(fp, "r+") as f:
-        try:
-            content = f.read()
-            replaced = content.replace(
-                PYTHON_INCLUDE_PATH_PLACEHOLDER,
-                Path(sysconfig.get_paths()["include"]).as_posix(),
+def _maybe_patch_toolchains() -> Generator[None, None, None]:
+    """
+    Patch rules_python toolchains to ignore root user error
+    when run in a Docker container on Linux in cibuildwheel.
+    """
+
+    def fmt_toolchain_args(matchobj):
+        suffix = "ignore_root_user_error = True"
+        callargs = matchobj.group(1)
+        # toolchain def is broken over multiple lines
+        if callargs.endswith("\n"):
+            callargs = callargs + "    " + suffix + ",\n"
+        # toolchain def is on one line.
+        else:
+            callargs = callargs + ", " + suffix
+        return "python.toolchain(" + callargs + ")"
+
+    CIBW_LINUX = is_cibuildwheel() and IS_LINUX
+    module_bazel = Path("MODULE.bazel")
+    content: str = module_bazel.read_text()
+    try:
+        if CIBW_LINUX:
+            module_bazel.write_text(
+                re.sub(
+                    r"python.toolchain\(([\w\"\s,.=]*)\)",
+                    fmt_toolchain_args,
+                    content,
+                )
             )
-            f.seek(0)
-            f.write(replaced)
-            f.truncate()
-            yield
-        finally:
-            # revert to the original content after exit
-            f.seek(0)
-            f.write(content)
-            f.truncate()
+        yield
+    finally:
+        if CIBW_LINUX:
+            module_bazel.write_text(content)
 
 
 class BazelExtension(setuptools.Extension):
     """A C/C++ extension that is defined as a Bazel BUILD target."""
 
-    def __init__(self, name: str, bazel_target: str):
-        super().__init__(name=name, sources=[])
+    def __init__(self, name: str, bazel_target: str, **kwargs: Any):
+        super().__init__(name=name, sources=[], **kwargs)
 
         self.bazel_target = bazel_target
         stripped_target = bazel_target.split("//")[-1]
@@ -53,7 +78,6 @@ class BuildBazelExtension(build_ext.build_ext):
     def run(self):
         for ext in self.extensions:
             self.bazel_build(ext)
-        super().run()
         # explicitly call `bazel shutdown` for graceful exit
         self.spawn(["bazel", "shutdown"])
 
@@ -63,61 +87,85 @@ def copy_extensions_to_source(self):
         This is done in the ``bazel_build`` method, so it's not necessary to
         do again in the `build_ext` base class.
         """
-        pass
 
-    def bazel_build(self, ext: BazelExtension) -> None:
+    def bazel_build(self, ext: BazelExtension) -> None:  # noqa: C901
         """Runs the bazel build to create the package."""
-        with temp_fill_include_path("WORKSPACE"):
-            temp_path = Path(self.build_temp)
-
-            bazel_argv = [
-                "bazel",
-                "build",
-                ext.bazel_target,
-                "--enable_bzlmod=false",
-                f"--symlink_prefix={temp_path / 'bazel-'}",
-                f"--compilation_mode={'dbg' if self.debug else 'opt'}",
-                # C++17 is required by nanobind
-                f"--cxxopt={'/std:c++17' if IS_WINDOWS else '-std=c++17'}",
-            ]
-
-            if IS_WINDOWS:
-                # Link with python*.lib.
-                for library_dir in self.library_dirs:
-                    bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
-            elif IS_MAC:
-                if platform.machine() == "x86_64":
-                    # C++17 needs macOS 10.14 at minimum
-                    bazel_argv.append("--macos_minimum_os=10.14")
-
-                    # cross-compilation for Mac ARM64 on GitHub Mac x86 runners.
-                    # ARCHFLAGS is set by cibuildwheel before macOS wheel builds.
-                    archflags = os.getenv("ARCHFLAGS", "")
-                    if "arm64" in archflags:
-                        bazel_argv.append("--cpu=darwin_arm64")
-                        bazel_argv.append("--macos_cpus=arm64")
-
-                elif platform.machine() == "arm64":
-                    bazel_argv.append("--macos_minimum_os=11.0")
-
+        temp_path = Path(self.build_temp)
+
+        # We round to the minor version, which makes rules_python
+        # look up the latest available patch version internally.
+        python_version = "{}.{}".format(*sys.version_info[:2])
+
+        bazel_argv = [
+            "bazel",
+            "run",
+            ext.bazel_target,
+            f"--symlink_prefix={temp_path / 'bazel-'}",
+            f"--compilation_mode={'dbg' if self.debug else 'opt'}",
+            # C++17 is required by nanobind
+            f"--cxxopt={'/std:c++17' if IS_WINDOWS else '-std=c++17'}",
+            f"-- at rules_python//python/config_settings:python_version={python_version}",
+        ]
+
+        if ext.py_limited_api:
+            bazel_argv += ["-- at nanobind_bazel//:py-limited-api=cp312"]
+
+        if IS_WINDOWS:
+            # Link with python*.lib.
+            for library_dir in self.library_dirs:
+                bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
+        elif IS_MAC:
+            # C++17 needs macOS 10.14 at minimum
+            bazel_argv.append("--macos_minimum_os=10.14")
+
+        with _maybe_patch_toolchains():
             self.spawn(bazel_argv)
 
-            shared_lib_suffix = ".dll" if IS_WINDOWS else ".so"
-            ext_name = ext.target_name + shared_lib_suffix
-            ext_bazel_bin_path = (
-                temp_path / "bazel-bin" / ext.relpath / ext_name
-            )
-
-            ext_dest_path = Path(self.get_ext_fullpath(ext.name))
-            shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
+        if IS_WINDOWS:
+            suffix = ".pyd"
+        else:
+            suffix = ".abi3.so" if ext.py_limited_api else ".so"
+
+        # copy the Bazel build artifacts into setuptools' libdir,
+        # from where the wheel is built.
+        pkgname = "google_benchmark"
+        pythonroot = Path("bindings") / "python" / "google_benchmark"
+        srcdir = temp_path / "bazel-bin" / pythonroot
+        if not self.inplace:
+            libdir = Path(self.build_lib) / pkgname
+        else:
+            build_py = self.get_finalized_command("build_py")
+            libdir = Path(build_py.get_package_dir(pkgname))
+
+        for root, dirs, files in os.walk(srcdir, topdown=True):
+            # exclude runfiles directories and children.
+            dirs[:] = [d for d in dirs if "runfiles" not in d]
+
+            for f in files:
+                fp = Path(f)
+                should_copy = False
+                # we do not want the bare .so file included
+                # when building for ABI3, so we require a
+                # full and exact match on the file extension.
+                if "".join(fp.suffixes) == suffix or fp.suffix == ".pyi":
+                    should_copy = True
+                elif Path(root) == srcdir and f == "py.typed":
+                    # copy py.typed, but only at the package root.
+                    should_copy = True
+
+                if should_copy:
+                    shutil.copyfile(root / fp, libdir / fp)
 
 
 setuptools.setup(
-    cmdclass=dict(build_ext=BuildBazelExtension),
+    cmdclass={"build_ext": BuildBazelExtension},
+    package_data={"google_benchmark": ["py.typed", "*.pyi"]},
     ext_modules=[
         BazelExtension(
             name="google_benchmark._benchmark",
-            bazel_target="//bindings/python/google_benchmark:_benchmark",
+            bazel_target="//bindings/python/google_benchmark:benchmark_stubgen",
+            py_limited_api=py_limited_api,
         )
     ],
+    options=options,
 )
diff --git a/third-party/benchmark/src/CMakeLists.txt b/third-party/benchmark/src/CMakeLists.txt
index 0357dcce3f831..6804c3c463d7e 100644
--- a/third-party/benchmark/src/CMakeLists.txt
+++ b/third-party/benchmark/src/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Allow the source files to find headers in src/
+#Allow the source files to find headers in src /
 include(GNUInstallDirs)
 include_directories(${PROJECT_SOURCE_DIR}/src)
 
@@ -39,6 +39,9 @@ set_property(
 if (PFM_FOUND)
   target_link_libraries(benchmark PRIVATE PFM::libpfm)
   target_compile_definitions(benchmark PRIVATE -DHAVE_LIBPFM)
+  install(
+      FILES "${PROJECT_SOURCE_DIR}/cmake/Modules/FindPFM.cmake"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 endif()
 
 # pthread affinity, if available
@@ -86,6 +89,7 @@ set(generated_dir "${PROJECT_BINARY_DIR}")
 set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
 set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
 set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
+set(pkg_config_main "${generated_dir}/${PROJECT_NAME}_main.pc")
 set(targets_to_export benchmark benchmark_main)
 set(targets_export_name "${PROJECT_NAME}Targets")
 
@@ -104,7 +108,22 @@ write_basic_package_version_file(
   "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY SameMajorVersion
 )
 
+# Derive private link libraries from target
+if(NOT BUILD_SHARED_LIBS)
+  get_target_property(LINK_LIBS benchmark LINK_LIBRARIES)
+  if(LINK_LIBS)
+    set(BENCHMARK_PRIVATE_LINK_LIBRARIES "")
+    foreach(LIB IN LISTS LINK_LIBS)
+      if(NOT TARGET "${LIB}" AND LIB MATCHES "^[a-zA-Z0-9_.-]+$")
+        list(APPEND BENCHMARK_PRIVATE_LINK_LIBRARIES "-l${LIB}")
+      endif()
+    endforeach()
+    string(JOIN " " BENCHMARK_PRIVATE_LINK_LIBRARIES ${BENCHMARK_PRIVATE_LINK_LIBRARIES})
+  endif()
+endif()
+
 configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)
+configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark_main.pc.in" "${pkg_config_main}" @ONLY)
 
 export (
   TARGETS ${targets_to_export}
@@ -133,7 +152,7 @@ if (BENCHMARK_ENABLE_INSTALL)
       DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 
   install(
-      FILES "${pkg_config}"
+      FILES "${pkg_config}" "${pkg_config_main}"
       DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
 
   install(
@@ -175,3 +194,11 @@ else()
       DESTINATION ${CMAKE_INSTALL_DOCDIR})
   endif()
 endif()
+
+set(CMAKE_INSTALL_PYTOOLSDIR "${CMAKE_INSTALL_DATADIR}/googlebenchmark/tools" CACHE PATH "")
+
+if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_TOOLS)
+  install(
+    DIRECTORY "${PROJECT_SOURCE_DIR}/tools/"
+    DESTINATION ${CMAKE_INSTALL_PYTOOLSDIR})
+endif()
diff --git a/third-party/benchmark/src/benchmark.cc b/third-party/benchmark/src/benchmark.cc
index 495944db29ff7..fc36fedb19cfa 100644
--- a/third-party/benchmark/src/benchmark.cc
+++ b/third-party/benchmark/src/benchmark.cc
@@ -26,6 +26,10 @@
 #include <unistd.h>
 #endif
 
+#ifdef BENCHMARK_OS_LINUX
+#include <sys/personality.h>
+#endif
+
 #include <algorithm>
 #include <atomic>
 #include <condition_variable>
@@ -46,7 +50,6 @@
 #include "commandlineflags.h"
 #include "complexity.h"
 #include "counter.h"
-#include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
 #include "perf_counters.h"
@@ -92,6 +95,11 @@ BM_DEFINE_double(benchmark_min_warmup_time, 0.0);
 // standard deviation of the runs will be reported.
 BM_DEFINE_int32(benchmark_repetitions, 1);
 
+// If enabled, forces each benchmark to execute exactly one iteration and one
+// repetition, bypassing any configured
+// MinTime()/MinWarmUpTime()/Iterations()/Repetitions()
+BM_DEFINE_bool(benchmark_dry_run, false);
+
 // If set, enable random interleaving of repetitions of all benchmarks.
 // See http://github.com/google/benchmark/issues/1051 for details.
 BM_DEFINE_bool(benchmark_enable_random_interleaving, false);
@@ -146,21 +154,34 @@ BM_DEFINE_int32(v, 0);
 
 namespace internal {
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::map<std::string, std::string>* global_context = nullptr;
 
 BENCHMARK_EXPORT std::map<std::string, std::string>*& GetGlobalContext() {
   return global_context;
 }
 
-// FIXME: wouldn't LTO mess this up?
-void UseCharPointer(char const volatile*) {}
+namespace {
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+void const volatile* volatile global_force_escape_pointer;
+}  // namespace
+
+// FIXME: Verify if LTO still messes this up?
+void UseCharPointer(char const volatile* const v) {
+  // We want to escape the pointer `v` so that the compiler can not eliminate
+  // computations that produced it. To do that, we escape the pointer by storing
+  // it into a volatile variable, since generally, volatile store, is not
+  // something the compiler is allowed to elide.
+  global_force_escape_pointer = reinterpret_cast<void const volatile*>(v);
+}
 
 }  // namespace internal
 
 State::State(std::string name, IterationCount max_iters,
              const std::vector<int64_t>& ranges, int thread_i, int n_threads,
              internal::ThreadTimer* timer, internal::ThreadManager* manager,
-             internal::PerfCountersMeasurement* perf_counters_measurement)
+             internal::PerfCountersMeasurement* perf_counters_measurement,
+             ProfilerManager* profiler_manager)
     : total_iterations_(0),
       batch_leftover_(0),
       max_iterations(max_iters),
@@ -174,7 +195,8 @@ State::State(std::string name, IterationCount max_iters,
       threads_(n_threads),
       timer_(timer),
       manager_(manager),
-      perf_counters_measurement_(perf_counters_measurement) {
+      perf_counters_measurement_(perf_counters_measurement),
+      profiler_manager_(profiler_manager) {
   BM_CHECK(max_iterations != 0) << "At least one iteration must be run";
   BM_CHECK_LT(thread_index_, threads_)
       << "thread_index must be less than threads";
@@ -183,7 +205,7 @@ State::State(std::string name, IterationCount max_iters,
   // `PauseTiming`, a new `Counter` will be inserted the first time, which
   // won't have the flag.  Inserting them now also reduces the allocations
   // during the benchmark.
-  if (perf_counters_measurement_) {
+  if (perf_counters_measurement_ != nullptr) {
     for (const std::string& counter_name :
          perf_counters_measurement_->names()) {
       counters[counter_name] = Counter(0.0, Counter::kAvgIterations);
@@ -199,12 +221,9 @@ State::State(std::string name, IterationCount max_iters,
 #if defined(__INTEL_COMPILER)
 #pragma warning push
 #pragma warning(disable : 1875)
-#elif defined(__GNUC__)
+#elif defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Winvalid-offsetof"
-#elif defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Winvalid-offsetof"
 #endif
 #if defined(__NVCC__)
 #pragma nv_diagnostic push
@@ -220,10 +239,8 @@ State::State(std::string name, IterationCount max_iters,
       offsetof(State, skipped_) <= (cache_line_size - sizeof(skipped_)), "");
 #if defined(__INTEL_COMPILER)
 #pragma warning pop
-#elif defined(__GNUC__)
+#elif defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic pop
-#elif defined(__clang__)
-#pragma clang diagnostic pop
 #endif
 #if defined(__NVCC__)
 #pragma nv_diagnostic pop
@@ -237,7 +254,7 @@ void State::PauseTiming() {
   // Add in time accumulated so far
   BM_CHECK(started_ && !finished_ && !skipped());
   timer_->StopTimer();
-  if (perf_counters_measurement_) {
+  if (perf_counters_measurement_ != nullptr) {
     std::vector<std::pair<std::string, double>> measurements;
     if (!perf_counters_measurement_->Stop(measurements)) {
       BM_CHECK(false) << "Perf counters read the value failed.";
@@ -255,7 +272,7 @@ void State::PauseTiming() {
 void State::ResumeTiming() {
   BM_CHECK(started_ && !finished_ && !skipped());
   timer_->StartTimer();
-  if (perf_counters_measurement_) {
+  if (perf_counters_measurement_ != nullptr) {
     perf_counters_measurement_->Start();
   }
 }
@@ -270,7 +287,9 @@ void State::SkipWithMessage(const std::string& msg) {
     }
   }
   total_iterations_ = 0;
-  if (timer_->running()) timer_->StopTimer();
+  if (timer_->running()) {
+    timer_->StopTimer();
+  }
 }
 
 void State::SkipWithError(const std::string& msg) {
@@ -283,7 +302,9 @@ void State::SkipWithError(const std::string& msg) {
     }
   }
   total_iterations_ = 0;
-  if (timer_->running()) timer_->StopTimer();
+  if (timer_->running()) {
+    timer_->StopTimer();
+  }
 }
 
 void State::SetIterationTime(double seconds) {
@@ -299,8 +320,13 @@ void State::StartKeepRunning() {
   BM_CHECK(!started_ && !finished_);
   started_ = true;
   total_iterations_ = skipped() ? 0 : max_iterations;
+  if (BENCHMARK_BUILTIN_EXPECT(profiler_manager_ != nullptr, false)) {
+    profiler_manager_->AfterSetupStart();
+  }
   manager_->StartStopBarrier();
-  if (!skipped()) ResumeTiming();
+  if (!skipped()) {
+    ResumeTiming();
+  }
 }
 
 void State::FinishKeepRunning() {
@@ -312,6 +338,9 @@ void State::FinishKeepRunning() {
   total_iterations_ = 0;
   finished_ = true;
   manager_->StartStopBarrier();
+  if (BENCHMARK_BUILTIN_EXPECT(profiler_manager_ != nullptr, false)) {
+    profiler_manager_->BeforeTeardownStop();
+  }
 }
 
 namespace internal {
@@ -320,7 +349,9 @@ namespace {
 // Flushes streams after invoking reporter methods that write to them. This
 // ensures users get timely updates even when streams are not line-buffered.
 void FlushStreams(BenchmarkReporter* reporter) {
-  if (!reporter) return;
+  if (reporter == nullptr) {
+    return;
+  }
   std::flush(reporter->GetOutputStream());
   std::flush(reporter->GetErrorStream());
 }
@@ -333,16 +364,20 @@ void Report(BenchmarkReporter* display_reporter,
     assert(reporter);
     // If there are no aggregates, do output non-aggregates.
     aggregates_only &= !results.aggregates_only.empty();
-    if (!aggregates_only) reporter->ReportRuns(results.non_aggregates);
-    if (!results.aggregates_only.empty())
+    if (!aggregates_only) {
+      reporter->ReportRuns(results.non_aggregates);
+    }
+    if (!results.aggregates_only.empty()) {
       reporter->ReportRuns(results.aggregates_only);
+    }
   };
 
   report_one(display_reporter, run_results.display_report_aggregates_only,
              run_results);
-  if (file_reporter)
+  if (file_reporter != nullptr) {
     report_one(file_reporter, run_results.file_report_aggregates_only,
                run_results);
+  }
 
   FlushStreams(display_reporter);
   FlushStreams(file_reporter);
@@ -363,10 +398,13 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
         std::max<size_t>(name_field_width, benchmark.name().str().size());
     might_have_aggregates |= benchmark.repetitions() > 1;
 
-    for (const auto& Stat : benchmark.statistics())
+    for (const auto& Stat : benchmark.statistics()) {
       stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
+    }
+  }
+  if (might_have_aggregates) {
+    name_field_width += 1 + stat_field_width;
   }
-  if (might_have_aggregates) name_field_width += 1 + stat_field_width;
 
   // Print header here
   BenchmarkReporter::Context context;
@@ -377,7 +415,7 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
       per_family_reports;
 
   if (display_reporter->ReportContext(context) &&
-      (!file_reporter || file_reporter->ReportContext(context))) {
+      ((file_reporter == nullptr) || file_reporter->ReportContext(context))) {
     FlushStreams(display_reporter);
     FlushStreams(file_reporter);
 
@@ -399,14 +437,17 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
     // Loop through all benchmarks
     for (const BenchmarkInstance& benchmark : benchmarks) {
       BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
-      if (benchmark.complexity() != oNone)
+      if (benchmark.complexity() != oNone) {
         reports_for_family = &per_family_reports[benchmark.family_index()];
-      benchmarks_with_threads += (benchmark.threads() > 1);
+      }
+      benchmarks_with_threads += static_cast<int>(benchmark.threads() > 1);
       runners.emplace_back(benchmark, &perfcounters, reports_for_family);
       int num_repeats_of_this_instance = runners.back().GetNumRepeats();
-      num_repetitions_total += num_repeats_of_this_instance;
-      if (reports_for_family)
+      num_repetitions_total +=
+          static_cast<size_t>(num_repeats_of_this_instance);
+      if (reports_for_family != nullptr) {
         reports_for_family->num_runs_total += num_repeats_of_this_instance;
+      }
     }
     assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
 
@@ -441,14 +482,17 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
     for (size_t repetition_index : repetition_indices) {
       internal::BenchmarkRunner& runner = runners[repetition_index];
       runner.DoOneRepetition();
-      if (runner.HasRepeatsRemaining()) continue;
+      if (runner.HasRepeatsRemaining()) {
+        continue;
+      }
       // FIXME: report each repetition separately, not all of them in bulk.
 
       display_reporter->ReportRunsConfig(
           runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
-      if (file_reporter)
+      if (file_reporter != nullptr) {
         file_reporter->ReportRunsConfig(
             runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
+      }
 
       RunResults run_results = runner.GetResults();
 
@@ -469,7 +513,9 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
     }
   }
   display_reporter->Finalize();
-  if (file_reporter) file_reporter->Finalize();
+  if (file_reporter != nullptr) {
+    file_reporter->Finalize();
+  }
   FlushStreams(display_reporter);
   FlushStreams(file_reporter);
 }
@@ -491,6 +537,7 @@ std::unique_ptr<BenchmarkReporter> CreateReporter(
     return PtrType(new CSVReporter());
   }
   std::cerr << "Unexpected format: '" << name << "'\n";
+  std::flush(std::cerr);
   std::exit(1);
 }
 
@@ -529,7 +576,7 @@ ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
 }  // end namespace internal
 
 BenchmarkReporter* CreateDefaultDisplayReporter() {
-  static auto default_display_reporter =
+  static auto* default_display_reporter =
       internal::CreateReporter(FLAGS_benchmark_format,
                                internal::GetOutputOptions())
           .release();
@@ -563,14 +610,15 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
                               BenchmarkReporter* file_reporter,
                               std::string spec) {
-  if (spec.empty() || spec == "all")
+  if (spec.empty() || spec == "all") {
     spec = ".";  // Regexp that matches all benchmarks
+  }
 
   // Setup the reporters
   std::ofstream output_file;
   std::unique_ptr<BenchmarkReporter> default_display_reporter;
   std::unique_ptr<BenchmarkReporter> default_file_reporter;
-  if (!display_reporter) {
+  if (display_reporter == nullptr) {
     default_display_reporter.reset(CreateDefaultDisplayReporter());
     display_reporter = default_display_reporter.get();
   }
@@ -578,10 +626,9 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
   auto& Err = display_reporter->GetErrorStream();
 
   std::string const& fname = FLAGS_benchmark_out;
-  if (fname.empty() && file_reporter) {
+  if (fname.empty() && (file_reporter != nullptr)) {
     Err << "A custom file reporter was provided but "
-           "--benchmark_out=<file> was not specified."
-        << std::endl;
+           "--benchmark_out=<file> was not specified.\n";
     Out.flush();
     Err.flush();
     std::exit(1);
@@ -589,12 +636,12 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
   if (!fname.empty()) {
     output_file.open(fname);
     if (!output_file.is_open()) {
-      Err << "invalid file name: '" << fname << "'" << std::endl;
+      Err << "invalid file name: '" << fname << "'\n";
       Out.flush();
       Err.flush();
       std::exit(1);
     }
-    if (!file_reporter) {
+    if (file_reporter == nullptr) {
       default_file_reporter = internal::CreateReporter(
           FLAGS_benchmark_out_format, FLAGS_benchmark_counters_tabular
                                           ? ConsoleReporter::OO_Tabular
@@ -620,8 +667,9 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
   }
 
   if (FLAGS_benchmark_list_tests) {
-    for (auto const& benchmark : benchmarks)
+    for (auto const& benchmark : benchmarks) {
       Out << benchmark.name().str() << "\n";
+    }
   } else {
     internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
   }
@@ -652,11 +700,20 @@ void RegisterMemoryManager(MemoryManager* manager) {
   internal::memory_manager = manager;
 }
 
-void AddCustomContext(const std::string& key, const std::string& value) {
+void RegisterProfilerManager(ProfilerManager* manager) {
+  // Don't allow overwriting an existing manager.
+  if (manager != nullptr) {
+    BM_CHECK_EQ(internal::profiler_manager, nullptr);
+  }
+  internal::profiler_manager = manager;
+}
+
+void AddCustomContext(std::string key, std::string value) {
   if (internal::global_context == nullptr) {
     internal::global_context = new std::map<std::string, std::string>();
   }
-  if (!internal::global_context->emplace(key, value).second) {
+  if (!internal::global_context->emplace(std::move(key), std::move(value))
+           .second) {
     std::cerr << "Failed to add custom context \"" << key << "\" as it already "
               << "exists with value \"" << value << "\"\n";
   }
@@ -666,9 +723,12 @@ namespace internal {
 
 void (*HelperPrintf)();
 
+namespace {
 void PrintUsageAndExit() {
   HelperPrintf();
-  exit(0);
+  std::flush(std::cout);
+  std::flush(std::cerr);
+  std::exit(0);
 }
 
 void SetDefaultTimeUnitFromFlag(const std::string& time_unit_flag) {
@@ -692,8 +752,8 @@ void SetDefaultTimeUnitFromFlag(const std::string& time_unit_flag) {
 void ParseCommandLineFlags(int* argc, char** argv) {
   using namespace benchmark;
   BenchmarkReporter::Context::executable_name =
-      (argc && *argc > 0) ? argv[0] : "unknown";
-  for (int i = 1; argc && i < *argc; ++i) {
+      ((argc != nullptr) && *argc > 0) ? argv[0] : "unknown";
+  for (int i = 1; (argc != nullptr) && i < *argc; ++i) {
     if (ParseBoolFlag(argv[i], "benchmark_list_tests",
                       &FLAGS_benchmark_list_tests) ||
         ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
@@ -703,6 +763,7 @@ void ParseCommandLineFlags(int* argc, char** argv) {
                         &FLAGS_benchmark_min_warmup_time) ||
         ParseInt32Flag(argv[i], "benchmark_repetitions",
                        &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_dry_run", &FLAGS_benchmark_dry_run) ||
         ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
                       &FLAGS_benchmark_enable_random_interleaving) ||
         ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
@@ -723,7 +784,9 @@ void ParseCommandLineFlags(int* argc, char** argv) {
         ParseStringFlag(argv[i], "benchmark_time_unit",
                         &FLAGS_benchmark_time_unit) ||
         ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
-      for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
+      for (int j = i; j != *argc - 1; ++j) {
+        argv[j] = argv[j + 1];
+      }
 
       --(*argc);
       --i;
@@ -741,19 +804,73 @@ void ParseCommandLineFlags(int* argc, char** argv) {
   if (FLAGS_benchmark_color.empty()) {
     PrintUsageAndExit();
   }
+  if (FLAGS_benchmark_dry_run) {
+    AddCustomContext("dry_run", "true");
+  }
   for (const auto& kv : FLAGS_benchmark_context) {
     AddCustomContext(kv.first, kv.second);
   }
 }
 
+}  // end namespace
+
 int InitializeStreams() {
   static std::ios_base::Init init;
   return 0;
 }
 
+template <typename T>
+std::make_unsigned_t<T> get_as_unsigned(T v) {
+  using UnsignedT = std::make_unsigned_t<T>;
+  return static_cast<UnsignedT>(v);
+}
+
 }  // end namespace internal
 
-std::string GetBenchmarkVersion() { return {BENCHMARK_VERSION}; }
+void MaybeReenterWithoutASLR(int /*argc*/, char** argv) {
+  // On e.g. Hexagon simulator, argv may be NULL.
+  if (!argv) return;
+
+#ifdef BENCHMARK_OS_LINUX
+  const auto curr_personality = personality(0xffffffff);
+
+  // We should never fail to read-only query the current personality,
+  // but let's be cautious.
+  if (curr_personality == -1) return;
+
+  // If ASLR is already disabled, we have nothing more to do.
+  if (internal::get_as_unsigned(curr_personality) & ADDR_NO_RANDOMIZE) return;
+
+  // Try to change the personality to disable ASLR.
+  const auto proposed_personality =
+      internal::get_as_unsigned(curr_personality) | ADDR_NO_RANDOMIZE;
+  const auto prev_personality = personality(proposed_personality);
+
+  // Have we failed to change the personality? That may happen.
+  if (prev_personality == -1) return;
+
+  // Make sure the parsona has been updated with the no-ASLR flag,
+  // otherwise we will try to reenter infinitely.
+  // This seems impossible, but can happen in some docker configurations.
+  const auto new_personality = personality(0xffffffff);
+  if ((internal::get_as_unsigned(new_personality) & ADDR_NO_RANDOMIZE) == 0)
+    return;
+
+  execv(argv[0], argv);
+  // The exec() functions return only if an error has occurred,
+  // in which case we want to just continue as-is.
+#else
+  return;
+#endif
+}
+
+std::string GetBenchmarkVersion() {
+#ifdef BENCHMARK_VERSION
+  return {BENCHMARK_VERSION};
+#else
+  return {""};
+#endif
+}
 
 void PrintDefaultHelp() {
   fprintf(stdout,
@@ -763,6 +880,7 @@ void PrintDefaultHelp() {
           "          [--benchmark_min_time=`<integer>x` OR `<float>s` ]\n"
           "          [--benchmark_min_warmup_time=<min_warmup_time>]\n"
           "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_dry_run={true|false}]\n"
           "          [--benchmark_enable_random_interleaving={true|false}]\n"
           "          [--benchmark_report_aggregates_only={true|false}]\n"
           "          [--benchmark_display_aggregates_only={true|false}]\n"
diff --git a/third-party/benchmark/src/benchmark_api_internal.cc b/third-party/benchmark/src/benchmark_api_internal.cc
index 286f986530859..f9c4990ddfcc9 100644
--- a/third-party/benchmark/src/benchmark_api_internal.cc
+++ b/third-party/benchmark/src/benchmark_api_internal.cc
@@ -7,7 +7,8 @@
 namespace benchmark {
 namespace internal {
 
-BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
+BenchmarkInstance::BenchmarkInstance(benchmark::Benchmark* benchmark,
+                                     int family_idx,
                                      int per_family_instance_idx,
                                      const std::vector<int64_t>& args,
                                      int thread_count)
@@ -27,7 +28,9 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
       min_time_(benchmark_.min_time_),
       min_warmup_time_(benchmark_.min_warmup_time_),
       iterations_(benchmark_.iterations_),
-      threads_(thread_count) {
+      threads_(thread_count),
+      setup_(benchmark_.setup_),
+      teardown_(benchmark_.teardown_) {
   name_.function_name = benchmark_.name_;
 
   size_t arg_i = 0;
@@ -84,33 +87,31 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
   if (!benchmark_.thread_counts_.empty()) {
     name_.threads = StrFormat("threads:%d", threads_);
   }
-
-  setup_ = benchmark_.setup_;
-  teardown_ = benchmark_.teardown_;
 }
 
 State BenchmarkInstance::Run(
     IterationCount iters, int thread_id, internal::ThreadTimer* timer,
     internal::ThreadManager* manager,
-    internal::PerfCountersMeasurement* perf_counters_measurement) const {
+    internal::PerfCountersMeasurement* perf_counters_measurement,
+    ProfilerManager* profiler_manager) const {
   State st(name_.function_name, iters, args_, thread_id, threads_, timer,
-           manager, perf_counters_measurement);
+           manager, perf_counters_measurement, profiler_manager);
   benchmark_.Run(st);
   return st;
 }
 
 void BenchmarkInstance::Setup() const {
-  if (setup_) {
+  if (setup_ != nullptr) {
     State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
-             nullptr, nullptr, nullptr);
+             nullptr, nullptr, nullptr, nullptr);
     setup_(st);
   }
 }
 
 void BenchmarkInstance::Teardown() const {
-  if (teardown_) {
+  if (teardown_ != nullptr) {
     State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
-             nullptr, nullptr, nullptr);
+             nullptr, nullptr, nullptr, nullptr);
     teardown_(st);
   }
 }
diff --git a/third-party/benchmark/src/benchmark_api_internal.h b/third-party/benchmark/src/benchmark_api_internal.h
index 94f516531bc4f..5b48ea2fdf8bd 100644
--- a/third-party/benchmark/src/benchmark_api_internal.h
+++ b/third-party/benchmark/src/benchmark_api_internal.h
@@ -17,9 +17,9 @@ namespace internal {
 // Information kept per benchmark we may want to run
 class BenchmarkInstance {
  public:
-  BenchmarkInstance(Benchmark* benchmark, int family_index,
-                    int per_family_instance_index,
-                    const std::vector<int64_t>& args, int threads);
+  BenchmarkInstance(benchmark::Benchmark* benchmark, int family_idx,
+                    int per_family_instance_idx,
+                    const std::vector<int64_t>& args, int thread_count);
 
   const BenchmarkName& name() const { return name_; }
   int family_index() const { return family_index_; }
@@ -41,14 +41,18 @@ class BenchmarkInstance {
   int threads() const { return threads_; }
   void Setup() const;
   void Teardown() const;
+  const auto& GetUserThreadRunnerFactory() const {
+    return benchmark_.threadrunner_;
+  }
 
   State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
             internal::ThreadManager* manager,
-            internal::PerfCountersMeasurement* perf_counters_measurement) const;
+            internal::PerfCountersMeasurement* perf_counters_measurement,
+            ProfilerManager* profiler_manager) const;
 
  private:
   BenchmarkName name_;
-  Benchmark& benchmark_;
+  benchmark::Benchmark& benchmark_;
   const int family_index_;
   const int per_family_instance_index_;
   AggregationReportMode aggregation_report_mode_;
@@ -67,9 +71,8 @@ class BenchmarkInstance {
   IterationCount iterations_;
   int threads_;  // Number of concurrent threads to us
 
-  typedef void (*callback_function)(const benchmark::State&);
-  callback_function setup_ = nullptr;
-  callback_function teardown_ = nullptr;
+  callback_function setup_;
+  callback_function teardown_;
 };
 
 bool FindBenchmarksInternal(const std::string& re,
diff --git a/third-party/benchmark/src/benchmark_main.cc b/third-party/benchmark/src/benchmark_main.cc
index cd61cd2ad5069..15c76eacebd9c 100644
--- a/third-party/benchmark/src/benchmark_main.cc
+++ b/third-party/benchmark/src/benchmark_main.cc
@@ -14,5 +14,5 @@
 
 #include "benchmark/benchmark.h"
 
-BENCHMARK_EXPORT int main(int, char**);
+BENCHMARK_EXPORT int main(int /*argc*/, char** /*argv*/);
 BENCHMARK_MAIN();
diff --git a/third-party/benchmark/src/benchmark_name.cc b/third-party/benchmark/src/benchmark_name.cc
index 01676bbc84df4..804cfbd3b7cd4 100644
--- a/third-party/benchmark/src/benchmark_name.cc
+++ b/third-party/benchmark/src/benchmark_name.cc
@@ -27,8 +27,8 @@ size_t size_impl(const Head& head, const Tail&... tail) {
 }
 
 // Join a pack of std::strings using a delimiter
-// TODO: use absl::StrJoin
-void join_impl(std::string&, char) {}
+// TODO(dominic): use absl::StrJoin
+void join_impl(std::string& /*unused*/, char /*unused*/) {}
 
 template <typename Head, typename... Tail>
 void join_impl(std::string& s, const char delimiter, const Head& head,
diff --git a/third-party/benchmark/src/benchmark_register.cc b/third-party/benchmark/src/benchmark_register.cc
index e447c9a2d39ba..65e1afced399c 100644
--- a/third-party/benchmark/src/benchmark_register.cc
+++ b/third-party/benchmark/src/benchmark_register.cc
@@ -53,13 +53,13 @@ namespace benchmark {
 
 namespace {
 // For non-dense Range, intermediate values are powers of kRangeMultiplier.
-static constexpr int kRangeMultiplier = 8;
+constexpr int kRangeMultiplier = 8;
 
 // The size of a benchmark family determines is the number of inputs to repeat
 // the benchmark on. If this is "large" then warn the user during configuration.
-static constexpr size_t kMaxFamilySize = 100;
+constexpr size_t kMaxFamilySize = 100;
 
-static constexpr char kDisabledPrefix[] = "DISABLED_";
+constexpr char kDisabledPrefix[] = "DISABLED_";
 }  // end namespace
 
 namespace internal {
@@ -75,21 +75,21 @@ class BenchmarkFamilies {
   static BenchmarkFamilies* GetInstance();
 
   // Registers a benchmark family and returns the index assigned to it.
-  size_t AddBenchmark(std::unique_ptr<Benchmark> family);
+  size_t AddBenchmark(std::unique_ptr<benchmark::Benchmark> family);
 
   // Clear all registered benchmark families.
   void ClearBenchmarks();
 
   // Extract the list of benchmark instances that match the specified
   // regular expression.
-  bool FindBenchmarks(std::string re,
+  bool FindBenchmarks(std::string spec,
                       std::vector<BenchmarkInstance>* benchmarks,
                       std::ostream* Err);
 
  private:
   BenchmarkFamilies() {}
 
-  std::vector<std::unique_ptr<Benchmark>> families_;
+  std::vector<std::unique_ptr<benchmark::Benchmark>> families_;
   Mutex mutex_;
 };
 
@@ -98,7 +98,8 @@ BenchmarkFamilies* BenchmarkFamilies::GetInstance() {
   return &instance;
 }
 
-size_t BenchmarkFamilies::AddBenchmark(std::unique_ptr<Benchmark> family) {
+size_t BenchmarkFamilies::AddBenchmark(
+    std::unique_ptr<benchmark::Benchmark> family) {
   MutexLock l(mutex_);
   size_t index = families_.size();
   families_.push_back(std::move(family));
@@ -125,7 +126,7 @@ bool BenchmarkFamilies::FindBenchmarks(
     is_negative_filter = true;
   }
   if (!re.Init(spec, &error_msg)) {
-    Err << "Could not compile benchmark re: " << error_msg << std::endl;
+    Err << "Could not compile benchmark re: " << error_msg << '\n';
     return false;
   }
 
@@ -135,12 +136,14 @@ bool BenchmarkFamilies::FindBenchmarks(
   int next_family_index = 0;
 
   MutexLock l(mutex_);
-  for (std::unique_ptr<Benchmark>& family : families_) {
+  for (std::unique_ptr<benchmark::Benchmark>& family : families_) {
     int family_index = next_family_index;
     int per_family_instance_index = 0;
 
     // Family was deleted or benchmark doesn't match
-    if (!family) continue;
+    if (!family) {
+      continue;
+    }
 
     if (family->ArgsCnt() == -1) {
       family->Args({});
@@ -159,7 +162,9 @@ bool BenchmarkFamilies::FindBenchmarks(
     // reserve in the special case the regex ".", since we know the final
     // family size.  this doesn't take into account any disabled benchmarks
     // so worst case we reserve more than we need.
-    if (spec == ".") benchmarks->reserve(benchmarks->size() + family_size);
+    if (spec == ".") {
+      benchmarks->reserve(benchmarks->size() + family_size);
+    }
 
     for (auto const& args : family->args_) {
       for (int num_threads : *thread_counts) {
@@ -175,9 +180,11 @@ bool BenchmarkFamilies::FindBenchmarks(
 
           ++per_family_instance_index;
 
-          // Only bump the next family index once we've estabilished that
+          // Only bump the next family index once we've established that
           // at least one instance of this family will be run.
-          if (next_family_index == family_index) ++next_family_index;
+          if (next_family_index == family_index) {
+            ++next_family_index;
+          }
         }
       }
     }
@@ -185,11 +192,12 @@ bool BenchmarkFamilies::FindBenchmarks(
   return true;
 }
 
-Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
-  std::unique_ptr<Benchmark> bench_ptr(bench);
+benchmark::Benchmark* RegisterBenchmarkInternal(
+    std::unique_ptr<benchmark::Benchmark> bench) {
+  benchmark::Benchmark* bench_ptr = bench.get();
   BenchmarkFamilies* families = BenchmarkFamilies::GetInstance();
-  families->AddBenchmark(std::move(bench_ptr));
-  return bench;
+  families->AddBenchmark(std::move(bench));
+  return bench_ptr;
 }
 
 // FIXME: This function is a hack so that benchmark.cc can access
@@ -200,13 +208,15 @@ bool FindBenchmarksInternal(const std::string& re,
   return BenchmarkFamilies::GetInstance()->FindBenchmarks(re, benchmarks, Err);
 }
 
+}  // end namespace internal
+
 //=============================================================================//
 //                               Benchmark
 //=============================================================================//
 
 Benchmark::Benchmark(const std::string& name)
     : name_(name),
-      aggregation_report_mode_(ARM_Unspecified),
+      aggregation_report_mode_(internal::ARM_Unspecified),
       time_unit_(GetDefaultTimeUnit()),
       use_default_time_unit_(true),
       range_multiplier_(kRangeMultiplier),
@@ -218,9 +228,7 @@ Benchmark::Benchmark(const std::string& name)
       use_real_time_(false),
       use_manual_time_(false),
       complexity_(oNone),
-      complexity_lambda_(nullptr),
-      setup_(nullptr),
-      teardown_(nullptr) {
+      complexity_lambda_(nullptr) {
   ComputeStatistics("mean", StatisticsMean);
   ComputeStatistics("median", StatisticsMedian);
   ComputeStatistics("stddev", StatisticsStdDev);
@@ -249,7 +257,7 @@ Benchmark* Benchmark::Unit(TimeUnit unit) {
 Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
   BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
   std::vector<int64_t> arglist;
-  AddRange(&arglist, start, limit, range_multiplier_);
+  internal::AddRange(&arglist, start, limit, range_multiplier_);
 
   for (int64_t i : arglist) {
     args_.push_back({i});
@@ -262,8 +270,8 @@ Benchmark* Benchmark::Ranges(
   BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
   std::vector<std::vector<int64_t>> arglists(ranges.size());
   for (std::size_t i = 0; i < ranges.size(); i++) {
-    AddRange(&arglists[i], ranges[i].first, ranges[i].second,
-             range_multiplier_);
+    internal::AddRange(&arglists[i], ranges[i].first, ranges[i].second,
+                       range_multiplier_);
   }
 
   ArgsProduct(arglists);
@@ -326,18 +334,31 @@ Benchmark* Benchmark::Args(const std::vector<int64_t>& args) {
   return this;
 }
 
-Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
+Benchmark* Benchmark::Apply(
+    const std::function<void(Benchmark* benchmark)>& custom_arguments) {
   custom_arguments(this);
   return this;
 }
 
-Benchmark* Benchmark::Setup(void (*setup)(const benchmark::State&)) {
+Benchmark* Benchmark::Setup(callback_function&& setup) {
+  BM_CHECK(setup != nullptr);
+  setup_ = std::forward<callback_function>(setup);
+  return this;
+}
+
+Benchmark* Benchmark::Setup(const callback_function& setup) {
   BM_CHECK(setup != nullptr);
   setup_ = setup;
   return this;
 }
 
-Benchmark* Benchmark::Teardown(void (*teardown)(const benchmark::State&)) {
+Benchmark* Benchmark::Teardown(callback_function&& teardown) {
+  BM_CHECK(teardown != nullptr);
+  teardown_ = std::forward<callback_function>(teardown);
+  return this;
+}
+
+Benchmark* Benchmark::Teardown(const callback_function& teardown) {
   BM_CHECK(teardown != nullptr);
   teardown_ = teardown;
   return this;
@@ -365,8 +386,8 @@ Benchmark* Benchmark::MinWarmUpTime(double t) {
 
 Benchmark* Benchmark::Iterations(IterationCount n) {
   BM_CHECK(n > 0);
-  BM_CHECK(IsZero(min_time_));
-  BM_CHECK(IsZero(min_warmup_time_));
+  BM_CHECK(internal::IsZero(min_time_));
+  BM_CHECK(internal::IsZero(min_warmup_time_));
   iterations_ = n;
   return this;
 }
@@ -378,21 +399,23 @@ Benchmark* Benchmark::Repetitions(int n) {
 }
 
 Benchmark* Benchmark::ReportAggregatesOnly(bool value) {
-  aggregation_report_mode_ = value ? ARM_ReportAggregatesOnly : ARM_Default;
+  aggregation_report_mode_ =
+      value ? internal::ARM_ReportAggregatesOnly : internal::ARM_Default;
   return this;
 }
 
 Benchmark* Benchmark::DisplayAggregatesOnly(bool value) {
   // If we were called, the report mode is no longer 'unspecified', in any case.
+  using internal::AggregationReportMode;
   aggregation_report_mode_ = static_cast<AggregationReportMode>(
-      aggregation_report_mode_ | ARM_Default);
+      aggregation_report_mode_ | internal::ARM_Default);
 
   if (value) {
     aggregation_report_mode_ = static_cast<AggregationReportMode>(
-        aggregation_report_mode_ | ARM_DisplayReportAggregatesOnly);
+        aggregation_report_mode_ | internal::ARM_DisplayReportAggregatesOnly);
   } else {
     aggregation_report_mode_ = static_cast<AggregationReportMode>(
-        aggregation_report_mode_ & ~ARM_DisplayReportAggregatesOnly);
+        aggregation_report_mode_ & ~internal::ARM_DisplayReportAggregatesOnly);
   }
 
   return this;
@@ -446,7 +469,7 @@ Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
   BM_CHECK_GT(min_threads, 0);
   BM_CHECK_GE(max_threads, min_threads);
 
-  AddRange(&thread_counts_, min_threads, max_threads, 2);
+  internal::AddRange(&thread_counts_, min_threads, max_threads, 2);
   return this;
 }
 
@@ -468,13 +491,20 @@ Benchmark* Benchmark::ThreadPerCpu() {
   return this;
 }
 
+Benchmark* Benchmark::ThreadRunner(threadrunner_factory&& factory) {
+  threadrunner_ = std::move(factory);
+  return this;
+}
+
 void Benchmark::SetName(const std::string& name) { name_ = name; }
 
 const char* Benchmark::GetName() const { return name_.c_str(); }
 
 int Benchmark::ArgsCnt() const {
   if (args_.empty()) {
-    if (arg_names_.empty()) return -1;
+    if (arg_names_.empty()) {
+      return -1;
+    }
     return static_cast<int>(arg_names_.size());
   }
   return static_cast<int>(args_.front().size());
@@ -482,14 +512,17 @@ int Benchmark::ArgsCnt() const {
 
 const char* Benchmark::GetArgName(int arg) const {
   BM_CHECK_GE(arg, 0);
-  BM_CHECK_LT(arg, static_cast<int>(arg_names_.size()));
-  return arg_names_[arg].c_str();
+  size_t uarg = static_cast<size_t>(arg);
+  BM_CHECK_LT(uarg, arg_names_.size());
+  return arg_names_[uarg].c_str();
 }
 
 TimeUnit Benchmark::GetTimeUnit() const {
   return use_default_time_unit_ ? GetDefaultTimeUnit() : time_unit_;
 }
 
+namespace internal {
+
 //=============================================================================//
 //                            FunctionBenchmark
 //=============================================================================//
diff --git a/third-party/benchmark/src/benchmark_register.h b/third-party/benchmark/src/benchmark_register.h
index 53367c707cf41..e0ace51ef00da 100644
--- a/third-party/benchmark/src/benchmark_register.h
+++ b/third-party/benchmark/src/benchmark_register.h
@@ -3,6 +3,7 @@
 
 #include <algorithm>
 #include <limits>
+#include <type_traits>
 #include <vector>
 
 #include "check.h"
@@ -24,7 +25,7 @@ typename std::vector<T>::iterator AddPowers(std::vector<T>* dst, T lo, T hi,
   static const T kmax = std::numeric_limits<T>::max();
 
   // Space out the values in multiples of "mult"
-  for (T i = static_cast<T>(1); i <= hi; i *= static_cast<T>(mult)) {
+  for (T i = static_cast<T>(1); i <= hi; i = static_cast<T>(i * mult)) {
     if (i >= lo) {
       dst->push_back(i);
     }
@@ -52,7 +53,7 @@ void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
 
   const auto it = AddPowers(dst, hi_complement, lo_complement, mult);
 
-  std::for_each(it, dst->end(), [](T& t) { t *= -1; });
+  std::for_each(it, dst->end(), [](T& t) { t = static_cast<T>(t * -1); });
   std::reverse(it, dst->end());
 }
 
diff --git a/third-party/benchmark/src/benchmark_runner.cc b/third-party/benchmark/src/benchmark_runner.cc
index dcddb437e37d7..fb688672a40ca 100644
--- a/third-party/benchmark/src/benchmark_runner.cc
+++ b/third-party/benchmark/src/benchmark_runner.cc
@@ -34,6 +34,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <fstream>
+#include <functional>
 #include <iostream>
 #include <limits>
 #include <memory>
@@ -46,7 +47,6 @@
 #include "commandlineflags.h"
 #include "complexity.h"
 #include "counter.h"
-#include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
 #include "perf_counters.h"
@@ -58,13 +58,23 @@
 
 namespace benchmark {
 
+BM_DECLARE_bool(benchmark_dry_run);
+BM_DECLARE_string(benchmark_min_time);
+BM_DECLARE_double(benchmark_min_warmup_time);
+BM_DECLARE_int32(benchmark_repetitions);
+BM_DECLARE_bool(benchmark_report_aggregates_only);
+BM_DECLARE_bool(benchmark_display_aggregates_only);
+BM_DECLARE_string(benchmark_perf_counters);
+
 namespace internal {
 
 MemoryManager* memory_manager = nullptr;
 
+ProfilerManager* profiler_manager = nullptr;
+
 namespace {
 
-static constexpr IterationCount kMaxIterations = 1000000000000;
+constexpr IterationCount kMaxIterations = 1000000000000;
 const double kDefaultMinTime =
     std::strtod(::benchmark::kDefaultMinTimeStr, /*p_end*/ nullptr);
 
@@ -72,7 +82,7 @@ BenchmarkReporter::Run CreateRunReport(
     const benchmark::internal::BenchmarkInstance& b,
     const internal::ThreadManager::Result& results,
     IterationCount memory_iterations,
-    const MemoryManager::Result* memory_result, double seconds,
+    const MemoryManager::Result& memory_result, double seconds,
     int64_t repetition_index, int64_t repeats) {
   // Create report about this benchmark run.
   BenchmarkReporter::Run report;
@@ -90,7 +100,7 @@ BenchmarkReporter::Run CreateRunReport(
   report.repetition_index = repetition_index;
   report.repetitions = repeats;
 
-  if (!report.skipped) {
+  if (report.skipped == 0u) {
     if (b.use_manual_time()) {
       report.real_accumulated_time = results.manual_time_used;
     } else {
@@ -105,15 +115,20 @@ BenchmarkReporter::Run CreateRunReport(
     report.counters = results.counters;
 
     if (memory_iterations > 0) {
-      assert(memory_result != nullptr);
       report.memory_result = memory_result;
       report.allocs_per_iter =
-          memory_iterations ? static_cast<double>(memory_result->num_allocs) /
-                                  static_cast<double>(memory_iterations)
-                            : 0;
+          memory_iterations != 0
+              ? static_cast<double>(memory_result.num_allocs) /
+                    static_cast<double>(memory_iterations)
+              : 0;
     }
 
-    internal::Finish(&report.counters, results.iterations, seconds,
+    // The CPU time is the total time taken by all thread. If we used that as
+    // the denominator, we'd be calculating the rate per thread here. This is
+    // why we have to divide the total cpu_time by the number of threads for
+    // global counters to get a global rate.
+    const double thread_seconds = seconds / b.threads();
+    internal::Finish(&report.counters, results.iterations, thread_seconds,
                      b.threads());
   }
   return report;
@@ -123,16 +138,20 @@ BenchmarkReporter::Run CreateRunReport(
 // Adds the stats collected for the thread into manager->results.
 void RunInThread(const BenchmarkInstance* b, IterationCount iters,
                  int thread_id, ThreadManager* manager,
-                 PerfCountersMeasurement* perf_counters_measurement) {
+                 PerfCountersMeasurement* perf_counters_measurement,
+                 ProfilerManager* profiler_manager_) {
   internal::ThreadTimer timer(
       b->measure_process_cpu_time()
           ? internal::ThreadTimer::CreateProcessCpuTime()
           : internal::ThreadTimer::Create());
 
-  State st =
-      b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
-  BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations)
-      << "Benchmark returned before State::KeepRunning() returned false!";
+  State st = b->Run(iters, thread_id, &timer, manager,
+                    perf_counters_measurement, profiler_manager_);
+  if (!(st.skipped() || st.iterations() >= st.max_iterations)) {
+    st.SkipWithError(
+        "The benchmark didn't run, nor was it explicitly skipped. Please call "
+        "'SkipWithXXX` in your benchmark as appropriate.");
+  }
   {
     MutexLock l(manager->GetBenchmarkMutex());
     internal::ThreadManager::Result& results = manager->results;
@@ -148,17 +167,23 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,
 
 double ComputeMinTime(const benchmark::internal::BenchmarkInstance& b,
                       const BenchTimeType& iters_or_time) {
-  if (!IsZero(b.min_time())) return b.min_time();
+  if (!IsZero(b.min_time())) {
+    return b.min_time();
+  }
   // If the flag was used to specify number of iters, then return the default
   // min_time.
-  if (iters_or_time.tag == BenchTimeType::ITERS) return kDefaultMinTime;
+  if (iters_or_time.tag == BenchTimeType::ITERS) {
+    return kDefaultMinTime;
+  }
 
   return iters_or_time.time;
 }
 
 IterationCount ComputeIters(const benchmark::internal::BenchmarkInstance& b,
                             const BenchTimeType& iters_or_time) {
-  if (b.iterations() != 0) return b.iterations();
+  if (b.iterations() != 0) {
+    return b.iterations();
+  }
 
   // We've already concluded that this flag is currently used to pass
   // iters but do a check here again anyway.
@@ -166,10 +191,43 @@ IterationCount ComputeIters(const benchmark::internal::BenchmarkInstance& b,
   return iters_or_time.iters;
 }
 
+class ThreadRunnerDefault : public ThreadRunnerBase {
+ public:
+  explicit ThreadRunnerDefault(int num_threads)
+      : pool(static_cast<size_t>(num_threads - 1)) {}
+
+  void RunThreads(const std::function<void(int)>& fn) override final {
+    // Run all but one thread in separate threads
+    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+      pool[ti] = std::thread(fn, static_cast<int>(ti + 1));
+    }
+    // And run one thread here directly.
+    // (If we were asked to run just one thread, we don't create new threads.)
+    // Yes, we need to do this here *after* we start the separate threads.
+    fn(0);
+
+    // The main thread has finished. Now let's wait for the other threads.
+    for (std::thread& thread : pool) {
+      thread.join();
+    }
+  }
+
+ private:
+  std::vector<std::thread> pool;
+};
+
+std::unique_ptr<ThreadRunnerBase> GetThreadRunner(
+    const benchmark::threadrunner_factory& userThreadRunnerFactory,
+    int num_threads) {
+  return userThreadRunnerFactory
+             ? userThreadRunnerFactory(num_threads)
+             : std::make_unique<ThreadRunnerDefault>(num_threads);
+}
+
 }  // end namespace
 
 BenchTimeType ParseBenchMinTime(const std::string& value) {
-  BenchTimeType ret;
+  BenchTimeType ret = {};
 
   if (value.empty()) {
     ret.tag = BenchTimeType::TIME;
@@ -178,7 +236,7 @@ BenchTimeType ParseBenchMinTime(const std::string& value) {
   }
 
   if (value.back() == 'x') {
-    char* p_end;
+    char* p_end = nullptr;
     // Reset errno before it's changed by strtol.
     errno = 0;
     IterationCount num_iters = std::strtol(value.c_str(), &p_end, 10);
@@ -200,7 +258,7 @@ BenchTimeType ParseBenchMinTime(const std::string& value) {
                   "Eg., `30s` for 30-seconds.";
   }
 
-  char* p_end;
+  char* p_end = nullptr;
   // Reset errno before it's changed by strtod.
   errno = 0;
   double min_time = std::strtod(value.c_str(), &p_end);
@@ -225,20 +283,30 @@ BenchmarkRunner::BenchmarkRunner(
     : b(b_),
       reports_for_family(reports_for_family_),
       parsed_benchtime_flag(ParseBenchMinTime(FLAGS_benchmark_min_time)),
-      min_time(ComputeMinTime(b_, parsed_benchtime_flag)),
-      min_warmup_time((!IsZero(b.min_time()) && b.min_warmup_time() > 0.0)
-                          ? b.min_warmup_time()
-                          : FLAGS_benchmark_min_warmup_time),
-      warmup_done(!(min_warmup_time > 0.0)),
-      repeats(b.repetitions() != 0 ? b.repetitions()
-                                   : FLAGS_benchmark_repetitions),
+      min_time(FLAGS_benchmark_dry_run
+                   ? 0
+                   : ComputeMinTime(b_, parsed_benchtime_flag)),
+      min_warmup_time(
+          FLAGS_benchmark_dry_run
+              ? 0
+              : ((!IsZero(b.min_time()) && b.min_warmup_time() > 0.0)
+                     ? b.min_warmup_time()
+                     : FLAGS_benchmark_min_warmup_time)),
+      warmup_done(FLAGS_benchmark_dry_run ? true : !(min_warmup_time > 0.0)),
+      repeats(FLAGS_benchmark_dry_run
+                  ? 1
+                  : (b.repetitions() != 0 ? b.repetitions()
+                                          : FLAGS_benchmark_repetitions)),
       has_explicit_iteration_count(b.iterations() != 0 ||
                                    parsed_benchtime_flag.tag ==
                                        BenchTimeType::ITERS),
-      pool(b.threads() - 1),
-      iters(has_explicit_iteration_count
-                ? ComputeIters(b_, parsed_benchtime_flag)
-                : 1),
+      thread_runner(
+          GetThreadRunner(b.GetUserThreadRunnerFactory(), b.threads())),
+      iters(FLAGS_benchmark_dry_run
+                ? 1
+                : (has_explicit_iteration_count
+                       ? ComputeIters(b_, parsed_benchtime_flag)
+                       : 1)),
       perf_counters_measurement_ptr(pcm_) {
   run_results.display_report_aggregates_only =
       (FLAGS_benchmark_report_aggregates_only ||
@@ -247,10 +315,11 @@ BenchmarkRunner::BenchmarkRunner(
       FLAGS_benchmark_report_aggregates_only;
   if (b.aggregation_report_mode() != internal::ARM_Unspecified) {
     run_results.display_report_aggregates_only =
-        (b.aggregation_report_mode() &
-         internal::ARM_DisplayReportAggregatesOnly);
+        ((b.aggregation_report_mode() &
+          internal::ARM_DisplayReportAggregatesOnly) != 0u);
     run_results.file_report_aggregates_only =
-        (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
+        ((b.aggregation_report_mode() &
+          internal::ARM_FileReportAggregatesOnly) != 0u);
     BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
              (perf_counters_measurement_ptr->num_counters() == 0))
         << "Perf counters were requested but could not be set up.";
@@ -263,19 +332,10 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
   std::unique_ptr<internal::ThreadManager> manager;
   manager.reset(new internal::ThreadManager(b.threads()));
 
-  // Run all but one thread in separate threads
-  for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-    pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
-                           manager.get(), perf_counters_measurement_ptr);
-  }
-  // And run one thread here directly.
-  // (If we were asked to run just one thread, we don't create new threads.)
-  // Yes, we need to do this here *after* we start the separate threads.
-  RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr);
-
-  // The main thread has finished. Now let's wait for the other threads.
-  manager->WaitForAllThreads();
-  for (std::thread& thread : pool) thread.join();
+  thread_runner->RunThreads([&](int thread_idx) {
+    RunInThread(&b, iters, thread_idx, manager.get(),
+                perf_counters_measurement_ptr, /*profiler_manager=*/nullptr);
+  });
 
   IterationResults i;
   // Acquire the measurements/counters from the manager, UNDER THE LOCK!
@@ -287,12 +347,6 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
   // And get rid of the manager.
   manager.reset();
 
-  // Adjust real/manual time stats since they were reported per thread.
-  i.results.real_time_used /= b.threads();
-  i.results.manual_time_used /= b.threads();
-  // If we were measuring whole-process CPU usage, adjust the CPU time too.
-  if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
-
   BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
              << i.results.real_time_used << "\n";
 
@@ -340,7 +394,7 @@ bool BenchmarkRunner::ShouldReportIterationResults(
   // Determine if this run should be reported;
   // Either it has run for a sufficient amount of time
   // or because an error was reported.
-  return i.results.skipped_ ||
+  return (i.results.skipped_ != 0u) || FLAGS_benchmark_dry_run ||
          i.iters >= kMaxIterations ||  // Too many iterations already.
          i.seconds >=
              GetMinTimeToApply() ||  // The elapsed time is large enough.
@@ -352,7 +406,7 @@ bool BenchmarkRunner::ShouldReportIterationResults(
 }
 
 double BenchmarkRunner::GetMinTimeToApply() const {
-  // In order to re-use functionality to run and measure benchmarks for running
+  // In order to reuse functionality to run and measure benchmarks for running
   // a warmup phase of the benchmark, we need a way of telling whether to apply
   // min_time or min_warmup_time. This function will figure out if we are in the
   // warmup phase and therefore need to apply min_warmup_time or if we already
@@ -401,6 +455,34 @@ void BenchmarkRunner::RunWarmUp() {
   }
 }
 
+MemoryManager::Result BenchmarkRunner::RunMemoryManager(
+    IterationCount memory_iterations) {
+  memory_manager->Start();
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(1));
+  b.Setup();
+  RunInThread(&b, memory_iterations, 0, manager.get(),
+              perf_counters_measurement_ptr,
+              /*profiler_manager=*/nullptr);
+  manager.reset();
+  b.Teardown();
+  MemoryManager::Result memory_result;
+  memory_manager->Stop(memory_result);
+  memory_result.memory_iterations = memory_iterations;
+  return memory_result;
+}
+
+void BenchmarkRunner::RunProfilerManager(IterationCount profile_iterations) {
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(1));
+  b.Setup();
+  RunInThread(&b, profile_iterations, 0, manager.get(),
+              /*perf_counters_measurement_ptr=*/nullptr,
+              /*profiler_manager=*/profiler_manager);
+  manager.reset();
+  b.Teardown();
+}
+
 void BenchmarkRunner::DoOneRepetition() {
   assert(HasRepeatsRemaining() && "Already done all repetitions?");
 
@@ -411,7 +493,9 @@ void BenchmarkRunner::DoOneRepetition() {
   // this warmup never happened except the fact that warmup_done is set. Every
   // other manipulation of the BenchmarkRunner instance would be a bug! Please
   // fix it.
-  if (!warmup_done) RunWarmUp();
+  if (!warmup_done) {
+    RunWarmUp();
+  }
 
   IterationResults i;
   // We *may* be gradually increasing the length (iteration count)
@@ -433,8 +517,10 @@ void BenchmarkRunner::DoOneRepetition() {
     const bool results_are_significant = !is_the_first_repetition ||
                                          has_explicit_iteration_count ||
                                          ShouldReportIterationResults(i);
-
-    if (results_are_significant) break;  // Good, let's report them!
+    // Good, let's report them!
+    if (results_are_significant) {
+      break;
+    }
 
     // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
     // iteration count, and run the benchmark again...
@@ -445,28 +531,21 @@ void BenchmarkRunner::DoOneRepetition() {
            "then we should have accepted the current iteration run.");
   }
 
-  // Oh, one last thing, we need to also produce the 'memory measurements'..
-  MemoryManager::Result* memory_result = nullptr;
+  // Produce memory measurements if requested.
+  MemoryManager::Result memory_result;
   IterationCount memory_iterations = 0;
   if (memory_manager != nullptr) {
-    // TODO(vyng): Consider making BenchmarkReporter::Run::memory_result an
-    // optional so we don't have to own the Result here.
-    // Can't do it now due to cxx03.
-    memory_results.push_back(MemoryManager::Result());
-    memory_result = &memory_results.back();
     // Only run a few iterations to reduce the impact of one-time
     // allocations in benchmarks that are not properly managed.
     memory_iterations = std::min<IterationCount>(16, iters);
-    memory_manager->Start();
-    std::unique_ptr<internal::ThreadManager> manager;
-    manager.reset(new internal::ThreadManager(1));
-    b.Setup();
-    RunInThread(&b, memory_iterations, 0, manager.get(),
-                perf_counters_measurement_ptr);
-    manager->WaitForAllThreads();
-    manager.reset();
-    b.Teardown();
-    memory_manager->Stop(*memory_result);
+    memory_result = RunMemoryManager(memory_iterations);
+  }
+
+  if (profiler_manager != nullptr) {
+    // We want to externally profile the benchmark for the same number of
+    // iterations because, for example, if we're tracing the benchmark then we
+    // want trace data to reasonably match PMU data.
+    RunProfilerManager(iters);
   }
 
   // Ok, now actually report.
@@ -474,9 +553,11 @@ void BenchmarkRunner::DoOneRepetition() {
       CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
                       num_repetitions_done, repeats);
 
-  if (reports_for_family) {
+  if (reports_for_family != nullptr) {
     ++reports_for_family->num_runs_done;
-    if (!report.skipped) reports_for_family->Runs.push_back(report);
+    if (report.skipped == 0u) {
+      reports_for_family->Runs.push_back(report);
+    }
   }
 
   run_results.non_aggregates.push_back(report);
diff --git a/third-party/benchmark/src/benchmark_runner.h b/third-party/benchmark/src/benchmark_runner.h
index db2fa04396c50..9a2231a2a4bfe 100644
--- a/third-party/benchmark/src/benchmark_runner.h
+++ b/third-party/benchmark/src/benchmark_runner.h
@@ -15,26 +15,20 @@
 #ifndef BENCHMARK_RUNNER_H_
 #define BENCHMARK_RUNNER_H_
 
+#include <memory>
 #include <thread>
 #include <vector>
 
 #include "benchmark_api_internal.h"
-#include "internal_macros.h"
 #include "perf_counters.h"
 #include "thread_manager.h"
 
 namespace benchmark {
 
-BM_DECLARE_string(benchmark_min_time);
-BM_DECLARE_double(benchmark_min_warmup_time);
-BM_DECLARE_int32(benchmark_repetitions);
-BM_DECLARE_bool(benchmark_report_aggregates_only);
-BM_DECLARE_bool(benchmark_display_aggregates_only);
-BM_DECLARE_string(benchmark_perf_counters);
-
 namespace internal {
 
 extern MemoryManager* memory_manager;
+extern ProfilerManager* profiler_manager;
 
 struct RunResults {
   std::vector<BenchmarkReporter::Run> non_aggregates;
@@ -45,7 +39,7 @@ struct RunResults {
 };
 
 struct BENCHMARK_EXPORT BenchTimeType {
-  enum { ITERS, TIME } tag;
+  enum { UNSPECIFIED, ITERS, TIME } tag;
   union {
     IterationCount iters;
     double time;
@@ -58,7 +52,7 @@ BenchTimeType ParseBenchMinTime(const std::string& value);
 class BenchmarkRunner {
  public:
   BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
-                  benchmark::internal::PerfCountersMeasurement* pmc_,
+                  benchmark::internal::PerfCountersMeasurement* pcm_,
                   BenchmarkReporter::PerFamilyRunReports* reports_for_family);
 
   int GetNumRepeats() const { return repeats; }
@@ -96,9 +90,7 @@ class BenchmarkRunner {
 
   int num_repetitions_done = 0;
 
-  std::vector<std::thread> pool;
-
-  std::vector<MemoryManager::Result> memory_results;
+  std::unique_ptr<ThreadRunnerBase> thread_runner;
 
   IterationCount iters;  // preserved between repetitions!
   // So only the first repetition has to find/calculate it,
@@ -113,6 +105,10 @@ class BenchmarkRunner {
   };
   IterationResults DoNIterations();
 
+  MemoryManager::Result RunMemoryManager(IterationCount memory_iterations);
+
+  void RunProfilerManager(IterationCount profile_iterations);
+
   IterationCount PredictNumItersNeeded(const IterationResults& i) const;
 
   bool ShouldReportIterationResults(const IterationResults& i) const;
diff --git a/third-party/benchmark/src/check.cc b/third-party/benchmark/src/check.cc
index 5f7526e08d6ef..3e2a40b4bde4d 100644
--- a/third-party/benchmark/src/check.cc
+++ b/third-party/benchmark/src/check.cc
@@ -3,7 +3,10 @@
 namespace benchmark {
 namespace internal {
 
-static AbortHandlerT* handler = &std::abort;
+namespace {
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+AbortHandlerT* handler = &std::abort;
+}  // namespace
 
 BENCHMARK_EXPORT AbortHandlerT*& GetAbortHandler() { return handler; }
 
diff --git a/third-party/benchmark/src/check.h b/third-party/benchmark/src/check.h
index c1cd5e85e44cf..aa8c78c92fe3b 100644
--- a/third-party/benchmark/src/check.h
+++ b/third-party/benchmark/src/check.h
@@ -4,6 +4,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <ostream>
+#include <string_view>
 
 #include "benchmark/export.h"
 #include "internal_macros.h"
@@ -36,6 +37,8 @@ AbortHandlerT*& GetAbortHandler();
 
 BENCHMARK_NORETURN inline void CallAbortHandler() {
   GetAbortHandler()();
+  std::flush(std::cout);
+  std::flush(std::cerr);
   std::abort();  // fallback to enforce noreturn
 }
 
@@ -44,7 +47,8 @@ BENCHMARK_NORETURN inline void CallAbortHandler() {
 // destructed.
 class CheckHandler {
  public:
-  CheckHandler(const char* check, const char* file, const char* func, int line)
+  CheckHandler(std::string_view check, std::string_view file,
+               std::string_view func, int line)
       : log_(GetErrorLogInstance()) {
     log_ << file << ":" << line << ": " << func << ": Check `" << check
          << "' failed. ";
@@ -57,7 +61,7 @@ class CheckHandler {
 #pragma warning(disable : 4722)
 #endif
   BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
-    log_ << std::endl;
+    log_ << '\n';
     CallAbortHandler();
   }
 #if defined(COMPILER_MSVC)
@@ -78,9 +82,11 @@ class CheckHandler {
 // The BM_CHECK macro returns a std::ostream object that can have extra
 // information written to it.
 #ifndef NDEBUG
-#define BM_CHECK(b)                                                          \
-  (b ? ::benchmark::internal::GetNullLogInstance()                           \
-     : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
+#define BM_CHECK(b)                                          \
+  (b ? ::benchmark::internal::GetNullLogInstance()           \
+     : ::benchmark::internal::CheckHandler(                  \
+           std::string_view(#b), std::string_view(__FILE__), \
+           std::string_view(__func__), __LINE__)             \
            .GetLog())
 #else
 #define BM_CHECK(b) ::benchmark::internal::GetNullLogInstance()
diff --git a/third-party/benchmark/src/colorprint.cc b/third-party/benchmark/src/colorprint.cc
index abc71492f77aa..c90232f20ff7b 100644
--- a/third-party/benchmark/src/colorprint.cc
+++ b/third-party/benchmark/src/colorprint.cc
@@ -135,22 +135,30 @@ void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
   // Gets the current text color.
   CONSOLE_SCREEN_BUFFER_INFO buffer_info;
   GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
-  const WORD old_color_attrs = buffer_info.wAttributes;
+  const WORD original_color_attrs = buffer_info.wAttributes;
 
   // We need to flush the stream buffers into the console before each
   // SetConsoleTextAttribute call lest it affect the text that is already
   // printed but has not yet reached the console.
   out.flush();
-  SetConsoleTextAttribute(stdout_handle,
-                          GetPlatformColorCode(color) | FOREGROUND_INTENSITY);
+
+  const WORD original_background_attrs =
+      original_color_attrs & (BACKGROUND_RED | BACKGROUND_GREEN |
+                              BACKGROUND_BLUE | BACKGROUND_INTENSITY);
+
+  SetConsoleTextAttribute(stdout_handle, GetPlatformColorCode(color) |
+                                             FOREGROUND_INTENSITY |
+                                             original_background_attrs);
   out << FormatString(fmt, args);
 
   out.flush();
-  // Restores the text color.
-  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+  // Restores the text and background color.
+  SetConsoleTextAttribute(stdout_handle, original_color_attrs);
 #else
   const char* color_code = GetPlatformColorCode(color);
-  if (color_code) out << FormatString("\033[0;3%sm", color_code);
+  if (color_code != nullptr) {
+    out << FormatString("\033[0;3%sm", color_code);
+  }
   out << FormatString(fmt, args) << "\033[m";
 #endif
 }
@@ -187,7 +195,7 @@ bool IsColorTerminal() {
 
   bool term_supports_color = false;
   for (const char* candidate : SUPPORTED_TERM_VALUES) {
-    if (term && 0 == strcmp(term, candidate)) {
+    if ((term != nullptr) && 0 == strcmp(term, candidate)) {
       term_supports_color = true;
       break;
     }
diff --git a/third-party/benchmark/src/colorprint.h b/third-party/benchmark/src/colorprint.h
index 9f6fab9b34226..469045c5f57f5 100644
--- a/third-party/benchmark/src/colorprint.h
+++ b/third-party/benchmark/src/colorprint.h
@@ -5,6 +5,8 @@
 #include <iostream>
 #include <string>
 
+#include "internal_macros.h"
+
 namespace benchmark {
 enum LogColor {
   COLOR_DEFAULT,
@@ -17,11 +19,14 @@ enum LogColor {
   COLOR_WHITE
 };
 
+PRINTF_FORMAT_STRING_FUNC(1, 0)
 std::string FormatString(const char* msg, va_list args);
-std::string FormatString(const char* msg, ...);
+PRINTF_FORMAT_STRING_FUNC(1, 2) std::string FormatString(const char* msg, ...);
 
+PRINTF_FORMAT_STRING_FUNC(3, 0)
 void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
                  va_list args);
+PRINTF_FORMAT_STRING_FUNC(3, 4)
 void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...);
 
 // Returns true if stdout appears to be a terminal that supports colored
diff --git a/third-party/benchmark/src/commandlineflags.cc b/third-party/benchmark/src/commandlineflags.cc
index dcb414959df4e..99a240c122412 100644
--- a/third-party/benchmark/src/commandlineflags.cc
+++ b/third-party/benchmark/src/commandlineflags.cc
@@ -109,12 +109,13 @@ bool ParseKvPairs(const std::string& src_text, const char* str,
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "BENCHMARK_FOO" in the open-source version.
-static std::string FlagToEnvVar(const char* flag) {
+std::string FlagToEnvVar(const char* flag) {
   const std::string flag_str(flag);
 
   std::string env_var;
-  for (size_t i = 0; i != flag_str.length(); ++i)
+  for (size_t i = 0; i != flag_str.length(); ++i) {
     env_var += static_cast<char>(::toupper(flag_str.c_str()[i]));
+  }
 
   return env_var;
 }
@@ -167,7 +168,9 @@ std::map<std::string, std::string> KvPairsFromEnv(
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value_str = getenv(env_var.c_str());
 
-  if (value_str == nullptr) return default_val;
+  if (value_str == nullptr) {
+    return default_val;
+  }
 
   std::map<std::string, std::string> value;
   if (!ParseKvPairs("Environment variable " + env_var, value_str, &value)) {
@@ -176,6 +179,8 @@ std::map<std::string, std::string> KvPairsFromEnv(
   return value;
 }
 
+namespace {
+
 // Parses a string as a command line flag.  The string should have
 // the format "--flag=value".  When def_optional is true, the "=value"
 // part can be omitted.
@@ -184,35 +189,47 @@ std::map<std::string, std::string> KvPairsFromEnv(
 const char* ParseFlagValue(const char* str, const char* flag,
                            bool def_optional) {
   // str and flag must not be nullptr.
-  if (str == nullptr || flag == nullptr) return nullptr;
+  if (str == nullptr || flag == nullptr) {
+    return nullptr;
+  }
 
   // The flag must start with "--".
   const std::string flag_str = std::string("--") + std::string(flag);
   const size_t flag_len = flag_str.length();
-  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) {
+    return nullptr;
+  }
 
   // Skips the flag name.
   const char* flag_end = str + flag_len;
 
   // When def_optional is true, it's OK to not have a "=value" part.
-  if (def_optional && (flag_end[0] == '\0')) return flag_end;
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
 
   // If def_optional is true and there are more characters after the
   // flag name, or if def_optional is false, there must be a '=' after
   // the flag name.
-  if (flag_end[0] != '=') return nullptr;
+  if (flag_end[0] != '=') {
+    return nullptr;
+  }
 
   // Returns the string after "=".
   return flag_end + 1;
 }
 
+}  // end namespace
+
 BENCHMARK_EXPORT
 bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, true);
 
   // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }
 
   // Converts the string value to a bool.
   *value = IsTruthyFlagValue(value_str);
@@ -225,7 +242,9 @@ bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
   const char* const value_str = ParseFlagValue(str, flag, false);
 
   // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }
 
   // Sets *value to the value of the flag.
   return ParseInt32(std::string("The value of flag --") + flag, value_str,
@@ -238,7 +257,9 @@ bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
   const char* const value_str = ParseFlagValue(str, flag, false);
 
   // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }
 
   // Sets *value to the value of the flag.
   return ParseDouble(std::string("The value of flag --") + flag, value_str,
@@ -251,7 +272,9 @@ bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
   const char* const value_str = ParseFlagValue(str, flag, false);
 
   // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }
 
   *value = value_str;
   return true;
@@ -262,11 +285,15 @@ bool ParseKeyValueFlag(const char* str, const char* flag,
                        std::map<std::string, std::string>* value) {
   const char* const value_str = ParseFlagValue(str, flag, false);
 
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }
 
   for (const auto& kvpair : StrSplit(value_str, ',')) {
     const auto kv = StrSplit(kvpair, '=');
-    if (kv.size() != 2) return false;
+    if (kv.size() != 2) {
+      return false;
+    }
     value->emplace(kv[0], kv[1]);
   }
 
diff --git a/third-party/benchmark/src/commandlineflags.h b/third-party/benchmark/src/commandlineflags.h
index 7882628975eac..5f9ebf1d56e83 100644
--- a/third-party/benchmark/src/commandlineflags.h
+++ b/third-party/benchmark/src/commandlineflags.h
@@ -11,14 +11,17 @@
 #define FLAG(name) FLAGS_##name
 
 // Macros for declaring flags.
+// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
 #define BM_DECLARE_bool(name) BENCHMARK_EXPORT extern bool FLAG(name)
 #define BM_DECLARE_int32(name) BENCHMARK_EXPORT extern int32_t FLAG(name)
 #define BM_DECLARE_double(name) BENCHMARK_EXPORT extern double FLAG(name)
 #define BM_DECLARE_string(name) BENCHMARK_EXPORT extern std::string FLAG(name)
 #define BM_DECLARE_kvpairs(name) \
   BENCHMARK_EXPORT extern std::map<std::string, std::string> FLAG(name)
+// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)
 
 // Macros for defining flags.
+// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
 #define BM_DEFINE_bool(name, default_val) \
   BENCHMARK_EXPORT bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
 #define BM_DEFINE_int32(name, default_val) \
@@ -33,6 +36,7 @@
 #define BM_DEFINE_kvpairs(name, default_val)                       \
   BENCHMARK_EXPORT std::map<std::string, std::string> FLAG(name) = \
       benchmark::KvPairsFromEnv(#name, default_val)
+// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)
 
 namespace benchmark {
 
diff --git a/third-party/benchmark/src/complexity.cc b/third-party/benchmark/src/complexity.cc
index eee3122646f95..4c9ef6d0c7180 100644
--- a/third-party/benchmark/src/complexity.cc
+++ b/third-party/benchmark/src/complexity.cc
@@ -17,7 +17,6 @@
 
 #include "complexity.h"
 
-#include <algorithm>
 #include <cmath>
 
 #include "benchmark/benchmark.h"
@@ -25,9 +24,10 @@
 
 namespace benchmark {
 
+namespace {
+
 // Internal function to calculate the different scalability forms
 BigOFunc* FittingCurve(BigO complexity) {
-  static const double kLog2E = 1.44269504088896340736;
   switch (complexity) {
     case oN:
       return [](IterationCount n) -> double { return static_cast<double>(n); };
@@ -36,15 +36,12 @@ BigOFunc* FittingCurve(BigO complexity) {
     case oNCubed:
       return [](IterationCount n) -> double { return std::pow(n, 3); };
     case oLogN:
-      /* Note: can't use log2 because Android's GNU STL lacks it */
-      return [](IterationCount n) {
-        return kLog2E * std::log(static_cast<double>(n));
+      return [](IterationCount n) -> double {
+        return std::log2(static_cast<double>(n));
       };
     case oNLogN:
-      /* Note: can't use log2 because Android's GNU STL lacks it */
-      return [](IterationCount n) {
-        return kLog2E * static_cast<double>(n) *
-               std::log(static_cast<double>(n));
+      return [](IterationCount n) -> double {
+        return static_cast<double>(n) * std::log2(static_cast<double>(n));
       };
     case o1:
     default:
@@ -52,6 +49,8 @@ BigOFunc* FittingCurve(BigO complexity) {
   }
 }
 
+}  // end namespace
+
 // Function to return an string for the calculated complexity
 std::string GetBigOString(BigO complexity) {
   switch (complexity) {
@@ -72,6 +71,8 @@ std::string GetBigOString(BigO complexity) {
   }
 }
 
+namespace {
+
 // Find the coefficient for the high-order term in the running time, by
 // minimizing the sum of squares of relative error, for the fitting curve
 // given by the lambda expression.
@@ -156,12 +157,16 @@ LeastSq MinimalLeastSq(const std::vector<ComplexityN>& n,
   return best_fit;
 }
 
+}  // end namespace
+
 std::vector<BenchmarkReporter::Run> ComputeBigO(
     const std::vector<BenchmarkReporter::Run>& reports) {
   typedef BenchmarkReporter::Run Run;
   std::vector<Run> results;
 
-  if (reports.size() < 2) return results;
+  if (reports.size() < 2) {
+    return results;
+  }
 
   // Accumulators.
   std::vector<ComplexityN> n;
diff --git a/third-party/benchmark/src/console_reporter.cc b/third-party/benchmark/src/console_reporter.cc
index 35c3de2a4dbae..6db6788f9443d 100644
--- a/third-party/benchmark/src/console_reporter.cc
+++ b/third-party/benchmark/src/console_reporter.cc
@@ -63,7 +63,7 @@ void ConsoleReporter::PrintHeader(const Run& run) {
       FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
                    "Benchmark", "Time", "CPU", "Iterations");
   if (!run.counters.empty()) {
-    if (output_options_ & OO_Tabular) {
+    if ((output_options_ & OO_Tabular) != 0) {
       for (auto const& c : run.counters) {
         str += FormatString(" %10s", c.first.c_str());
       }
@@ -83,7 +83,7 @@ void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
     bool print_header = !printed_header_;
     // --- or if the format is tabular and this run
     //     has different fields from the prev header
-    print_header |= (output_options_ & OO_Tabular) &&
+    print_header |= ((output_options_ & OO_Tabular) != 0) &&
                     (!internal::SameNames(run.counters, prev_counters_));
     if (print_header) {
       printed_header_ = true;
@@ -97,8 +97,9 @@ void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
   }
 }
 
-static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
-                             ...) {
+PRINTF_FORMAT_STRING_FUNC(3, 4)
+static void IgnoreColorPrint(std::ostream& out, LogColor /*unused*/,
+                             const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
   out << FormatString(fmt, args);
@@ -131,7 +132,7 @@ BENCHMARK_EXPORT
 void ConsoleReporter::PrintRunData(const Run& result) {
   typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
   auto& Out = GetOutputStream();
-  PrinterFn* printer = (output_options_ & OO_Color)
+  PrinterFn* printer = (output_options_ & OO_Color) != 0
                            ? static_cast<PrinterFn*>(ColorPrintf)
                            : IgnoreColorPrint;
   auto name_color =
@@ -144,7 +145,8 @@ void ConsoleReporter::PrintRunData(const Run& result) {
             result.skip_message.c_str());
     printer(Out, COLOR_DEFAULT, "\n");
     return;
-  } else if (internal::SkippedWithMessage == result.skipped) {
+  }
+  if (internal::SkippedWithMessage == result.skipped) {
     printer(Out, COLOR_WHITE, "SKIPPED: \'%s\'", result.skip_message.c_str());
     printer(Out, COLOR_DEFAULT, "\n");
     return;
@@ -178,9 +180,9 @@ void ConsoleReporter::PrintRunData(const Run& result) {
     printer(Out, COLOR_CYAN, "%10lld", result.iterations);
   }
 
-  for (auto& c : result.counters) {
+  for (const auto& c : result.counters) {
     const std::size_t cNameLen =
-        std::max(std::string::size_type(10), c.first.length());
+        std::max(static_cast<std::size_t>(10), c.first.length());
     std::string s;
     const char* unit = "";
     if (result.run_type == Run::RT_Aggregate &&
@@ -189,10 +191,11 @@ void ConsoleReporter::PrintRunData(const Run& result) {
       unit = "%";
     } else {
       s = HumanReadableNumber(c.second.value, c.second.oneK);
-      if (c.second.flags & Counter::kIsRate)
-        unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+      if ((c.second.flags & Counter::kIsRate) != 0) {
+        unit = (c.second.flags & Counter::kInvert) != 0 ? "s" : "/s";
+      }
     }
-    if (output_options_ & OO_Tabular) {
+    if ((output_options_ & OO_Tabular) != 0) {
       printer(Out, COLOR_DEFAULT, " %*s%s", cNameLen - strlen(unit), s.c_str(),
               unit);
     } else {
diff --git a/third-party/benchmark/src/counter.cc b/third-party/benchmark/src/counter.cc
index aa14cd8092f94..4bdd5e9b59234 100644
--- a/third-party/benchmark/src/counter.cc
+++ b/third-party/benchmark/src/counter.cc
@@ -17,28 +17,32 @@
 namespace benchmark {
 namespace internal {
 
+namespace {
+
 double Finish(Counter const& c, IterationCount iterations, double cpu_time,
               double num_threads) {
   double v = c.value;
-  if (c.flags & Counter::kIsRate) {
+  if ((c.flags & Counter::kIsRate) != 0) {
     v /= cpu_time;
   }
-  if (c.flags & Counter::kAvgThreads) {
+  if ((c.flags & Counter::kAvgThreads) != 0) {
     v /= num_threads;
   }
-  if (c.flags & Counter::kIsIterationInvariant) {
+  if ((c.flags & Counter::kIsIterationInvariant) != 0) {
     v *= static_cast<double>(iterations);
   }
-  if (c.flags & Counter::kAvgIterations) {
+  if ((c.flags & Counter::kAvgIterations) != 0) {
     v /= static_cast<double>(iterations);
   }
 
-  if (c.flags & Counter::kInvert) {  // Invert is *always* last.
+  if ((c.flags & Counter::kInvert) != 0) {  // Invert is *always* last.
     v = 1.0 / v;
   }
   return v;
 }
 
+}  // namespace
+
 void Finish(UserCounters* l, IterationCount iterations, double cpu_time,
             double num_threads) {
   for (auto& c : *l) {
@@ -64,7 +68,9 @@ void Increment(UserCounters* l, UserCounters const& r) {
 }
 
 bool SameNames(UserCounters const& l, UserCounters const& r) {
-  if (&l == &r) return true;
+  if (&l == &r) {
+    return true;
+  }
   if (l.size() != r.size()) {
     return false;
   }
diff --git a/third-party/benchmark/src/csv_reporter.cc b/third-party/benchmark/src/csv_reporter.cc
index 4b39e2c52fb91..0f998045bd189 100644
--- a/third-party/benchmark/src/csv_reporter.cc
+++ b/third-party/benchmark/src/csv_reporter.cc
@@ -12,29 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <algorithm>
-#include <cstdint>
 #include <iostream>
 #include <string>
-#include <tuple>
 #include <vector>
 
 #include "benchmark/benchmark.h"
 #include "check.h"
 #include "complexity.h"
-#include "string_util.h"
-#include "timers.h"
 
 // File format reference: http://edoceo.com/utilitas/csv-file-format.
 
 namespace benchmark {
 
 namespace {
-std::vector<std::string> elements = {
+const std::vector<const char*> elements = {
     "name",           "iterations",       "real_time",        "cpu_time",
     "time_unit",      "bytes_per_second", "items_per_second", "label",
     "error_occurred", "error_message"};
-}  // namespace
 
 std::string CsvEscape(const std::string& s) {
   std::string tmp;
@@ -51,6 +45,7 @@ std::string CsvEscape(const std::string& s) {
   }
   return '"' + tmp + '"';
 }
+}  // namespace
 
 BENCHMARK_EXPORT
 bool CSVReporter::ReportContext(const Context& context) {
@@ -66,8 +61,10 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
     // save the names of all the user counters
     for (const auto& run : reports) {
       for (const auto& cnt : run.counters) {
-        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+        if (cnt.first == "bytes_per_second" ||
+            cnt.first == "items_per_second") {
           continue;
+        }
         user_counter_names_.insert(cnt.first);
       }
     }
@@ -75,7 +72,9 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
     // print the header
     for (auto B = elements.begin(); B != elements.end();) {
       Out << *B++;
-      if (B != elements.end()) Out << ",";
+      if (B != elements.end()) {
+        Out << ",";
+      }
     }
     for (auto B = user_counter_names_.begin();
          B != user_counter_names_.end();) {
@@ -88,8 +87,10 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
     // check that all the current counters are saved in the name set
     for (const auto& run : reports) {
       for (const auto& cnt : run.counters) {
-        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+        if (cnt.first == "bytes_per_second" ||
+            cnt.first == "items_per_second") {
           continue;
+        }
         BM_CHECK(user_counter_names_.find(cnt.first) !=
                  user_counter_names_.end())
             << "All counters must be present in each run. "
@@ -109,7 +110,7 @@ BENCHMARK_EXPORT
 void CSVReporter::PrintRunData(const Run& run) {
   std::ostream& Out = GetOutputStream();
   Out << CsvEscape(run.benchmark_name()) << ",";
-  if (run.skipped) {
+  if (run.skipped != 0u) {
     Out << std::string(elements.size() - 3, ',');
     Out << std::boolalpha << (internal::SkippedWithError == run.skipped) << ",";
     Out << CsvEscape(run.skip_message) << "\n";
diff --git a/third-party/benchmark/src/cycleclock.h b/third-party/benchmark/src/cycleclock.h
index c0dffcf4b35f6..0671a425f0d4a 100644
--- a/third-party/benchmark/src/cycleclock.h
+++ b/third-party/benchmark/src/cycleclock.h
@@ -36,6 +36,9 @@
 // declarations of some other intrinsics, breaking compilation.
 // Therefore, we simply declare __rdtsc ourselves. See also
 // http://connect.microsoft.com/VisualStudio/feedback/details/262047
+//
+// Note that MSVC defines the x64 preprocessor macros when building
+// for Arm64EC, despite it using Arm64 assembly instructions.
 #if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64) && \
     !defined(_M_ARM64EC)
 extern "C" uint64_t __rdtsc();
@@ -70,7 +73,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   // frequency scaling).  Also note that when the Mac sleeps, this
   // counter pauses; it does not continue counting, nor does it
   // reset to zero.
-  return mach_absolute_time();
+  return static_cast<int64_t>(mach_absolute_time());
 #elif defined(BENCHMARK_OS_EMSCRIPTEN)
   // this goes above x86-specific code because old versions of Emscripten
   // define __x86_64__, although they have nothing to do with it.
@@ -79,10 +82,13 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   int64_t ret;
   __asm__ volatile("rdtsc" : "=A"(ret));
   return ret;
+
+// Note that Clang, like MSVC, defines the x64 preprocessor macros when building
+// for Arm64EC, despite it using Arm64 assembly instructions.
 #elif (defined(__x86_64__) || defined(__amd64__)) && !defined(__arm64ec__)
   uint64_t low, high;
   __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
-  return (high << 32) | low;
+  return static_cast<int64_t>((high << 32) | low);
 #elif defined(__powerpc__) || defined(__ppc__)
   // This returns a time-base, which is not always precisely a cycle-count.
 #if defined(__powerpc64__) || defined(__ppc64__)
@@ -205,11 +211,12 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
       "sub %0, zero, %0\n"
       "and %1, %1, %0\n"
       : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1));
-  return (static_cast<uint64_t>(cycles_hi1) << 32) | cycles_lo;
+  return static_cast<int64_t>((static_cast<uint64_t>(cycles_hi1) << 32) |
+                              cycles_lo);
 #else
   uint64_t cycles;
   asm volatile("rdtime %0" : "=r"(cycles));
-  return cycles;
+  return static_cast<int64_t>(cycles);
 #endif
 #elif defined(__e2k__) || defined(__elbrus__)
   struct timeval tv;
@@ -218,7 +225,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
 #elif defined(__hexagon__)
   uint64_t pcycle;
   asm volatile("%0 = C15:14" : "=r"(pcycle));
-  return static_cast<double>(pcycle);
+  return static_cast<int64_t>(pcycle);
 #elif defined(__alpha__)
   // Alpha has a cycle counter, the PCC register, but it is an unsigned 32-bit
   // integer and thus wraps every ~4s, making using it for tick counts
@@ -228,6 +235,18 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__hppa__) || defined(__linux__)
+  // Fallback for all other architectures with a recent Linux kernel, e.g.:
+  // HP PA-RISC provides a user-readable clock counter (cr16), but
+  // it's not syncronized across CPUs and only 32-bit wide when programs
+  // are built as 32-bit binaries.
+  // Same for SH-4 and possibly others.
+  // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
+  // because is provides nanosecond resolution.
+  // Initialize to always return 0 if clock_gettime fails.
+  struct timespec ts = {0, 0};
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return static_cast<int64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
 #else
   // The soft failover to a generic implementation is automatic only for ARM.
   // For other platforms the developer is expected to make an attempt to create
diff --git a/third-party/benchmark/src/internal_macros.h b/third-party/benchmark/src/internal_macros.h
index f4894ba8e65d6..22e3e21753ee0 100644
--- a/third-party/benchmark/src/internal_macros.h
+++ b/third-party/benchmark/src/internal_macros.h
@@ -106,6 +106,16 @@
   #define BENCHMARK_MAYBE_UNUSED
 #endif
 
+#if defined(__GNUC__) || defined(__clang__)
+#define PRINTF_FORMAT_STRING_FUNC(format_arg, first_idx) \
+  __attribute__((format(printf, format_arg, first_idx)))
+#elif defined(__MINGW32__)
+#define PRINTF_FORMAT_STRING_FUNC(format_arg, first_idx) \
+  __attribute__((format(__MINGW_PRINTF_FORMAT, format_arg, first_idx)))
+#else
+#define PRINTF_FORMAT_STRING_FUNC(format_arg, first_idx)
+#endif
+
 // clang-format on
 
 #endif  // BENCHMARK_INTERNAL_MACROS_H_
diff --git a/third-party/benchmark/src/json_reporter.cc b/third-party/benchmark/src/json_reporter.cc
index b8c8c94c08a0f..2b84cd14a5202 100644
--- a/third-party/benchmark/src/json_reporter.cc
+++ b/third-party/benchmark/src/json_reporter.cc
@@ -81,19 +81,25 @@ std::string FormatKV(std::string const& key, bool value) {
 
 std::string FormatKV(std::string const& key, int64_t value) {
   std::stringstream ss;
-  ss << '"' << StrEscape(key) << "\": " << value;
+  // We really want to just dump the integer as-is,
+  // without the system locale interfering.
+  ss << '"' << StrEscape(key) << "\": " << std::to_string(value);
   return ss.str();
 }
 
+std::string FormatKV(std::string const& key, int value) {
+  return FormatKV(key, static_cast<int64_t>(value));
+}
+
 std::string FormatKV(std::string const& key, double value) {
   std::stringstream ss;
   ss << '"' << StrEscape(key) << "\": ";
 
-  if (std::isnan(value))
+  if (std::isnan(value)) {
     ss << (value < 0 ? "-" : "") << "NaN";
-  else if (std::isinf(value))
+  } else if (std::isinf(value)) {
     ss << (value < 0 ? "-" : "") << "Infinity";
-  else {
+  } else {
     const auto max_digits10 =
         std::numeric_limits<decltype(value)>::max_digits10;
     const auto max_fractional_digits10 = max_digits10 - 1;
@@ -122,7 +128,7 @@ bool JSONReporter::ReportContext(const Context& context) {
 
   out << indent << FormatKV("host_name", context.sys_info.name) << ",\n";
 
-  if (Context::executable_name) {
+  if (Context::executable_name != nullptr) {
     out << indent << FormatKV("executable", Context::executable_name) << ",\n";
   }
 
@@ -136,7 +142,15 @@ bool JSONReporter::ReportContext(const Context& context) {
   if (CPUInfo::Scaling::UNKNOWN != info.scaling) {
     out << indent
         << FormatKV("cpu_scaling_enabled",
-                    info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
+                    info.scaling == CPUInfo::Scaling::ENABLED)
+        << ",\n";
+  }
+
+  const SystemInfo& sysinfo = context.sys_info;
+  if (SystemInfo::ASLR::UNKNOWN != sysinfo.ASLRStatus) {
+    out << indent
+        << FormatKV("aslr_enabled",
+                    sysinfo.ASLRStatus == SystemInfo::ASLR::ENABLED)
         << ",\n";
   }
 
@@ -144,7 +158,7 @@ bool JSONReporter::ReportContext(const Context& context) {
   indent = std::string(6, ' ');
   std::string cache_indent(8, ' ');
   for (size_t i = 0; i < info.caches.size(); ++i) {
-    auto& CI = info.caches[i];
+    const auto& CI = info.caches[i];
     out << indent << "{\n";
     out << cache_indent << FormatKV("type", CI.type) << ",\n";
     out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
@@ -155,7 +169,9 @@ bool JSONReporter::ReportContext(const Context& context) {
         << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
         << "\n";
     out << indent << "}";
-    if (i != info.caches.size() - 1) out << ",";
+    if (i != info.caches.size() - 1) {
+      out << ",";
+    }
     out << "\n";
   }
   indent = std::string(4, ' ');
@@ -163,7 +179,9 @@ bool JSONReporter::ReportContext(const Context& context) {
   out << indent << "\"load_avg\": [";
   for (auto it = info.load_avg.begin(); it != info.load_avg.end();) {
     out << *it++;
-    if (it != info.load_avg.end()) out << ",";
+    if (it != info.load_avg.end()) {
+      out << ",";
+    }
   }
   out << "],\n";
 
@@ -179,7 +197,7 @@ bool JSONReporter::ReportContext(const Context& context) {
   out << ",\n";
 
   // NOTE: our json schema is not strictly tied to the library version!
-  out << indent << FormatKV("json_schema_version", int64_t(1));
+  out << indent << FormatKV("json_schema_version", 1);
 
   std::map<std::string, std::string>* global_context =
       internal::GetGlobalContext();
@@ -294,20 +312,21 @@ void JSONReporter::PrintRunData(Run const& run) {
     out << indent << FormatKV("rms", run.GetAdjustedCPUTime());
   }
 
-  for (auto& c : run.counters) {
+  for (const auto& c : run.counters) {
     out << ",\n" << indent << FormatKV(c.first, c.second);
   }
 
-  if (run.memory_result) {
-    const MemoryManager::Result memory_result = *run.memory_result;
+  if (run.memory_result.memory_iterations > 0) {
+    const auto& memory_result = run.memory_result;
     out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
     out << ",\n"
         << indent << FormatKV("max_bytes_used", memory_result.max_bytes_used);
 
     auto report_if_present = [&out, &indent](const std::string& label,
                                              int64_t val) {
-      if (val != MemoryManager::TombstoneValue)
+      if (val != MemoryManager::TombstoneValue) {
         out << ",\n" << indent << FormatKV(label, val);
+      }
     };
 
     report_if_present("total_allocated_bytes",
@@ -321,7 +340,4 @@ void JSONReporter::PrintRunData(Run const& run) {
   out << '\n';
 }
 
-const int64_t MemoryManager::TombstoneValue =
-    std::numeric_limits<int64_t>::max();
-
 }  // end namespace benchmark
diff --git a/third-party/benchmark/src/log.h b/third-party/benchmark/src/log.h
index 9a21400b096d5..57b7bdfc45b14 100644
--- a/third-party/benchmark/src/log.h
+++ b/third-party/benchmark/src/log.h
@@ -4,13 +4,6 @@
 #include <iostream>
 #include <ostream>
 
-// NOTE: this is also defined in benchmark.h but we're trying to avoid a
-// dependency.
-// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer.
-#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
-#define BENCHMARK_HAS_CXX11
-#endif
-
 namespace benchmark {
 namespace internal {
 
@@ -31,13 +24,8 @@ class LogType {
 
   // NOTE: we could use BENCHMARK_DISALLOW_COPY_AND_ASSIGN but we shouldn't have
   // a dependency on benchmark.h from here.
-#ifndef BENCHMARK_HAS_CXX11
-  LogType(const LogType&);
-  LogType& operator=(const LogType&);
-#else
   LogType(const LogType&) = delete;
   LogType& operator=(const LogType&) = delete;
-#endif
 };
 
 template <class Tp>
diff --git a/third-party/benchmark/src/perf_counters.cc b/third-party/benchmark/src/perf_counters.cc
index d466e27e86f94..f47aa7b42d040 100644
--- a/third-party/benchmark/src/perf_counters.cc
+++ b/third-party/benchmark/src/perf_counters.cc
@@ -26,8 +26,6 @@
 namespace benchmark {
 namespace internal {
 
-constexpr size_t PerfCounterValues::kMaxCounters;
-
 #if defined HAVE_LIBPFM
 
 size_t PerfCounterValues::Read(const std::vector<int>& leaders) {
@@ -39,7 +37,8 @@ size_t PerfCounterValues::Read(const std::vector<int>& leaders) {
     auto read_bytes = ::read(lead, ptr, size);
     if (read_bytes >= ssize_t(sizeof(uint64_t))) {
       // Actual data bytes are all bytes minus initial padding
-      std::size_t data_bytes = read_bytes - sizeof(uint64_t);
+      std::size_t data_bytes =
+          static_cast<std::size_t>(read_bytes) - sizeof(uint64_t);
       // This should be very cheap since it's in hot cache
       std::memmove(ptr, ptr + sizeof(uint64_t), data_bytes);
       // Increment our counters
@@ -156,7 +155,8 @@ PerfCounters PerfCounters::Create(
     attr.exclude_hv = true;
 
     // Read all counters in a group in one read.
-    attr.read_format = PERF_FORMAT_GROUP;
+    attr.read_format = PERF_FORMAT_GROUP;  //| PERF_FORMAT_TOTAL_TIME_ENABLED |
+                                           // PERF_FORMAT_TOTAL_TIME_RUNNING;
 
     int id = -1;
     while (id < 0) {
@@ -214,9 +214,9 @@ PerfCounters PerfCounters::Create(
       // This should never happen but if it does, we give up on the
       // entire batch as recovery would be a mess.
       GetErrorLogInstance() << "***WARNING*** Failed to start counters. "
-                               "Claring out all counters.\n";
+                               "Clearing out all counters.\n";
 
-      // Close all peformance counters
+      // Close all performance counters
       for (int id : counter_ids) {
         ::close(id);
       }
diff --git a/third-party/benchmark/src/re.h b/third-party/benchmark/src/re.h
index 9afb869bea27e..1486dd8778a6b 100644
--- a/third-party/benchmark/src/re.h
+++ b/third-party/benchmark/src/re.h
@@ -15,6 +15,8 @@
 #ifndef BENCHMARK_RE_H_
 #define BENCHMARK_RE_H_
 
+#include <vector>
+
 #include "internal_macros.h"
 
 // clang-format off
@@ -121,15 +123,13 @@ inline bool Regex::Init(const std::string& spec, std::string* error) {
   if (ec != 0) {
     if (error) {
       size_t needed = regerror(ec, &re_, nullptr, 0);
-      char* errbuf = new char[needed];
-      regerror(ec, &re_, errbuf, needed);
+      std::vector<char> errbuf(needed);
+      regerror(ec, &re_, errbuf.data(), needed);
 
       // regerror returns the number of bytes necessary to null terminate
       // the string, so we move that when assigning to error.
       BM_CHECK_NE(needed, 0);
-      error->assign(errbuf, needed - 1);
-
-      delete[] errbuf;
+      error->assign(errbuf.data(), needed - 1);
     }
 
     return false;
diff --git a/third-party/benchmark/src/reporter.cc b/third-party/benchmark/src/reporter.cc
index 076bc31a2eccc..71926b15e97b5 100644
--- a/third-party/benchmark/src/reporter.cc
+++ b/third-party/benchmark/src/reporter.cc
@@ -42,20 +42,23 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
   Out << LocalDateTimeString() << "\n";
 #endif
 
-  if (context.executable_name)
-    Out << "Running " << context.executable_name << "\n";
+  if (benchmark::BenchmarkReporter::Context::executable_name != nullptr) {
+    Out << "Running " << benchmark::BenchmarkReporter::Context::executable_name
+        << "\n";
+  }
 
   const CPUInfo &info = context.cpu_info;
   Out << "Run on (" << info.num_cpus << " X "
       << (info.cycles_per_second / 1000000.0) << " MHz CPU "
       << ((info.num_cpus > 1) ? "s" : "") << ")\n";
-  if (info.caches.size() != 0) {
+  if (!info.caches.empty()) {
     Out << "CPU Caches:\n";
-    for (auto &CInfo : info.caches) {
+    for (const auto &CInfo : info.caches) {
       Out << "  L" << CInfo.level << " " << CInfo.type << " "
           << (CInfo.size / 1024) << " KiB";
-      if (CInfo.num_sharing != 0)
+      if (CInfo.num_sharing != 0) {
         Out << " (x" << (info.num_cpus / CInfo.num_sharing) << ")";
+      }
       Out << "\n";
     }
   }
@@ -63,7 +66,9 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
     Out << "Load Average: ";
     for (auto It = info.load_avg.begin(); It != info.load_avg.end();) {
       Out << StrFormat("%.2f", *It++);
-      if (It != info.load_avg.end()) Out << ", ";
+      if (It != info.load_avg.end()) {
+        Out << ", ";
+      }
     }
     Out << "\n";
   }
@@ -83,6 +88,12 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
            "overhead.\n";
   }
 
+  const SystemInfo &sysinfo = context.sys_info;
+  if (SystemInfo::ASLR::ENABLED == sysinfo.ASLRStatus) {
+    Out << "***WARNING*** ASLR is enabled, the results may have unreproducible "
+           "noise in them.\n";
+  }
+
 #ifndef NDEBUG
   Out << "***WARNING*** Library was built as DEBUG. Timings may be "
          "affected.\n";
@@ -105,13 +116,17 @@ std::string BenchmarkReporter::Run::benchmark_name() const {
 
 double BenchmarkReporter::Run::GetAdjustedRealTime() const {
   double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit);
-  if (iterations != 0) new_time /= static_cast<double>(iterations);
+  if (iterations != 0) {
+    new_time /= static_cast<double>(iterations);
+  }
   return new_time;
 }
 
 double BenchmarkReporter::Run::GetAdjustedCPUTime() const {
   double new_time = cpu_accumulated_time * GetTimeUnitMultiplier(time_unit);
-  if (iterations != 0) new_time /= static_cast<double>(iterations);
+  if (iterations != 0) {
+    new_time /= static_cast<double>(iterations);
+  }
   return new_time;
 }
 
diff --git a/third-party/benchmark/src/statistics.cc b/third-party/benchmark/src/statistics.cc
index 261dcb299a677..fc7450ef91897 100644
--- a/third-party/benchmark/src/statistics.cc
+++ b/third-party/benchmark/src/statistics.cc
@@ -26,17 +26,21 @@
 
 namespace benchmark {
 
-auto StatisticsSum = [](const std::vector<double>& v) {
+const auto StatisticsSum = [](const std::vector<double>& v) {
   return std::accumulate(v.begin(), v.end(), 0.0);
 };
 
 double StatisticsMean(const std::vector<double>& v) {
-  if (v.empty()) return 0.0;
+  if (v.empty()) {
+    return 0.0;
+  }
   return StatisticsSum(v) * (1.0 / static_cast<double>(v.size()));
 }
 
 double StatisticsMedian(const std::vector<double>& v) {
-  if (v.size() < 3) return StatisticsMean(v);
+  if (v.size() < 3) {
+    return StatisticsMean(v);
+  }
   std::vector<double> copy(v);
 
   auto center = copy.begin() + v.size() / 2;
@@ -47,29 +51,37 @@ double StatisticsMedian(const std::vector<double>& v) {
   // before.  Instead of resorting, we just look for the max value before it,
   // which is not necessarily the element immediately preceding `center` Since
   // `copy` is only partially sorted by `nth_element`.
-  if (v.size() % 2 == 1) return *center;
+  if (v.size() % 2 == 1) {
+    return *center;
+  }
   auto center2 = std::max_element(copy.begin(), center);
   return (*center + *center2) / 2.0;
 }
 
 // Return the sum of the squares of this sample set
-auto SumSquares = [](const std::vector<double>& v) {
+const auto SumSquares = [](const std::vector<double>& v) {
   return std::inner_product(v.begin(), v.end(), v.begin(), 0.0);
 };
 
-auto Sqr = [](const double dat) { return dat * dat; };
-auto Sqrt = [](const double dat) {
+const auto Sqr = [](const double dat) { return dat * dat; };
+const auto Sqrt = [](const double dat) {
   // Avoid NaN due to imprecision in the calculations
-  if (dat < 0.0) return 0.0;
+  if (dat < 0.0) {
+    return 0.0;
+  }
   return std::sqrt(dat);
 };
 
 double StatisticsStdDev(const std::vector<double>& v) {
   const auto mean = StatisticsMean(v);
-  if (v.empty()) return mean;
+  if (v.empty()) {
+    return mean;
+  }
 
   // Sample standard deviation is undefined for n = 1
-  if (v.size() == 1) return 0.0;
+  if (v.size() == 1) {
+    return 0.0;
+  }
 
   const double avg_squares =
       SumSquares(v) * (1.0 / static_cast<double>(v.size()));
@@ -79,12 +91,16 @@ double StatisticsStdDev(const std::vector<double>& v) {
 }
 
 double StatisticsCV(const std::vector<double>& v) {
-  if (v.size() < 2) return 0.0;
+  if (v.size() < 2) {
+    return 0.0;
+  }
 
   const auto stddev = StatisticsStdDev(v);
   const auto mean = StatisticsMean(v);
 
-  if (std::fpclassify(mean) == FP_ZERO) return 0.0;
+  if (std::fpclassify(mean) == FP_ZERO) {
+    return 0.0;
+  }
 
   return stddev / mean;
 }
@@ -97,7 +113,7 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
   auto error_count = std::count_if(reports.begin(), reports.end(),
                                    [](Run const& run) { return run.skipped; });
 
-  if (reports.size() - error_count < 2) {
+  if (reports.size() - static_cast<size_t>(error_count) < 2) {
     // We don't report aggregated data if there was a single run.
     return results;
   }
@@ -137,7 +153,9 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
   for (Run const& run : reports) {
     BM_CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
     BM_CHECK_EQ(run_iterations, run.iterations);
-    if (run.skipped) continue;
+    if (run.skipped != 0u) {
+      continue;
+    }
     real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
     cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
     // user counters
@@ -158,7 +176,7 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
   }
 
   const double iteration_rescale_factor =
-      double(reports.size()) / double(run_iterations);
+      static_cast<double>(reports.size()) / static_cast<double>(run_iterations);
 
   for (const auto& Stat : *reports[0].statistics) {
     // Get the data from the accumulator to BenchmarkReporter::Run's.
@@ -179,7 +197,7 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
     // Similarly, if there are N repetitions with 1 iterations each,
     // an aggregate will be computed over N measurements, not 1.
     // Thus it is best to simply use the count of separate reports.
-    data.iterations = reports.size();
+    data.iterations = static_cast<IterationCount>(reports.size());
 
     data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
     data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
diff --git a/third-party/benchmark/src/string_util.cc b/third-party/benchmark/src/string_util.cc
index c69e40a8133cc..9c5df3ba25c92 100644
--- a/third-party/benchmark/src/string_util.cc
+++ b/third-party/benchmark/src/string_util.cc
@@ -29,7 +29,7 @@ static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
 static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
               "Small SI and Big SI unit arrays must be the same size");
 
-static const int64_t kUnitsSize = arraysize(kBigSIUnits);
+const int64_t kUnitsSize = arraysize(kBigSIUnits);
 
 void ToExponentAndMantissa(double val, int precision, double one_k,
                            std::string* mantissa, int64_t* exponent) {
@@ -56,7 +56,7 @@ void ToExponentAndMantissa(double val, int precision, double one_k,
       scaled /= one_k;
       if (scaled <= big_threshold) {
         mantissa_stream << scaled;
-        *exponent = i + 1;
+        *exponent = static_cast<int64_t>(i + 1);
         *mantissa = mantissa_stream.str();
         return;
       }
@@ -87,10 +87,14 @@ void ToExponentAndMantissa(double val, int precision, double one_k,
 }
 
 std::string ExponentToPrefix(int64_t exponent, bool iec) {
-  if (exponent == 0) return "";
+  if (exponent == 0) {
+    return {};
+  }
 
   const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1);
-  if (index >= kUnitsSize) return "";
+  if (index >= kUnitsSize) {
+    return {};
+  }
 
   const char* const* array =
       (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits);
@@ -101,21 +105,22 @@ std::string ExponentToPrefix(int64_t exponent, bool iec) {
 std::string ToBinaryStringFullySpecified(double value, int precision,
                                          Counter::OneK one_k) {
   std::string mantissa;
-  int64_t exponent;
+  int64_t exponent = 0;
   ToExponentAndMantissa(value, precision,
                         one_k == Counter::kIs1024 ? 1024.0 : 1000.0, &mantissa,
                         &exponent);
   return mantissa + ExponentToPrefix(exponent, one_k == Counter::kIs1024);
 }
 
+PRINTF_FORMAT_STRING_FUNC(1, 0)
 std::string StrFormatImp(const char* msg, va_list args) {
   // we might need a second shot at this, so pre-emptivly make a copy
   va_list args_cp;
   va_copy(args_cp, args);
 
-  // TODO(ericwf): use std::array for first attempt to avoid one memory
-  // allocation guess what the size might be
-  std::array<char, 256> local_buff;
+  // Use std::array for first attempt to avoid one memory allocation guess what
+  // the size might be
+  std::array<char, 256> local_buff = {};
 
   // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
   // in the android-ndk
@@ -124,9 +129,12 @@ std::string StrFormatImp(const char* msg, va_list args) {
   va_end(args_cp);
 
   // handle empty expansion
-  if (ret == 0) return std::string{};
-  if (static_cast<std::size_t>(ret) < local_buff.size())
+  if (ret == 0) {
+    return {};
+  }
+  if (static_cast<std::size_t>(ret) < local_buff.size()) {
     return std::string(local_buff.data());
+  }
 
   // we did not provide a long enough buffer on our first attempt.
   // add 1 to size to account for null-byte in size cast to prevent overflow
@@ -153,7 +161,9 @@ std::string StrFormat(const char* format, ...) {
 }
 
 std::vector<std::string> StrSplit(const std::string& str, char delim) {
-  if (str.empty()) return {};
+  if (str.empty()) {
+    return {};
+  }
   std::vector<std::string> ret;
   size_t first = 0;
   size_t next = str.find(delim);
diff --git a/third-party/benchmark/src/string_util.h b/third-party/benchmark/src/string_util.h
index 731aa2c04c3e0..f1e50be4f44d5 100644
--- a/third-party/benchmark/src/string_util.h
+++ b/third-party/benchmark/src/string_util.h
@@ -9,7 +9,6 @@
 #include "benchmark/benchmark.h"
 #include "benchmark/export.h"
 #include "check.h"
-#include "internal_macros.h"
 
 namespace benchmark {
 
diff --git a/third-party/benchmark/src/sysinfo.cc b/third-party/benchmark/src/sysinfo.cc
index d1ae6cc82b943..d25888548fe8b 100644
--- a/third-party/benchmark/src/sysinfo.cc
+++ b/third-party/benchmark/src/sysinfo.cc
@@ -12,13 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(_MSC_VER)
-// FIXME: This must be defined before any other includes to disable deprecation
-// warnings for use of codecvt from C++17. We should remove our reliance on
-// the deprecated functionality instead.
-#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
-#endif
-
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
@@ -61,6 +54,10 @@
 #include <pthread.h>
 #endif
 
+#if defined(BENCHMARK_OS_LINUX)
+#include <sys/personality.h>
+#endif
+
 #include <algorithm>
 #include <array>
 #include <bitset>
@@ -83,7 +80,6 @@
 #include "benchmark/benchmark.h"
 #include "check.h"
 #include "cycleclock.h"
-#include "internal_macros.h"
 #include "log.h"
 #include "string_util.h"
 #include "timers.h"
@@ -91,7 +87,7 @@
 namespace benchmark {
 namespace {
 
-void PrintImp(std::ostream& out) { out << std::endl; }
+void PrintImp(std::ostream& out) { out << '\n'; }
 
 template <class First, class... Rest>
 void PrintImp(std::ostream& out, First&& f, Rest&&... rest) {
@@ -102,6 +98,7 @@ void PrintImp(std::ostream& out, First&& f, Rest&&... rest) {
 template <class... Args>
 BENCHMARK_NORETURN void PrintErrorAndDie(Args&&... args) {
   PrintImp(std::cerr, std::forward<Args>(args)...);
+  std::cerr << std::flush;
   std::exit(EXIT_FAILURE);
 }
 
@@ -127,7 +124,7 @@ struct ValueUnion {
 
   explicit ValueUnion(std::size_t buff_size)
       : size(sizeof(DataT) + buff_size),
-        buff(::new (std::malloc(size)) DataT(), &std::free) {}
+        buff(::new(std::malloc(size)) DataT(), &std::free) {}
 
   ValueUnion(ValueUnion&& other) = default;
 
@@ -219,14 +216,18 @@ template <class ArgT>
 bool ReadFromFile(std::string const& fname, ArgT* arg) {
   *arg = ArgT();
   std::ifstream f(fname.c_str());
-  if (!f.is_open()) return false;
+  if (!f.is_open()) {
+    return false;
+  }
   f >> *arg;
   return f.good();
 }
 
 CPUInfo::Scaling CpuScaling(int num_cpus) {
   // We don't have a valid CPU count, so don't even bother.
-  if (num_cpus <= 0) return CPUInfo::Scaling::UNKNOWN;
+  if (num_cpus <= 0) {
+    return CPUInfo::Scaling::UNKNOWN;
+  }
 #if defined(BENCHMARK_OS_QNX)
   return CPUInfo::Scaling::UNKNOWN;
 #elif !defined(BENCHMARK_OS_WINDOWS)
@@ -237,8 +238,9 @@ CPUInfo::Scaling CpuScaling(int num_cpus) {
   for (int cpu = 0; cpu < num_cpus; ++cpu) {
     std::string governor_file =
         StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
-    if (ReadFromFile(governor_file, &res) && res != "performance")
+    if (ReadFromFile(governor_file, &res) && res != "performance") {
       return CPUInfo::Scaling::ENABLED;
+    }
   }
   return CPUInfo::Scaling::DISABLED;
 #else
@@ -253,7 +255,7 @@ int CountSetBitsInCPUMap(std::string val) {
     CPUMask mask(benchmark::stoul(part, nullptr, 16));
     return static_cast<int>(mask.count());
   };
-  std::size_t pos;
+  std::size_t pos = 0;
   int total = 0;
   while ((pos = val.find(',')) != std::string::npos) {
     total += CountBits(val.substr(0, pos));
@@ -274,28 +276,35 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
     CPUInfo::CacheInfo info;
     std::string fpath = StrCat(dir, "index", idx++, "/");
     std::ifstream f(StrCat(fpath, "size").c_str());
-    if (!f.is_open()) break;
+    if (!f.is_open()) {
+      break;
+    }
     std::string suffix;
     f >> info.size;
-    if (f.fail())
+    if (f.fail()) {
       PrintErrorAndDie("Failed while reading file '", fpath, "size'");
+    }
     if (f.good()) {
       f >> suffix;
-      if (f.bad())
+      if (f.bad()) {
         PrintErrorAndDie(
             "Invalid cache size format: failed to read size suffix");
-      else if (f && suffix != "K")
+      } else if (f && suffix != "K") {
         PrintErrorAndDie("Invalid cache size format: Expected bytes ", suffix);
-      else if (suffix == "K")
+      } else if (suffix == "K") {
         info.size *= 1024;
+      }
     }
-    if (!ReadFromFile(StrCat(fpath, "type"), &info.type))
+    if (!ReadFromFile(StrCat(fpath, "type"), &info.type)) {
       PrintErrorAndDie("Failed to read from file ", fpath, "type");
-    if (!ReadFromFile(StrCat(fpath, "level"), &info.level))
+    }
+    if (!ReadFromFile(StrCat(fpath, "level"), &info.level)) {
       PrintErrorAndDie("Failed to read from file ", fpath, "level");
+    }
     std::string map_str;
-    if (!ReadFromFile(StrCat(fpath, "shared_cpu_map"), &map_str))
+    if (!ReadFromFile(StrCat(fpath, "shared_cpu_map"), &map_str)) {
       PrintErrorAndDie("Failed to read from file ", fpath, "shared_cpu_map");
+    }
     info.num_sharing = CountSetBitsInCPUMap(map_str);
     res.push_back(info);
   }
@@ -340,15 +349,18 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
   using UPtr = std::unique_ptr<PInfo, decltype(&std::free)>;
   GetLogicalProcessorInformation(nullptr, &buffer_size);
   UPtr buff(static_cast<PInfo*>(std::malloc(buffer_size)), &std::free);
-  if (!GetLogicalProcessorInformation(buff.get(), &buffer_size))
+  if (!GetLogicalProcessorInformation(buff.get(), &buffer_size)) {
     PrintErrorAndDie("Failed during call to GetLogicalProcessorInformation: ",
                      GetLastError());
+  }
 
   PInfo* it = buff.get();
   PInfo* end = buff.get() + (buffer_size / sizeof(PInfo));
 
   for (; it != end; ++it) {
-    if (it->Relationship != RelationCache) continue;
+    if (it->Relationship != RelationCache) {
+      continue;
+    }
     using BitSet = std::bitset<sizeof(ULONG_PTR) * CHAR_BIT>;
     BitSet b(it->ProcessorMask);
     // To prevent duplicates, only consider caches where CPU 0 is specified
@@ -357,8 +369,13 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
     CPUInfo::CacheInfo C;
     C.num_sharing = static_cast<int>(b.count());
     C.level = cache.Level;
-    C.size = cache.Size;
+    C.size = static_cast<int>(cache.Size);
     switch (cache.Type) {
+// Windows SDK version >= 10.0.26100.0
+#ifdef NTDDI_WIN11_GE
+      case CacheUnknown:
+        break;
+#endif
       case CacheUnified:
         C.type = "Unified";
         break;
@@ -424,7 +441,7 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
   return GetCacheSizesWindows();
 #elif defined(BENCHMARK_OS_QNX)
   return GetCacheSizesQNX();
-#elif defined(BENCHMARK_OS_QURT)
+#elif defined(BENCHMARK_OS_QURT) || defined(__EMSCRIPTEN__)
   return std::vector<CPUInfo::CacheInfo>();
 #else
   return GetCacheSizesFromKVFS();
@@ -446,7 +463,7 @@ std::string GetSystemName() {
                                 DWCOUNT, NULL, 0, NULL, NULL);
   str.resize(len);
   WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, hostname, DWCOUNT, &str[0],
-                      str.size(), NULL, NULL);
+                      static_cast<int>(str.size()), NULL, NULL);
 #endif
   return str;
 #elif defined(BENCHMARK_OS_QURT)
@@ -478,11 +495,21 @@ std::string GetSystemName() {
 #endif  // def HOST_NAME_MAX
   char hostname[HOST_NAME_MAX];
   int retVal = gethostname(hostname, HOST_NAME_MAX);
-  if (retVal != 0) return std::string("");
-  return std::string(hostname);
+  return retVal != 0 ? std::string() : std::string(hostname);
 #endif  // Catch-all POSIX block.
 }
 
+SystemInfo::ASLR GetASLR() {
+#ifdef BENCHMARK_OS_LINUX
+  const auto curr_personality = personality(0xffffffff);
+  return (curr_personality & ADDR_NO_RANDOMIZE) ? SystemInfo::ASLR::DISABLED
+                                                : SystemInfo::ASLR::ENABLED;
+#else
+  // FIXME: support detecting ASLR on other OS.
+  return SystemInfo::ASLR::UNKNOWN;
+#endif
+}
+
 int GetNumCPUsImpl() {
 #ifdef BENCHMARK_OS_WINDOWS
   SYSTEM_INFO sysinfo;
@@ -499,8 +526,9 @@ int GetNumCPUsImpl() {
   if (qurt_sysenv_get_max_hw_threads(&hardware_threads) != QURT_EOK) {
     hardware_threads.max_hthreads = 1;
   }
-  return hardware_threads.max_hthreads;
+  return static_cast<int>(hardware_threads.max_hthreads);
 #elif defined(BENCHMARK_HAS_SYSCTL)
+  // *BSD, macOS
   int num_cpu = -1;
   constexpr auto* hwncpu =
 #if defined BENCHMARK_OS_MACOSX
@@ -513,6 +541,7 @@ int GetNumCPUsImpl() {
   if (GetSysctl(hwncpu, &num_cpu)) return num_cpu;
   PrintErrorAndDie("Err: ", strerror(errno));
 #elif defined(_SC_NPROCESSORS_ONLN)
+  // Linux, Solaris, AIX, Haiku, WASM, etc.
   // Returns -1 in case of a failure.
   int num_cpu = static_cast<int>(sysconf(_SC_NPROCESSORS_ONLN));
   if (num_cpu < 0) {
@@ -520,6 +549,9 @@ int GetNumCPUsImpl() {
                      strerror(errno));
   }
   return num_cpu;
+#else
+  // Fallback, no other API exists.
+  return -1;
 #endif
   BENCHMARK_UNREACHABLE();
 }
@@ -528,7 +560,7 @@ int GetNumCPUs() {
   int num_cpus = GetNumCPUsImpl();
   if (num_cpus < 1) {
     std::cerr << "Unable to extract number of CPUs.\n";
-    /* There is at least one CPU which we run on. */
+    // There must be at least one CPU on which we're running.
     num_cpus = 1;
   }
   return num_cpus;
@@ -537,22 +569,28 @@ int GetNumCPUs() {
 class ThreadAffinityGuard final {
  public:
   ThreadAffinityGuard() : reset_affinity(SetAffinity()) {
-    if (!reset_affinity)
+    if (!reset_affinity) {
       std::cerr << "***WARNING*** Failed to set thread affinity. Estimated CPU "
-                   "frequency may be incorrect."
-                << std::endl;
+                   "frequency may be incorrect.\n";
+    }
   }
 
   ~ThreadAffinityGuard() {
-    if (!reset_affinity) return;
+    if (!reset_affinity) {
+      return;
+    }
 
 #if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
     int ret = pthread_setaffinity_np(self, sizeof(previous_affinity),
                                      &previous_affinity);
-    if (ret == 0) return;
+    if (ret == 0) {
+      return;
+    }
 #elif defined(BENCHMARK_OS_WINDOWS_WIN32)
     DWORD_PTR ret = SetThreadAffinityMask(self, previous_affinity);
-    if (ret != 0) return;
+    if (ret != 0) {
+      return;
+    }
 #endif  // def BENCHMARK_HAS_PTHREAD_AFFINITY
     PrintErrorAndDie("Failed to reset thread affinity");
   }
@@ -565,26 +603,32 @@ class ThreadAffinityGuard final {
  private:
   bool SetAffinity() {
 #if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
-    int ret;
+    int ret = 0;
     self = pthread_self();
     ret = pthread_getaffinity_np(self, sizeof(previous_affinity),
                                  &previous_affinity);
-    if (ret != 0) return false;
+    if (ret != 0) {
+      return false;
+    }
 
     cpu_set_t affinity;
     memcpy(&affinity, &previous_affinity, sizeof(affinity));
 
     bool is_first_cpu = true;
 
-    for (int i = 0; i < CPU_SETSIZE; ++i)
+    for (int i = 0; i < CPU_SETSIZE; ++i) {
       if (CPU_ISSET(i, &affinity)) {
-        if (is_first_cpu)
+        if (is_first_cpu) {
           is_first_cpu = false;
-        else
+        } else {
           CPU_CLR(i, &affinity);
+        }
       }
+    }
 
-    if (is_first_cpu) return false;
+    if (is_first_cpu) {
+      return false;
+    }
 
     ret = pthread_setaffinity_np(self, sizeof(affinity), &affinity);
     return ret == 0;
@@ -599,8 +643,8 @@ class ThreadAffinityGuard final {
   }
 
 #if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
-  pthread_t self;
-  cpu_set_t previous_affinity;
+  pthread_t self{};
+  cpu_set_t previous_affinity{};
 #elif defined(BENCHMARK_OS_WINDOWS_WIN32)
   HANDLE self;
   DWORD_PTR previous_affinity;
@@ -614,7 +658,7 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
   (void)scaling;
 
 #if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
-  long freq;
+  long freq = 0;
 
   // If the kernel is exporting the tsc frequency use that. There are issues
   // where cpuinfo_max_freq cannot be relied on because the BIOS may be
@@ -649,7 +693,9 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
   }
 
   auto StartsWithKey = [](std::string const& Value, std::string const& Key) {
-    if (Key.size() > Value.size()) return false;
+    if (Key.size() > Value.size()) {
+      return false;
+    }
     auto Cmp = [&](char X, char Y) {
       return std::tolower(X) == std::tolower(Y);
     };
@@ -658,22 +704,30 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
 
   std::string ln;
   while (std::getline(f, ln)) {
-    if (ln.empty()) continue;
+    if (ln.empty()) {
+      continue;
+    }
     std::size_t split_idx = ln.find(':');
     std::string value;
-    if (split_idx != std::string::npos) value = ln.substr(split_idx + 1);
+    if (split_idx != std::string::npos) {
+      value = ln.substr(split_idx + 1);
+    }
     // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
     // accept positive values. Some environments (virtual machines) report zero,
     // which would cause infinite looping in WallTime_Init.
     if (StartsWithKey(ln, "cpu MHz")) {
       if (!value.empty()) {
         double cycles_per_second = benchmark::stod(value) * 1000000.0;
-        if (cycles_per_second > 0) return cycles_per_second;
+        if (cycles_per_second > 0) {
+          return cycles_per_second;
+        }
       }
     } else if (StartsWithKey(ln, "bogomips")) {
       if (!value.empty()) {
         bogo_clock = benchmark::stod(value) * 1000000.0;
-        if (bogo_clock < 0.0) bogo_clock = error_value;
+        if (bogo_clock < 0.0) {
+          bogo_clock = error_value;
+        }
       }
     }
   }
@@ -689,7 +743,9 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
   // If we found the bogomips clock, but nothing better, we'll use it (but
   // we're not happy about it); otherwise, fallback to the rough estimation
   // below.
-  if (bogo_clock >= 0.0) return bogo_clock;
+  if (bogo_clock >= 0.0) {
+    return bogo_clock;
+  }
 
 #elif defined BENCHMARK_HAS_SYSCTL
   constexpr auto* freqStr =
@@ -704,9 +760,13 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
 #endif
   unsigned long long hz = 0;
 #if defined BENCHMARK_OS_OPENBSD
-  if (GetSysctl(freqStr, &hz)) return static_cast<double>(hz * 1000000);
+  if (GetSysctl(freqStr, &hz)) {
+    return static_cast<double>(hz * 1000000);
+  }
 #else
-  if (GetSysctl(freqStr, &hz)) return hz;
+  if (GetSysctl(freqStr, &hz)) {
+    return static_cast<double>(hz);
+  }
 #endif
   fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
           freqStr, strerror(errno));
@@ -722,9 +782,10 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
       SUCCEEDED(
           SHGetValueA(HKEY_LOCAL_MACHINE,
                       "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
-                      "~MHz", nullptr, &data, &data_size)))
+                      "~MHz", nullptr, &data, &data_size))) {
     return static_cast<double>(static_cast<int64_t>(data) *
                                static_cast<int64_t>(1000 * 1000));  // was mhz
+  }
 #elif defined(BENCHMARK_OS_SOLARIS)
   kstat_ctl_t* kc = kstat_open();
   if (!kc) {
@@ -806,11 +867,11 @@ std::vector<double> GetLoadAvg() {
     !(defined(__ANDROID__) && __ANDROID_API__ < 29)
   static constexpr int kMaxSamples = 3;
   std::vector<double> res(kMaxSamples, 0.0);
-  const int nelem = getloadavg(res.data(), kMaxSamples);
+  const auto nelem = getloadavg(res.data(), kMaxSamples);
   if (nelem < 1) {
     res.clear();
   } else {
-    res.resize(nelem);
+    res.resize(static_cast<size_t>(nelem));
   }
   return res;
 #else
@@ -837,5 +898,5 @@ const SystemInfo& SystemInfo::Get() {
   return *info;
 }
 
-SystemInfo::SystemInfo() : name(GetSystemName()) {}
+SystemInfo::SystemInfo() : name(GetSystemName()), ASLRStatus(GetASLR()) {}
 }  // end namespace benchmark
diff --git a/third-party/benchmark/src/thread_manager.h b/third-party/benchmark/src/thread_manager.h
index 819b3c44db662..a0ac37a8b200c 100644
--- a/third-party/benchmark/src/thread_manager.h
+++ b/third-party/benchmark/src/thread_manager.h
@@ -11,30 +11,15 @@ namespace internal {
 
 class ThreadManager {
  public:
-  explicit ThreadManager(int num_threads)
-      : alive_threads_(num_threads), start_stop_barrier_(num_threads) {}
+  explicit ThreadManager(int num_threads) : start_stop_barrier_(num_threads) {}
 
   Mutex& GetBenchmarkMutex() const RETURN_CAPABILITY(benchmark_mutex_) {
     return benchmark_mutex_;
   }
 
-  bool StartStopBarrier() EXCLUDES(end_cond_mutex_) {
-    return start_stop_barrier_.wait();
-  }
-
-  void NotifyThreadComplete() EXCLUDES(end_cond_mutex_) {
-    start_stop_barrier_.removeThread();
-    if (--alive_threads_ == 0) {
-      MutexLock lock(end_cond_mutex_);
-      end_condition_.notify_all();
-    }
-  }
+  bool StartStopBarrier() { return start_stop_barrier_.wait(); }
 
-  void WaitForAllThreads() EXCLUDES(end_cond_mutex_) {
-    MutexLock lock(end_cond_mutex_);
-    end_condition_.wait(lock.native_handle(),
-                        [this]() { return alive_threads_ == 0; });
-  }
+  void NotifyThreadComplete() { start_stop_barrier_.removeThread(); }
 
   struct Result {
     IterationCount iterations = 0;
@@ -51,10 +36,7 @@ class ThreadManager {
 
  private:
   mutable Mutex benchmark_mutex_;
-  std::atomic<int> alive_threads_;
   Barrier start_stop_barrier_;
-  Mutex end_cond_mutex_;
-  Condition end_condition_;
 };
 
 }  // namespace internal
diff --git a/third-party/benchmark/src/timers.cc b/third-party/benchmark/src/timers.cc
index 667e7b2eef3c3..f8d9560ed1c64 100644
--- a/third-party/benchmark/src/timers.cc
+++ b/third-party/benchmark/src/timers.cc
@@ -107,8 +107,9 @@ double MakeTime(struct timespec const& ts) {
 }
 #endif
 
-BENCHMARK_NORETURN static void DiagnoseAndExit(const char* msg) {
-  std::cerr << "ERROR: " << msg << std::endl;
+BENCHMARK_NORETURN void DiagnoseAndExit(const char* msg) {
+  std::cerr << "ERROR: " << msg << '\n';
+  std::flush(std::cerr);
   std::exit(EXIT_FAILURE);
 }
 
@@ -126,8 +127,12 @@ double ProcessCPUUsage() {
     return MakeTime(kernel_time, user_time);
   DiagnoseAndExit("GetProccessTimes() failed");
 #elif defined(BENCHMARK_OS_QURT)
+  // Note that qurt_timer_get_ticks() is no longer documented as of SDK 5.3.0,
+  // and doesn't appear to work on at least some devices (eg Samsung S22),
+  // so let's use the actually-documented and apparently-equivalent
+  // qurt_sysclock_get_hw_ticks() call instead.
   return static_cast<double>(
-             qurt_timer_timetick_to_us(qurt_timer_get_ticks())) *
+             qurt_timer_timetick_to_us(qurt_sysclock_get_hw_ticks())) *
          1.0e-6;
 #elif defined(BENCHMARK_OS_EMSCRIPTEN)
   // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) returns 0 on Emscripten.
@@ -138,9 +143,10 @@ double ProcessCPUUsage() {
 #elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
   // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
   // See https://github.com/google/benchmark/pull/292
-  struct timespec spec;
-  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
+  struct timespec spec {};
+  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0) {
     return MakeTime(spec);
+  }
   DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
 #else
   struct rusage ru;
@@ -160,8 +166,12 @@ double ThreadCPUUsage() {
                  &user_time);
   return MakeTime(kernel_time, user_time);
 #elif defined(BENCHMARK_OS_QURT)
+  // Note that qurt_timer_get_ticks() is no longer documented as of SDK 5.3.0,
+  // and doesn't appear to work on at least some devices (eg Samsung S22),
+  // so let's use the actually-documented and apparently-equivalent
+  // qurt_sysclock_get_hw_ticks() call instead.
   return static_cast<double>(
-             qurt_timer_timetick_to_us(qurt_timer_get_ticks())) *
+             qurt_timer_timetick_to_us(qurt_sysclock_get_hw_ticks())) *
          1.0e-6;
 #elif defined(BENCHMARK_OS_MACOSX)
   // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
@@ -190,8 +200,10 @@ double ThreadCPUUsage() {
   if (getrusage(RUSAGE_LWP, &ru) == 0) return MakeTime(ru);
   DiagnoseAndExit("getrusage(RUSAGE_LWP, ...) failed");
 #elif defined(CLOCK_THREAD_CPUTIME_ID)
-  struct timespec ts;
-  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) return MakeTime(ts);
+  struct timespec ts {};
+  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) {
+    return MakeTime(ts);
+  }
   DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed");
 #else
 #error Per-thread timing is not available on your system.
@@ -205,9 +217,9 @@ std::string LocalDateTimeString() {
   const std::size_t kTzOffsetLen = 6;
   const std::size_t kTimestampLen = 19;
 
-  std::size_t tz_len;
-  std::size_t timestamp_len;
-  long int offset_minutes;
+  std::size_t tz_len = 0;
+  std::size_t timestamp_len = 0;
+  long int offset_minutes = 0;
   char tz_offset_sign = '+';
   // tz_offset is set in one of three ways:
   // * strftime with %z - This either returns empty or the ISO 8601 time.  The
@@ -227,7 +239,7 @@ std::string LocalDateTimeString() {
 #if defined(BENCHMARK_OS_WINDOWS)
   std::tm* timeinfo_p = ::localtime(&now);
 #else
-  std::tm timeinfo;
+  std::tm timeinfo{};
   std::tm* timeinfo_p = &timeinfo;
   ::localtime_r(&now, &timeinfo);
 #endif
@@ -245,9 +257,9 @@ std::string LocalDateTimeString() {
       tz_offset_sign = '-';
     }
 
-    tz_len =
+    tz_len = static_cast<size_t>(
         ::snprintf(tz_offset, sizeof(tz_offset), "%c%02li:%02li",
-                   tz_offset_sign, offset_minutes / 100, offset_minutes % 100);
+                   tz_offset_sign, offset_minutes / 100, offset_minutes % 100));
     BM_CHECK(tz_len == kTzOffsetLen);
     ((void)tz_len);  // Prevent unused variable warning in optimized build.
   } else {
diff --git a/third-party/benchmark/src/timers.h b/third-party/benchmark/src/timers.h
index 65606ccd93d14..690086b36ca68 100644
--- a/third-party/benchmark/src/timers.h
+++ b/third-party/benchmark/src/timers.h
@@ -15,6 +15,29 @@ double ChildrenCPUUsage();
 // Return the CPU usage of the current thread
 double ThreadCPUUsage();
 
+#if defined(BENCHMARK_OS_QURT)
+
+// std::chrono::now() can return 0 on some Hexagon devices;
+// this reads the value of a 56-bit, 19.2MHz hardware counter
+// and converts it to seconds. Unlike std::chrono, this doesn't
+// return an absolute time, but since ChronoClockNow() is only used
+// to compute elapsed time, this shouldn't matter.
+struct QuRTClock {
+  typedef uint64_t rep;
+  typedef std::ratio<1, 19200000> period;
+  typedef std::chrono::duration<rep, period> duration;
+  typedef std::chrono::time_point<QuRTClock> time_point;
+  static const bool is_steady = false;
+
+  static time_point now() {
+    unsigned long long count;
+    asm volatile(" %0 = c31:30 " : "=r"(count));
+    return time_point(static_cast<duration>(count));
+  }
+};
+
+#else
+
 #if defined(HAVE_STEADY_CLOCK)
 template <bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
 struct ChooseSteadyClock {
@@ -25,10 +48,14 @@ template <>
 struct ChooseSteadyClock<false> {
   typedef std::chrono::steady_clock type;
 };
+#endif  // HAVE_STEADY_CLOCK
+
 #endif
 
 struct ChooseClockType {
-#if defined(HAVE_STEADY_CLOCK)
+#if defined(BENCHMARK_OS_QURT)
+  typedef QuRTClock type;
+#elif defined(HAVE_STEADY_CLOCK)
   typedef ChooseSteadyClock<>::type type;
 #else
   typedef std::chrono::high_resolution_clock type;
diff --git a/third-party/benchmark/test/CMakeLists.txt b/third-party/benchmark/test/CMakeLists.txt
index 1de175f98d342..8a1a1a968fa95 100644
--- a/third-party/benchmark/test/CMakeLists.txt
+++ b/third-party/benchmark/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Enable the tests
+#Enable the tests
 
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 
@@ -73,6 +73,18 @@ macro(benchmark_add_test)
 endmacro(benchmark_add_test)
 
 # Demonstration executable
+
+compile_benchmark_test_with_main(cxx11_test)
+if(DEFINED MSVC)
+  # MSVC does not really support C++11.
+  set_property(TARGET cxx11_test PROPERTY CXX_STANDARD 14)
+else()
+  set_property(TARGET cxx11_test PROPERTY CXX_STANDARD 11)
+endif()
+set_property(TARGET cxx11_test PROPERTY CXX_STANDARD_REQUIRED ON)
+set_property(TARGET cxx11_test PROPERTY CXX_EXTENSIONS OFF)
+benchmark_add_test(NAME cxx11_test COMMAND cxx11_test --benchmark_min_time=0.01s)
+
 compile_benchmark_test(benchmark_test)
 benchmark_add_test(NAME benchmark COMMAND benchmark_test --benchmark_min_time=0.01s)
 
@@ -168,15 +180,24 @@ benchmark_add_test(NAME reporter_output_test COMMAND reporter_output_test --benc
 compile_output_test(templated_fixture_test)
 benchmark_add_test(NAME templated_fixture_test COMMAND templated_fixture_test --benchmark_min_time=0.01s)
 
+compile_output_test(templated_fixture_method_test)
+benchmark_add_test(NAME templated_fixture_method_test COMMAND templated_fixture_method_test --benchmark_min_time=0.01s)
+
 compile_output_test(user_counters_test)
 benchmark_add_test(NAME user_counters_test COMMAND user_counters_test --benchmark_min_time=0.01s)
 
+compile_output_test(user_counters_threads_test)
+benchmark_add_test(NAME user_counters_threads_test COMMAND user_counters_threads_test --benchmark_min_time=0.01s)
+
 compile_output_test(perf_counters_test)
 benchmark_add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time=0.01s --benchmark_perf_counters=CYCLES,INSTRUCTIONS)
 
 compile_output_test(internal_threading_test)
 benchmark_add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01s)
 
+compile_output_test(manual_threading_test)
+benchmark_add_test(NAME manual_threading_test COMMAND manual_threading_test --benchmark_min_time=0.01s)
+
 compile_output_test(report_aggregates_only_test)
 benchmark_add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01s)
 
@@ -192,35 +213,18 @@ benchmark_add_test(NAME user_counters_thousands_test COMMAND user_counters_thous
 compile_output_test(memory_manager_test)
 benchmark_add_test(NAME memory_manager_test COMMAND memory_manager_test --benchmark_min_time=0.01s)
 
-# MSVC does not allow to set the language standard to C++98/03.
-if(NOT (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC"))
-  compile_benchmark_test(cxx03_test)
-  set_target_properties(cxx03_test
-      PROPERTIES
-      CXX_STANDARD 98
-      CXX_STANDARD_REQUIRED YES)
-  # libstdc++ provides different definitions within <map> between dialects. When
-  # LTO is enabled and -Werror is specified GCC diagnoses this ODR violation
-  # causing the test to fail to compile. To prevent this we explicitly disable
-  # the warning.
-  check_cxx_compiler_flag(-Wno-odr BENCHMARK_HAS_WNO_ODR)
-  check_cxx_compiler_flag(-Wno-lto-type-mismatch BENCHMARK_HAS_WNO_LTO_TYPE_MISMATCH)
-  # Cannot set_target_properties multiple times here because the warnings will
-  # be overwritten on each call
-  set (DISABLE_LTO_WARNINGS "")
-  if (BENCHMARK_HAS_WNO_ODR)
-    set(DISABLE_LTO_WARNINGS "${DISABLE_LTO_WARNINGS} -Wno-odr")
-  endif()
-  if (BENCHMARK_HAS_WNO_LTO_TYPE_MISMATCH)
-    set(DISABLE_LTO_WARNINGS "${DISABLE_LTO_WARNINGS} -Wno-lto-type-mismatch")
-  endif()
-  set_target_properties(cxx03_test PROPERTIES LINK_FLAGS "${DISABLE_LTO_WARNINGS}")
-  benchmark_add_test(NAME cxx03 COMMAND cxx03_test --benchmark_min_time=0.01s)
-endif()
+compile_output_test(profiler_manager_test)
+benchmark_add_test(NAME profiler_manager_test COMMAND profiler_manager_test --benchmark_min_time=0.01s)
+
+compile_benchmark_test(profiler_manager_iterations_test)
+benchmark_add_test(NAME profiler_manager_iterations COMMAND profiler_manager_iterations_test)
 
 compile_output_test(complexity_test)
 benchmark_add_test(NAME complexity_benchmark COMMAND complexity_test --benchmark_min_time=1000000x)
 
+compile_output_test(locale_impermeability_test)
+benchmark_add_test(NAME locale_impermeability_test COMMAND locale_impermeability_test)
+
 ###############################################################################
 # GoogleTest Unit Tests
 ###############################################################################
@@ -251,6 +255,9 @@ if (BENCHMARK_ENABLE_GTEST_TESTS)
   add_gtest(perf_counters_gtest)
   add_gtest(time_unit_gtest)
   add_gtest(min_time_parse_gtest)
+  add_gtest(profiler_manager_gtest)
+  add_gtest(benchmark_setup_teardown_cb_types_gtest)
+  add_gtest(memory_results_gtest)
 endif(BENCHMARK_ENABLE_GTEST_TESTS)
 
 ###############################################################################
@@ -292,7 +299,7 @@ if (${CMAKE_BUILD_TYPE_LOWER} MATCHES "coverage")
       COMMAND ${LCOV} -q -a before.lcov -a after.lcov --output-file final.lcov
       COMMAND ${LCOV} -q -r final.lcov "'${CMAKE_SOURCE_DIR}/test/*'" -o final.lcov
       COMMAND ${GENHTML} final.lcov -o lcov --demangle-cpp --sort -p "${CMAKE_BINARY_DIR}" -t benchmark
-      DEPENDS filter_test benchmark_test options_test basic_test fixture_test cxx03_test complexity_test
+      DEPENDS filter_test benchmark_test options_test basic_test fixture_test complexity_test
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
       COMMENT "Running LCOV"
     )
diff --git a/third-party/benchmark/test/basic_test.cc b/third-party/benchmark/test/basic_test.cc
index c25bec7ddd58c..068cd98476c6f 100644
--- a/third-party/benchmark/test/basic_test.cc
+++ b/third-party/benchmark/test/basic_test.cc
@@ -3,9 +3,11 @@
 
 #define BASIC_BENCHMARK_TEST(x) BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
 
+namespace {
 void BM_empty(benchmark::State& state) {
   for (auto _ : state) {
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
 }
@@ -142,7 +144,6 @@ void BM_RangedFor(benchmark::State& state) {
 }
 BENCHMARK(BM_RangedFor);
 
-#ifdef BENCHMARK_HAS_CXX11
 template <typename T>
 void BM_OneTemplateFunc(benchmark::State& state) {
   auto arg = state.range(0);
@@ -167,8 +168,6 @@ void BM_TwoTemplateFunc(benchmark::State& state) {
 BENCHMARK(BM_TwoTemplateFunc<int, double>)->Arg(1);
 BENCHMARK(BM_TwoTemplateFunc<double, int>)->Arg(1);
 
-#endif  // BENCHMARK_HAS_CXX11
-
 // Ensure that StateIterator provides all the necessary typedefs required to
 // instantiate std::iterator_traits.
 static_assert(
@@ -176,5 +175,6 @@ static_assert(
                      benchmark::State::StateIterator>::value_type,
                  typename benchmark::State::StateIterator::value_type>::value,
     "");
+}  // end namespace
 
 BENCHMARK_MAIN();
diff --git a/third-party/benchmark/test/benchmark_gtest.cc b/third-party/benchmark/test/benchmark_gtest.cc
index 2c9e555d92dcd..0aa2552c1e4e2 100644
--- a/third-party/benchmark/test/benchmark_gtest.cc
+++ b/third-party/benchmark/test/benchmark_gtest.cc
@@ -38,7 +38,7 @@ TEST(AddRangeTest, Advanced64) {
 
 TEST(AddRangeTest, FullRange8) {
   std::vector<int8_t> dst;
-  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), int8_t{8});
+  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), 8);
   EXPECT_THAT(
       dst, testing::ElementsAre(int8_t{1}, int8_t{8}, int8_t{64}, int8_t{127}));
 }
diff --git a/third-party/benchmark/test/benchmark_min_time_flag_iters_test.cc b/third-party/benchmark/test/benchmark_min_time_flag_iters_test.cc
index 3de93a75057b4..dedcbe6fa32ab 100644
--- a/third-party/benchmark/test/benchmark_min_time_flag_iters_test.cc
+++ b/third-party/benchmark/test/benchmark_min_time_flag_iters_test.cc
@@ -1,7 +1,6 @@
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
-#include <iostream>
 #include <string>
 #include <vector>
 
@@ -13,11 +12,11 @@ namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
+  bool ReportContext(const Context& context) override {
     return ConsoleReporter::ReportContext(context);
   };
 
-  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
+  void ReportRuns(const std::vector<Run>& report) override {
     assert(report.size() == 1);
     iter_nums_.push_back(report[0].iterations);
     ConsoleReporter::ReportRuns(report);
@@ -25,7 +24,7 @@ class TestReporter : public benchmark::ConsoleReporter {
 
   TestReporter() {}
 
-  virtual ~TestReporter() {}
+  ~TestReporter() override {}
 
   const std::vector<benchmark::IterationCount>& GetIters() const {
     return iter_nums_;
@@ -35,22 +34,26 @@ class TestReporter : public benchmark::ConsoleReporter {
   std::vector<benchmark::IterationCount> iter_nums_;
 };
 
-}  // end namespace
-
-static void BM_MyBench(benchmark::State& state) {
+void BM_MyBench(benchmark::State& state) {
   for (auto s : state) {
   }
 }
+}  // end namespace
+
 BENCHMARK(BM_MyBench);
 
 int main(int argc, char** argv) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+
   // Make a fake argv and append the new --benchmark_min_time=<foo> to it.
   int fake_argc = argc + 1;
-  const char** fake_argv = new const char*[static_cast<size_t>(fake_argc)];
-  for (int i = 0; i < argc; ++i) fake_argv[i] = argv[i];
-  fake_argv[argc] = "--benchmark_min_time=4x";
+  std::vector<const char*> fake_argv(static_cast<size_t>(fake_argc));
+  for (size_t i = 0; i < static_cast<size_t>(argc); ++i) {
+    fake_argv[i] = argv[i];
+  }
+  fake_argv[static_cast<size_t>(argc)] = "--benchmark_min_time=4x";
 
-  benchmark::Initialize(&fake_argc, const_cast<char**>(fake_argv));
+  benchmark::Initialize(&fake_argc, const_cast<char**>(fake_argv.data()));
 
   TestReporter test_reporter;
   const size_t returned_count =
@@ -61,6 +64,5 @@ int main(int argc, char** argv) {
   const std::vector<benchmark::IterationCount> iters = test_reporter.GetIters();
   assert(!iters.empty() && iters[0] == 4);
 
-  delete[] fake_argv;
   return 0;
 }
diff --git a/third-party/benchmark/test/benchmark_min_time_flag_time_test.cc b/third-party/benchmark/test/benchmark_min_time_flag_time_test.cc
index 04a82eb95bf9d..bbc2cc35d8c05 100644
--- a/third-party/benchmark/test/benchmark_min_time_flag_time_test.cc
+++ b/third-party/benchmark/test/benchmark_min_time_flag_time_test.cc
@@ -19,23 +19,23 @@ typedef int64_t IterationCount;
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
+  bool ReportContext(const Context& context) override {
     return ConsoleReporter::ReportContext(context);
   };
 
-  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
+  void ReportRuns(const std::vector<Run>& report) override {
     assert(report.size() == 1);
     ConsoleReporter::ReportRuns(report);
   };
 
-  virtual void ReportRunsConfig(double min_time, bool /* has_explicit_iters */,
-                                IterationCount /* iters */) BENCHMARK_OVERRIDE {
+  void ReportRunsConfig(double min_time, bool /* has_explicit_iters */,
+                        IterationCount /* iters */) override {
     min_times_.push_back(min_time);
   }
 
   TestReporter() {}
 
-  virtual ~TestReporter() {}
+  ~TestReporter() override {}
 
   const std::vector<double>& GetMinTimes() const { return min_times_; }
 
@@ -60,31 +60,34 @@ void DoTestHelper(int* argc, const char** argv, double expected) {
   assert(!min_times.empty() && AlmostEqual(min_times[0], expected));
 }
 
-}  // end namespace
-
-static void BM_MyBench(benchmark::State& state) {
+void BM_MyBench(benchmark::State& state) {
   for (auto s : state) {
   }
 }
 BENCHMARK(BM_MyBench);
 
+}  // end namespace
+
 int main(int argc, char** argv) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+
   // Make a fake argv and append the new --benchmark_min_time=<foo> to it.
   int fake_argc = argc + 1;
-  const char** fake_argv = new const char*[static_cast<size_t>(fake_argc)];
+  std::vector<const char*> fake_argv(static_cast<size_t>(fake_argc));
 
-  for (int i = 0; i < argc; ++i) fake_argv[i] = argv[i];
+  for (size_t i = 0; i < static_cast<size_t>(argc); ++i) {
+    fake_argv[i] = argv[i];
+  }
 
   const char* no_suffix = "--benchmark_min_time=4";
   const char* with_suffix = "--benchmark_min_time=4.0s";
   double expected = 4.0;
 
-  fake_argv[argc] = no_suffix;
-  DoTestHelper(&fake_argc, fake_argv, expected);
+  fake_argv[static_cast<size_t>(argc)] = no_suffix;
+  DoTestHelper(&fake_argc, fake_argv.data(), expected);
 
-  fake_argv[argc] = with_suffix;
-  DoTestHelper(&fake_argc, fake_argv, expected);
+  fake_argv[static_cast<size_t>(argc)] = with_suffix;
+  DoTestHelper(&fake_argc, fake_argv.data(), expected);
 
-  delete[] fake_argv;
   return 0;
 }
diff --git a/third-party/benchmark/test/benchmark_random_interleaving_gtest.cc b/third-party/benchmark/test/benchmark_random_interleaving_gtest.cc
index 7f2086750d534..5f3a554743d10 100644
--- a/third-party/benchmark/test/benchmark_random_interleaving_gtest.cc
+++ b/third-party/benchmark/test/benchmark_random_interleaving_gtest.cc
@@ -34,7 +34,8 @@ class EventQueue : public std::queue<std::string> {
   }
 };
 
-EventQueue* queue = new EventQueue();
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+EventQueue* const queue = new EventQueue();
 
 class NullReporter : public BenchmarkReporter {
  public:
@@ -48,7 +49,7 @@ class BenchmarkTest : public testing::Test {
 
   static void TeardownHook(int /* num_threads */) { queue->push("Teardown"); }
 
-  void Execute(const std::string& pattern) {
+  static void Execute(const std::string& pattern) {
     queue->Clear();
 
     std::unique_ptr<BenchmarkReporter> reporter(new NullReporter());
diff --git a/third-party/benchmark/test/benchmark_setup_teardown_cb_types_gtest.cc b/third-party/benchmark/test/benchmark_setup_teardown_cb_types_gtest.cc
new file mode 100644
index 0000000000000..2ed255dcd33d0
--- /dev/null
+++ b/third-party/benchmark/test/benchmark_setup_teardown_cb_types_gtest.cc
@@ -0,0 +1,126 @@
+#include "benchmark/benchmark.h"
+#include "gtest/gtest.h"
+
+using benchmark::Benchmark;
+using benchmark::BenchmarkReporter;
+using benchmark::callback_function;
+using benchmark::ClearRegisteredBenchmarks;
+using benchmark::RegisterBenchmark;
+using benchmark::RunSpecifiedBenchmarks;
+using benchmark::State;
+
+static int functor_called = 0;
+struct Functor {
+  void operator()(const benchmark::State& /*unused*/) { functor_called++; }
+};
+
+class NullReporter : public BenchmarkReporter {
+ public:
+  bool ReportContext(const Context& /*context*/) override { return true; }
+  void ReportRuns(const std::vector<Run>& /* report */) override {}
+};
+
+class BenchmarkTest : public testing::Test {
+ public:
+  Benchmark* bm;
+  NullReporter null_reporter;
+
+  int setup_calls;
+  int teardown_calls;
+
+  void SetUp() override {
+    setup_calls = 0;
+    teardown_calls = 0;
+    functor_called = 0;
+
+    bm = RegisterBenchmark("BM", [](State& st) {
+      for (auto _ : st) {
+      }
+    });
+    bm->Iterations(1);
+  }
+
+  void TearDown() override { ClearRegisteredBenchmarks(); }
+};
+
+// Test that Setup/Teardown can correctly take a lambda expressions
+TEST_F(BenchmarkTest, LambdaTestCopy) {
+  auto setup_lambda = [this](const State&) { setup_calls++; };
+  auto teardown_lambda = [this](const State&) { teardown_calls++; };
+  bm->Setup(setup_lambda);
+  bm->Teardown(teardown_lambda);
+  RunSpecifiedBenchmarks(&null_reporter);
+  EXPECT_EQ(setup_calls, 1);
+  EXPECT_EQ(teardown_calls, 1);
+}
+
+// Test that Setup/Teardown can correctly take a lambda expressions
+TEST_F(BenchmarkTest, LambdaTestMove) {
+  auto setup_lambda = [this](const State&) { setup_calls++; };
+  auto teardown_lambda = [this](const State&) { teardown_calls++; };
+  bm->Setup(std::move(setup_lambda));
+  bm->Teardown(std::move(teardown_lambda));
+  RunSpecifiedBenchmarks(&null_reporter);
+  EXPECT_EQ(setup_calls, 1);
+  EXPECT_EQ(teardown_calls, 1);
+}
+
+// Test that Setup/Teardown can correctly take std::function
+TEST_F(BenchmarkTest, CallbackFunctionCopy) {
+  callback_function setup_lambda = [this](const State&) { setup_calls++; };
+  callback_function teardown_lambda = [this](const State&) {
+    teardown_calls++;
+  };
+  bm->Setup(setup_lambda);
+  bm->Teardown(teardown_lambda);
+  RunSpecifiedBenchmarks(&null_reporter);
+  EXPECT_EQ(setup_calls, 1);
+  EXPECT_EQ(teardown_calls, 1);
+}
+
+// Test that Setup/Teardown can correctly take std::function
+TEST_F(BenchmarkTest, CallbackFunctionMove) {
+  callback_function setup_lambda = [this](const State&) { setup_calls++; };
+  callback_function teardown_lambda = [this](const State&) {
+    teardown_calls++;
+  };
+  bm->Setup(std::move(setup_lambda));
+  bm->Teardown(std::move(teardown_lambda));
+  RunSpecifiedBenchmarks(&null_reporter);
+  EXPECT_EQ(setup_calls, 1);
+  EXPECT_EQ(teardown_calls, 1);
+}
+
+// Test that Setup/Teardown can correctly take functors
+TEST_F(BenchmarkTest, FunctorCopy) {
+  Functor func;
+  bm->Setup(func);
+  bm->Teardown(func);
+  RunSpecifiedBenchmarks(&null_reporter);
+  EXPECT_EQ(functor_called, 2);
+}
+
+// Test that Setup/Teardown can correctly take functors
+TEST_F(BenchmarkTest, FunctorMove) {
+  Functor func1;
+  Functor func2;
+  bm->Setup(std::move(func1));
+  bm->Teardown(std::move(func2));
+  RunSpecifiedBenchmarks(&null_reporter);
+  EXPECT_EQ(functor_called, 2);
+}
+
+// Test that Setup/Teardown can not take nullptr
+TEST_F(BenchmarkTest, NullptrTest) {
+#if GTEST_HAS_DEATH_TEST
+  // Tests only runnable in debug mode (when BM_CHECK is enabled).
+#ifndef NDEBUG
+#ifndef TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS
+  EXPECT_DEATH(bm->Setup(nullptr), "setup != nullptr");
+  EXPECT_DEATH(bm->Teardown(nullptr), "teardown != nullptr");
+#else
+  GTEST_SKIP() << "Test skipped because BM_CHECK is disabled";
+#endif
+#endif
+#endif
+}
diff --git a/third-party/benchmark/test/benchmark_setup_teardown_test.cc b/third-party/benchmark/test/benchmark_setup_teardown_test.cc
index 6c3cc2e58fbde..eb45a73e92bd9 100644
--- a/third-party/benchmark/test/benchmark_setup_teardown_test.cc
+++ b/third-party/benchmark/test/benchmark_setup_teardown_test.cc
@@ -2,18 +2,18 @@
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
-#include <iostream>
-#include <limits>
 #include <string>
 
 #include "benchmark/benchmark.h"
 
 // Test that Setup() and Teardown() are called exactly once
 // for each benchmark run (single-threaded).
+namespace {
 namespace singlethreaded {
 static int setup_call = 0;
 static int teardown_call = 0;
 }  // namespace singlethreaded
+}  // namespace
 static void DoSetup1(const benchmark::State& state) {
   ++singlethreaded::setup_call;
 
@@ -40,23 +40,24 @@ BENCHMARK(BM_with_setup)
     ->Teardown(DoTeardown1);
 
 // Test that Setup() and Teardown() are called once for each group of threads.
+namespace {
 namespace concurrent {
 static std::atomic<int> setup_call(0);
 static std::atomic<int> teardown_call(0);
 static std::atomic<int> func_call(0);
 }  // namespace concurrent
 
-static void DoSetup2(const benchmark::State& state) {
+void DoSetup2(const benchmark::State& state) {
   concurrent::setup_call.fetch_add(1, std::memory_order_acquire);
   assert(state.thread_index() == 0);
 }
 
-static void DoTeardown2(const benchmark::State& state) {
+void DoTeardown2(const benchmark::State& state) {
   concurrent::teardown_call.fetch_add(1, std::memory_order_acquire);
   assert(state.thread_index() == 0);
 }
 
-static void BM_concurrent(benchmark::State& state) {
+void BM_concurrent(benchmark::State& state) {
   for (auto s : state) {
   }
   concurrent::func_call.fetch_add(1, std::memory_order_acquire);
@@ -80,7 +81,7 @@ int fixture_setup = 0;
 
 class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
  public:
-  void SetUp(const ::benchmark::State&) override {
+  void SetUp(const ::benchmark::State& /*unused*/) override {
     fixture_interaction::fixture_setup++;
   }
 
@@ -92,7 +93,7 @@ BENCHMARK_F(FIXTURE_BECHMARK_NAME, BM_WithFixture)(benchmark::State& st) {
   }
 }
 
-static void DoSetupWithFixture(const benchmark::State&) {
+void DoSetupWithFixture(const benchmark::State& /*unused*/) {
   fixture_interaction::setup++;
 }
 
@@ -110,10 +111,10 @@ namespace repetitions {
 int setup = 0;
 }
 
-static void DoSetupWithRepetitions(const benchmark::State&) {
+void DoSetupWithRepetitions(const benchmark::State& /*unused*/) {
   repetitions::setup++;
 }
-static void BM_WithRep(benchmark::State& state) {
+void BM_WithRep(benchmark::State& state) {
   for (auto _ : state) {
   }
 }
@@ -126,8 +127,11 @@ BENCHMARK(BM_WithRep)
     ->Setup(DoSetupWithRepetitions)
     ->Iterations(100)
     ->Repetitions(4);
+}  // namespace
 
 int main(int argc, char** argv) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+
   benchmark::Initialize(&argc, argv);
 
   size_t ret = benchmark::RunSpecifiedBenchmarks(".");
diff --git a/third-party/benchmark/test/benchmark_test.cc b/third-party/benchmark/test/benchmark_test.cc
index 8b14017d03a58..49cbfba6f348b 100644
--- a/third-party/benchmark/test/benchmark_test.cc
+++ b/third-party/benchmark/test/benchmark_test.cc
@@ -8,10 +8,9 @@
 #include <complex>
 #include <cstdlib>
 #include <iostream>
-#include <limits>
 #include <list>
-#include <map>
 #include <mutex>
+#include <optional>
 #include <set>
 #include <sstream>
 #include <string>
@@ -44,18 +43,22 @@ double CalculatePi(int depth) {
 
 std::set<int64_t> ConstructRandomSet(int64_t size) {
   std::set<int64_t> s;
-  for (int i = 0; i < size; ++i) s.insert(s.end(), i);
+  for (int i = 0; i < size; ++i) {
+    s.insert(s.end(), i);
+  }
   return s;
 }
 
+// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
 std::mutex test_vector_mu;
-std::vector<int>* test_vector = nullptr;
-
-}  // end namespace
+std::optional<std::vector<int>> test_vector;
+// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)
 
-static void BM_Factorial(benchmark::State& state) {
+void BM_Factorial(benchmark::State& state) {
   int fac_42 = 0;
-  for (auto _ : state) fac_42 = Factorial(8);
+  for (auto _ : state) {
+    fac_42 = Factorial(8);
+  }
   // Prevent compiler optimizations
   std::stringstream ss;
   ss << fac_42;
@@ -64,16 +67,18 @@ static void BM_Factorial(benchmark::State& state) {
 BENCHMARK(BM_Factorial);
 BENCHMARK(BM_Factorial)->UseRealTime();
 
-static void BM_CalculatePiRange(benchmark::State& state) {
+void BM_CalculatePiRange(benchmark::State& state) {
   double pi = 0.0;
-  for (auto _ : state) pi = CalculatePi(static_cast<int>(state.range(0)));
+  for (auto _ : state) {
+    pi = CalculatePi(static_cast<int>(state.range(0)));
+  }
   std::stringstream ss;
   ss << pi;
   state.SetLabel(ss.str());
 }
 BENCHMARK_RANGE(BM_CalculatePiRange, 1, 1024 * 1024);
 
-static void BM_CalculatePi(benchmark::State& state) {
+void BM_CalculatePi(benchmark::State& state) {
   static const int depth = 1024;
   for (auto _ : state) {
     double pi = CalculatePi(static_cast<int>(depth));
@@ -84,13 +89,15 @@ BENCHMARK(BM_CalculatePi)->Threads(8);
 BENCHMARK(BM_CalculatePi)->ThreadRange(1, 32);
 BENCHMARK(BM_CalculatePi)->ThreadPerCpu();
 
-static void BM_SetInsert(benchmark::State& state) {
+void BM_SetInsert(benchmark::State& state) {
   std::set<int64_t> data;
   for (auto _ : state) {
     state.PauseTiming();
     data = ConstructRandomSet(state.range(0));
     state.ResumeTiming();
-    for (int j = 0; j < state.range(1); ++j) data.insert(rand());
+    for (int j = 0; j < state.range(1); ++j) {
+      data.insert(rand());
+    }
   }
   state.SetItemsProcessed(state.iterations() * state.range(1));
   state.SetBytesProcessed(state.iterations() * state.range(1) *
@@ -104,11 +111,13 @@ BENCHMARK(BM_SetInsert)->Ranges({{1 << 10, 8 << 10}, {128, 512}});
 
 template <typename Container,
           typename ValueType = typename Container::value_type>
-static void BM_Sequential(benchmark::State& state) {
+void BM_Sequential(benchmark::State& state) {
   ValueType v = 42;
   for (auto _ : state) {
     Container c;
-    for (int64_t i = state.range(0); --i;) c.push_back(v);
+    for (int64_t i = state.range(0); --i;) {
+      c.push_back(v);
+    }
   }
   const int64_t items_processed = state.iterations() * state.range(0);
   state.SetItemsProcessed(items_processed);
@@ -118,11 +127,9 @@ BENCHMARK_TEMPLATE2(BM_Sequential, std::vector<int>, int)
     ->Range(1 << 0, 1 << 10);
 BENCHMARK_TEMPLATE(BM_Sequential, std::list<int>)->Range(1 << 0, 1 << 10);
 // Test the variadic version of BENCHMARK_TEMPLATE in C++11 and beyond.
-#ifdef BENCHMARK_HAS_CXX11
 BENCHMARK_TEMPLATE(BM_Sequential, std::vector<int>, int)->Arg(512);
-#endif
 
-static void BM_StringCompare(benchmark::State& state) {
+void BM_StringCompare(benchmark::State& state) {
   size_t len = static_cast<size_t>(state.range(0));
   std::string s1(len, '-');
   std::string s2(len, '-');
@@ -133,43 +140,45 @@ static void BM_StringCompare(benchmark::State& state) {
 }
 BENCHMARK(BM_StringCompare)->Range(1, 1 << 20);
 
-static void BM_SetupTeardown(benchmark::State& state) {
+void BM_SetupTeardown(benchmark::State& state) {
   if (state.thread_index() == 0) {
     // No need to lock test_vector_mu here as this is running single-threaded.
-    test_vector = new std::vector<int>();
+    test_vector = std::vector<int>();
   }
   int i = 0;
   for (auto _ : state) {
     std::lock_guard<std::mutex> l(test_vector_mu);
-    if (i % 2 == 0)
+    if (i % 2 == 0) {
       test_vector->push_back(i);
-    else
+    } else {
       test_vector->pop_back();
+    }
     ++i;
   }
   if (state.thread_index() == 0) {
-    delete test_vector;
+    test_vector.reset();
   }
 }
 BENCHMARK(BM_SetupTeardown)->ThreadPerCpu();
 
-static void BM_LongTest(benchmark::State& state) {
+void BM_LongTest(benchmark::State& state) {
   double tracker = 0.0;
   for (auto _ : state) {
-    for (int i = 0; i < state.range(0); ++i)
+    for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(tracker += i);
+    }
   }
 }
 BENCHMARK(BM_LongTest)->Range(1 << 16, 1 << 28);
 
-static void BM_ParallelMemset(benchmark::State& state) {
+void BM_ParallelMemset(benchmark::State& state) {
   int64_t size = state.range(0) / static_cast<int64_t>(sizeof(int));
   int thread_size = static_cast<int>(size) / state.threads();
   int from = thread_size * state.thread_index();
   int to = from + thread_size;
 
   if (state.thread_index() == 0) {
-    test_vector = new std::vector<int>(static_cast<size_t>(size));
+    test_vector = std::vector<int>(static_cast<size_t>(size));
   }
 
   for (auto _ : state) {
@@ -181,12 +190,12 @@ static void BM_ParallelMemset(benchmark::State& state) {
   }
 
   if (state.thread_index() == 0) {
-    delete test_vector;
+    test_vector.reset();
   }
 }
 BENCHMARK(BM_ParallelMemset)->Arg(10 << 20)->ThreadRange(1, 4);
 
-static void BM_ManualTiming(benchmark::State& state) {
+void BM_ManualTiming(benchmark::State& state) {
   int64_t slept_for = 0;
   int64_t microseconds = state.range(0);
   std::chrono::duration<double, std::micro> sleep_duration{
@@ -210,8 +219,6 @@ static void BM_ManualTiming(benchmark::State& state) {
 BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseRealTime();
 BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseManualTime();
 
-#ifdef BENCHMARK_HAS_CXX11
-
 template <class... Args>
 void BM_with_args(benchmark::State& state, Args&&...) {
   for (auto _ : state) {
@@ -252,9 +259,7 @@ void BM_template1_capture(benchmark::State& state, ExtraArgs&&... extra_args) {
 BENCHMARK_TEMPLATE1_CAPTURE(BM_template1_capture, void, foo, 24UL);
 BENCHMARK_CAPTURE(BM_template1_capture<void>, foo, 24UL);
 
-#endif  // BENCHMARK_HAS_CXX11
-
-static void BM_DenseThreadRanges(benchmark::State& st) {
+void BM_DenseThreadRanges(benchmark::State& st) {
   switch (st.range(0)) {
     case 1:
       assert(st.threads() == 1 || st.threads() == 2 || st.threads() == 3);
@@ -276,7 +281,7 @@ BENCHMARK(BM_DenseThreadRanges)->Arg(1)->DenseThreadRange(1, 3);
 BENCHMARK(BM_DenseThreadRanges)->Arg(2)->DenseThreadRange(1, 4, 2);
 BENCHMARK(BM_DenseThreadRanges)->Arg(3)->DenseThreadRange(5, 14, 3);
 
-static void BM_BenchmarkName(benchmark::State& state) {
+void BM_BenchmarkName(benchmark::State& state) {
   for (auto _ : state) {
   }
 
@@ -287,14 +292,15 @@ BENCHMARK(BM_BenchmarkName);
 
 // regression test for #1446
 template <typename type>
-static void BM_templated_test(benchmark::State& state) {
+void BM_templated_test(benchmark::State& state) {
   for (auto _ : state) {
     type created_string;
     benchmark::DoNotOptimize(created_string);
   }
 }
 
-static auto BM_templated_test_double = BM_templated_test<std::complex<double>>;
+const auto BM_templated_test_double = BM_templated_test<std::complex<double>>;
 BENCHMARK(BM_templated_test_double);
+}  // end namespace
 
 BENCHMARK_MAIN();
diff --git a/third-party/benchmark/test/complexity_test.cc b/third-party/benchmark/test/complexity_test.cc
index fb4ad1ad53a98..8cf17f41d39ea 100644
--- a/third-party/benchmark/test/complexity_test.cc
+++ b/third-party/benchmark/test/complexity_test.cc
@@ -1,5 +1,4 @@
 #undef NDEBUG
-#include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
@@ -11,12 +10,12 @@
 namespace {
 
 #define ADD_COMPLEXITY_CASES(...) \
-  int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
+  const int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
 
-int AddComplexityTest(const std::string &test_name,
-                      const std::string &big_o_test_name,
-                      const std::string &rms_test_name,
-                      const std::string &big_o, int family_index) {
+int AddComplexityTest(const std::string& test_name,
+                      const std::string& big_o_test_name,
+                      const std::string& rms_test_name,
+                      const std::string& big_o, int family_index) {
   SetSubstitutions({{"%name", test_name},
                     {"%bigo_name", big_o_test_name},
                     {"%rms_name", rms_test_name},
@@ -61,21 +60,19 @@ int AddComplexityTest(const std::string &test_name,
   return 0;
 }
 
-}  // end namespace
-
 // ========================================================================= //
 // --------------------------- Testing BigO O(1) --------------------------- //
 // ========================================================================= //
 
-void BM_Complexity_O1(benchmark::State &state) {
+void BM_Complexity_O1(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
     benchmark::DoNotOptimize(state.iterations());
-    long tmp = state.iterations();
+    double tmp = static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(tmp);
     for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
       benchmark::DoNotOptimize(state.iterations());
-      tmp *= state.iterations();
+      tmp *= static_cast<double>(state.iterations());
       benchmark::DoNotOptimize(tmp);
     }
 
@@ -94,11 +91,11 @@ BENCHMARK(BM_Complexity_O1)
     ->UseManualTime()
     ->Complexity([](benchmark::IterationCount) { return 1.0; });
 
-const char *one_test_name = "BM_Complexity_O1/manual_time";
-const char *big_o_1_test_name = "BM_Complexity_O1/manual_time_BigO";
-const char *rms_o_1_test_name = "BM_Complexity_O1/manual_time_RMS";
-const char *enum_auto_big_o_1 = "\\([0-9]+\\)";
-const char *lambda_big_o_1 = "f\\(N\\)";
+constexpr char one_test_name[] = "BM_Complexity_O1/manual_time";
+constexpr char big_o_1_test_name[] = "BM_Complexity_O1/manual_time_BigO";
+constexpr char rms_o_1_test_name[] = "BM_Complexity_O1/manual_time_RMS";
+constexpr char enum_auto_big_o_1[] = "\\([0-9]+\\)";
+constexpr char lambda_big_o_1[] = "f\\(N\\)";
 
 // Add enum tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
@@ -116,20 +113,20 @@ ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
 // --------------------------- Testing BigO O(N) --------------------------- //
 // ========================================================================= //
 
-void BM_Complexity_O_N(benchmark::State &state) {
+void BM_Complexity_O_N(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
     benchmark::DoNotOptimize(state.iterations());
-    long tmp = state.iterations();
+    double tmp = static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(tmp);
     for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
       benchmark::DoNotOptimize(state.iterations());
-      tmp *= state.iterations();
+      tmp *= static_cast<double>(state.iterations());
       benchmark::DoNotOptimize(tmp);
     }
 
     // 1ns per iteration per entry
-    state.SetIterationTime(static_cast<double>(state.range(0)) * 42.0 * 1e-9);
+    state.SetIterationTime(static_cast<double>(state.range(0)) * 42 * 1e-9);
   }
   state.SetComplexityN(state.range(0));
 }
@@ -151,11 +148,11 @@ BENCHMARK(BM_Complexity_O_N)
       return static_cast<double>(n);
     });
 
-const char *n_test_name = "BM_Complexity_O_N/manual_time";
-const char *big_o_n_test_name = "BM_Complexity_O_N/manual_time_BigO";
-const char *rms_o_n_test_name = "BM_Complexity_O_N/manual_time_RMS";
-const char *enum_auto_big_o_n = "N";
-const char *lambda_big_o_n = "f\\(N\\)";
+constexpr char n_test_name[] = "BM_Complexity_O_N/manual_time";
+constexpr char big_o_n_test_name[] = "BM_Complexity_O_N/manual_time_BigO";
+constexpr char rms_o_n_test_name[] = "BM_Complexity_O_N/manual_time_RMS";
+constexpr char enum_auto_big_o_n[] = "N";
+constexpr char lambda_big_o_n[] = "f\\(N\\)";
 
 // Add enum tests
 ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
@@ -173,21 +170,21 @@ ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
 // ------------------------- Testing BigO O(NlgN) ------------------------- //
 // ========================================================================= //
 
-static const double kLog2E = 1.44269504088896340736;
-static void BM_Complexity_O_N_log_N(benchmark::State &state) {
+const double kLog2E = 1.44269504088896340736;
+void BM_Complexity_O_N_log_N(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
     benchmark::DoNotOptimize(state.iterations());
-    long tmp = state.iterations();
+    double tmp = static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(tmp);
     for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
       benchmark::DoNotOptimize(state.iterations());
-      tmp *= state.iterations();
+      tmp *= static_cast<double>(state.iterations());
       benchmark::DoNotOptimize(tmp);
     }
 
     state.SetIterationTime(static_cast<double>(state.range(0)) * kLog2E *
-                           std::log(state.range(0)) * 42.0 * 1e-9);
+                           std::log(state.range(0)) * 42 * 1e-9);
   }
   state.SetComplexityN(state.range(0));
 }
@@ -209,11 +206,13 @@ BENCHMARK(BM_Complexity_O_N_log_N)
       return kLog2E * static_cast<double>(n) * std::log(static_cast<double>(n));
     });
 
-const char *n_lg_n_test_name = "BM_Complexity_O_N_log_N/manual_time";
-const char *big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N/manual_time_BigO";
-const char *rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N/manual_time_RMS";
-const char *enum_auto_big_o_n_lg_n = "NlgN";
-const char *lambda_big_o_n_lg_n = "f\\(N\\)";
+constexpr char n_lg_n_test_name[] = "BM_Complexity_O_N_log_N/manual_time";
+constexpr char big_o_n_lg_n_test_name[] =
+    "BM_Complexity_O_N_log_N/manual_time_BigO";
+constexpr char rms_o_n_lg_n_test_name[] =
+    "BM_Complexity_O_N_log_N/manual_time_RMS";
+constexpr char enum_auto_big_o_n_lg_n[] = "NlgN";
+constexpr char lambda_big_o_n_lg_n[] = "f\\(N\\)";
 
 // Add enum tests
 ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
@@ -234,19 +233,19 @@ ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
 // -------- Testing formatting of Complexity with captured args ------------ //
 // ========================================================================= //
 
-void BM_ComplexityCaptureArgs(benchmark::State &state, int n) {
+void BM_ComplexityCaptureArgs(benchmark::State& state, int n) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
     benchmark::DoNotOptimize(state.iterations());
-    long tmp = state.iterations();
+    double tmp = static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(tmp);
     for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
       benchmark::DoNotOptimize(state.iterations());
-      tmp *= state.iterations();
+      tmp *= static_cast<double>(state.iterations());
       benchmark::DoNotOptimize(tmp);
     }
 
-    state.SetIterationTime(static_cast<double>(state.range(0)) * 42.0 * 1e-9);
+    state.SetIterationTime(static_cast<double>(state.range(0)) * 42 * 1e-9);
   }
   state.SetComplexityN(n);
 }
@@ -262,9 +261,13 @@ const std::string complexity_capture_name =
 ADD_COMPLEXITY_CASES(complexity_capture_name, complexity_capture_name + "_BigO",
                      complexity_capture_name + "_RMS", "N",
                      /*family_index=*/9);
+}  // end namespace
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
 // ========================================================================= //
 
-int main(int argc, char *argv[]) { RunOutputTests(argc, argv); }
+int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+  RunOutputTests(argc, argv);
+}
diff --git a/third-party/benchmark/test/cxx03_test.cc b/third-party/benchmark/test/cxx03_test.cc
deleted file mode 100644
index 9711c1bd4a9b3..0000000000000
--- a/third-party/benchmark/test/cxx03_test.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-#undef NDEBUG
-#include <cassert>
-#include <cstddef>
-
-#include "benchmark/benchmark.h"
-
-#if __cplusplus >= 201103L
-#error C++11 or greater detected. Should be C++03.
-#endif
-
-#ifdef BENCHMARK_HAS_CXX11
-#error C++11 or greater detected by the library. BENCHMARK_HAS_CXX11 is defined.
-#endif
-
-void BM_empty(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    volatile benchmark::IterationCount x = state.iterations();
-    ((void)x);
-  }
-}
-BENCHMARK(BM_empty);
-
-// The new C++11 interface for args/ranges requires initializer list support.
-// Therefore we provide the old interface to support C++03.
-void BM_old_arg_range_interface(benchmark::State& state) {
-  assert((state.range(0) == 1 && state.range(1) == 2) ||
-         (state.range(0) == 5 && state.range(1) == 6));
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_old_arg_range_interface)->ArgPair(1, 2)->RangePair(5, 5, 6, 6);
-
-template <class T, class U>
-void BM_template2(benchmark::State& state) {
-  BM_empty(state);
-}
-BENCHMARK_TEMPLATE2(BM_template2, int, long);
-
-template <class T>
-void BM_template1(benchmark::State& state) {
-  BM_empty(state);
-}
-BENCHMARK_TEMPLATE(BM_template1, long);
-BENCHMARK_TEMPLATE1(BM_template1, int);
-
-template <class T>
-struct BM_Fixture : public ::benchmark::Fixture {};
-
-BENCHMARK_TEMPLATE_F(BM_Fixture, BM_template1, long)(benchmark::State& state) {
-  BM_empty(state);
-}
-BENCHMARK_TEMPLATE1_F(BM_Fixture, BM_template2, int)(benchmark::State& state) {
-  BM_empty(state);
-}
-
-void BM_counters(benchmark::State& state) {
-  BM_empty(state);
-  state.counters["Foo"] = 2;
-}
-BENCHMARK(BM_counters);
-
-BENCHMARK_MAIN();
diff --git a/third-party/benchmark/test/cxx11_test.cc b/third-party/benchmark/test/cxx11_test.cc
new file mode 100644
index 0000000000000..db1a993343617
--- /dev/null
+++ b/third-party/benchmark/test/cxx11_test.cc
@@ -0,0 +1,12 @@
+#include "benchmark/benchmark.h"
+
+#if defined(_MSC_VER)
+#if _MSVC_LANG != 201402L
+// MSVC, even in C++11 mode, dooes not claim to be in C++11 mode.
+#error "Trying to compile C++11 test with wrong C++ standard"
+#endif  //  _MSVC_LANG
+#else   // Non-MSVC
+#if __cplusplus != 201103L
+#error "Trying to compile C++11 test with wrong C++ standard"
+#endif  // Non-MSVC
+#endif
diff --git a/third-party/benchmark/test/diagnostics_test.cc b/third-party/benchmark/test/diagnostics_test.cc
index 7c68a98929d85..e8d7d9119a338 100644
--- a/third-party/benchmark/test/diagnostics_test.cc
+++ b/third-party/benchmark/test/diagnostics_test.cc
@@ -17,6 +17,7 @@
 #define TEST_HAS_NO_EXCEPTIONS
 #endif
 
+namespace {
 void TestHandler() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   throw std::logic_error("");
@@ -46,14 +47,19 @@ void try_invalid_pause_resume(benchmark::State& state) {
 void BM_diagnostic_test(benchmark::State& state) {
   static bool called_once = false;
 
-  if (called_once == false) try_invalid_pause_resume(state);
+  if (!called_once) {
+    try_invalid_pause_resume(state);
+  }
 
   for (auto _ : state) {
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
 
-  if (called_once == false) try_invalid_pause_resume(state);
+  if (!called_once) {
+    try_invalid_pause_resume(state);
+  }
 
   called_once = true;
 }
@@ -62,28 +68,35 @@ BENCHMARK(BM_diagnostic_test);
 void BM_diagnostic_test_keep_running(benchmark::State& state) {
   static bool called_once = false;
 
-  if (called_once == false) try_invalid_pause_resume(state);
+  if (!called_once) {
+    try_invalid_pause_resume(state);
+  }
 
   while (state.KeepRunning()) {
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
 
-  if (called_once == false) try_invalid_pause_resume(state);
+  if (!called_once) {
+    try_invalid_pause_resume(state);
+  }
 
   called_once = true;
 }
 BENCHMARK(BM_diagnostic_test_keep_running);
+}  // end namespace
 
 int main(int argc, char* argv[]) {
 #ifdef NDEBUG
   // This test is exercising functionality for debug builds, which are not
   // available in release builds. Skip the test if we are in that environment
   // to avoid a test failure.
-  std::cout << "Diagnostic test disabled in release build" << std::endl;
+  std::cout << "Diagnostic test disabled in release build\n";
   (void)argc;
   (void)argv;
 #else
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
   benchmark::internal::GetAbortHandler() = &TestHandler;
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
diff --git a/third-party/benchmark/test/display_aggregates_only_test.cc b/third-party/benchmark/test/display_aggregates_only_test.cc
index 6ad65e7f516a6..bae97593acaa8 100644
--- a/third-party/benchmark/test/display_aggregates_only_test.cc
+++ b/third-party/benchmark/test/display_aggregates_only_test.cc
@@ -10,13 +10,17 @@
 // reporter in the presence of DisplayAggregatesOnly().
 // We do not care about console output, the normal tests check that already.
 
+namespace {
 void BM_SummaryRepeat(benchmark::State& state) {
   for (auto _ : state) {
   }
 }
 BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->DisplayAggregatesOnly();
+}  // end namespace
 
 int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+
   const std::string output = GetFileReporterOutput(argc, argv);
 
   if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 7 ||
diff --git a/third-party/benchmark/test/donotoptimize_assembly_test.cc b/third-party/benchmark/test/donotoptimize_assembly_test.cc
index dc286f53e20f6..1f817e02bb1dc 100644
--- a/third-party/benchmark/test/donotoptimize_assembly_test.cc
+++ b/third-party/benchmark/test/donotoptimize_assembly_test.cc
@@ -2,6 +2,7 @@
 
 #ifdef __clang__
 #pragma clang diagnostic ignored "-Wreturn-type"
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
 #endif
 BENCHMARK_DISABLE_DEPRECATED_WARNING
 
@@ -19,7 +20,7 @@ inline int Add42(int x) { return x + 42; }
 struct NotTriviallyCopyable {
   NotTriviallyCopyable();
   explicit NotTriviallyCopyable(int x) : value(x) {}
-  NotTriviallyCopyable(NotTriviallyCopyable const &);
+  NotTriviallyCopyable(NotTriviallyCopyable const&);
   int value;
 };
 
@@ -185,7 +186,7 @@ extern "C" void test_pointer_const_lvalue() {
   // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z]+]])
   // CHECK: ret
   int x = 42;
-  int *const xp = &x;
+  int* const xp = &x;
   benchmark::DoNotOptimize(xp);
 }
 
@@ -196,6 +197,6 @@ extern "C" void test_pointer_lvalue() {
   // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z+]+]])
   // CHECK: ret
   int x = 42;
-  int *xp = &x;
+  int* xp = &x;
   benchmark::DoNotOptimize(xp);
 }
diff --git a/third-party/benchmark/test/donotoptimize_test.cc b/third-party/benchmark/test/donotoptimize_test.cc
index 04ec9386a3b40..7571cf445e0e6 100644
--- a/third-party/benchmark/test/donotoptimize_test.cc
+++ b/third-party/benchmark/test/donotoptimize_test.cc
@@ -4,7 +4,7 @@
 
 namespace {
 #if defined(__GNUC__)
-std::int64_t double_up(const std::int64_t x) __attribute__((const));
+std::int64_t double_up(std::int64_t x) __attribute__((const));
 #endif
 std::int64_t double_up(const std::int64_t x) { return x * 2; }
 }  // namespace
@@ -26,7 +26,9 @@ struct BitRef {
   BitRef(int i, unsigned char& b) : index(i), byte(b) {}
 };
 
-int main(int, char*[]) {
+int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+
   // this test verifies compilation of DoNotOptimize() for some types
 
   char buffer1[1] = "";
@@ -62,8 +64,6 @@ int main(int, char*[]) {
   BitRef lval = BitRef::Make();
   benchmark::DoNotOptimize(lval);
 
-#ifdef BENCHMARK_HAS_CXX11
   // Check that accept rvalue.
   benchmark::DoNotOptimize(BitRef::Make());
-#endif
 }
diff --git a/third-party/benchmark/test/filter_test.cc b/third-party/benchmark/test/filter_test.cc
index 4c8b8ea488ad0..8c150eb2de97f 100644
--- a/third-party/benchmark/test/filter_test.cc
+++ b/third-party/benchmark/test/filter_test.cc
@@ -37,43 +37,45 @@ class TestReporter : public benchmark::ConsoleReporter {
   mutable int64_t max_family_index_;
 };
 
-}  // end namespace
-
-static void NoPrefix(benchmark::State& state) {
+void NoPrefix(benchmark::State& state) {
   for (auto _ : state) {
   }
 }
 BENCHMARK(NoPrefix);
 
-static void BM_Foo(benchmark::State& state) {
+void BM_Foo(benchmark::State& state) {
   for (auto _ : state) {
   }
 }
 BENCHMARK(BM_Foo);
 
-static void BM_Bar(benchmark::State& state) {
+void BM_Bar(benchmark::State& state) {
   for (auto _ : state) {
   }
 }
 BENCHMARK(BM_Bar);
 
-static void BM_FooBar(benchmark::State& state) {
+void BM_FooBar(benchmark::State& state) {
   for (auto _ : state) {
   }
 }
 BENCHMARK(BM_FooBar);
 
-static void BM_FooBa(benchmark::State& state) {
+void BM_FooBa(benchmark::State& state) {
   for (auto _ : state) {
   }
 }
 BENCHMARK(BM_FooBa);
+}  // end namespace
 
 int main(int argc, char** argv) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+
   bool list_only = false;
-  for (int i = 0; i < argc; ++i)
+  for (int i = 0; i < argc; ++i) {
     list_only |= std::string(argv[i]).find("--benchmark_list_tests") !=
                  std::string::npos;
+  }
 
   benchmark::Initialize(&argc, argv);
 
@@ -84,13 +86,13 @@ int main(int argc, char** argv) {
   if (argc == 2) {
     // Make sure we ran all of the tests
     std::stringstream ss(argv[1]);
-    int64_t expected_return;
+    int64_t expected_return = 0;
     ss >> expected_return;
 
     if (returned_count != expected_return) {
       std::cerr << "ERROR: Expected " << expected_return
                 << " tests to match the filter but returned_count = "
-                << returned_count << std::endl;
+                << returned_count << '\n';
       return -1;
     }
 
@@ -99,7 +101,7 @@ int main(int argc, char** argv) {
     if (reports_count != expected_reports) {
       std::cerr << "ERROR: Expected " << expected_reports
                 << " tests to be run but reported_count = " << reports_count
-                << std::endl;
+                << '\n';
       return -1;
     }
 
@@ -108,7 +110,7 @@ int main(int argc, char** argv) {
     if (num_families != expected_reports) {
       std::cerr << "ERROR: Expected " << expected_reports
                 << " test families to be run but num_families = "
-                << num_families << std::endl;
+                << num_families << '\n';
       return -1;
     }
   }
diff --git a/third-party/benchmark/test/internal_threading_test.cc b/third-party/benchmark/test/internal_threading_test.cc
index 62b5b955a9f5d..c57bf44b0c343 100644
--- a/third-party/benchmark/test/internal_threading_test.cc
+++ b/third-party/benchmark/test/internal_threading_test.cc
@@ -8,8 +8,9 @@
 #include "benchmark/benchmark.h"
 #include "output_test.h"
 
-static const std::chrono::duration<double, std::milli> time_frame(50);
-static const double time_frame_in_sec(
+namespace {
+const std::chrono::duration<double, std::milli> time_frame(50);
+const double time_frame_in_sec(
     std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(
         time_frame)
         .count());
@@ -22,8 +23,9 @@ void MyBusySpinwait() {
     const auto elapsed = now - start;
 
     if (std::chrono::duration<double, std::chrono::seconds::period>(elapsed) >=
-        time_frame)
+        time_frame) {
       return;
+    }
   }
 }
 
@@ -177,9 +179,14 @@ BENCHMARK(BM_MainThreadAndWorkerThread)
     ->Threads(2)
     ->MeasureProcessCPUTime()
     ->UseManualTime();
+}  // end namespace
 
 // ========================================================================= //
 // ---------------------------- TEST CASES END ----------------------------- //
 // ========================================================================= //
 
-int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
+int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+
+  RunOutputTests(argc, argv);
+}
diff --git a/third-party/benchmark/test/link_main_test.cc b/third-party/benchmark/test/link_main_test.cc
index 131937eebca9d..41dbac9ab0faf 100644
--- a/third-party/benchmark/test/link_main_test.cc
+++ b/third-party/benchmark/test/link_main_test.cc
@@ -1,9 +1,12 @@
 #include "benchmark/benchmark.h"
 
+namespace {
 void BM_empty(benchmark::State& state) {
   for (auto _ : state) {
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
 }
 BENCHMARK(BM_empty);
+}  // end namespace
diff --git a/third-party/benchmark/test/locale_impermeability_test.cc b/third-party/benchmark/test/locale_impermeability_test.cc
new file mode 100644
index 0000000000000..e2dd6cfd9d750
--- /dev/null
+++ b/third-party/benchmark/test/locale_impermeability_test.cc
@@ -0,0 +1,47 @@
+#undef NDEBUG
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+namespace {
+void BM_ostream(benchmark::State& state) {
+#if !defined(__MINGW64__) || defined(__clang__)
+  // GCC-based versions of MINGW64 do not support locale manipulations,
+  // don't run the test under them.
+  std::locale::global(std::locale("en_US.UTF-8"));
+#endif
+  while (state.KeepRunning()) {
+    state.SetIterationTime(1e-6);
+  }
+}
+BENCHMARK(BM_ostream)->UseManualTime()->Iterations(1000000);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_ostream/iterations:1000000/manual_time"
+                           " %console_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ostream/iterations:1000000/manual_time\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": "
+            "\"BM_ostream/iterations:1000000/manual_time\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": 1000000,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ostream/iterations:1000000/"
+                       "manual_time\",1000000,%float,%float,ns,,,,,$"}});
+}  // end namespace
+
+int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+  RunOutputTests(argc, argv);
+}
diff --git a/third-party/benchmark/test/manual_threading_test.cc b/third-party/benchmark/test/manual_threading_test.cc
new file mode 100644
index 0000000000000..b3252ec16ea83
--- /dev/null
+++ b/third-party/benchmark/test/manual_threading_test.cc
@@ -0,0 +1,175 @@
+
+#include <memory>
+#undef NDEBUG
+
+#include <chrono>
+#include <thread>
+
+#include "../src/timers.h"
+#include "benchmark/benchmark.h"
+
+namespace {
+
+const std::chrono::duration<double, std::milli> time_frame(50);
+const double time_frame_in_sec(
+    std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(
+        time_frame)
+        .count());
+
+void MyBusySpinwait() {
+  const auto start = benchmark::ChronoClockNow();
+
+  while (true) {
+    const auto now = benchmark::ChronoClockNow();
+    const auto elapsed = now - start;
+
+    if (std::chrono::duration<double, std::chrono::seconds::period>(elapsed) >=
+        time_frame) {
+      return;
+    }
+  }
+}
+
+int numRunThreadsCalled_ = 0;
+
+class ManualThreadRunner : public benchmark::ThreadRunnerBase {
+ public:
+  explicit ManualThreadRunner(int num_threads)
+      : pool(static_cast<size_t>(num_threads - 1)) {}
+
+  void RunThreads(const std::function<void(int)>& fn) final {
+    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+      pool[ti] = std::thread(fn, static_cast<int>(ti + 1));
+    }
+
+    fn(0);
+
+    for (std::thread& thread : pool) {
+      thread.join();
+    }
+
+    ++numRunThreadsCalled_;
+  }
+
+ private:
+  std::vector<std::thread> pool;
+};
+
+// ========================================================================= //
+// --------------------------- TEST CASES BEGIN ---------------------------- //
+// ========================================================================= //
+
+// ========================================================================= //
+// BM_ManualThreading
+// Creation of threads is done before the start of the measurement,
+// joining after the finish of the measurement.
+void BM_ManualThreading(benchmark::State& state) {
+  for (auto _ : state) {
+    MyBusySpinwait();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+}  // end namespace
+
+BENCHMARK(BM_ManualThreading)
+    ->Iterations(1)
+    ->ThreadRunner([](int num_threads) {
+      return std::make_unique<ManualThreadRunner>(num_threads);
+    })
+    ->Threads(1);
+BENCHMARK(BM_ManualThreading)
+    ->Iterations(1)
+    ->ThreadRunner([](int num_threads) {
+      return std::make_unique<ManualThreadRunner>(num_threads);
+    })
+    ->Threads(1)
+    ->UseRealTime();
+BENCHMARK(BM_ManualThreading)
+    ->Iterations(1)
+    ->ThreadRunner([](int num_threads) {
+      return std::make_unique<ManualThreadRunner>(num_threads);
+    })
+    ->Threads(1)
+    ->UseManualTime();
+BENCHMARK(BM_ManualThreading)
+    ->Iterations(1)
+    ->ThreadRunner([](int num_threads) {
+      return std::make_unique<ManualThreadRunner>(num_threads);
+    })
+    ->Threads(1)
+    ->MeasureProcessCPUTime();
+BENCHMARK(BM_ManualThreading)
+    ->Iterations(1)
+    ->ThreadRunner([](int num_threads) {
+      return std::make_unique<ManualThreadRunner>(num_threads);
+    })
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_ManualThreading)
+    ->Iterations(1)
+    ->ThreadRunner([](int num_threads) {
+      return std::make_unique<ManualThreadRunner>(num_threads);
+    })
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_ManualThreading)
+    ->Iterations(1)
+    ->ThreadRunner([](int num_threads) {
+      return std::make_unique<ManualThreadRunner>(num_threads);
+    })
+    ->Threads(2);
+BENCHMARK(BM_ManualThreading)
+    ->Iterations(1)
+    ->ThreadRunner([](int num_threads) {
+      return std::make_unique<ManualThreadRunner>(num_threads);
+    })
+    ->Threads(2)
+    ->UseRealTime();
+BENCHMARK(BM_ManualThreading)
+    ->Iterations(1)
+    ->ThreadRunner([](int num_threads) {
+      return std::make_unique<ManualThreadRunner>(num_threads);
+    })
+    ->Threads(2)
+    ->UseManualTime();
+BENCHMARK(BM_ManualThreading)
+    ->Iterations(1)
+    ->ThreadRunner([](int num_threads) {
+      return std::make_unique<ManualThreadRunner>(num_threads);
+    })
+    ->Threads(2)
+    ->MeasureProcessCPUTime();
+BENCHMARK(BM_ManualThreading)
+    ->Iterations(1)
+    ->ThreadRunner([](int num_threads) {
+      return std::make_unique<ManualThreadRunner>(num_threads);
+    })
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_ManualThreading)
+    ->Iterations(1)
+    ->ThreadRunner([](int num_threads) {
+      return std::make_unique<ManualThreadRunner>(num_threads);
+    })
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// ---------------------------- TEST CASES END ----------------------------- //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+  assert(numRunThreadsCalled_ > 0);
+}
diff --git a/third-party/benchmark/test/map_test.cc b/third-party/benchmark/test/map_test.cc
index 0fdba7c87c4fe..018e12a75e887 100644
--- a/third-party/benchmark/test/map_test.cc
+++ b/third-party/benchmark/test/map_test.cc
@@ -13,10 +13,8 @@ std::map<int, int> ConstructRandomMap(int size) {
   return m;
 }
 
-}  // namespace
-
 // Basic version.
-static void BM_MapLookup(benchmark::State& state) {
+void BM_MapLookup(benchmark::State& state) {
   const int size = static_cast<int>(state.range(0));
   std::map<int, int> m;
   for (auto _ : state) {
@@ -31,6 +29,7 @@ static void BM_MapLookup(benchmark::State& state) {
   state.SetItemsProcessed(state.iterations() * size);
 }
 BENCHMARK(BM_MapLookup)->Range(1 << 3, 1 << 12);
+}  // namespace
 
 // Using fixtures.
 class MapFixture : public ::benchmark::Fixture {
@@ -39,7 +38,7 @@ class MapFixture : public ::benchmark::Fixture {
     m = ConstructRandomMap(static_cast<int>(st.range(0)));
   }
 
-  void TearDown(const ::benchmark::State&) override { m.clear(); }
+  void TearDown(const ::benchmark::State& /*unused*/) override { m.clear(); }
 
   std::map<int, int> m;
 };
diff --git a/third-party/benchmark/test/memory_manager_test.cc b/third-party/benchmark/test/memory_manager_test.cc
index 4df674d586ed7..39b32169d572e 100644
--- a/third-party/benchmark/test/memory_manager_test.cc
+++ b/third-party/benchmark/test/memory_manager_test.cc
@@ -1,9 +1,9 @@
 #include <memory>
 
-#include "../src/check.h"
 #include "benchmark/benchmark.h"
 #include "output_test.h"
 
+namespace {
 class TestMemoryManager : public benchmark::MemoryManager {
   void Start() override {}
   void Stop(Result& result) override {
@@ -14,11 +14,13 @@ class TestMemoryManager : public benchmark::MemoryManager {
 
 void BM_empty(benchmark::State& state) {
   for (auto _ : state) {
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
 }
 BENCHMARK(BM_empty);
+}  // end namespace
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_empty %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_empty\",$"},
@@ -39,6 +41,7 @@ ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_empty\",$"},
 ADD_CASES(TC_CSVOut, {{"^\"BM_empty\",%csv_report$"}});
 
 int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
   std::unique_ptr<benchmark::MemoryManager> mm(new TestMemoryManager());
 
   benchmark::RegisterMemoryManager(mm.get());
diff --git a/third-party/benchmark/test/memory_results_gtest.cc b/third-party/benchmark/test/memory_results_gtest.cc
new file mode 100644
index 0000000000000..70a5a5a985c76
--- /dev/null
+++ b/third-party/benchmark/test/memory_results_gtest.cc
@@ -0,0 +1,101 @@
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+using benchmark::Benchmark;
+using benchmark::ClearRegisteredBenchmarks;
+using benchmark::ConsoleReporter;
+using benchmark::MemoryManager;
+using benchmark::RegisterBenchmark;
+using benchmark::RunSpecifiedBenchmarks;
+using benchmark::State;
+
+constexpr int N_REPETITIONS = 100;
+constexpr int N_ITERATIONS = 1;
+
+int num_allocs = 0;
+int max_bytes_used = 0;
+int total_allocated_bytes = 0;
+int net_heap_growth = 0;
+
+void reset() {
+  num_allocs = 0;
+  max_bytes_used = 0;
+  total_allocated_bytes = 0;
+  net_heap_growth = 0;
+}
+class TestMemoryManager : public MemoryManager {
+  void Start() override {}
+  void Stop(Result& result) override {
+    result.num_allocs = num_allocs;
+    result.net_heap_growth = net_heap_growth;
+    result.max_bytes_used = max_bytes_used;
+    result.total_allocated_bytes = total_allocated_bytes;
+
+    num_allocs += 1;
+    max_bytes_used += 2;
+    net_heap_growth += 4;
+    total_allocated_bytes += 10;
+  }
+};
+
+class TestReporter : public ConsoleReporter {
+ public:
+  TestReporter() = default;
+  virtual ~TestReporter() = default;
+
+  bool ReportContext(const Context& /*unused*/) override { return true; }
+
+  void PrintHeader(const Run&) override {}
+  void PrintRunData(const Run& run) override {
+    if (run.repetition_index == -1) return;
+    if (!run.memory_result.memory_iterations) return;
+
+    store.push_back(run.memory_result);
+  }
+
+  std::vector<MemoryManager::Result> store;
+};
+
+class MemoryResultsTest : public testing::Test {
+ public:
+  Benchmark* bm;
+  TestReporter reporter;
+
+  void SetUp() override {
+    bm = RegisterBenchmark("BM", [](State& st) {
+      for (auto _ : st) {
+      }
+    });
+    bm->Repetitions(N_REPETITIONS);
+    bm->Iterations(N_ITERATIONS);
+    reset();
+  }
+  void TearDown() override { ClearRegisteredBenchmarks(); }
+};
+
+TEST_F(MemoryResultsTest, NoMMTest) {
+  RunSpecifiedBenchmarks(&reporter);
+  EXPECT_EQ(reporter.store.size(), 0);
+}
+
+TEST_F(MemoryResultsTest, ResultsTest) {
+  auto mm = std::make_unique<TestMemoryManager>();
+  RegisterMemoryManager(mm.get());
+
+  RunSpecifiedBenchmarks(&reporter);
+  EXPECT_EQ(reporter.store.size(), N_REPETITIONS);
+
+  for (size_t i = 0; i < reporter.store.size(); i++) {
+    EXPECT_EQ(reporter.store[i].num_allocs, static_cast<int64_t>(i));
+    EXPECT_EQ(reporter.store[i].max_bytes_used, static_cast<int64_t>(i) * 2);
+    EXPECT_EQ(reporter.store[i].net_heap_growth, static_cast<int64_t>(i) * 4);
+    EXPECT_EQ(reporter.store[i].total_allocated_bytes,
+              static_cast<int64_t>(i) * 10);
+  }
+}
+
+}  // namespace
diff --git a/third-party/benchmark/test/multiple_ranges_test.cc b/third-party/benchmark/test/multiple_ranges_test.cc
index 5300a96036c1b..987b69c82f500 100644
--- a/third-party/benchmark/test/multiple_ranges_test.cc
+++ b/third-party/benchmark/test/multiple_ranges_test.cc
@@ -5,6 +5,7 @@
 
 #include "benchmark/benchmark.h"
 
+namespace {
 class MultipleRangesFixture : public ::benchmark::Fixture {
  public:
   MultipleRangesFixture()
@@ -87,10 +88,11 @@ void BM_CheckDefaultArgument(benchmark::State& state) {
 }
 BENCHMARK(BM_CheckDefaultArgument)->Ranges({{1, 5}, {6, 10}});
 
-static void BM_MultipleRanges(benchmark::State& st) {
+void BM_MultipleRanges(benchmark::State& st) {
   for (auto _ : st) {
   }
 }
 BENCHMARK(BM_MultipleRanges)->Ranges({{5, 5}, {6, 6}});
+}  // end namespace
 
 BENCHMARK_MAIN();
diff --git a/third-party/benchmark/test/options_test.cc b/third-party/benchmark/test/options_test.cc
index a1b209f3eb334..70e3e18e2fdb1 100644
--- a/third-party/benchmark/test/options_test.cc
+++ b/third-party/benchmark/test/options_test.cc
@@ -8,6 +8,7 @@
 #endif
 #include <cassert>
 
+namespace {
 void BM_basic(benchmark::State& state) {
   for (auto _ : state) {
   }
@@ -50,7 +51,7 @@ BENCHMARK(BM_basic)->RangeMultiplier(4)->Range(-8, 8);
 BENCHMARK(BM_basic)->DenseRange(-2, 2, 1);
 BENCHMARK(BM_basic)->Ranges({{-64, 1}, {-8, -1}});
 
-void CustomArgs(benchmark::internal::Benchmark* b) {
+void CustomArgs(benchmark::Benchmark* b) {
   for (int i = 0; i < 10; ++i) {
     b->Arg(i);
   }
@@ -73,5 +74,6 @@ void BM_explicit_iteration_count(benchmark::State& state) {
   assert(state.iterations() == 42);
 }
 BENCHMARK(BM_explicit_iteration_count)->Iterations(42);
+}  // end namespace
 
 BENCHMARK_MAIN();
diff --git a/third-party/benchmark/test/output_test.h b/third-party/benchmark/test/output_test.h
index c08fe1d87e6c8..0fd557d90b6f8 100644
--- a/third-party/benchmark/test/output_test.h
+++ b/third-party/benchmark/test/output_test.h
@@ -16,12 +16,13 @@
 #define CONCAT2(x, y) x##y
 #define CONCAT(x, y) CONCAT2(x, y)
 
-#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = ::AddCases(__VA_ARGS__)
+#define ADD_CASES(...) \
+  const int CONCAT(dummy, __LINE__) = ::AddCases(__VA_ARGS__)
 
 #define SET_SUBSTITUTIONS(...) \
-  int CONCAT(dummy, __LINE__) = ::SetSubstitutions(__VA_ARGS__)
+  const int CONCAT(dummy, __LINE__) = ::SetSubstitutions(__VA_ARGS__)
 
-enum MatchRules {
+enum MatchRules : uint8_t {
   MR_Default,  // Skip non-matching lines until a match is found.
   MR_Next,     // Match must occur on the next line.
   MR_Not  // No line between the current position and the next match matches
@@ -37,7 +38,7 @@ struct TestCase {
   std::shared_ptr<benchmark::Regex> regex;
 };
 
-enum TestCaseID {
+enum TestCaseID : uint8_t {
   TC_ConsoleOut,
   TC_ConsoleErr,
   TC_JSONOut,
@@ -80,7 +81,8 @@ std::string GetFileReporterOutput(int argc, char* argv[]);
 //                  will be the subject of a call to checker_function
 // checker_function: should be of type ResultsCheckFn (see below)
 #define CHECK_BENCHMARK_RESULTS(bm_name_pattern, checker_function) \
-  size_t CONCAT(dummy, __LINE__) = AddChecker(bm_name_pattern, checker_function)
+  const size_t CONCAT(dummy, __LINE__) =                           \
+      AddChecker(bm_name_pattern, checker_function)
 
 struct Results;
 typedef std::function<void(Results const&)> ResultsCheckFn;
@@ -101,7 +103,7 @@ struct Results {
 
   double NumIterations() const;
 
-  typedef enum { kCpuTime, kRealTime } BenchmarkTime;
+  typedef enum : uint8_t { kCpuTime, kRealTime } BenchmarkTime;
 
   // get cpu_time or real_time in seconds
   double GetTime(BenchmarkTime which) const;
diff --git a/third-party/benchmark/test/output_test_helper.cc b/third-party/benchmark/test/output_test_helper.cc
index 265f28aae7c7c..43a1bfde87304 100644
--- a/third-party/benchmark/test/output_test_helper.cc
+++ b/third-party/benchmark/test/output_test_helper.cc
@@ -83,7 +83,7 @@ std::string PerformSubstitutions(std::string source) {
   SubMap const& subs = GetSubstitutions();
   using SizeT = std::string::size_type;
   for (auto const& KV : subs) {
-    SizeT pos;
+    SizeT pos = 0;
     SizeT next_start = 0;
     while ((pos = source.find(KV.first, next_start)) != std::string::npos) {
       next_start = pos + KV.second.size();
@@ -98,7 +98,7 @@ void CheckCase(std::stringstream& remaining_output, TestCase const& TC,
   std::string first_line;
   bool on_first = true;
   std::string line;
-  while (remaining_output.eof() == false) {
+  while (!remaining_output.eof()) {
     BM_CHECK(remaining_output.good());
     std::getline(remaining_output, line);
     if (on_first) {
@@ -112,7 +112,9 @@ void CheckCase(std::stringstream& remaining_output, TestCase const& TC,
           << "\n    actual regex string \"" << TC.substituted_regex << "\""
           << "\n    started matching near: " << first_line;
     }
-    if (TC.regex->Match(line)) return;
+    if (TC.regex->Match(line)) {
+      return;
+    }
     BM_CHECK(TC.match_rule != MR_Next)
         << "Expected line \"" << line << "\" to match regex \"" << TC.regex_str
         << "\""
@@ -147,7 +149,7 @@ class TestReporter : public benchmark::BenchmarkReporter {
   bool ReportContext(const Context& context) override {
     bool last_ret = false;
     bool first = true;
-    for (auto rep : reporters_) {
+    for (auto* rep : reporters_) {
       bool new_ret = rep->ReportContext(context);
       BM_CHECK(first || new_ret == last_ret)
           << "Reports return different values for ReportContext";
@@ -159,10 +161,14 @@ class TestReporter : public benchmark::BenchmarkReporter {
   }
 
   void ReportRuns(const std::vector<Run>& report) override {
-    for (auto rep : reporters_) rep->ReportRuns(report);
+    for (auto* rep : reporters_) {
+      rep->ReportRuns(report);
+    }
   }
   void Finalize() override {
-    for (auto rep : reporters_) rep->Finalize();
+    for (auto* rep : reporters_) {
+      rep->Finalize();
+    }
   }
 
  private:
@@ -200,15 +206,17 @@ class ResultsChecker {
   void SetHeader_(const std::string& csv_header);
   void SetValues_(const std::string& entry_csv_line);
 
-  std::vector<std::string> SplitCsv_(const std::string& line);
+  std::vector<std::string> SplitCsv_(const std::string& line) const;
 };
 
+namespace {
 // store the static ResultsChecker in a function to prevent initialization
 // order problems
 ResultsChecker& GetResultsChecker() {
   static ResultsChecker rc;
   return rc;
 }
+}  // end namespace
 
 // add a results checker for a benchmark
 void ResultsChecker::Add(const std::string& entry_pattern,
@@ -224,14 +232,16 @@ void ResultsChecker::CheckResults(std::stringstream& output) {
     // clear before calling tellg()
     output.clear();
     // seek to zero only when needed
-    if (output.tellg() > start) output.seekg(start);
+    if (output.tellg() > start) {
+      output.seekg(start);
+    }
     // and just in case
     output.clear();
   }
   // now go over every line and publish it to the ResultsChecker
   std::string line;
   bool on_first = true;
-  while (output.eof() == false) {
+  while (!output.eof()) {
     BM_CHECK(output.good());
     std::getline(output, line);
     if (on_first) {
@@ -265,7 +275,9 @@ void ResultsChecker::SetHeader_(const std::string& csv_header) {
 
 // set the values for a benchmark
 void ResultsChecker::SetValues_(const std::string& entry_csv_line) {
-  if (entry_csv_line.empty()) return;  // some lines are empty
+  if (entry_csv_line.empty()) {
+    return;
+  }  // some lines are empty
   BM_CHECK(!field_names.empty());
   auto vals = SplitCsv_(entry_csv_line);
   BM_CHECK_EQ(vals.size(), field_names.size());
@@ -277,23 +289,38 @@ void ResultsChecker::SetValues_(const std::string& entry_csv_line) {
 }
 
 // a quick'n'dirty csv splitter (eliminating quotes)
-std::vector<std::string> ResultsChecker::SplitCsv_(const std::string& line) {
+std::vector<std::string> ResultsChecker::SplitCsv_(
+    const std::string& line) const {
   std::vector<std::string> out;
-  if (line.empty()) return out;
-  if (!field_names.empty()) out.reserve(field_names.size());
-  size_t prev = 0, pos = line.find_first_of(','), curr = pos;
-  while (pos != line.npos) {
+  if (line.empty()) {
+    return out;
+  }
+  if (!field_names.empty()) {
+    out.reserve(field_names.size());
+  }
+  size_t prev = 0;
+  size_t pos = line.find_first_of(',');
+  size_t curr = pos;
+  while (pos != std::string::npos) {
     BM_CHECK(curr > 0);
-    if (line[prev] == '"') ++prev;
-    if (line[curr - 1] == '"') --curr;
+    if (line[prev] == '"') {
+      ++prev;
+    }
+    if (line[curr - 1] == '"') {
+      --curr;
+    }
     out.push_back(line.substr(prev, curr - prev));
     prev = pos + 1;
     pos = line.find_first_of(',', pos + 1);
     curr = pos;
   }
   curr = line.size();
-  if (line[prev] == '"') ++prev;
-  if (line[curr - 1] == '"') --curr;
+  if (line[prev] == '"') {
+    ++prev;
+  }
+  if (line[curr - 1] == '"') {
+    --curr;
+  }
   out.push_back(line.substr(prev, curr - prev));
   return out;
 }
@@ -308,7 +335,9 @@ size_t AddChecker(const std::string& bm_name, const ResultsCheckFn& fn) {
 
 int Results::NumThreads() const {
   auto pos = name.find("/threads:");
-  if (pos == name.npos) return 1;
+  if (pos == std::string::npos) {
+    return 1;
+  }
   auto end = name.find('/', pos + 9);
   std::stringstream ss;
   ss << name.substr(pos + 9, end);
@@ -324,7 +353,7 @@ double Results::GetTime(BenchmarkTime which) const {
   BM_CHECK(which == kCpuTime || which == kRealTime);
   const char* which_str = which == kCpuTime ? "cpu_time" : "real_time";
   double val = GetAs<double>(which_str);
-  auto unit = Get("time_unit");
+  const auto* unit = Get("time_unit");
   BM_CHECK(unit);
   if (*unit == "ns") {
     return val * 1.e-9;
@@ -378,7 +407,9 @@ int SetSubstitutions(
         break;
       }
     }
-    if (!exists) subs.push_back(std::move(KV));
+    if (!exists) {
+      subs.push_back(std::move(KV));
+    }
   }
   return 0;
 }
@@ -449,50 +480,60 @@ void RunOutputTests(int argc, char* argv[]) {
 BENCHMARK_RESTORE_DEPRECATED_WARNING
 
 int SubstrCnt(const std::string& haystack, const std::string& pat) {
-  if (pat.length() == 0) return 0;
+  if (pat.length() == 0) {
+    return 0;
+  }
   int count = 0;
   for (size_t offset = haystack.find(pat); offset != std::string::npos;
-       offset = haystack.find(pat, offset + pat.length()))
+       offset = haystack.find(pat, offset + pat.length())) {
     ++count;
+  }
   return count;
 }
 
-static char ToHex(int ch) {
+namespace {
+char ToHex(int ch) {
   return ch < 10 ? static_cast<char>('0' + ch)
                  : static_cast<char>('a' + (ch - 10));
 }
 
-static char RandomHexChar() {
+char RandomHexChar() {
   static std::mt19937 rd{std::random_device{}()};
   static std::uniform_int_distribution<int> mrand{0, 15};
   return ToHex(mrand(rd));
 }
 
-static std::string GetRandomFileName() {
+std::string GetRandomFileName() {
   std::string model = "test.%%%%%%";
   for (auto& ch : model) {
-    if (ch == '%') ch = RandomHexChar();
+    if (ch == '%') {
+      ch = RandomHexChar();
+    }
   }
   return model;
 }
 
-static bool FileExists(std::string const& name) {
+bool FileExists(std::string const& name) {
   std::ifstream in(name.c_str());
   return in.good();
 }
 
-static std::string GetTempFileName() {
+std::string GetTempFileName() {
   // This function attempts to avoid race conditions where two tests
   // create the same file at the same time. However, it still introduces races
   // similar to tmpnam.
   int retries = 3;
-  while (--retries) {
+  while (--retries != 0) {
     std::string name = GetRandomFileName();
-    if (!FileExists(name)) return name;
+    if (!FileExists(name)) {
+      return name;
+    }
   }
-  std::cerr << "Failed to create unique temporary file name" << std::endl;
-  std::abort();
+  std::cerr << "Failed to create unique temporary file name\n";
+  std::flush(std::cerr);
+  std::exit(1);
 }
+}  // end namespace
 
 std::string GetFileReporterOutput(int argc, char* argv[]) {
   std::vector<char*> new_argv(argv, argv + argc);
@@ -505,7 +546,7 @@ std::string GetFileReporterOutput(int argc, char* argv[]) {
   tmp += tmp_file_name;
   new_argv.emplace_back(const_cast<char*>(tmp.c_str()));
 
-  argc = int(new_argv.size());
+  argc = static_cast<int>(new_argv.size());
 
   benchmark::Initialize(&argc, new_argv.data());
   benchmark::RunSpecifiedBenchmarks();
diff --git a/third-party/benchmark/test/overload_test.cc b/third-party/benchmark/test/overload_test.cc
new file mode 100644
index 0000000000000..d1fee9a783a8a
--- /dev/null
+++ b/third-party/benchmark/test/overload_test.cc
@@ -0,0 +1,35 @@
+#include "benchmark/benchmark.h"
+
+namespace {
+// Simulate an overloaded function name.
+// This version does nothing and is just here to create ambiguity for
+// MyOverloadedBenchmark.
+BENCHMARK_UNUSED void MyOverloadedBenchmark() {}
+
+// This is the actual benchmark function we want to register.
+// It has the signature void(benchmark::State&) required by the library.
+void MyOverloadedBenchmark(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+
+// This macro invocation should compile correctly if benchmark.h
+// contains the fix (using static_cast), but would fail to compile
+// if the benchmark name were ambiguous (e.g., when using + or no cast
+// with an overloaded function).
+BENCHMARK(MyOverloadedBenchmark);
+
+// Also test BENCHMARK_TEMPLATE with an overloaded name.
+template <int N>
+void MyTemplatedOverloadedBenchmark() {}
+
+template <int N>
+void MyTemplatedOverloadedBenchmark(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+
+BENCHMARK_TEMPLATE(MyTemplatedOverloadedBenchmark, 1);
+}  // end namespace
+
+BENCHMARK_MAIN();
diff --git a/third-party/benchmark/test/perf_counters_gtest.cc b/third-party/benchmark/test/perf_counters_gtest.cc
index 2e63049285d75..5de262fa2b44c 100644
--- a/third-party/benchmark/test/perf_counters_gtest.cc
+++ b/third-party/benchmark/test/perf_counters_gtest.cc
@@ -226,9 +226,13 @@ void measure(size_t threadcount, PerfCounterValues* before,
   // threadpool.
   auto counters =
       PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
-  for (auto& t : threads) t = std::thread(work);
+  for (auto& t : threads) {
+    t = std::thread(work);
+  }
   counters.Snapshot(before);
-  for (auto& t : threads) t.join();
+  for (auto& t : threads) {
+    t.join();
+  }
   counters.Snapshot(after);
 }
 
diff --git a/third-party/benchmark/test/perf_counters_test.cc b/third-party/benchmark/test/perf_counters_test.cc
index 3cc593e629d80..a830b5ef10e81 100644
--- a/third-party/benchmark/test/perf_counters_test.cc
+++ b/third-party/benchmark/test/perf_counters_test.cc
@@ -11,8 +11,9 @@ namespace benchmark {
 BM_DECLARE_string(benchmark_perf_counters);
 
 }  // namespace benchmark
+namespace {
 
-static void BM_Simple(benchmark::State& state) {
+void BM_Simple(benchmark::State& state) {
   for (auto _ : state) {
     auto iterations = double(state.iterations()) * double(state.iterations());
     benchmark::DoNotOptimize(iterations);
@@ -66,19 +67,21 @@ static void CheckSimple(Results const& e) {
 double withoutPauseResumeInstrCount = 0.0;
 double withPauseResumeInstrCount = 0.0;
 
-static void SaveInstrCountWithoutResume(Results const& e) {
+void SaveInstrCountWithoutResume(Results const& e) {
   withoutPauseResumeInstrCount = e.GetAs<double>("INSTRUCTIONS");
 }
 
-static void SaveInstrCountWithResume(Results const& e) {
+void SaveInstrCountWithResume(Results const& e) {
   withPauseResumeInstrCount = e.GetAs<double>("INSTRUCTIONS");
 }
 
 CHECK_BENCHMARK_RESULTS("BM_Simple", &CheckSimple);
 CHECK_BENCHMARK_RESULTS("BM_WithoutPauseResume", &SaveInstrCountWithoutResume);
 CHECK_BENCHMARK_RESULTS("BM_WithPauseResume", &SaveInstrCountWithResume);
+}  // end namespace
 
 int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
   if (!benchmark::internal::PerfCounters::kSupported) {
     return 0;
   }
diff --git a/third-party/benchmark/test/profiler_manager_gtest.cc b/third-party/benchmark/test/profiler_manager_gtest.cc
new file mode 100644
index 0000000000000..434e4ecadf7c4
--- /dev/null
+++ b/third-party/benchmark/test/profiler_manager_gtest.cc
@@ -0,0 +1,42 @@
+#include <memory>
+
+#include "benchmark/benchmark.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+class TestProfilerManager : public benchmark::ProfilerManager {
+ public:
+  void AfterSetupStart() override { ++start_called; }
+  void BeforeTeardownStop() override { ++stop_called; }
+
+  int start_called = 0;
+  int stop_called = 0;
+};
+
+void BM_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
+  }
+}
+BENCHMARK(BM_empty);
+
+TEST(ProfilerManager, ReregisterManager) {
+#if GTEST_HAS_DEATH_TEST
+  // Tests only runnable in debug mode (when BM_CHECK is enabled).
+#ifndef NDEBUG
+#ifndef TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS
+  ASSERT_DEATH_IF_SUPPORTED(
+      {
+        std::unique_ptr<TestProfilerManager> pm(new TestProfilerManager());
+        benchmark::RegisterProfilerManager(pm.get());
+        benchmark::RegisterProfilerManager(pm.get());
+      },
+      "RegisterProfilerManager");
+#endif
+#endif
+#endif
+}
+
+}  // namespace
diff --git a/third-party/benchmark/test/profiler_manager_iterations_test.cc b/third-party/benchmark/test/profiler_manager_iterations_test.cc
new file mode 100644
index 0000000000000..c4983eb34858e
--- /dev/null
+++ b/third-party/benchmark/test/profiler_manager_iterations_test.cc
@@ -0,0 +1,62 @@
+#include <cassert>
+#include <cstdlib>
+#include <memory>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+// Tests that we can specify the number of profiler iterations with
+// --benchmark_min_time=<NUM>x.
+namespace {
+
+int iteration_count = 0;
+int end_profiler_iteration_count = 0;
+
+class TestProfilerManager : public benchmark::ProfilerManager {
+  void AfterSetupStart() override { iteration_count = 0; }
+  void BeforeTeardownStop() override {
+    end_profiler_iteration_count = iteration_count;
+  }
+};
+
+class NullReporter : public benchmark::BenchmarkReporter {
+ public:
+  bool ReportContext(const Context& /*context*/) override { return true; }
+  void ReportRuns(const std::vector<Run>& /* report */) override {}
+};
+
+void BM_MyBench(benchmark::State& state) {
+  for (auto s : state) {
+    ++iteration_count;
+  }
+}
+BENCHMARK(BM_MyBench);
+}  // end namespace
+
+int main(int argc, char** argv) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+  // Make a fake argv and append the new --benchmark_profiler_iterations=<foo>
+  // to it.
+  int fake_argc = argc + 1;
+  std::vector<const char*> fake_argv(static_cast<size_t>(fake_argc));
+  for (size_t i = 0; i < static_cast<size_t>(argc); ++i) {
+    fake_argv[i] = argv[i];
+  }
+  fake_argv[static_cast<size_t>(argc)] = "--benchmark_min_time=4x";
+
+  std::unique_ptr<benchmark::ProfilerManager> pm(new TestProfilerManager());
+  benchmark::RegisterProfilerManager(pm.get());
+
+  benchmark::Initialize(&fake_argc, const_cast<char**>(fake_argv.data()));
+
+  NullReporter null_reporter;
+  const size_t returned_count =
+      benchmark::RunSpecifiedBenchmarks(&null_reporter, "BM_MyBench");
+  assert(returned_count == 1);
+
+  // Check the executed iters.
+  assert(end_profiler_iteration_count == 4);
+
+  benchmark::RegisterProfilerManager(nullptr);
+  return 0;
+}
diff --git a/third-party/benchmark/test/profiler_manager_test.cc b/third-party/benchmark/test/profiler_manager_test.cc
new file mode 100644
index 0000000000000..5c4b14daa3f8a
--- /dev/null
+++ b/third-party/benchmark/test/profiler_manager_test.cc
@@ -0,0 +1,54 @@
+// FIXME: WIP
+
+#include <cassert>
+#include <memory>
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+namespace {
+class TestProfilerManager : public benchmark::ProfilerManager {
+ public:
+  void AfterSetupStart() override { ++start_called; }
+  void BeforeTeardownStop() override { ++stop_called; }
+
+  int start_called = 0;
+  int stop_called = 0;
+};
+
+void BM_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
+  }
+}
+BENCHMARK(BM_empty);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_empty %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_empty\",$"},
+                       {"\"family_index\": 0,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_empty\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_empty\",%csv_report$"}});
+}  // end namespace
+
+int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+  std::unique_ptr<TestProfilerManager> pm(new TestProfilerManager());
+
+  benchmark::RegisterProfilerManager(pm.get());
+  RunOutputTests(argc, argv);
+  benchmark::RegisterProfilerManager(nullptr);
+
+  assert(pm->start_called == 1);
+  assert(pm->stop_called == 1);
+}
diff --git a/third-party/benchmark/test/register_benchmark_test.cc b/third-party/benchmark/test/register_benchmark_test.cc
index d69d144a4e171..3e39437a2779f 100644
--- a/third-party/benchmark/test/register_benchmark_test.cc
+++ b/third-party/benchmark/test/register_benchmark_test.cc
@@ -53,11 +53,10 @@ int AddCases(std::initializer_list<TestCase> const& v) {
 
 #define CONCAT(x, y) CONCAT2(x, y)
 #define CONCAT2(x, y) x##y
-#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = AddCases({__VA_ARGS__})
+#define ADD_CASES(...) \
+  const int CONCAT(dummy, __LINE__) = AddCases({__VA_ARGS__})
 
-}  // end namespace
-
-typedef benchmark::internal::Benchmark* ReturnVal;
+using ReturnVal = benchmark::Benchmark const* const;
 
 //----------------------------------------------------------------------------//
 // Test RegisterBenchmark with no additional arguments
@@ -76,7 +75,6 @@ ADD_CASES({"BM_function"}, {"BM_function_manual_registration"});
 // Note: GCC <= 4.8 do not support this form of RegisterBenchmark because they
 //       reject the variadic pack expansion of lambda captures.
 //----------------------------------------------------------------------------//
-#ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
 
 void BM_extra_args(benchmark::State& st, const char* label) {
   for (auto _ : st) {
@@ -86,15 +84,14 @@ void BM_extra_args(benchmark::State& st, const char* label) {
 int RegisterFromFunction() {
   std::pair<const char*, const char*> cases[] = {
       {"test1", "One"}, {"test2", "Two"}, {"test3", "Three"}};
-  for (auto const& c : cases)
+  for (auto const& c : cases) {
     benchmark::RegisterBenchmark(c.first, &BM_extra_args, c.second);
+  }
   return 0;
 }
-int dummy2 = RegisterFromFunction();
+const int dummy2 = RegisterFromFunction();
 ADD_CASES({"test1", "One"}, {"test2", "Two"}, {"test3", "Three"});
 
-#endif  // BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
-
 //----------------------------------------------------------------------------//
 // Test RegisterBenchmark with DISABLED_ benchmark
 //----------------------------------------------------------------------------//
@@ -119,14 +116,11 @@ struct CustomFixture {
 };
 
 void TestRegistrationAtRuntime() {
-#ifdef BENCHMARK_HAS_CXX11
   {
     CustomFixture fx;
     benchmark::RegisterBenchmark("custom_fixture", fx);
     AddCases({std::string("custom_fixture")});
   }
-#endif
-#ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
   {
     const char* x = "42";
     auto capturing_lam = [=](benchmark::State& st) {
@@ -137,7 +131,6 @@ void TestRegistrationAtRuntime() {
     benchmark::RegisterBenchmark("lambda_benchmark", capturing_lam);
     AddCases({{"lambda_benchmark", x}});
   }
-#endif
 }
 
 // Test that all benchmarks, registered at either during static init or runtime,
@@ -163,7 +156,7 @@ void RunTestOne() {
 // benchmarks.
 // Also test that new benchmarks can be registered and ran afterwards.
 void RunTestTwo() {
-  assert(ExpectedResults.size() != 0 &&
+  assert(!ExpectedResults.empty() &&
          "must have at least one registered benchmark");
   ExpectedResults.clear();
   benchmark::ClearRegisteredBenchmarks();
@@ -187,8 +180,10 @@ void RunTestTwo() {
   }
   assert(EB == ExpectedResults.end());
 }
+}  // end namespace
 
 int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
   benchmark::Initialize(&argc, argv);
 
   RunTestOne();
diff --git a/third-party/benchmark/test/repetitions_test.cc b/third-party/benchmark/test/repetitions_test.cc
index 569777d5f933e..9116fa65be4cd 100644
--- a/third-party/benchmark/test/repetitions_test.cc
+++ b/third-party/benchmark/test/repetitions_test.cc
@@ -2,11 +2,12 @@
 #include "benchmark/benchmark.h"
 #include "output_test.h"
 
+namespace {
 // ========================================================================= //
 // ------------------------ Testing Basic Output --------------------------- //
 // ========================================================================= //
 
-static void BM_ExplicitRepetitions(benchmark::State& state) {
+void BM_ExplicitRepetitions(benchmark::State& state) {
   for (auto _ : state) {
   }
 }
@@ -108,7 +109,7 @@ ADD_CASES(TC_CSVOut,
 // ------------------------ Testing Basic Output --------------------------- //
 // ========================================================================= //
 
-static void BM_ImplicitRepetitions(benchmark::State& state) {
+void BM_ImplicitRepetitions(benchmark::State& state) {
   for (auto _ : state) {
   }
 }
@@ -206,9 +207,13 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions\",%csv_report$"}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_mean\",%csv_report$"}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_median\",%csv_report$"}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_stddev\",%csv_report$"}});
+}  // end namespace
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
 // ========================================================================= //
 
-int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
+int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+  RunOutputTests(argc, argv);
+}
diff --git a/third-party/benchmark/test/report_aggregates_only_test.cc b/third-party/benchmark/test/report_aggregates_only_test.cc
index 47da50358885b..707d92383a3e1 100644
--- a/third-party/benchmark/test/report_aggregates_only_test.cc
+++ b/third-party/benchmark/test/report_aggregates_only_test.cc
@@ -6,6 +6,7 @@
 #include "benchmark/benchmark.h"
 #include "output_test.h"
 
+namespace {
 // Ok this test is super ugly. We want to check what happens with the file
 // reporter in the presence of ReportAggregatesOnly().
 // We do not care about console output, the normal tests check that already.
@@ -15,8 +16,10 @@ void BM_SummaryRepeat(benchmark::State& state) {
   }
 }
 BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
+}  // end namespace
 
 int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
   const std::string output = GetFileReporterOutput(argc, argv);
 
   if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 4 ||
diff --git a/third-party/benchmark/test/reporter_output_test.cc b/third-party/benchmark/test/reporter_output_test.cc
index 7867165d1f3df..9940ab75de5dd 100644
--- a/third-party/benchmark/test/reporter_output_test.cc
+++ b/third-party/benchmark/test/reporter_output_test.cc
@@ -1,11 +1,9 @@
-
 #undef NDEBUG
-#include <numeric>
-#include <utility>
 
 #include "benchmark/benchmark.h"
 #include "output_test.h"
 
+namespace {
 // ========================================================================= //
 // ---------------------- Testing Prologue Output -------------------------- //
 // ========================================================================= //
@@ -13,7 +11,7 @@
 ADD_CASES(TC_ConsoleOut, {{"^[-]+$", MR_Next},
                           {"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
                           {"^[-]+$", MR_Next}});
-static int AddContextCases() {
+int AddContextCases() {
   AddCases(TC_ConsoleErr,
            {
                {"^%int-%int-%intT%int:%int:%int[-+]%int:%int$", MR_Default},
@@ -60,7 +58,7 @@ static int AddContextCases() {
   AddCases(TC_JSONOut, {{"\"json_schema_version\": 1$", MR_Next}});
   return 0;
 }
-int dummy_register = AddContextCases();
+const int dummy_register = AddContextCases();
 ADD_CASES(TC_CSVOut, {{"%csv_header"}});
 
 // ========================================================================= //
@@ -96,7 +94,8 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_basic\",%csv_report$"}});
 void BM_bytes_per_second(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
   state.SetBytesProcessed(1);
@@ -128,7 +127,8 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_bytes_per_second\",%csv_bytes_report$"}});
 void BM_items_per_second(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
   state.SetItemsProcessed(1);
@@ -409,7 +409,8 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_BigArgs/1073741824 %console_report$"},
 void BM_Complexity_O1(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
   state.SetComplexityN(state.range(0));
@@ -1125,9 +1126,13 @@ void BM_CSV_Format(benchmark::State& state) {
 }
 BENCHMARK(BM_CSV_Format);
 ADD_CASES(TC_CSVOut, {{"^\"BM_CSV_Format\",,,,,,,,true,\"\"\"freedom\"\"\"$"}});
+}  // end namespace
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
 // ========================================================================= //
 
-int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
+int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+  RunOutputTests(argc, argv);
+}
diff --git a/third-party/benchmark/test/skip_with_error_test.cc b/third-party/benchmark/test/skip_with_error_test.cc
index 2139a19e25071..425895988c247 100644
--- a/third-party/benchmark/test/skip_with_error_test.cc
+++ b/third-party/benchmark/test/skip_with_error_test.cc
@@ -46,6 +46,7 @@ struct TestCase {
   }
 };
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::vector<TestCase> ExpectedResults;
 
 int AddCases(const std::string& base_name,
@@ -59,9 +60,7 @@ int AddCases(const std::string& base_name,
 
 #define CONCAT(x, y) CONCAT2(x, y)
 #define CONCAT2(x, y) x##y
-#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
-
-}  // end namespace
+#define ADD_CASES(...) const int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
 
 void BM_error_no_running(benchmark::State& state) {
   state.SkipWithError("error message");
@@ -97,11 +96,11 @@ BENCHMARK(BM_error_before_running_range_for);
 ADD_CASES("BM_error_before_running_range_for", {{"", true, "error message"}});
 
 void BM_error_during_running(benchmark::State& state) {
-  int first_iter = true;
+  int first_iter = 1;
   while (state.KeepRunning()) {
     if (state.range(0) == 1 && state.thread_index() <= (state.threads() / 2)) {
       assert(first_iter);
-      first_iter = false;
+      first_iter = 0;
       state.SkipWithError("error message");
     } else {
       state.PauseTiming();
@@ -143,11 +142,13 @@ ADD_CASES("BM_error_during_running_ranged_for",
 
 void BM_error_after_running(benchmark::State& state) {
   for (auto _ : state) {
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
-  if (state.thread_index() <= (state.threads() / 2))
+  if (state.thread_index() <= (state.threads() / 2)) {
     state.SkipWithError("error message");
+  }
 }
 BENCHMARK(BM_error_after_running)->ThreadRange(1, 8);
 ADD_CASES("BM_error_after_running", {{"/threads:1", true, "error message"},
@@ -179,7 +180,18 @@ ADD_CASES("BM_error_while_paused", {{"/1/threads:1", true, "error message"},
                                     {"/2/threads:4", false, ""},
                                     {"/2/threads:8", false, ""}});
 
+void BM_malformed(benchmark::State& /*unused*/) {
+  // NOTE: empty body wanted. No thing else.
+}
+BENCHMARK(BM_malformed);
+ADD_CASES("BM_malformed",
+          {{"", true,
+            "The benchmark didn't run, nor was it explicitly skipped. Please "
+            "call 'SkipWithXXX` in your benchmark as appropriate."}});
+}  // end namespace
+
 int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
   benchmark::Initialize(&argc, argv);
 
   TestReporter test_reporter;
diff --git a/third-party/benchmark/test/spec_arg_test.cc b/third-party/benchmark/test/spec_arg_test.cc
index 06aafbeb9b5ee..21275ef0d88b7 100644
--- a/third-party/benchmark/test/spec_arg_test.cc
+++ b/third-party/benchmark/test/spec_arg_test.cc
@@ -39,22 +39,24 @@ class TestReporter : public benchmark::ConsoleReporter {
   std::vector<std::string> matched_functions;
 };
 
-}  // end namespace
-
-static void BM_NotChosen(benchmark::State& state) {
+void BM_NotChosen(benchmark::State& state) {
   assert(false && "SHOULD NOT BE CALLED");
   for (auto _ : state) {
   }
 }
 BENCHMARK(BM_NotChosen);
 
-static void BM_Chosen(benchmark::State& state) {
+void BM_Chosen(benchmark::State& state) {
   for (auto _ : state) {
   }
 }
 BENCHMARK(BM_Chosen);
 
+}  // end namespace
+
 int main(int argc, char** argv) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+
   const std::string flag = "BM_NotChosen";
 
   // Verify that argv specify --benchmark_filter=BM_NotChosen.
diff --git a/third-party/benchmark/test/spec_arg_verbosity_test.cc b/third-party/benchmark/test/spec_arg_verbosity_test.cc
index 8f8eb6d37c590..318784cfff73c 100644
--- a/third-party/benchmark/test/spec_arg_verbosity_test.cc
+++ b/third-party/benchmark/test/spec_arg_verbosity_test.cc
@@ -4,14 +4,18 @@
 
 #include "benchmark/benchmark.h"
 
+namespace {
 // Tests that the user specified verbosity level can be get.
-static void BM_Verbosity(benchmark::State& state) {
+void BM_Verbosity(benchmark::State& state) {
   for (auto _ : state) {
   }
 }
 BENCHMARK(BM_Verbosity);
+}  // end namespace
 
 int main(int argc, char** argv) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+
   const int32_t flagv = 42;
 
   // Verify that argv specify --v=42.
diff --git a/third-party/benchmark/test/state_assembly_test.cc b/third-party/benchmark/test/state_assembly_test.cc
index 7ddbb3b2a92c8..e9ecfebf16ad5 100644
--- a/third-party/benchmark/test/state_assembly_test.cc
+++ b/third-party/benchmark/test/state_assembly_test.cc
@@ -2,6 +2,7 @@
 
 #ifdef __clang__
 #pragma clang diagnostic ignored "-Wreturn-type"
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
 #endif
 
 // clang-format off
diff --git a/third-party/benchmark/test/string_util_gtest.cc b/third-party/benchmark/test/string_util_gtest.cc
index 67b4bc0c24f26..5a9a09e19bc45 100644
--- a/third-party/benchmark/test/string_util_gtest.cc
+++ b/third-party/benchmark/test/string_util_gtest.cc
@@ -13,18 +13,18 @@ namespace {
 TEST(StringUtilTest, stoul) {
   {
     size_t pos = 0;
-    EXPECT_EQ(0ul, benchmark::stoul("0", &pos));
-    EXPECT_EQ(1ul, pos);
+    EXPECT_EQ(0UL, benchmark::stoul("0", &pos));
+    EXPECT_EQ(1UL, pos);
   }
   {
     size_t pos = 0;
-    EXPECT_EQ(7ul, benchmark::stoul("7", &pos));
-    EXPECT_EQ(1ul, pos);
+    EXPECT_EQ(7UL, benchmark::stoul("7", &pos));
+    EXPECT_EQ(1UL, pos);
   }
   {
     size_t pos = 0;
-    EXPECT_EQ(135ul, benchmark::stoul("135", &pos));
-    EXPECT_EQ(3ul, pos);
+    EXPECT_EQ(135UL, benchmark::stoul("135", &pos));
+    EXPECT_EQ(3UL, pos);
   }
 #if ULONG_MAX == 0xFFFFFFFFul
   {
@@ -35,35 +35,35 @@ TEST(StringUtilTest, stoul) {
 #elif ULONG_MAX == 0xFFFFFFFFFFFFFFFFul
   {
     size_t pos = 0;
-    EXPECT_EQ(0xFFFFFFFFFFFFFFFFul,
+    EXPECT_EQ(0xFFFFFFFFFFFFFFFFUL,
               benchmark::stoul("18446744073709551615", &pos));
-    EXPECT_EQ(20ul, pos);
+    EXPECT_EQ(20UL, pos);
   }
 #endif
   {
     size_t pos = 0;
-    EXPECT_EQ(10ul, benchmark::stoul("1010", &pos, 2));
-    EXPECT_EQ(4ul, pos);
+    EXPECT_EQ(10UL, benchmark::stoul("1010", &pos, 2));
+    EXPECT_EQ(4UL, pos);
   }
   {
     size_t pos = 0;
-    EXPECT_EQ(520ul, benchmark::stoul("1010", &pos, 8));
-    EXPECT_EQ(4ul, pos);
+    EXPECT_EQ(520UL, benchmark::stoul("1010", &pos, 8));
+    EXPECT_EQ(4UL, pos);
   }
   {
     size_t pos = 0;
-    EXPECT_EQ(1010ul, benchmark::stoul("1010", &pos, 10));
-    EXPECT_EQ(4ul, pos);
+    EXPECT_EQ(1010UL, benchmark::stoul("1010", &pos, 10));
+    EXPECT_EQ(4UL, pos);
   }
   {
     size_t pos = 0;
-    EXPECT_EQ(4112ul, benchmark::stoul("1010", &pos, 16));
-    EXPECT_EQ(4ul, pos);
+    EXPECT_EQ(4112UL, benchmark::stoul("1010", &pos, 16));
+    EXPECT_EQ(4UL, pos);
   }
   {
     size_t pos = 0;
-    EXPECT_EQ(0xBEEFul, benchmark::stoul("BEEF", &pos, 16));
-    EXPECT_EQ(4ul, pos);
+    EXPECT_EQ(0xBEEFUL, benchmark::stoul("BEEF", &pos, 16));
+    EXPECT_EQ(4UL, pos);
   }
 #ifndef BENCHMARK_HAS_NO_EXCEPTIONS
   {
@@ -73,83 +73,87 @@ TEST(StringUtilTest, stoul) {
 #endif
 }
 
-TEST(StringUtilTest, stoi){{size_t pos = 0;
-EXPECT_EQ(0, benchmark::stoi("0", &pos));
-EXPECT_EQ(1ul, pos);
-}  // namespace
-{
-  size_t pos = 0;
-  EXPECT_EQ(-17, benchmark::stoi("-17", &pos));
-  EXPECT_EQ(3ul, pos);
-}
-{
-  size_t pos = 0;
-  EXPECT_EQ(1357, benchmark::stoi("1357", &pos));
-  EXPECT_EQ(4ul, pos);
-}
-{
-  size_t pos = 0;
-  EXPECT_EQ(10, benchmark::stoi("1010", &pos, 2));
-  EXPECT_EQ(4ul, pos);
-}
-{
-  size_t pos = 0;
-  EXPECT_EQ(520, benchmark::stoi("1010", &pos, 8));
-  EXPECT_EQ(4ul, pos);
-}
-{
-  size_t pos = 0;
-  EXPECT_EQ(1010, benchmark::stoi("1010", &pos, 10));
-  EXPECT_EQ(4ul, pos);
-}
-{
-  size_t pos = 0;
-  EXPECT_EQ(4112, benchmark::stoi("1010", &pos, 16));
-  EXPECT_EQ(4ul, pos);
-}
-{
-  size_t pos = 0;
-  EXPECT_EQ(0xBEEF, benchmark::stoi("BEEF", &pos, 16));
-  EXPECT_EQ(4ul, pos);
-}
+TEST(StringUtilTest, stoi) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0, benchmark::stoi("0", &pos));
+    EXPECT_EQ(1UL, pos);
+  }  // namespace
+  {
+    size_t pos = 0;
+    EXPECT_EQ(-17, benchmark::stoi("-17", &pos));
+    EXPECT_EQ(3UL, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1357, benchmark::stoi("1357", &pos));
+    EXPECT_EQ(4UL, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(10, benchmark::stoi("1010", &pos, 2));
+    EXPECT_EQ(4UL, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(520, benchmark::stoi("1010", &pos, 8));
+    EXPECT_EQ(4UL, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1010, benchmark::stoi("1010", &pos, 10));
+    EXPECT_EQ(4UL, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(4112, benchmark::stoi("1010", &pos, 16));
+    EXPECT_EQ(4UL, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xBEEF, benchmark::stoi("BEEF", &pos, 16));
+    EXPECT_EQ(4UL, pos);
+  }
 #ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-{
-  ASSERT_THROW(std::ignore = benchmark::stoi("this is a test"),
-               std::invalid_argument);
-}
+  {
+    ASSERT_THROW(std::ignore = benchmark::stoi("this is a test"),
+                 std::invalid_argument);
+  }
 #endif
 }
 
-TEST(StringUtilTest, stod){{size_t pos = 0;
-EXPECT_EQ(0.0, benchmark::stod("0", &pos));
-EXPECT_EQ(1ul, pos);
-}
-{
-  size_t pos = 0;
-  EXPECT_EQ(-84.0, benchmark::stod("-84", &pos));
-  EXPECT_EQ(3ul, pos);
-}
-{
-  size_t pos = 0;
-  EXPECT_EQ(1234.0, benchmark::stod("1234", &pos));
-  EXPECT_EQ(4ul, pos);
-}
-{
-  size_t pos = 0;
-  EXPECT_EQ(1.5, benchmark::stod("1.5", &pos));
-  EXPECT_EQ(3ul, pos);
-}
-{
-  size_t pos = 0;
-  /* Note: exactly representable as double */
-  EXPECT_EQ(-1.25e+9, benchmark::stod("-1.25e+9", &pos));
-  EXPECT_EQ(8ul, pos);
-}
+TEST(StringUtilTest, stod) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0.0, benchmark::stod("0", &pos));
+    EXPECT_EQ(1UL, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(-84.0, benchmark::stod("-84", &pos));
+    EXPECT_EQ(3UL, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1234.0, benchmark::stod("1234", &pos));
+    EXPECT_EQ(4UL, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1.5, benchmark::stod("1.5", &pos));
+    EXPECT_EQ(3UL, pos);
+  }
+  {
+    size_t pos = 0;
+    /* Note: exactly representable as double */
+    EXPECT_EQ(-1.25e+9, benchmark::stod("-1.25e+9", &pos));
+    EXPECT_EQ(8UL, pos);
+  }
 #ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-{
-  ASSERT_THROW(std::ignore = benchmark::stod("this is a test"),
-               std::invalid_argument);
-}
+  {
+    ASSERT_THROW(std::ignore = benchmark::stod("this is a test"),
+                 std::invalid_argument);
+  }
 #endif
 }
 
diff --git a/third-party/benchmark/test/templated_fixture_method_test.cc b/third-party/benchmark/test/templated_fixture_method_test.cc
new file mode 100644
index 0000000000000..06fc7d83e701b
--- /dev/null
+++ b/third-party/benchmark/test/templated_fixture_method_test.cc
@@ -0,0 +1,26 @@
+
+#include <cassert>
+#include <memory>
+
+#include "benchmark/benchmark.h"
+
+template <typename T>
+class MyFixture : public ::benchmark::Fixture {
+ public:
+  MyFixture() : data(0) {}
+
+  T data;
+
+  using type = T;
+};
+
+BENCHMARK_TEMPLATE_METHOD_F(MyFixture, Foo)(benchmark::State& st) {
+  for (auto _ : st) {
+    this->data += typename Base::type(1);
+  }
+}
+
+BENCHMARK_TEMPLATE_INSTANTIATE_F(MyFixture, Foo, int);
+BENCHMARK_TEMPLATE_INSTANTIATE_F(MyFixture, Foo, double);
+
+BENCHMARK_MAIN();
diff --git a/third-party/benchmark/test/time_unit_gtest.cc b/third-party/benchmark/test/time_unit_gtest.cc
index 484ecbcfb411f..0da11092b7ae8 100644
--- a/third-party/benchmark/test/time_unit_gtest.cc
+++ b/third-party/benchmark/test/time_unit_gtest.cc
@@ -6,10 +6,10 @@ namespace internal {
 
 namespace {
 
-class DummyBenchmark : public Benchmark {
+class DummyBenchmark : public benchmark::Benchmark {
  public:
   DummyBenchmark() : Benchmark("dummy") {}
-  void Run(State&) override {}
+  void Run(State& /*state*/) override {}
 };
 
 TEST(DefaultTimeUnitTest, TimeUnitIsNotSet) {
diff --git a/third-party/benchmark/test/user_counters_tabular_test.cc b/third-party/benchmark/test/user_counters_tabular_test.cc
index cfc1ab069c78a..7db0e20822979 100644
--- a/third-party/benchmark/test/user_counters_tabular_test.cc
+++ b/third-party/benchmark/test/user_counters_tabular_test.cc
@@ -4,6 +4,7 @@
 #include "benchmark/benchmark.h"
 #include "output_test.h"
 
+namespace {
 // @todo: <jpmag> this checks the full output at once; the rule for
 // CounterSet1 was failing because it was not matching "^[-]+$".
 // @todo: <jpmag> check that the counters are vertically aligned.
@@ -64,7 +65,8 @@ ADD_CASES(TC_CSVOut, {{"%csv_header,"
 void BM_Counters_Tabular(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
@@ -375,7 +377,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/repeats:2/threads:2$",
 void BM_CounterRates_Tabular(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
@@ -415,7 +418,7 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_CounterRates_Tabular/threads:%int\",%csv_report,"
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckTabularRate(Results const& e) {
-  double t = e.DurationCPUTime();
+  double t = e.DurationCPUTime() / e.NumThreads();
   CHECK_FLOAT_COUNTER_VALUE(e, "Foo", EQ, 1. / t, 0.001);
   CHECK_FLOAT_COUNTER_VALUE(e, "Bar", EQ, 2. / t, 0.001);
   CHECK_FLOAT_COUNTER_VALUE(e, "Baz", EQ, 4. / t, 0.001);
@@ -553,9 +556,13 @@ void CheckSet2(Results const& e) {
   CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40);
 }
 CHECK_BENCHMARK_RESULTS("BM_CounterSet2_Tabular", &CheckSet2);
+}  // end namespace
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
 // ========================================================================= //
 
-int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
+int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+  RunOutputTests(argc, argv);
+}
diff --git a/third-party/benchmark/test/user_counters_test.cc b/third-party/benchmark/test/user_counters_test.cc
index 22252acbf6a22..a8af0877cc19e 100644
--- a/third-party/benchmark/test/user_counters_test.cc
+++ b/third-party/benchmark/test/user_counters_test.cc
@@ -21,7 +21,7 @@ ADD_CASES(TC_CSVOut, {{"%csv_header,\"bar\",\"foo\""}});
 // ========================================================================= //
 // ------------------------- Simple Counters Output ------------------------ //
 // ========================================================================= //
-
+namespace {
 void BM_Counters_Simple(benchmark::State& state) {
   for (auto _ : state) {
   }
@@ -56,6 +56,7 @@ void CheckSimple(Results const& e) {
   CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. * its, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_Simple", &CheckSimple);
+}  // end namespace
 
 // ========================================================================= //
 // --------------------- Counters+Items+Bytes/s Output --------------------- //
@@ -63,11 +64,11 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_Simple", &CheckSimple);
 
 namespace {
 int num_calls1 = 0;
-}
 void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
   state.counters["foo"] = 1;
@@ -111,15 +112,17 @@ void CheckBytesAndItemsPSec(Results const& e) {
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_WithBytesAndItemsPSec",
                         &CheckBytesAndItemsPSec);
+}  // end namespace
 
 // ========================================================================= //
 // ------------------------- Rate Counters Output -------------------------- //
 // ========================================================================= //
-
+namespace {
 void BM_Counters_Rate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
@@ -155,15 +158,18 @@ void CheckRate(Results const& e) {
   CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / t, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_Rate", &CheckRate);
+}  // end namespace
 
 // ========================================================================= //
 // ----------------------- Inverted Counters Output ------------------------ //
 // ========================================================================= //
 
+namespace {
 void BM_Invert(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
@@ -196,15 +202,18 @@ void CheckInvert(Results const& e) {
   CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 0.0001, 0.0001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Invert", &CheckInvert);
+}  // end namespace
 
 // ========================================================================= //
 // --------------------- InvertedRate Counters Output ---------------------- //
 // ========================================================================= //
 
+namespace {
 void BM_Counters_InvertedRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
@@ -243,11 +252,13 @@ void CheckInvertedRate(Results const& e) {
   CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, t / 8192.0, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_InvertedRate", &CheckInvertedRate);
+}  // end namespace
 
 // ========================================================================= //
 // ------------------------- Thread Counters Output ------------------------ //
 // ========================================================================= //
 
+namespace {
 void BM_Counters_Threads(benchmark::State& state) {
   for (auto _ : state) {
   }
@@ -283,11 +294,13 @@ void CheckThreads(Results const& e) {
   CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2 * e.NumThreads());
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_Threads/threads:%int", &CheckThreads);
+}  // end namespace
 
 // ========================================================================= //
 // ---------------------- ThreadAvg Counters Output ------------------------ //
 // ========================================================================= //
 
+namespace {
 void BM_Counters_AvgThreads(benchmark::State& state) {
   for (auto _ : state) {
   }
@@ -325,15 +338,18 @@ void CheckAvgThreads(Results const& e) {
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreads/threads:%int",
                         &CheckAvgThreads);
+}  // end namespace
 
 // ========================================================================= //
 // ---------------------- ThreadAvg Counters Output ------------------------ //
 // ========================================================================= //
 
+namespace {
 void BM_Counters_AvgThreadsRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
@@ -365,16 +381,20 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreadsRate/"
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckAvgThreadsRate(Results const& e) {
-  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / e.DurationCPUTime(), 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / e.DurationCPUTime(), 0.001);
+  // this (and not real time) is the time used
+  double t = e.DurationCPUTime() / e.NumThreads();
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / t, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreadsRate/threads:%int",
                         &CheckAvgThreadsRate);
+}  // end namespace
 
 // ========================================================================= //
 // ------------------- IterationInvariant Counters Output ------------------ //
 // ========================================================================= //
 
+namespace {
 void BM_Counters_IterationInvariant(benchmark::State& state) {
   for (auto _ : state) {
   }
@@ -413,15 +433,18 @@ void CheckIterationInvariant(Results const& e) {
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_IterationInvariant",
                         &CheckIterationInvariant);
+}  // end namespace
 
 // ========================================================================= //
 // ----------------- IterationInvariantRate Counters Output ---------------- //
 // ========================================================================= //
 
+namespace {
 void BM_Counters_kIsIterationInvariantRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
@@ -463,11 +486,13 @@ void CheckIsIterationInvariantRate(Results const& e) {
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_kIsIterationInvariantRate",
                         &CheckIsIterationInvariantRate);
+}  // end namespace
 
 // ========================================================================= //
 // --------------------- AvgIterations Counters Output --------------------- //
 // ========================================================================= //
 
+namespace {
 void BM_Counters_AvgIterations(benchmark::State& state) {
   for (auto _ : state) {
   }
@@ -505,15 +530,18 @@ void CheckAvgIterations(Results const& e) {
   CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / its, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_AvgIterations", &CheckAvgIterations);
+}  // end namespace
 
 // ========================================================================= //
 // ------------------- AvgIterationsRate Counters Output ------------------- //
 // ========================================================================= //
 
+namespace {
 void BM_Counters_kAvgIterationsRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    auto iterations = double(state.iterations()) * double(state.iterations());
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
     benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
@@ -553,9 +581,13 @@ void CheckAvgIterationsRate(Results const& e) {
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_kAvgIterationsRate",
                         &CheckAvgIterationsRate);
+}  // end namespace
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
 // ========================================================================= //
 
-int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
+int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+  RunOutputTests(argc, argv);
+}
diff --git a/third-party/benchmark/test/user_counters_thousands_test.cc b/third-party/benchmark/test/user_counters_thousands_test.cc
index fc153835f8006..0ef78d37875ce 100644
--- a/third-party/benchmark/test/user_counters_thousands_test.cc
+++ b/third-party/benchmark/test/user_counters_thousands_test.cc
@@ -4,6 +4,7 @@
 #include "benchmark/benchmark.h"
 #include "output_test.h"
 
+namespace {
 // ========================================================================= //
 // ------------------------ Thousands Customisation ------------------------ //
 // ========================================================================= //
@@ -166,8 +167,9 @@ ADD_CASES(
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckThousands(Results const& e) {
-  if (e.name != "BM_Counters_Thousands/repeats:2")
+  if (e.name != "BM_Counters_Thousands/repeats:2") {
     return;  // Do not check the aggregates!
+  }
 
   // check that the values are within 0.01% of the expected values
   CHECK_FLOAT_COUNTER_VALUE(e, "t0_1000000DefaultBase", EQ, 1000 * 1000,
@@ -178,9 +180,13 @@ void CheckThousands(Results const& e) {
   CHECK_FLOAT_COUNTER_VALUE(e, "t4_1048576Base1024", EQ, 1024 * 1024, 0.0001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_Thousands", &CheckThousands);
+}  // end namespace
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
 // ========================================================================= //
 
-int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
+int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+  RunOutputTests(argc, argv);
+}
diff --git a/third-party/benchmark/test/user_counters_threads_test.cc b/third-party/benchmark/test/user_counters_threads_test.cc
new file mode 100644
index 0000000000000..e2e5ade460a7f
--- /dev/null
+++ b/third-party/benchmark/test/user_counters_threads_test.cc
@@ -0,0 +1,622 @@
+
+#undef NDEBUG
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ---------------------- Testing Prologue Output -------------------------- //
+// ========================================================================= //
+
+// clang-format off
+
+ADD_CASES(TC_ConsoleOut,
+          {{"^[-]+$", MR_Next},
+           {"^Benchmark %s Time %s CPU %s Iterations UserCounters...$", MR_Next},
+           {"^[-]+$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"%csv_header,\"bar\",\"foo\""}});
+
+// clang-format on
+
+// ========================================================================= //
+// ------------------------- Simple Counters Output ------------------------ //
+// ========================================================================= //
+
+namespace {
+void BM_Counters_Simple(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = 2 * static_cast<double>(state.iterations());
+}
+BENCHMARK(BM_Counters_Simple)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Simple/threads:%int %console_report "
+                           "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Simple/threads:%int\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Simple/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": %int,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_Simple/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSimple(Results const& e) {
+  double its = e.NumIterations();
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1 * e.NumThreads());
+  // check that the value of bar is within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. * its, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Simple/threads:%int", &CheckSimple);
+}  // end namespace
+
+// ========================================================================= //
+// --------------------- Counters+Items+Bytes/s Output --------------------- //
+// ========================================================================= //
+
+namespace {
+void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
+    benchmark::DoNotOptimize(iterations);
+  }
+  state.counters["foo"] = 1;
+  state.SetBytesProcessed(364);
+  state.SetItemsProcessed(150);
+}
+BENCHMARK(BM_Counters_WithBytesAndItemsPSec)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Counters_WithBytesAndItemsPSec/threads:%int %console_report "
+            "bytes_per_second=%hrfloat/s "
+            "foo=%hrfloat items_per_second=%hrfloat/s$"}});
+ADD_CASES(
+    TC_JSONOut,
+    {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec/threads:%int\",$"},
+     {"\"family_index\": 1,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": \"BM_Counters_WithBytesAndItemsPSec/threads:%int\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 1,$", MR_Next},
+     {"\"repetition_index\": 0,$", MR_Next},
+     {"\"threads\": %int,$", MR_Next},
+     {"\"iterations\": %int,$", MR_Next},
+     {"\"real_time\": %float,$", MR_Next},
+     {"\"cpu_time\": %float,$", MR_Next},
+     {"\"time_unit\": \"ns\",$", MR_Next},
+     {"\"bytes_per_second\": %float,$", MR_Next},
+     {"\"foo\": %float,$", MR_Next},
+     {"\"items_per_second\": %float$", MR_Next},
+     {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_WithBytesAndItemsPSec/threads:%int\","
+                       "%csv_bytes_items_report,,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckBytesAndItemsPSec(Results const& e) {
+  // this (and not real time) is the time used
+  double t = e.DurationCPUTime() / e.NumThreads();
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1 * e.NumThreads());
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_RESULT_VALUE(e, "bytes_per_second", EQ,
+                           (364. * e.NumThreads()) / t, 0.001);
+  CHECK_FLOAT_RESULT_VALUE(e, "items_per_second", EQ,
+                           (150. * e.NumThreads()) / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_WithBytesAndItemsPSec/threads:%int",
+                        &CheckBytesAndItemsPSec);
+}  // end namespace
+
+// ========================================================================= //
+// ------------------------- Rate Counters Output -------------------------- //
+// ========================================================================= //
+namespace {
+void BM_Counters_Rate(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
+    benchmark::DoNotOptimize(iterations);
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kIsRate};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kIsRate};
+}
+BENCHMARK(BM_Counters_Rate)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Rate/threads:%int %console_report "
+                           "bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Rate/threads:%int\",$"},
+           {"\"family_index\": 2,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Rate/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": %int,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Rate/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckRate(Results const& e) {
+  // this (and not real time) is the time used
+  double t = e.DurationCPUTime() / e.NumThreads();
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, (1. * e.NumThreads()) / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, (2. * e.NumThreads()) / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Rate/threads:%int", &CheckRate);
+}  // end namespace
+
+// ========================================================================= //
+// ----------------------- Inverted Counters Output ------------------------ //
+// ========================================================================= //
+
+namespace {
+void BM_Invert(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
+    benchmark::DoNotOptimize(iterations);
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{0.0001, bm::Counter::kInvert};
+  state.counters["bar"] = bm::Counter{10000, bm::Counter::kInvert};
+}
+BENCHMARK(BM_Invert)->ThreadRange(1, 8);
+ADD_CASES(
+    TC_ConsoleOut,
+    {{"^BM_Invert/threads:%int %console_report bar=%hrfloatu foo=%hrfloatk$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Invert/threads:%int\",$"},
+                       {"\"family_index\": 3,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Invert/threads:%int\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": %int,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Invert/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckInvert(Results const& e) {
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / (0.0001 * e.NumThreads()),
+                            0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 1. / (10000 * e.NumThreads()),
+                            0.0001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Invert/threads:%int", &CheckInvert);
+}  // end namespace
+
+// ========================================================================= //
+// --------------------- InvertedRate Counters Output ---------------------- //
+// ========================================================================= //
+
+namespace {
+void BM_Counters_InvertedRate(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
+    benchmark::DoNotOptimize(iterations);
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] =
+      bm::Counter{1, bm::Counter::kIsRate | bm::Counter::kInvert};
+  state.counters["bar"] =
+      bm::Counter{8192, bm::Counter::kIsRate | bm::Counter::kInvert};
+}
+BENCHMARK(BM_Counters_InvertedRate)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Counters_InvertedRate/threads:%int %console_report "
+            "bar=%hrfloats foo=%hrfloats$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_InvertedRate/threads:%int\",$"},
+           {"\"family_index\": 4,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_InvertedRate/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": %int,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_InvertedRate/"
+                       "threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckInvertedRate(Results const& e) {
+  // this (and not real time) is the time used
+  double t = e.DurationCPUTime() / e.NumThreads();
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, t / (e.NumThreads()), 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, t / (8192.0 * e.NumThreads()), 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_InvertedRate/threads:%int",
+                        &CheckInvertedRate);
+}  // end namespace
+
+// ========================================================================= //
+// ------------------------- Thread Counters Output ------------------------ //
+// ========================================================================= //
+
+namespace {
+void BM_Counters_Threads(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = 2;
+}
+BENCHMARK(BM_Counters_Threads)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report "
+                           "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
+           {"\"family_index\": 5,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Threads/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": %int,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_Threads/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckThreads(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, e.NumThreads());
+  CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2 * e.NumThreads());
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Threads/threads:%int", &CheckThreads);
+}  // end namespace
+
+// ========================================================================= //
+// ---------------------- ThreadAvg Counters Output ------------------------ //
+// ========================================================================= //
+
+namespace {
+void BM_Counters_AvgThreads(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreads};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreads};
+}
+BENCHMARK(BM_Counters_AvgThreads)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int "
+                           "%console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
+           {"\"family_index\": 6,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_AvgThreads/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": %int,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_AvgThreads/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgThreads(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
+  CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreads/threads:%int",
+                        &CheckAvgThreads);
+}  // end namespace
+
+// ========================================================================= //
+// ---------------------- ThreadAvg Counters Output ------------------------ //
+// ========================================================================= //
+
+namespace {
+void BM_Counters_AvgThreadsRate(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
+    benchmark::DoNotOptimize(iterations);
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreadsRate};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreadsRate};
+}
+BENCHMARK(BM_Counters_AvgThreadsRate)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int "
+                           "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
+           {"\"family_index\": 7,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": %int,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreadsRate/"
+                       "threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgThreadsRate(Results const& e) {
+  // this (and not real time) is the time used
+  double t = e.DurationCPUTime() / e.NumThreads();
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreadsRate/threads:%int",
+                        &CheckAvgThreadsRate);
+}  // end namespace
+
+// ========================================================================= //
+// ------------------- IterationInvariant Counters Output ------------------ //
+// ========================================================================= //
+
+namespace {
+void BM_Counters_IterationInvariant(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kIsIterationInvariant};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kIsIterationInvariant};
+}
+BENCHMARK(BM_Counters_IterationInvariant)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Counters_IterationInvariant/threads:%int %console_report "
+            "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_IterationInvariant/threads:%int\",$"},
+           {"\"family_index\": 8,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_IterationInvariant/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": %int,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_IterationInvariant/"
+                       "threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckIterationInvariant(Results const& e) {
+  double its = e.NumIterations();
+  // check that the values are within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, its * e.NumThreads(), 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. * its * e.NumThreads(), 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_IterationInvariant/threads:%int",
+                        &CheckIterationInvariant);
+}  // end namespace
+
+// ========================================================================= //
+// ----------------- IterationInvariantRate Counters Output ---------------- //
+// ========================================================================= //
+
+namespace {
+void BM_Counters_kIsIterationInvariantRate(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
+    benchmark::DoNotOptimize(iterations);
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] =
+      bm::Counter{1, bm::Counter::kIsIterationInvariantRate};
+  state.counters["bar"] =
+      bm::Counter{2, bm::Counter::kIsRate | bm::Counter::kIsIterationInvariant};
+}
+BENCHMARK(BM_Counters_kIsIterationInvariantRate)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Counters_kIsIterationInvariantRate/threads:%int "
+            "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(
+    TC_JSONOut,
+    {{"\"name\": \"BM_Counters_kIsIterationInvariantRate/threads:%int\",$"},
+     {"\"family_index\": 9,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": \"BM_Counters_kIsIterationInvariantRate/threads:%int\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 1,$", MR_Next},
+     {"\"repetition_index\": 0,$", MR_Next},
+     {"\"threads\": %int,$", MR_Next},
+     {"\"iterations\": %int,$", MR_Next},
+     {"\"real_time\": %float,$", MR_Next},
+     {"\"cpu_time\": %float,$", MR_Next},
+     {"\"time_unit\": \"ns\",$", MR_Next},
+     {"\"bar\": %float,$", MR_Next},
+     {"\"foo\": %float$", MR_Next},
+     {"}", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_kIsIterationInvariantRate/threads:%int\",%csv_report,"
+      "%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckIsIterationInvariantRate(Results const& e) {
+  double its = e.NumIterations();
+  // this (and not real time) is the time used
+  double t = e.DurationCPUTime() / e.NumThreads();
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, its * 1. * e.NumThreads() / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, its * 2. * e.NumThreads() / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_kIsIterationInvariantRate/threads:%int",
+                        &CheckIsIterationInvariantRate);
+}  // end namespace
+
+// ========================================================================= //
+// --------------------- AvgIterations Counters Output --------------------- //
+// ========================================================================= //
+
+namespace {
+void BM_Counters_AvgIterations(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgIterations};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgIterations};
+}
+BENCHMARK(BM_Counters_AvgIterations)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Counters_AvgIterations/threads:%int %console_report "
+            "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgIterations/threads:%int\",$"},
+           {"\"family_index\": 10,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_AvgIterations/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": %int,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgIterations/"
+                       "threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgIterations(Results const& e) {
+  double its = e.NumIterations();
+  // check that the values are within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. * e.NumThreads() / its, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. * e.NumThreads() / its, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgIterations/threads:%int",
+                        &CheckAvgIterations);
+}  // end namespace
+
+// ========================================================================= //
+// ------------------- AvgIterationsRate Counters Output ------------------- //
+// ========================================================================= //
+
+namespace {
+void BM_Counters_kAvgIterationsRate(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
+    benchmark::DoNotOptimize(iterations);
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgIterationsRate};
+  state.counters["bar"] =
+      bm::Counter{2, bm::Counter::kIsRate | bm::Counter::kAvgIterations};
+}
+BENCHMARK(BM_Counters_kAvgIterationsRate)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kAvgIterationsRate/threads:%int "
+                           "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_kAvgIterationsRate/threads:%int\",$"},
+           {"\"family_index\": 11,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_kAvgIterationsRate/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": %int,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_kAvgIterationsRate/threads:%int\",%csv_report,"
+            "%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgIterationsRate(Results const& e) {
+  double its = e.NumIterations();
+  // this (and not real time) is the time used
+  double t = e.DurationCPUTime() / e.NumThreads();
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. * e.NumThreads() / its / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. * e.NumThreads() / its / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_kAvgIterationsRate/threads:%int",
+                        &CheckAvgIterationsRate);
+}  // end namespace
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+  RunOutputTests(argc, argv);
+}
diff --git a/third-party/benchmark/tools/compare.py b/third-party/benchmark/tools/compare.py
index 7572520cc0ca1..1a656345c21e4 100755
--- a/third-party/benchmark/tools/compare.py
+++ b/third-party/benchmark/tools/compare.py
@@ -21,8 +21,8 @@ def check_inputs(in1, in2, flags):
     """
     Perform checking on the user provided inputs and diagnose any abnormalities
     """
-    in1_kind, in1_err = util.classify_input_file(in1)
-    in2_kind, in2_err = util.classify_input_file(in2)
+    in1_kind, _ = util.classify_input_file(in1)
+    in2_kind, _ = util.classify_input_file(in2)
     output_file = util.find_benchmark_flag("--benchmark_out=", flags)
     output_type = util.find_benchmark_flag("--benchmark_out_format=", flags)
     if (
@@ -85,7 +85,10 @@ def create_parser():
         "-d",
         "--dump_to_json",
         dest="dump_to_json",
-        help="Additionally, dump benchmark comparison output to this file in JSON format.",
+        help=(
+            "Additionally, dump benchmark comparison output to this file in"
+            " JSON format."
+        ),
     )
 
     utest = parser.add_argument_group()
@@ -94,8 +97,15 @@ def create_parser():
         dest="utest",
         default=True,
         action="store_false",
-        help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(
-            report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS
+        help=(
+            "The tool can do a two-tailed Mann-Whitney U test with the null"
+            " hypothesis that it is equally likely that a randomly selected"
+            " value from one sample will be less than or greater than a"
+            " randomly selected value from a second sample.\nWARNING: requires"
+            f" **LARGE** (no less than {report.UTEST_OPTIMAL_REPETITIONS})"
+            " number of repetitions to be meaningful!\nThe test is being done"
+            f" by default, if at least {report.UTEST_MIN_REPETITIONS}"
+            " repetitions were done.\nThis option can disable the U Test."
         ),
     )
     alpha_default = 0.05
@@ -105,7 +115,9 @@ def create_parser():
         default=alpha_default,
         type=float,
         help=(
-            "significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)"
+            "significance level alpha. if the calculated p-value is below this"
+            " value, then the result is said to be statistically significant"
+            " and the null hypothesis is rejected.\n(default: %0.4f)"
         )
         % alpha_default,
     )
@@ -116,7 +128,10 @@ def create_parser():
 
     parser_a = subparsers.add_parser(
         "benchmarks",
-        help="The most simple use-case, compare all the output of these two benchmarks",
+        help=(
+            "The most simple use-case, compare all the output of these two"
+            " benchmarks"
+        ),
     )
     baseline = parser_a.add_argument_group("baseline", "The benchmark baseline")
     baseline.add_argument(
@@ -180,7 +195,10 @@ def create_parser():
 
     parser_c = subparsers.add_parser(
         "benchmarksfiltered",
-        help="Compare filter one of first benchmark with filter two of the second benchmark",
+        help=(
+            "Compare filter one of first benchmark with filter two of the"
+            " second benchmark"
+        ),
     )
     baseline = parser_c.add_argument_group("baseline", "The benchmark baseline")
     baseline.add_argument(
@@ -205,7 +223,10 @@ def create_parser():
         metavar="test_contender",
         type=argparse.FileType("r"),
         nargs=1,
-        help="The second benchmark executable or JSON output file, that will be compared against the baseline",
+        help=(
+            "The second benchmark executable or JSON output file, that will be"
+            " compared against the baseline"
+        ),
     )
     contender.add_argument(
         "filter_contender",
diff --git a/third-party/benchmark/tools/gbench/report.py b/third-party/benchmark/tools/gbench/report.py
index 7158fd1654cb1..e143e45a71792 100644
--- a/third-party/benchmark/tools/gbench/report.py
+++ b/third-party/benchmark/tools/gbench/report.py
@@ -14,7 +14,7 @@
 from scipy.stats import gmean, mannwhitneyu
 
 
-class BenchmarkColor(object):
+class BenchmarkColor:
     def __init__(self, name, code):
         self.name = name
         self.code = code
@@ -249,8 +249,9 @@ def get_utest_color(pval):
     # We still got some results to show but issue a warning about it.
     if not utest["have_optimal_repetitions"]:
         dsc_color = BC_WARNING
-        dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
-            UTEST_OPTIMAL_REPETITIONS
+        dsc += (
+            f". WARNING: Results unreliable! {UTEST_OPTIMAL_REPETITIONS}+"
+            " repetitions recommended."
         )
 
     special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
@@ -260,7 +261,7 @@ def get_utest_color(pval):
             use_color,
             special_str,
             BC_HEADER,
-            "{}{}".format(bc_name, UTEST_COL_NAME),
+            f"{bc_name}{UTEST_COL_NAME}",
             first_col_width,
             get_utest_color(utest["time_pvalue"]),
             utest["time_pvalue"],
@@ -285,7 +286,7 @@ def get_difference_report(json1, json2, utest=False):
     partitions = partition_benchmarks(json1, json2)
     for partition in partitions:
         benchmark_name = partition[0][0]["name"]
-        label = partition[0][0]["label"] if "label" in partition[0][0] else ""
+        label = partition[0][0].get("label", "")
         time_unit = partition[0][0]["time_unit"]
         measurements = []
         utest_results = {}
@@ -329,11 +330,7 @@ def get_difference_report(json1, json2, utest=False):
         # time units which are not compatible with other time units in the
         # benchmark suite.
         if measurements:
-            run_type = (
-                partition[0][0]["run_type"]
-                if "run_type" in partition[0][0]
-                else ""
-            )
+            run_type = partition[0][0].get("run_type", "")
             aggregate_name = (
                 partition[0][0]["aggregate_name"]
                 if run_type == "aggregate"
@@ -403,12 +400,17 @@ def get_color(res):
     first_col_width = find_longest_name(json_diff_report)
     first_col_width = max(first_col_width, len("Benchmark"))
     first_col_width += len(UTEST_COL_NAME)
-    first_line = "{:<{}s}Time             CPU      Time Old      Time New       CPU Old       CPU New".format(
-        "Benchmark", 12 + first_col_width
+    fmt_str = (
+        "{:<{}s}Time             CPU      Time Old      Time New       CPU Old"
+        "       CPU New"
     )
+    first_line = fmt_str.format("Benchmark", 12 + first_col_width)
     output_strs = [first_line, "-" * len(first_line)]
 
-    fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
+    fmt_str = (
+        "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}"
+        "{endc}{:14.0f}{:14.0f}"
+    )
     for benchmark in json_diff_report:
         # *If* we were asked to only include aggregates,
         # and if it is non-aggregate, then don't print it.
@@ -464,7 +466,7 @@ def load_results(self):
             os.path.dirname(os.path.realpath(__file__)), "Inputs"
         )
         testOutput = os.path.join(testInputs, "test3_run0.json")
-        with open(testOutput, "r") as f:
+        with open(testOutput) as f:
             json = json.load(f)
         return json
 
@@ -480,8 +482,8 @@ def test_basic(self):
         print("\n")
         print("\n".join(output_lines))
         self.assertEqual(len(output_lines), len(expect_lines))
-        for i in range(0, len(output_lines)):
-            self.assertEqual(expect_lines[i], output_lines[i])
+        for i, output_line in enumerate(output_lines):
+            self.assertEqual(expect_lines[i], output_line)
 
 
 class TestReportDifference(unittest.TestCase):
@@ -495,9 +497,9 @@ def load_results():
             )
             testOutput1 = os.path.join(testInputs, "test1_run1.json")
             testOutput2 = os.path.join(testInputs, "test1_run2.json")
-            with open(testOutput1, "r") as f:
+            with open(testOutput1) as f:
                 json1 = json.load(f)
-            with open(testOutput2, "r") as f:
+            with open(testOutput2) as f:
                 json2 = json.load(f)
             return json1, json2
 
@@ -584,8 +586,8 @@ def test_json_diff_report_pretty_printing(self):
         print("\n")
         print("\n".join(output_lines_with_header))
         self.assertEqual(len(output_lines), len(expect_lines))
-        for i in range(0, len(output_lines)):
-            parts = [x for x in output_lines[i].split(" ") if x]
+        for i, output_line in enumerate(output_lines):
+            parts = [x for x in output_line.split(" ") if x]
             self.assertEqual(len(parts), 7)
             self.assertEqual(expect_lines[i], parts)
 
@@ -819,7 +821,9 @@ def test_json_diff_report_output(self):
             },
         ]
         self.assertEqual(len(self.json_diff_report), len(expected_output))
-        for out, expected in zip(self.json_diff_report, expected_output):
+        for out, expected in zip(
+            self.json_diff_report, expected_output, strict=True
+        ):
             self.assertEqual(out["name"], expected["name"])
             self.assertEqual(out["label"], expected["label"])
             self.assertEqual(out["time_unit"], expected["time_unit"])
@@ -837,7 +841,7 @@ def load_result():
                 os.path.dirname(os.path.realpath(__file__)), "Inputs"
             )
             testOutput = os.path.join(testInputs, "test2_run.json")
-            with open(testOutput, "r") as f:
+            with open(testOutput) as f:
                 json = json.load(f)
             return json
 
@@ -861,8 +865,8 @@ def test_json_diff_report_pretty_printing(self):
         print("\n")
         print("\n".join(output_lines_with_header))
         self.assertEqual(len(output_lines), len(expect_lines))
-        for i in range(0, len(output_lines)):
-            parts = [x for x in output_lines[i].split(" ") if x]
+        for i, output_line in enumerate(output_lines):
+            parts = [x for x in output_line.split(" ") if x]
             self.assertEqual(len(parts), 7)
             self.assertEqual(expect_lines[i], parts)
 
@@ -947,7 +951,9 @@ def test_json_diff_report(self):
             },
         ]
         self.assertEqual(len(self.json_diff_report), len(expected_output))
-        for out, expected in zip(self.json_diff_report, expected_output):
+        for out, expected in zip(
+            self.json_diff_report, expected_output, strict=True
+        ):
             self.assertEqual(out["name"], expected["name"])
             self.assertEqual(out["time_unit"], expected["time_unit"])
             assert_utest(self, out, expected)
@@ -965,9 +971,9 @@ def load_results():
             )
             testOutput1 = os.path.join(testInputs, "test3_run0.json")
             testOutput2 = os.path.join(testInputs, "test3_run1.json")
-            with open(testOutput1, "r") as f:
+            with open(testOutput1) as f:
                 json1 = json.load(f)
-            with open(testOutput2, "r") as f:
+            with open(testOutput2) as f:
                 json2 = json.load(f)
             return json1, json2
 
@@ -1025,8 +1031,8 @@ def test_json_diff_report_pretty_printing(self):
         print("\n")
         print("\n".join(output_lines_with_header))
         self.assertEqual(len(output_lines), len(expect_lines))
-        for i in range(0, len(output_lines)):
-            parts = [x for x in output_lines[i].split(" ") if x]
+        for i, output_line in enumerate(output_lines):
+            parts = [x for x in output_line.split(" ") if x]
             self.assertEqual(expect_lines[i], parts)
 
     def test_json_diff_report_pretty_printing_aggregates_only(self):
@@ -1081,8 +1087,8 @@ def test_json_diff_report_pretty_printing_aggregates_only(self):
         print("\n")
         print("\n".join(output_lines_with_header))
         self.assertEqual(len(output_lines), len(expect_lines))
-        for i in range(0, len(output_lines)):
-            parts = [x for x in output_lines[i].split(" ") if x]
+        for i, output_line in enumerate(output_lines):
+            parts = [x for x in output_line.split(" ") if x]
             self.assertEqual(expect_lines[i], parts)
 
     def test_json_diff_report(self):
@@ -1190,7 +1196,9 @@ def test_json_diff_report(self):
             },
         ]
         self.assertEqual(len(self.json_diff_report), len(expected_output))
-        for out, expected in zip(self.json_diff_report, expected_output):
+        for out, expected in zip(
+            self.json_diff_report, expected_output, strict=True
+        ):
             self.assertEqual(out["name"], expected["name"])
             self.assertEqual(out["time_unit"], expected["time_unit"])
             assert_utest(self, out, expected)
@@ -1210,9 +1218,9 @@ def load_results():
             )
             testOutput1 = os.path.join(testInputs, "test3_run0.json")
             testOutput2 = os.path.join(testInputs, "test3_run1.json")
-            with open(testOutput1, "r") as f:
+            with open(testOutput1) as f:
                 json1 = json.load(f)
-            with open(testOutput2, "r") as f:
+            with open(testOutput2) as f:
                 json2 = json.load(f)
             return json1, json2
 
@@ -1270,8 +1278,8 @@ def test_json_diff_report_pretty_printing(self):
         print("\n")
         print("\n".join(output_lines_with_header))
         self.assertEqual(len(output_lines), len(expect_lines))
-        for i in range(0, len(output_lines)):
-            parts = [x for x in output_lines[i].split(" ") if x]
+        for i, output_line in enumerate(output_lines):
+            parts = [x for x in output_line.split(" ") if x]
             self.assertEqual(expect_lines[i], parts)
 
     def test_json_diff_report(self):
@@ -1380,7 +1388,9 @@ def test_json_diff_report(self):
             },
         ]
         self.assertEqual(len(self.json_diff_report), len(expected_output))
-        for out, expected in zip(self.json_diff_report, expected_output):
+        for out, expected in zip(
+            self.json_diff_report, expected_output, strict=True
+        ):
             self.assertEqual(out["name"], expected["name"])
             self.assertEqual(out["time_unit"], expected["time_unit"])
             assert_utest(self, out, expected)
@@ -1398,9 +1408,9 @@ def load_results():
             )
             testOutput1 = os.path.join(testInputs, "test4_run0.json")
             testOutput2 = os.path.join(testInputs, "test4_run1.json")
-            with open(testOutput1, "r") as f:
+            with open(testOutput1) as f:
                 json1 = json.load(f)
-            with open(testOutput2, "r") as f:
+            with open(testOutput2) as f:
                 json2 = json.load(f)
             return json1, json2
 
@@ -1416,8 +1426,8 @@ def test_json_diff_report_pretty_printing(self):
         print("\n")
         print("\n".join(output_lines_with_header))
         self.assertEqual(len(output_lines), len(expect_lines))
-        for i in range(0, len(output_lines)):
-            parts = [x for x in output_lines[i].split(" ") if x]
+        for i, output_line in enumerate(output_lines):
+            parts = [x for x in output_line.split(" ") if x]
             self.assertEqual(expect_lines[i], parts)
 
     def test_json_diff_report(self):
@@ -1439,7 +1449,9 @@ def test_json_diff_report(self):
             }
         ]
         self.assertEqual(len(self.json_diff_report), len(expected_output))
-        for out, expected in zip(self.json_diff_report, expected_output):
+        for out, expected in zip(
+            self.json_diff_report, expected_output, strict=True
+        ):
             self.assertEqual(out["name"], expected["name"])
             self.assertEqual(out["time_unit"], expected["time_unit"])
             assert_utest(self, out, expected)
@@ -1456,7 +1468,7 @@ def load_result():
                 os.path.dirname(os.path.realpath(__file__)), "Inputs"
             )
             testOutput = os.path.join(testInputs, "test4_run.json")
-            with open(testOutput, "r") as f:
+            with open(testOutput) as f:
                 json = json.load(f)
             return json
 
@@ -1480,13 +1492,15 @@ def test_json_diff_report_pretty_printing(self):
             "88 family 1 instance 1 aggregate",
         ]
 
-        for n in range(len(self.json["benchmarks"]) ** 2):
+        for _n in range(len(self.json["benchmarks"]) ** 2):
             random.shuffle(self.json["benchmarks"])
             sorted_benchmarks = util.sort_benchmark_results(self.json)[
                 "benchmarks"
             ]
             self.assertEqual(len(expected_names), len(sorted_benchmarks))
-            for out, expected in zip(sorted_benchmarks, expected_names):
+            for out, expected in zip(
+                sorted_benchmarks, expected_names, strict=True
+            ):
                 self.assertEqual(out["name"], expected)
 
 
@@ -1503,12 +1517,12 @@ def load_results():
             )
             testOutput1 = os.path.join(testInputs, "test5_run0.json")
             testOutput2 = os.path.join(testInputs, "test5_run1.json")
-            with open(testOutput1, "r") as f:
+            with open(testOutput1) as f:
                 json1 = json.load(f)
                 json1["benchmarks"] = [
                     json1["benchmarks"][0] for i in range(1000)
                 ]
-            with open(testOutput2, "r") as f:
+            with open(testOutput2) as f:
                 json2 = json.load(f)
                 json2["benchmarks"] = [
                     json2["benchmarks"][0] for i in range(1000)
@@ -1535,8 +1549,8 @@ def test_json_diff_report_pretty_printing(self):
         )
         output_lines = output_lines_with_header[2:]
         found = False
-        for i in range(0, len(output_lines)):
-            parts = [x for x in output_lines[i].split(" ") if x]
+        for output_line in output_lines:
+            parts = [x for x in output_line.split(" ") if x]
             found = expect_line == parts
             if found:
                 break
@@ -1578,7 +1592,9 @@ def test_json_diff_report(self):
             },
         ]
         self.assertEqual(len(self.json_diff_report), len(expected_output))
-        for out, expected in zip(self.json_diff_report, expected_output):
+        for out, expected in zip(
+            self.json_diff_report, expected_output, strict=True
+        ):
             self.assertEqual(out["name"], expected["name"])
             self.assertEqual(out["time_unit"], expected["time_unit"])
             assert_utest(self, out, expected)
@@ -1602,7 +1618,7 @@ def assert_utest(unittest_instance, lhs, rhs):
 
 
 def assert_measurements(unittest_instance, lhs, rhs):
-    for m1, m2 in zip(lhs["measurements"], rhs["measurements"]):
+    for m1, m2 in zip(lhs["measurements"], rhs["measurements"], strict=False):
         unittest_instance.assertEqual(m1["real_time"], m2["real_time"])
         unittest_instance.assertEqual(m1["cpu_time"], m2["cpu_time"])
         # m1['time'] and m1['cpu'] hold values which are being calculated,
diff --git a/third-party/benchmark/tools/gbench/util.py b/third-party/benchmark/tools/gbench/util.py
index 4d061a3a1e344..2e91006be416c 100644
--- a/third-party/benchmark/tools/gbench/util.py
+++ b/third-party/benchmark/tools/gbench/util.py
@@ -1,5 +1,7 @@
-"""util.py - General utilities for running, loading, and processing benchmarks
+"""util.py - General utilities for running, loading, and processing
+benchmarks
 """
+
 import json
 import os
 import re
@@ -37,7 +39,7 @@ def is_executable_file(filename):
     elif sys.platform.startswith("win"):
         return magic_bytes == b"MZ"
     else:
-        return magic_bytes == b"\x7FELF"
+        return magic_bytes == b"\x7fELF"
 
 
 def is_json_file(filename):
@@ -46,7 +48,7 @@ def is_json_file(filename):
     'False' otherwise.
     """
     try:
-        with open(filename, "r") as f:
+        with open(filename) as f:
             json.load(f)
         return True
     except BaseException:
@@ -97,7 +99,8 @@ def find_benchmark_flag(prefix, benchmark_flags):
     if it is found return the arg it specifies. If specified more than once the
     last value is returned. If the flag is not found None is returned.
     """
-    assert prefix.startswith("--") and prefix.endswith("=")
+    assert prefix.startswith("--")
+    assert prefix.endswith("=")
     result = None
     for f in benchmark_flags:
         if f.startswith(prefix):
@@ -110,7 +113,8 @@ def remove_benchmark_flags(prefix, benchmark_flags):
     Return a new list containing the specified benchmark_flags except those
     with the specified prefix.
     """
-    assert prefix.startswith("--") and prefix.endswith("=")
+    assert prefix.startswith("--")
+    assert prefix.endswith("=")
     return [f for f in benchmark_flags if not f.startswith(prefix)]
 
 
@@ -133,17 +137,16 @@ def benchmark_wanted(benchmark):
         name = benchmark.get("run_name", None) or benchmark["name"]
         return re.search(benchmark_filter, name) is not None
 
-    with open(fname, "r") as f:
+    with open(fname) as f:
         results = json.load(f)
-        if "context" in results:
-            if "json_schema_version" in results["context"]:
-                json_schema_version = results["context"]["json_schema_version"]
-                if json_schema_version != 1:
-                    print(
-                        "In %s, got unnsupported JSON schema version: %i, expected 1"
-                        % (fname, json_schema_version)
-                    )
-                    sys.exit(1)
+        if "json_schema_version" in results.get("context", {}):
+            json_schema_version = results["context"]["json_schema_version"]
+            if json_schema_version != 1:
+                print(
+                    f"In {fname}, got unnsupported JSON schema version:"
+                    f" {json_schema_version}, expected 1"
+                )
+                sys.exit(1)
         if "benchmarks" in results:
             results["benchmarks"] = list(
                 filter(benchmark_wanted, results["benchmarks"])
@@ -157,9 +160,7 @@ def sort_benchmark_results(result):
     # From inner key to the outer key!
     benchmarks = sorted(
         benchmarks,
-        key=lambda benchmark: benchmark["repetition_index"]
-        if "repetition_index" in benchmark
-        else -1,
+        key=lambda benchmark: benchmark.get("repetition_index", -1),
     )
     benchmarks = sorted(
         benchmarks,
@@ -169,15 +170,11 @@ def sort_benchmark_results(result):
     )
     benchmarks = sorted(
         benchmarks,
-        key=lambda benchmark: benchmark["per_family_instance_index"]
-        if "per_family_instance_index" in benchmark
-        else -1,
+        key=lambda benchmark: benchmark.get("per_family_instance_index", -1),
     )
     benchmarks = sorted(
         benchmarks,
-        key=lambda benchmark: benchmark["family_index"]
-        if "family_index" in benchmark
-        else -1,
+        key=lambda benchmark: benchmark.get("family_index", -1),
     )
 
     result["benchmarks"] = benchmarks
@@ -197,11 +194,12 @@ def run_benchmark(exe_name, benchmark_flags):
         is_temp_output = True
         thandle, output_name = tempfile.mkstemp()
         os.close(thandle)
-        benchmark_flags = list(benchmark_flags) + [
-            "--benchmark_out=%s" % output_name
+        benchmark_flags = [
+            *list(benchmark_flags),
+            "--benchmark_out=%s" % output_name,
         ]
 
-    cmd = [exe_name] + benchmark_flags
+    cmd = [exe_name, *benchmark_flags]
     print("RUNNING: %s" % " ".join(cmd))
     exitCode = subprocess.call(cmd)
     if exitCode != 0:
diff --git a/third-party/benchmark/tools/libpfm.BUILD.bazel b/third-party/benchmark/tools/libpfm.BUILD.bazel
index 62695342aa81e..30b585452dd64 100644
--- a/third-party/benchmark/tools/libpfm.BUILD.bazel
+++ b/third-party/benchmark/tools/libpfm.BUILD.bazel
@@ -1,21 +1,241 @@
-# Build rule for libpfm, which is required to collect performance counters for
-# BENCHMARK_ENABLE_LIBPFM builds.
+"""Build rule for libpfm, which is required to collect performance counters for BENCHMARK_ENABLE_LIBPFM builds."""
 
-load("@rules_foreign_cc//foreign_cc:defs.bzl", "make")
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+AARCH32_SRCS_COMMON = [
+    "lib/pfmlib_arm.c",
+    "lib/pfmlib_arm_armv7_pmuv1.c",
+    "lib/pfmlib_arm_armv6.c",
+    "lib/pfmlib_arm_armv8.c",
+    "lib/pfmlib_tx2_unc_perf_event.c",
+]
+
+AARCH32_SRCS_LINUX = [
+    "lib/pfmlib_arm_perf_event.c",
+]
+
+AARCH64_SRCS_COMMON = [
+    "lib/pfmlib_arm.c",
+    "lib/pfmlib_arm_armv8.c",
+    "lib/pfmlib_tx2_unc_perf_event.c",
+]
+
+AARCH64_SRCS_LINUX = [
+    "lib/pfmlib_arm_perf_event.c",
+]
+
+MIPS_SRCS_COMMON = [
+    "lib/pfmlib_mips.c",
+    "lib/pfmlib_mips_74k.c",
+]
+
+MIPS_SRCS_LINUX = [
+    "lib/pfmlib_mips_perf_event.c",
+]
+
+POWERPC_SRCS_COMMON = [
+    "lib/pfmlib_powerpc.c",
+    "lib/pfmlib_power4.c",
+    "lib/pfmlib_ppc970.c",
+    "lib/pfmlib_power5.c",
+    "lib/pfmlib_power6.c",
+    "lib/pfmlib_power7.c",
+    "lib/pfmlib_torrent.c",
+    "lib/pfmlib_power8.c",
+    "lib/pfmlib_power9.c",
+    "lib/pfmlib_powerpc_nest.c",
+]
+
+POWERPC_SRCS_LINUX = [
+    "lib/pfmlib_powerpc_perf_event.c",
+]
+
+S390X_SRCS_COMMON = [
+    "lib/pfmlib_s390x_cpumf.c",
+]
+
+S390X_SRCS_LINUX = [
+    "lib/pfmlib_s390x_perf_event.c",
+]
+
+X86_64_SRCS_COMMON = [
+    "lib/pfmlib_amd64.c",
+    "lib/pfmlib_intel_core.c",
+    "lib/pfmlib_intel_x86.c",
+    "lib/pfmlib_intel_x86_arch.c",
+    "lib/pfmlib_intel_atom.c",
+    "lib/pfmlib_intel_nhm_unc.c",
+    "lib/pfmlib_intel_nhm.c",
+    "lib/pfmlib_intel_wsm.c",
+    "lib/pfmlib_intel_snb.c",
+    "lib/pfmlib_intel_snb_unc.c",
+    "lib/pfmlib_intel_ivb.c",
+    "lib/pfmlib_intel_ivb_unc.c",
+    "lib/pfmlib_intel_hsw.c",
+    "lib/pfmlib_intel_bdw.c",
+    "lib/pfmlib_intel_skl.c",
+    "lib/pfmlib_intel_icl.c",
+    "lib/pfmlib_intel_rapl.c",
+    "lib/pfmlib_intel_snbep_unc.c",
+    "lib/pfmlib_intel_snbep_unc_cbo.c",
+    "lib/pfmlib_intel_snbep_unc_ha.c",
+    "lib/pfmlib_intel_snbep_unc_imc.c",
+    "lib/pfmlib_intel_snbep_unc_pcu.c",
+    "lib/pfmlib_intel_snbep_unc_qpi.c",
+    "lib/pfmlib_intel_snbep_unc_ubo.c",
+    "lib/pfmlib_intel_snbep_unc_r2pcie.c",
+    "lib/pfmlib_intel_snbep_unc_r3qpi.c",
+    "lib/pfmlib_intel_ivbep_unc_cbo.c",
+    "lib/pfmlib_intel_ivbep_unc_ha.c",
+    "lib/pfmlib_intel_ivbep_unc_imc.c",
+    "lib/pfmlib_intel_ivbep_unc_pcu.c",
+    "lib/pfmlib_intel_ivbep_unc_qpi.c",
+    "lib/pfmlib_intel_ivbep_unc_ubo.c",
+    "lib/pfmlib_intel_ivbep_unc_r2pcie.c",
+    "lib/pfmlib_intel_ivbep_unc_r3qpi.c",
+    "lib/pfmlib_intel_ivbep_unc_irp.c",
+    "lib/pfmlib_intel_hswep_unc_cbo.c",
+    "lib/pfmlib_intel_hswep_unc_ha.c",
+    "lib/pfmlib_intel_hswep_unc_imc.c",
+    "lib/pfmlib_intel_hswep_unc_pcu.c",
+    "lib/pfmlib_intel_hswep_unc_qpi.c",
+    "lib/pfmlib_intel_hswep_unc_ubo.c",
+    "lib/pfmlib_intel_hswep_unc_r2pcie.c",
+    "lib/pfmlib_intel_hswep_unc_r3qpi.c",
+    "lib/pfmlib_intel_hswep_unc_irp.c",
+    "lib/pfmlib_intel_hswep_unc_sbo.c",
+    "lib/pfmlib_intel_bdx_unc_cbo.c",
+    "lib/pfmlib_intel_bdx_unc_ubo.c",
+    "lib/pfmlib_intel_bdx_unc_sbo.c",
+    "lib/pfmlib_intel_bdx_unc_ha.c",
+    "lib/pfmlib_intel_bdx_unc_imc.c",
+    "lib/pfmlib_intel_bdx_unc_irp.c",
+    "lib/pfmlib_intel_bdx_unc_pcu.c",
+    "lib/pfmlib_intel_bdx_unc_qpi.c",
+    "lib/pfmlib_intel_bdx_unc_r2pcie.c",
+    "lib/pfmlib_intel_bdx_unc_r3qpi.c",
+    "lib/pfmlib_intel_skx_unc_cha.c",
+    "lib/pfmlib_intel_skx_unc_iio.c",
+    "lib/pfmlib_intel_skx_unc_imc.c",
+    "lib/pfmlib_intel_skx_unc_irp.c",
+    "lib/pfmlib_intel_skx_unc_m2m.c",
+    "lib/pfmlib_intel_skx_unc_m3upi.c",
+    "lib/pfmlib_intel_skx_unc_pcu.c",
+    "lib/pfmlib_intel_skx_unc_ubo.c",
+    "lib/pfmlib_intel_skx_unc_upi.c",
+    "lib/pfmlib_intel_knc.c",
+    "lib/pfmlib_intel_slm.c",
+    "lib/pfmlib_intel_tmt.c",
+    "lib/pfmlib_intel_knl.c",
+    "lib/pfmlib_intel_knl_unc_imc.c",
+    "lib/pfmlib_intel_knl_unc_edc.c",
+    "lib/pfmlib_intel_knl_unc_cha.c",
+    "lib/pfmlib_intel_knl_unc_m2pcie.c",
+    "lib/pfmlib_intel_glm.c",
+    "lib/pfmlib_intel_netburst.c",
+    "lib/pfmlib_amd64_k7.c",
+    "lib/pfmlib_amd64_k8.c",
+    "lib/pfmlib_amd64_fam10h.c",
+    "lib/pfmlib_amd64_fam11h.c",
+    "lib/pfmlib_amd64_fam12h.c",
+    "lib/pfmlib_amd64_fam14h.c",
+    "lib/pfmlib_amd64_fam15h.c",
+    "lib/pfmlib_amd64_fam17h.c",
+    "lib/pfmlib_amd64_fam16h.c",
+]
+
+X86_SRCS_COMMON = X86_64_SRCS_COMMON + [
+    "lib/pfmlib_intel_coreduo.c",
+    "lib/pfmlib_intel_p6.c",
+]
 
 filegroup(
-    name = "pfm_srcs",
-    srcs = glob(["**"]),
+    name = "cpu_srcs",
+    srcs = select({
+        "@platforms//cpu:x86_32": X86_SRCS_COMMON,
+        "@platforms//cpu:x86_64": X86_64_SRCS_COMMON,
+        "@platforms//cpu:aarch32": AARCH32_SRCS_COMMON,
+        "@platforms//cpu:aarch64": AARCH64_SRCS_COMMON,
+        "@platforms//cpu:mips64": MIPS_SRCS_COMMON,
+        "@platforms//cpu:ppc32": POWERPC_SRCS_COMMON,
+        "@platforms//cpu:ppc64le": POWERPC_SRCS_COMMON,
+        "@platforms//cpu:ppc": POWERPC_SRCS_COMMON,
+        "@platforms//cpu:s390x": S390X_SRCS_COMMON,
+        "//conditions:default": [],
+    }),
 )
 
-make(
-    name = "libpfm",
-    lib_source = ":pfm_srcs",
-    lib_name = "libpfm",
+filegroup(
+    name = "linux_srcs",
+    srcs = select({
+        "@platforms//cpu:aarch32": AARCH32_SRCS_LINUX,
+        "@platforms//cpu:aarch64": AARCH64_SRCS_LINUX,
+        "@platforms//cpu:mips64": MIPS_SRCS_LINUX,
+        "@platforms//cpu:ppc32": POWERPC_SRCS_LINUX,
+        "@platforms//cpu:ppc64le": POWERPC_SRCS_LINUX,
+        "@platforms//cpu:ppc": POWERPC_SRCS_LINUX,
+        "@platforms//cpu:s390x": S390X_SRCS_LINUX,
+        "//conditions:default": [],
+    }),
+)
+
+filegroup(
+    name = "srcs",
+    srcs = [
+        "lib/pfmlib_common.c",
+        "lib/pfmlib_perf_event.c",
+        "lib/pfmlib_perf_event_pmu.c",
+        "lib/pfmlib_perf_event_priv.h",
+        "lib/pfmlib_perf_event_raw.c",
+        "lib/pfmlib_torrent.c",
+        "lib/pfmlib_tx2_unc_perf_event.c",
+        ":cpu_srcs",
+    ] + select({
+        "@platforms//os:linux": [":linux_srcs"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "pfm",
+    srcs = [
+        ":srcs",
+    ],
+    hdrs = glob([
+        "include/perfmon/*.h",
+    ]),
     copts = [
         "-Wno-format-truncation",
         "-Wno-use-after-free",
+        "-fPIC",
+        "-D_REENTRANT",
+        "-fvisibility=hidden",
+    ] + select({
+        "@platforms//cpu:aarch32": ["-DCONFIG_PFMLIB_ARCH_ARM"],
+        "@platforms//cpu:aarch64": ["-DCONFIG_PFMLIB_ARCH_ARM64"],
+        "@platforms//cpu:mips64": ["-DCONFIG_PFMLIB_ARCH_MIPS"],
+        "@platforms//cpu:ppc32": ["-DCONFIG_PFMLIB_ARCH_POWERPC"],
+        "@platforms//cpu:ppc64le": ["-DCONFIG_PFMLIB_ARCH_POWERPC"],
+        "@platforms//cpu:ppc": ["-DCONFIG_PFMLIB_ARCH_POWERPC"],
+        "@platforms//cpu:s390x": ["-DCONFIG_PFMLIB_ARCH_S390X"],
+        "//conditions:default": [],
+    }),
+    includes = [
+        "include",
+        "lib",
     ],
+    strip_include_prefix = "include",
+    textual_hdrs = glob([
+        "lib/**/*.h",
+    ]),
+    visibility = [
+        "//visibility:public",
+    ],
+)
+
+alias(
+    name = "libpfm",
+    actual = ":pfm",
     visibility = [
         "//visibility:public",
     ],
diff --git a/third-party/benchmark/tools/requirements.txt b/third-party/benchmark/tools/requirements.txt
index f32f35b8fbfda..12d5d9ce99603 100644
--- a/third-party/benchmark/tools/requirements.txt
+++ b/third-party/benchmark/tools/requirements.txt
@@ -1,2 +1,2 @@
-numpy == 1.25
-scipy == 1.10.0
+numpy == 2.4.1
+scipy == 1.17.0
diff --git a/third-party/benchmark/tools/strip_asm.py b/third-party/benchmark/tools/strip_asm.py
index bc3a774a79320..f49a8c85ac641 100755
--- a/third-party/benchmark/tools/strip_asm.py
+++ b/third-party/benchmark/tools/strip_asm.py
@@ -73,16 +73,16 @@ def process_identifiers(line):
     parts = re.split(r"([a-zA-Z0-9_]+)", line)
     new_line = ""
     for tk in parts:
-        if is_identifier(tk):
-            if tk.startswith("__Z"):
-                tk = tk[1:]
-            elif (
+        if is_identifier(tk) and (
+            tk.startswith("__Z")
+            or (
                 tk.startswith("_")
                 and len(tk) > 1
                 and tk[1].isalpha()
                 and tk[1] != "Z"
-            ):
-                tk = tk[1:]
+            )
+        ):
+            tk = tk[1:]
         new_line += tk
     return new_line
 
@@ -141,14 +141,14 @@ def main():
     parser.add_argument(
         "out", metavar="output", type=str, nargs=1, help="The output file"
     )
-    args, unknown_args = parser.parse_known_args()
+    args, _ = parser.parse_known_args()
     input = args.input[0]
     output = args.out[0]
     if not os.path.isfile(input):
         print("ERROR: input file '%s' does not exist" % input)
         sys.exit(1)
 
-    with open(input, "r") as f:
+    with open(input) as f:
         contents = f.read()
     new_contents = process_asm(contents)
     with open(output, "w") as f: