[llvm] Update Benchmark (PR #83488)

Mon Mar 4 06:48:46 PST 2024

https://github.com/mtrofin updated https://github.com/llvm/llvm-project/pull/83488

>From adcb75acb48a28c7515209c12171d189ef5dd2b7 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Thu, 29 Feb 2024 12:59:15 -0800
Subject: [PATCH 1/2] update benchmark at
 1576991177ba97a4b2ff6c45950f1fa6e9aa678c

---
 third-party/benchmark/.ycm_extra_conf.py      |   8 +-
 third-party/benchmark/AUTHORS                 |  12 +-
 third-party/benchmark/CMakeLists.txt          | 100 ++--
 third-party/benchmark/CONTRIBUTORS            |  14 +-
 third-party/benchmark/README.md               |  13 +-
 third-party/benchmark/WORKSPACE               |  54 +-
 .../benchmark/bindings/python/build_defs.bzl  |   8 +-
 .../python/google_benchmark/__init__.py       |  58 +-
 .../python/google_benchmark/benchmark.cc      | 149 ++---
 .../python/google_benchmark/example.py        |   6 +-
 .../benchmark/bindings/python/pybind11.BUILD  |  20 -
 .../bindings/python/python_headers.BUILD      |   4 +
 .../bindings/python/requirements.txt          |   2 -
 .../benchmark/cmake/CXXFeatureCheck.cmake     |  29 +-
 .../benchmark/cmake/GetGitVersion.cmake       |  30 +-
 third-party/benchmark/cmake/GoogleTest.cmake  |  16 +-
 .../benchmark/cmake/Modules/FindPFM.cmake     |  38 +-
 third-party/benchmark/cmake/benchmark.pc.in   |   4 +-
 third-party/benchmark/docs/AssemblyTests.md   |   2 +
 third-party/benchmark/docs/_config.yml        |   4 +-
 third-party/benchmark/docs/dependencies.md    |  22 +-
 third-party/benchmark/docs/index.md           |   4 +-
 third-party/benchmark/docs/perf_counters.md   |  13 +-
 third-party/benchmark/docs/releasing.md       |  24 +-
 third-party/benchmark/docs/tools.md           | 140 +++++
 third-party/benchmark/docs/user_guide.md      | 158 ++++-
 .../benchmark/include/benchmark/benchmark.h   | 563 +++++++++++++-----
 third-party/benchmark/requirements.txt        |   3 -
 third-party/benchmark/setup.py                | 193 +++---
 third-party/benchmark/src/CMakeLists.txt      |  28 +-
 third-party/benchmark/src/benchmark.cc        | 294 +++++++--
 .../benchmark/src/benchmark_api_internal.cc   |  20 +-
 .../benchmark/src/benchmark_api_internal.h    |   3 +
 third-party/benchmark/src/benchmark_main.cc   |   1 +
 third-party/benchmark/src/benchmark_name.cc   |   5 +-
 .../benchmark/src/benchmark_register.cc       |  52 +-
 .../benchmark/src/benchmark_register.h        |   5 +-
 third-party/benchmark/src/benchmark_runner.cc | 201 ++++++-
 third-party/benchmark/src/benchmark_runner.h  |  33 +-
 third-party/benchmark/src/check.h             |  31 +-
 third-party/benchmark/src/colorprint.cc       |  46 +-
 third-party/benchmark/src/commandlineflags.cc |  19 +-
 third-party/benchmark/src/commandlineflags.h  |  39 +-
 third-party/benchmark/src/complexity.cc       |  43 +-
 third-party/benchmark/src/complexity.h        |   2 +-
 third-party/benchmark/src/console_reporter.cc |  34 +-
 third-party/benchmark/src/counter.cc          |   4 +-
 third-party/benchmark/src/csv_reporter.cc     |  23 +-
 third-party/benchmark/src/cycleclock.h        |  40 +-
 third-party/benchmark/src/internal_macros.h   |  17 +-
 third-party/benchmark/src/json_reporter.cc    |  36 +-
 third-party/benchmark/src/log.h               |  26 +-
 third-party/benchmark/src/perf_counters.cc    | 242 ++++++--
 third-party/benchmark/src/perf_counters.h     | 108 ++--
 third-party/benchmark/src/re.h                |   2 +-
 third-party/benchmark/src/reporter.cc         |  14 +-
 third-party/benchmark/src/sleep.cc            |  66 --
 third-party/benchmark/src/sleep.h             |  15 -
 third-party/benchmark/src/statistics.cc       |  38 +-
 third-party/benchmark/src/statistics.h        |  12 +-
 third-party/benchmark/src/string_util.cc      |  65 +-
 third-party/benchmark/src/string_util.h       |  11 +-
 third-party/benchmark/src/sysinfo.cc          | 450 +++++++++-----
 third-party/benchmark/src/thread_manager.h    |   4 +-
 third-party/benchmark/src/timers.cc           |  24 +-
 .../benchmark/test/AssemblyTests.cmake        |  21 +
 third-party/benchmark/test/CMakeLists.txt     | 127 ++--
 .../benchmark/test/args_product_test.cc       |   4 +-
 third-party/benchmark/test/basic_test.cc      |   9 +-
 third-party/benchmark/test/benchmark_gtest.cc |  14 +-
 .../benchmark/test/benchmark_name_gtest.cc    |   8 +
 .../benchmark_random_interleaving_gtest.cc    |   5 +-
 .../test/benchmark_setup_teardown_test.cc     |  18 +-
 third-party/benchmark/test/benchmark_test.cc  |  65 +-
 .../test/clobber_memory_assembly_test.cc      |   1 +
 third-party/benchmark/test/complexity_test.cc | 158 +++--
 .../benchmark/test/diagnostics_test.cc        |  15 +-
 .../test/donotoptimize_assembly_test.cc       |  40 ++
 .../benchmark/test/donotoptimize_test.cc      |  28 +-
 third-party/benchmark/test/filter_test.cc     |  31 +-
 third-party/benchmark/test/fixture_test.cc    |   6 +-
 third-party/benchmark/test/link_main_test.cc  |   3 +-
 third-party/benchmark/test/map_test.cc        |  10 +-
 .../benchmark/test/memory_manager_test.cc     |  11 +-
 .../benchmark/test/multiple_ranges_test.cc    |   4 +-
 third-party/benchmark/test/options_test.cc    |   6 +-
 third-party/benchmark/test/output_test.h      |  10 +-
 .../benchmark/test/output_test_helper.cc      |  47 +-
 .../benchmark/test/perf_counters_gtest.cc     | 266 +++++++--
 .../benchmark/test/perf_counters_test.cc      |  71 ++-
 .../benchmark/test/register_benchmark_test.cc |  26 +-
 .../benchmark/test/reporter_output_test.cc    |  18 +-
 .../benchmark/test/skip_with_error_test.cc    |  17 +-
 third-party/benchmark/test/spec_arg_test.cc   |  16 +-
 .../benchmark/test/statistics_gtest.cc        |   4 +-
 .../benchmark/test/string_util_gtest.cc       |  55 +-
 .../test/user_counters_tabular_test.cc        |  10 +-
 .../benchmark/test/user_counters_test.cc      |  28 +-
 .../test/user_counters_thousands_test.cc      |  32 +-
 third-party/benchmark/tools/compare.py        |  62 +-
 .../tools/gbench/Inputs/test1_run1.json       |   8 +
 .../tools/gbench/Inputs/test1_run2.json       |   8 +
 .../benchmark/tools/gbench/__init__.py        |   2 +-
 third-party/benchmark/tools/gbench/report.py  | 271 ++++++++-
 third-party/benchmark/tools/gbench/util.py    |  52 +-
 third-party/benchmark/tools/requirements.txt  |   3 +-
 third-party/benchmark/tools/strip_asm.py      |  73 ++-
 107 files changed, 3773 insertions(+), 1634 deletions(-)
 delete mode 100644 third-party/benchmark/bindings/python/pybind11.BUILD
 delete mode 100644 third-party/benchmark/bindings/python/requirements.txt
 delete mode 100644 third-party/benchmark/requirements.txt
 delete mode 100644 third-party/benchmark/src/sleep.cc
 delete mode 100644 third-party/benchmark/src/sleep.h

diff --git a/third-party/benchmark/.ycm_extra_conf.py b/third-party/benchmark/.ycm_extra_conf.py
index 1482c7b00202ea..caf257f0540e19 100644
--- a/third-party/benchmark/.ycm_extra_conf.py
+++ b/third-party/benchmark/.ycm_extra_conf.py
@@ -1,4 +1,5 @@
 import os
+
 import ycm_core
 
 # These are the compilation flags that will be used in case there's no
@@ -91,7 +92,9 @@ def GetCompilationInfoForFile(filename):
         for extension in SOURCE_EXTENSIONS:
             replacement_file = basename + extension
             if os.path.exists(replacement_file):
-                compilation_info = database.GetCompilationInfoForFile(replacement_file)
+                compilation_info = database.GetCompilationInfoForFile(
+                    replacement_file
+                )
                 if compilation_info.compiler_flags_:
                     return compilation_info
         return None
@@ -107,7 +110,8 @@ def FlagsForFile(filename, **kwargs):
             return None
 
         final_flags = MakeRelativePathsInFlagsAbsolute(
-            compilation_info.compiler_flags_, compilation_info.compiler_working_dir_
+            compilation_info.compiler_flags_,
+            compilation_info.compiler_working_dir_,
         )
     else:
         relative_to = DirectoryOfThisScript()
diff --git a/third-party/benchmark/AUTHORS b/third-party/benchmark/AUTHORS
index 54770f35499ce3..2170e46fd4a051 100644
--- a/third-party/benchmark/AUTHORS
+++ b/third-party/benchmark/AUTHORS
@@ -13,6 +13,7 @@ Alex Steele <steeleal123 at gmail.com>
 Andriy Berestovskyy <berestovskyy at gmail.com>
 Arne Beer <arne at twobeer.de>
 Carto
+Cezary Skrzyński <czars1988 at gmail.com>
 Christian Wassermann <christian_wassermann at web.de>
 Christopher Seymour <chris.j.seymour at hotmail.com>
 Colin Braley <braley.colin at gmail.com>
@@ -27,10 +28,13 @@ Eric Backus <eric_backus at alum.mit.edu>
 Eric Fiselier <eric at efcs.ca>
 Eugene Zhuk <eugene.zhuk at gmail.com>
 Evgeny Safronov <division494 at gmail.com>
+Fabien Pichot <pichot.fabien at gmail.com>
 Federico Ficarelli <federico.ficarelli at gmail.com>
 Felix Homann <linuxaudio at showlabor.de>
+Gergely Meszaros <maetveis at gmail.com>
 Gergő Szitár <szitar.gergo at gmail.com>
 Google Inc.
+Henrique Bucher <hbucher at gmail.com>
 International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez at gmail.com>
 Jern-Kuan Leong <jernkuan at gmail.com>
@@ -41,8 +45,11 @@ Jussi Knuuttila <jussi.knuuttila at gmail.com>
 Kaito Udagawa <umireon at gmail.com>
 Kishan Kumar <kumar.kishan at outlook.com>
 Lei Xu <eddyxu at gmail.com>
+Marcel Jacobse <mjacobse at uni-bremen.de>
 Matt Clarkson <mattyclarkson at gmail.com>
 Maxim Vafin <maxvafin at gmail.com>
+Mike Apodaca <gatorfax at gmail.com>
+Min-Yih Hsu <yihshyng223 at gmail.com>
 MongoDB Inc.
 Nick Hutchinson <nshutchinson at gmail.com>
 Norman Heino <norman.heino at gmail.com>
@@ -50,13 +57,16 @@ Oleksandr Sochka <sasha.sochka at gmail.com>
 Ori Livneh <ori.livneh at gmail.com>
 Paul Redmond <paul.redmond at gmail.com>
 Radoslav Yovchev <radoslav.tm at gmail.com>
+Raghu Raja <raghu at enfabrica.net>
+Rainer Orth <ro at cebitec.uni-bielefeld.de>
 Roman Lebedev <lebedev.ri at gmail.com>
 Sayan Bhattacharjee <aero.sayan at gmail.com>
+Shapr3D <google-contributors at shapr3d.com>
 Shuo Chen <chenshuo at chenshuo.com>
+Staffan Tjernstrom <staffantj at gmail.com>
 Steinar H. Gunderson <sgunderson at bigfoot.com>
 Stripe, Inc.
 Tobias Schmidt <tobias.schmidt at in.tum.de>
 Yixuan Qiu <yixuanq at gmail.com>
 Yusuke Suzuki <utatane.tea at gmail.com>
 Zbigniew Skowron <zbychs at gmail.com>
-Min-Yih Hsu <yihshyng223 at gmail.com>
diff --git a/third-party/benchmark/CMakeLists.txt b/third-party/benchmark/CMakeLists.txt
index 8af49406d052f3..d9bcc6a4939bee 100644
--- a/third-party/benchmark/CMakeLists.txt
+++ b/third-party/benchmark/CMakeLists.txt
@@ -1,19 +1,7 @@
-cmake_minimum_required (VERSION 3.5.1)
-
-foreach(p
-    CMP0048 # OK to clear PROJECT_VERSION on project()
-    CMP0054 # CMake 3.1
-    CMP0056 # export EXE_LINKER_FLAGS to try_run
-    CMP0057 # Support no if() IN_LIST operator
-    CMP0063 # Honor visibility properties for all targets
-    CMP0077 # Allow option() overrides in importing projects
-    )
-  if(POLICY ${p})
-    cmake_policy(SET ${p} NEW)
-  endif()
-endforeach()
+# Require CMake 3.10. If available, use the policies up to CMake 3.22.
+cmake_minimum_required (VERSION 3.10...3.22)
 
-project (benchmark VERSION 1.6.0 LANGUAGES CXX)
+project (benchmark VERSION 1.8.3 LANGUAGES CXX)
 
 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
@@ -26,11 +14,14 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
   # PGC++ maybe reporting false positives.
   set(BENCHMARK_ENABLE_WERROR OFF)
 endif()
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "NVHPC")
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
 if(BENCHMARK_FORCE_WERROR)
   set(BENCHMARK_ENABLE_WERROR ON)
 endif(BENCHMARK_FORCE_WERROR)
 
-if(NOT MSVC)
+if(NOT (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC"))
   option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
 else()
   set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
@@ -50,8 +41,11 @@ option(BENCHMARK_USE_BUNDLED_GTEST "Use bundled GoogleTest. If disabled, the fin
 
 option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm" OFF)
 
-set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-if(MSVC)
+# Export only public symbols
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
     # As of CMake 3.18, CMAKE_SYSTEM_PROCESSOR is not set properly for MSVC and
     # cross-compilation (e.g. Host=x86_64, target=aarch64) requires using the
     # undocumented, but working variable.
@@ -72,7 +66,7 @@ function(should_enable_assembly_tests)
       return()
     endif()
   endif()
-  if (MSVC)
+  if (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC")
     return()
   elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
     return()
@@ -111,22 +105,32 @@ get_git_version(GIT_VERSION)
 # If no git version can be determined, use the version
 # from the project() command
 if ("${GIT_VERSION}" STREQUAL "0.0.0")
-  set(VERSION "${benchmark_VERSION}")
+  set(VERSION "v${benchmark_VERSION}")
 else()
   set(VERSION "${GIT_VERSION}")
 endif()
+
+# Normalize version: drop "v" prefix, replace first "-" with ".",
+# drop everything after second "-" (including said "-").
+string(STRIP ${VERSION} VERSION)
+if(VERSION MATCHES v[^-]*-)
+   string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  NORMALIZED_VERSION ${VERSION})
+else()
+   string(REGEX REPLACE "v(.*)" "\\1" NORMALIZED_VERSION ${VERSION})
+endif()
+
 # Tell the user what versions we are using
-message(STATUS "Version: ${VERSION}")
+message(STATUS "Google Benchmark version: ${VERSION}, normalized to ${NORMALIZED_VERSION}")
 
 # The version of the libraries
-set(GENERIC_LIB_VERSION ${VERSION})
-string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
+set(GENERIC_LIB_VERSION ${NORMALIZED_VERSION})
+string(SUBSTRING ${NORMALIZED_VERSION} 0 1 GENERIC_LIB_SOVERSION)
 
 # Import our CMake modules
-include(CheckCXXCompilerFlag)
 include(AddCXXCompilerFlag)
-include(CXXFeatureCheck)
+include(CheckCXXCompilerFlag)
 include(CheckLibraryExists)
+include(CXXFeatureCheck)
 
 check_library_exists(rt shm_open "" HAVE_LIB_RT)
 
@@ -134,6 +138,16 @@ if (BENCHMARK_BUILD_32_BITS)
   add_required_cxx_compiler_flag(-m32)
 endif()
 
+if (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC")
+  set(BENCHMARK_CXX_STANDARD 14)
+else()
+  set(BENCHMARK_CXX_STANDARD 11)
+endif()
+
+set(CMAKE_CXX_STANDARD ${BENCHMARK_CXX_STANDARD})
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
 if (MSVC)
   # Turn compiler warnings up to 11
   string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
@@ -166,21 +180,18 @@ if (MSVC)
     set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
   endif()
 else()
-  # Try and enable C++11. Don't use C++14 because it doesn't work in some
-  # configurations.
-  add_cxx_compiler_flag(-std=c++11)
-  if (NOT HAVE_CXX_FLAG_STD_CXX11)
-    add_cxx_compiler_flag(-std=c++0x)
-  endif()
-
+  # Turn on Large-file Support
+  add_definitions(-D_FILE_OFFSET_BITS=64)
+  add_definitions(-D_LARGEFILE64_SOURCE)
+  add_definitions(-D_LARGEFILE_SOURCE)
   # Turn compiler warnings up to 11
   add_cxx_compiler_flag(-Wall)
   add_cxx_compiler_flag(-Wextra)
   add_cxx_compiler_flag(-Wshadow)
+  add_cxx_compiler_flag(-Wfloat-equal)
+  add_cxx_compiler_flag(-Wold-style-cast)
   if(BENCHMARK_ENABLE_WERROR)
-      add_cxx_compiler_flag(-Werror RELEASE)
-      add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
-      add_cxx_compiler_flag(-Werror MINSIZEREL)
+      add_cxx_compiler_flag(-Werror)
   endif()
   if (NOT BENCHMARK_ENABLE_TESTING)
     # Disable warning when compiling tests as gtest does not use 'override'.
@@ -193,24 +204,23 @@ else()
   # Disable warnings regarding deprecated parts of the library while building
   # and testing those parts of the library.
   add_cxx_compiler_flag(-Wno-deprecated-declarations)
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
     # Intel silently ignores '-Wno-deprecated-declarations',
     # warning no. 1786 must be explicitly disabled.
     # See #631 for rationale.
     add_cxx_compiler_flag(-wd1786)
+    add_cxx_compiler_flag(-fno-finite-math-only)
   endif()
   # Disable deprecation warnings for release builds (when -Werror is enabled).
   if(BENCHMARK_ENABLE_WERROR)
-      add_cxx_compiler_flag(-Wno-deprecated RELEASE)
-      add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
-      add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
+      add_cxx_compiler_flag(-Wno-deprecated)
   endif()
   if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
     add_cxx_compiler_flag(-fno-exceptions)
   endif()
 
   if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
-    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
+    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM") #ICC17u2: Many false positives for Wstrict-aliasing
       add_cxx_compiler_flag(-Wstrict-aliasing)
     endif()
   endif()
@@ -219,12 +229,12 @@ else()
   add_cxx_compiler_flag(-wd654)
   add_cxx_compiler_flag(-Wthread-safety)
   if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
-    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
+    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES "-DINCLUDE_DIRECTORIES=${PROJECT_SOURCE_DIR}/include")
   endif()
 
   # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a
   # predefined macro, which turns on all of the wonderful libc extensions.
-  # However g++ doesn't do this in Cygwin so we have to define it ourselfs
+  # However g++ doesn't do this in Cygwin so we have to define it ourselves
   # since we depend on GNU/POSIX/BSD extensions.
   if (CYGWIN)
     add_definitions(-D_GNU_SOURCE=1)
@@ -275,7 +285,8 @@ if (BENCHMARK_USE_LIBCXX)
   if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
     add_cxx_compiler_flag(-stdlib=libc++)
   elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
-          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel" OR
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM")
     add_cxx_compiler_flag(-nostdinc++)
     message(WARNING "libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
     # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
@@ -312,9 +323,10 @@ cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
+cxx_feature_check(PTHREAD_AFFINITY)
 
 if (BENCHMARK_ENABLE_LIBPFM)
-  find_package(PFM)
+  find_package(PFM REQUIRED)
 endif()
 
 # Set up directories
diff --git a/third-party/benchmark/CONTRIBUTORS b/third-party/benchmark/CONTRIBUTORS
index 651fbeafe66ab7..9ca2caa3ee784a 100644
--- a/third-party/benchmark/CONTRIBUTORS
+++ b/third-party/benchmark/CONTRIBUTORS
@@ -27,7 +27,9 @@ Albert Pretorius <pretoalb at gmail.com>
 Alex Steele <steelal123 at gmail.com>
 Andriy Berestovskyy <berestovskyy at gmail.com>
 Arne Beer <arne at twobeer.de>
+Bátor Tallér <bator.taller at shapr3d.com>
 Billy Robert O'Neal III <billy.oneal at gmail.com> <bion at microsoft.com>
+Cezary Skrzyński <czars1988 at gmail.com>
 Chris Kennelly <ckennelly at google.com> <ckennelly at ckennelly.com>
 Christian Wassermann <christian_wassermann at web.de>
 Christopher Seymour <chris.j.seymour at hotmail.com>
@@ -44,25 +46,32 @@ Eric Backus <eric_backus at alum.mit.edu>
 Eric Fiselier <eric at efcs.ca>
 Eugene Zhuk <eugene.zhuk at gmail.com>
 Evgeny Safronov <division494 at gmail.com>
+Fabien Pichot <pichot.fabien at gmail.com>
 Fanbo Meng <fanbo.meng at ibm.com>
 Federico Ficarelli <federico.ficarelli at gmail.com>
 Felix Homann <linuxaudio at showlabor.de>
 Geoffrey Martin-Noble <gcmn at google.com> <gmngeoffrey at gmail.com>
+Gergely Meszaros <maetveis at gmail.com>
 Gergő Szitár <szitar.gergo at gmail.com>
 Hannes Hauswedell <h2 at fsfe.org>
+Henrique Bucher <hbucher at gmail.com>
 Ismael Jimenez Martinez <ismael.jimenez.martinez at gmail.com>
+Iakov Sergeev <yahontu at gmail.com>
 Jern-Kuan Leong <jernkuan at gmail.com>
 JianXiong Zhou <zhoujianxiong2 at gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes at gmail.com>
 John Millikin <jmillikin at stripe.com>
 Jordan Williams <jwillikers at protonmail.com>
 Jussi Knuuttila <jussi.knuuttila at gmail.com>
-Kai Wolf <kai.wolf at gmail.com>
 Kaito Udagawa <umireon at gmail.com>
+Kai Wolf <kai.wolf at gmail.com>
 Kishan Kumar <kumar.kishan at outlook.com>
 Lei Xu <eddyxu at gmail.com>
+Marcel Jacobse <mjacobse at uni-bremen.de>
 Matt Clarkson <mattyclarkson at gmail.com>
 Maxim Vafin <maxvafin at gmail.com>
+Mike Apodaca <gatorfax at gmail.com>
+Min-Yih Hsu <yihshyng223 at gmail.com>
 Nick Hutchinson <nshutchinson at gmail.com>
 Norman Heino <norman.heino at gmail.com>
 Oleksandr Sochka <sasha.sochka at gmail.com>
@@ -71,6 +80,8 @@ Pascal Leroy <phl at google.com>
 Paul Redmond <paul.redmond at gmail.com>
 Pierre Phaneuf <pphaneuf at google.com>
 Radoslav Yovchev <radoslav.tm at gmail.com>
+Raghu Raja <raghu at enfabrica.net>
+Rainer Orth <ro at cebitec.uni-bielefeld.de>
 Raul Marin <rmrodriguez at cartodb.com>
 Ray Glover <ray.glover at uk.ibm.com>
 Robert Guo <robert.guo at mongodb.com>
@@ -84,4 +95,3 @@ Tom Madams <tom.ej.madams at gmail.com> <tmadams at google.com>
 Yixuan Qiu <yixuanq at gmail.com>
 Yusuke Suzuki <utatane.tea at gmail.com>
 Zbigniew Skowron <zbychs at gmail.com>
-Min-Yih Hsu <yihshyng223 at gmail.com>
diff --git a/third-party/benchmark/README.md b/third-party/benchmark/README.md
index 7b81d960fc1d72..a5e5d392d8262d 100644
--- a/third-party/benchmark/README.md
+++ b/third-party/benchmark/README.md
@@ -4,10 +4,9 @@
 [![bazel](https://github.com/google/benchmark/actions/workflows/bazel.yml/badge.svg)](https://github.com/google/benchmark/actions/workflows/bazel.yml)
 [![pylint](https://github.com/google/benchmark/workflows/pylint/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Apylint)
 [![test-bindings](https://github.com/google/benchmark/workflows/test-bindings/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Atest-bindings)
-
-[![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
 [![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
 
+[![Discord](https://discordapp.com/api/guilds/1125694995928719494/widget.png?style=shield)](https://discord.gg/cz7UX7wKC2)
 
 A library to benchmark code snippets, similar to unit tests. Example:
 
@@ -33,7 +32,7 @@ To get started, see [Requirements](#requirements) and
 [Installation](#installation). See [Usage](#usage) for a full example and the
 [User Guide](docs/user_guide.md) for a more comprehensive feature overview.
 
-It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/master/docs/primer.md)
+It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/main/docs/primer.md)
 as some of the structural aspects of the APIs are similar.
 
 ## Resources
@@ -47,6 +46,8 @@ IRC channels:
 
 [Assembly Testing Documentation](docs/AssemblyTests.md)
 
+[Building and installing Python bindings](docs/python_bindings.md)
+
 ## Requirements
 
 The library can be used with C++03. However, it requires C++11 to build,
@@ -137,6 +138,12 @@ cache variables, if autodetection fails.
 If you are using clang, you may need to set `LLVMAR_EXECUTABLE`,
 `LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
 
+To enable sanitizer checks (eg., `asan` and `tsan`), add:
+```
+ -DCMAKE_C_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=address -fsanitize=thread -fno-sanitize-recover=all"
+ -DCMAKE_CXX_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=address -fsanitize=thread -fno-sanitize-recover=all "  
+```
+
 ### Stable and Experimental Library Versions
 
 The main branch contains the latest stable version of the benchmarking library;
diff --git a/third-party/benchmark/WORKSPACE b/third-party/benchmark/WORKSPACE
index 949eb98bc5d9de..256207022597f5 100644
--- a/third-party/benchmark/WORKSPACE
+++ b/third-party/benchmark/WORKSPACE
@@ -1,44 +1,30 @@
 workspace(name = "com_github_google_benchmark")
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
-
-http_archive(
-    name = "com_google_absl",
-    sha256 = "f41868f7a938605c92936230081175d1eae87f6ea2c248f41077c8f88316f111",
-    strip_prefix = "abseil-cpp-20200225.2",
-    urls = ["https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz"],
-)
+load("//:bazel/benchmark_deps.bzl", "benchmark_deps")
 
-git_repository(
-    name = "com_google_googletest",
-    remote = "https://github.com/google/googletest.git",
-    tag = "release-1.11.0",
-)
+benchmark_deps()
 
-http_archive(
-    name = "pybind11",
-    build_file = "@//bindings/python:pybind11.BUILD",
-    sha256 = "1eed57bc6863190e35637290f97a20c81cfe4d9090ac0a24f3bbf08f265eb71d",
-    strip_prefix = "pybind11-2.4.3",
-    urls = ["https://github.com/pybind/pybind11/archive/v2.4.3.tar.gz"],
-)
+load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
 
-new_local_repository(
-    name = "python_headers",
-    build_file = "@//bindings/python:python_headers.BUILD",
-    path = "/usr/include/python3.6",  # May be overwritten by setup.py.
-)
+rules_foreign_cc_dependencies()
 
-http_archive(
-    name = "rules_python",
-    url = "https://github.com/bazelbuild/rules_python/releases/download/0.1.0/rules_python-0.1.0.tar.gz",
-    sha256 = "b6d46438523a3ec0f3cead544190ee13223a52f6a6765a29eae7b7cc24cc83a0",
+load("@rules_python//python:repositories.bzl", "py_repositories")
+
+py_repositories()
+
+load("@rules_python//python:pip.bzl", "pip_parse")
+
+pip_parse(
+    name = "tools_pip_deps",
+    requirements_lock = "//tools:requirements.txt",
 )
 
-load("@rules_python//python:pip.bzl", pip3_install="pip_install")
+load("@tools_pip_deps//:requirements.bzl", "install_deps")
+
+install_deps()
 
-pip3_install(
-   name = "py_deps",
-   requirements = "//:requirements.txt",
+new_local_repository(
+    name = "python_headers",
+    build_file = "@//bindings/python:python_headers.BUILD",
+    path = "<PYTHON_INCLUDE_PATH>",  # May be overwritten by setup.py.
 )
diff --git a/third-party/benchmark/bindings/python/build_defs.bzl b/third-party/benchmark/bindings/python/build_defs.bzl
index 45907aaa5e2d89..b0c1b0f5807e3a 100644
--- a/third-party/benchmark/bindings/python/build_defs.bzl
+++ b/third-party/benchmark/bindings/python/build_defs.bzl
@@ -1,3 +1,7 @@
+"""
+This file contains some build definitions for C++ extensions used in the Google Benchmark Python bindings.
+"""
+
 _SHARED_LIB_SUFFIX = {
     "//conditions:default": ".so",
     "//:windows": ".dll",
@@ -8,8 +12,8 @@ def py_extension(name, srcs, hdrs = [], copts = [], features = [], deps = []):
         shared_lib_name = name + shared_lib_suffix
         native.cc_binary(
             name = shared_lib_name,
-            linkshared = 1,
-            linkstatic = 1,
+            linkshared = True,
+            linkstatic = True,
             srcs = srcs + hdrs,
             copts = copts,
             features = features,
diff --git a/third-party/benchmark/bindings/python/google_benchmark/__init__.py b/third-party/benchmark/bindings/python/google_benchmark/__init__.py
index 1055bf2418569f..e14769f451fe70 100644
--- a/third-party/benchmark/bindings/python/google_benchmark/__init__.py
+++ b/third-party/benchmark/bindings/python/google_benchmark/__init__.py
@@ -26,47 +26,29 @@ def my_benchmark(state):
   if __name__ == '__main__':
     benchmark.main()
 """
+import atexit
 
 from absl import app
+
 from google_benchmark import _benchmark
 from google_benchmark._benchmark import (
-    Counter,
-    kNanosecond,
-    kMicrosecond,
-    kMillisecond,
-    kSecond,
-    oNone,
-    o1,
-    oN,
-    oNSquared,
-    oNCubed,
-    oLogN,
-    oNLogN,
-    oAuto,
-    oLambda,
+    Counter as Counter,
+    State as State,
+    kMicrosecond as kMicrosecond,
+    kMillisecond as kMillisecond,
+    kNanosecond as kNanosecond,
+    kSecond as kSecond,
+    o1 as o1,
+    oAuto as oAuto,
+    oLambda as oLambda,
+    oLogN as oLogN,
+    oN as oN,
+    oNCubed as oNCubed,
+    oNLogN as oNLogN,
+    oNone as oNone,
+    oNSquared as oNSquared,
 )
-
-
-__all__ = [
-    "register",
-    "main",
-    "Counter",
-    "kNanosecond",
-    "kMicrosecond",
-    "kMillisecond",
-    "kSecond",
-    "oNone",
-    "o1",
-    "oN",
-    "oNSquared",
-    "oNCubed",
-    "oLogN",
-    "oNLogN",
-    "oAuto",
-    "oLambda",
-]
-
-__version__ = "0.2.0"
+from google_benchmark.version import __version__ as __version__
 
 
 class __OptionMaker:
@@ -94,14 +76,13 @@ def __getattr__(self, builder_name):
 
         # The function that get returned on @option.range(start=0, limit=1<<5).
         def __builder_method(*args, **kwargs):
-
             # The decorator that get called, either with the benchmared function
             # or the previous Options
             def __decorator(func_or_options):
                 options = self.make(func_or_options)
                 options.builder_calls.append((builder_name, args, kwargs))
                 # The decorator returns Options so it is not technically a decorator
-                # and needs a final call to @regiser
+                # and needs a final call to @register
                 return options
 
             return __decorator
@@ -156,3 +137,4 @@ def main(argv=None):
 # Methods for use with custom main function.
 initialize = _benchmark.Initialize
 run_benchmarks = _benchmark.RunSpecifiedBenchmarks
+atexit.register(_benchmark.ClearRegisteredBenchmarks)
diff --git a/third-party/benchmark/bindings/python/google_benchmark/benchmark.cc b/third-party/benchmark/bindings/python/google_benchmark/benchmark.cc
index 02b6ed7ed59009..f44476901cae77 100644
--- a/third-party/benchmark/bindings/python/google_benchmark/benchmark.cc
+++ b/third-party/benchmark/bindings/python/google_benchmark/benchmark.cc
@@ -1,20 +1,17 @@
 // Benchmark for Python.
 
-#include <map>
-#include <string>
-#include <vector>
-
-#include "pybind11/operators.h"
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-#include "pybind11/stl_bind.h"
-
 #include "benchmark/benchmark.h"
 
-PYBIND11_MAKE_OPAQUE(benchmark::UserCounters);
+#include "nanobind/nanobind.h"
+#include "nanobind/operators.h"
+#include "nanobind/stl/bind_map.h"
+#include "nanobind/stl/string.h"
+#include "nanobind/stl/vector.h"
+
+NB_MAKE_OPAQUE(benchmark::UserCounters);
 
 namespace {
-namespace py = ::pybind11;
+namespace nb = nanobind;
 
 std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
   // The `argv` pointers here become invalid when this function returns, but
@@ -37,15 +34,16 @@ std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
   return remaining_argv;
 }
 
-benchmark::internal::Benchmark* RegisterBenchmark(const char* name,
-                                                  py::function f) {
+benchmark::internal::Benchmark* RegisterBenchmark(const std::string& name,
+                                                  nb::callable f) {
   return benchmark::RegisterBenchmark(
       name, [f](benchmark::State& state) { f(&state); });
 }
 
-PYBIND11_MODULE(_benchmark, m) {
+NB_MODULE(_benchmark, m) {
+
   using benchmark::TimeUnit;
-  py::enum_<TimeUnit>(m, "TimeUnit")
+  nb::enum_<TimeUnit>(m, "TimeUnit")
       .value("kNanosecond", TimeUnit::kNanosecond)
       .value("kMicrosecond", TimeUnit::kMicrosecond)
       .value("kMillisecond", TimeUnit::kMillisecond)
@@ -53,72 +51,74 @@ PYBIND11_MODULE(_benchmark, m) {
       .export_values();
 
   using benchmark::BigO;
-  py::enum_<BigO>(m, "BigO")
+  nb::enum_<BigO>(m, "BigO")
       .value("oNone", BigO::oNone)
       .value("o1", BigO::o1)
       .value("oN", BigO::oN)
       .value("oNSquared", BigO::oNSquared)
       .value("oNCubed", BigO::oNCubed)
       .value("oLogN", BigO::oLogN)
-      .value("oNLogN", BigO::oLogN)
+      .value("oNLogN", BigO::oNLogN)
       .value("oAuto", BigO::oAuto)
       .value("oLambda", BigO::oLambda)
       .export_values();
 
   using benchmark::internal::Benchmark;
-  py::class_<Benchmark>(m, "Benchmark")
-      // For methods returning a pointer tor the current object, reference
-      // return policy is used to ask pybind not to take ownership oof the
+  nb::class_<Benchmark>(m, "Benchmark")
+      // For methods returning a pointer to the current object, reference
+      // return policy is used to ask nanobind not to take ownership of the
       // returned object and avoid calling delete on it.
       // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies
       //
       // For methods taking a const std::vector<...>&, a copy is created
       // because a it is bound to a Python list.
       // https://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html
-      .def("unit", &Benchmark::Unit, py::return_value_policy::reference)
-      .def("arg", &Benchmark::Arg, py::return_value_policy::reference)
-      .def("args", &Benchmark::Args, py::return_value_policy::reference)
-      .def("range", &Benchmark::Range, py::return_value_policy::reference,
-           py::arg("start"), py::arg("limit"))
+      .def("unit", &Benchmark::Unit, nb::rv_policy::reference)
+      .def("arg", &Benchmark::Arg, nb::rv_policy::reference)
+      .def("args", &Benchmark::Args, nb::rv_policy::reference)
+      .def("range", &Benchmark::Range, nb::rv_policy::reference,
+           nb::arg("start"), nb::arg("limit"))
       .def("dense_range", &Benchmark::DenseRange,
-           py::return_value_policy::reference, py::arg("start"),
-           py::arg("limit"), py::arg("step") = 1)
-      .def("ranges", &Benchmark::Ranges, py::return_value_policy::reference)
+           nb::rv_policy::reference, nb::arg("start"),
+           nb::arg("limit"), nb::arg("step") = 1)
+      .def("ranges", &Benchmark::Ranges, nb::rv_policy::reference)
       .def("args_product", &Benchmark::ArgsProduct,
-           py::return_value_policy::reference)
-      .def("arg_name", &Benchmark::ArgName, py::return_value_policy::reference)
+           nb::rv_policy::reference)
+      .def("arg_name", &Benchmark::ArgName, nb::rv_policy::reference)
       .def("arg_names", &Benchmark::ArgNames,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
       .def("range_pair", &Benchmark::RangePair,
-           py::return_value_policy::reference, py::arg("lo1"), py::arg("hi1"),
-           py::arg("lo2"), py::arg("hi2"))
+           nb::rv_policy::reference, nb::arg("lo1"), nb::arg("hi1"),
+           nb::arg("lo2"), nb::arg("hi2"))
       .def("range_multiplier", &Benchmark::RangeMultiplier,
-           py::return_value_policy::reference)
-      .def("min_time", &Benchmark::MinTime, py::return_value_policy::reference)
+           nb::rv_policy::reference)
+      .def("min_time", &Benchmark::MinTime, nb::rv_policy::reference)
+      .def("min_warmup_time", &Benchmark::MinWarmUpTime,
+           nb::rv_policy::reference)
       .def("iterations", &Benchmark::Iterations,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
       .def("repetitions", &Benchmark::Repetitions,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
       .def("report_aggregates_only", &Benchmark::ReportAggregatesOnly,
-           py::return_value_policy::reference, py::arg("value") = true)
+           nb::rv_policy::reference, nb::arg("value") = true)
       .def("display_aggregates_only", &Benchmark::DisplayAggregatesOnly,
-           py::return_value_policy::reference, py::arg("value") = true)
+           nb::rv_policy::reference, nb::arg("value") = true)
       .def("measure_process_cpu_time", &Benchmark::MeasureProcessCPUTime,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
       .def("use_real_time", &Benchmark::UseRealTime,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
       .def("use_manual_time", &Benchmark::UseManualTime,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
       .def(
           "complexity",
           (Benchmark * (Benchmark::*)(benchmark::BigO)) & Benchmark::Complexity,
-          py::return_value_policy::reference,
-          py::arg("complexity") = benchmark::oAuto);
+          nb::rv_policy::reference,
+          nb::arg("complexity") = benchmark::oAuto);
 
   using benchmark::Counter;
-  py::class_<Counter> py_counter(m, "Counter");
+  nb::class_<Counter> py_counter(m, "Counter");
 
-  py::enum_<Counter::Flags>(py_counter, "Flags")
+  nb::enum_<Counter::Flags>(py_counter, "Flags")
       .value("kDefaults", Counter::Flags::kDefaults)
       .value("kIsRate", Counter::Flags::kIsRate)
       .value("kAvgThreads", Counter::Flags::kAvgThreads)
@@ -130,52 +130,55 @@ PYBIND11_MODULE(_benchmark, m) {
       .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate)
       .value("kInvert", Counter::Flags::kInvert)
       .export_values()
-      .def(py::self | py::self);
+      .def(nb::self | nb::self);
 
-  py::enum_<Counter::OneK>(py_counter, "OneK")
+  nb::enum_<Counter::OneK>(py_counter, "OneK")
       .value("kIs1000", Counter::OneK::kIs1000)
       .value("kIs1024", Counter::OneK::kIs1024)
       .export_values();
 
   py_counter
-      .def(py::init<double, Counter::Flags, Counter::OneK>(),
-           py::arg("value") = 0., py::arg("flags") = Counter::kDefaults,
-           py::arg("k") = Counter::kIs1000)
-      .def(py::init([](double value) { return Counter(value); }))
-      .def_readwrite("value", &Counter::value)
-      .def_readwrite("flags", &Counter::flags)
-      .def_readwrite("oneK", &Counter::oneK);
-  py::implicitly_convertible<py::float_, Counter>();
-  py::implicitly_convertible<py::int_, Counter>();
-
-  py::bind_map<benchmark::UserCounters>(m, "UserCounters");
+      .def(nb::init<double, Counter::Flags, Counter::OneK>(),
+           nb::arg("value") = 0., nb::arg("flags") = Counter::kDefaults,
+           nb::arg("k") = Counter::kIs1000)
+      .def("__init__", ([](Counter *c, double value) { new (c) Counter(value); }))
+      .def_rw("value", &Counter::value)
+      .def_rw("flags", &Counter::flags)
+      .def_rw("oneK", &Counter::oneK)
+      .def(nb::init_implicit<double>());
+
+  nb::implicitly_convertible<nb::int_, Counter>();
+
+  nb::bind_map<benchmark::UserCounters>(m, "UserCounters");
 
   using benchmark::State;
-  py::class_<State>(m, "State")
+  nb::class_<State>(m, "State")
       .def("__bool__", &State::KeepRunning)
-      .def_property_readonly("keep_running", &State::KeepRunning)
+      .def_prop_ro("keep_running", &State::KeepRunning)
       .def("pause_timing", &State::PauseTiming)
       .def("resume_timing", &State::ResumeTiming)
       .def("skip_with_error", &State::SkipWithError)
-      .def_property_readonly("error_occurred", &State::error_occurred)
+      .def_prop_ro("error_occurred", &State::error_occurred)
       .def("set_iteration_time", &State::SetIterationTime)
-      .def_property("bytes_processed", &State::bytes_processed,
+      .def_prop_rw("bytes_processed", &State::bytes_processed,
                     &State::SetBytesProcessed)
-      .def_property("complexity_n", &State::complexity_length_n,
+      .def_prop_rw("complexity_n", &State::complexity_length_n,
                     &State::SetComplexityN)
-      .def_property("items_processed", &State::items_processed,
-                    &State::SetItemsProcessed)
-      .def("set_label", (void(State::*)(const char*)) & State::SetLabel)
-      .def("range", &State::range, py::arg("pos") = 0)
-      .def_property_readonly("iterations", &State::iterations)
-      .def_readwrite("counters", &State::counters)
-      .def_property_readonly("thread_index", &State::thread_index)
-      .def_property_readonly("threads", &State::threads);
+      .def_prop_rw("items_processed", &State::items_processed,
+                   &State::SetItemsProcessed)
+      .def("set_label", &State::SetLabel)
+      .def("range", &State::range, nb::arg("pos") = 0)
+      .def_prop_ro("iterations", &State::iterations)
+      .def_prop_ro("name", &State::name)
+      .def_rw("counters", &State::counters)
+      .def_prop_ro("thread_index", &State::thread_index)
+      .def_prop_ro("threads", &State::threads);
 
   m.def("Initialize", Initialize);
   m.def("RegisterBenchmark", RegisterBenchmark,
-        py::return_value_policy::reference);
+        nb::rv_policy::reference);
   m.def("RunSpecifiedBenchmarks",
         []() { benchmark::RunSpecifiedBenchmarks(); });
+  m.def("ClearRegisteredBenchmarks", benchmark::ClearRegisteredBenchmarks);
 };
 }  // namespace
diff --git a/third-party/benchmark/bindings/python/google_benchmark/example.py b/third-party/benchmark/bindings/python/google_benchmark/example.py
index fb0234b8fd7e31..b5b2f88ff30695 100644
--- a/third-party/benchmark/bindings/python/google_benchmark/example.py
+++ b/third-party/benchmark/bindings/python/google_benchmark/example.py
@@ -73,7 +73,7 @@ def manual_timing(state):
 
 @benchmark.register
 def custom_counters(state):
-    """Collect cutom metric using benchmark.Counter."""
+    """Collect custom metric using benchmark.Counter."""
     num_foo = 0.0
     while state:
         # Benchmark some code here
@@ -86,7 +86,9 @@ def custom_counters(state):
     # Set a counter as a rate.
     state.counters["foo_rate"] = Counter(num_foo, Counter.kIsRate)
     #  Set a counter as an inverse of rate.
-    state.counters["foo_inv_rate"] = Counter(num_foo, Counter.kIsRate | Counter.kInvert)
+    state.counters["foo_inv_rate"] = Counter(
+        num_foo, Counter.kIsRate | Counter.kInvert
+    )
     # Set a counter as a thread-average quantity.
     state.counters["foo_avg"] = Counter(num_foo, Counter.kAvgThreads)
     # There's also a combined flag:
diff --git a/third-party/benchmark/bindings/python/pybind11.BUILD b/third-party/benchmark/bindings/python/pybind11.BUILD
deleted file mode 100644
index bc833500383a2c..00000000000000
--- a/third-party/benchmark/bindings/python/pybind11.BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-cc_library(
-    name = "pybind11",
-    hdrs = glob(
-        include = [
-            "include/pybind11/*.h",
-            "include/pybind11/detail/*.h",
-        ],
-        exclude = [
-            "include/pybind11/common.h",
-            "include/pybind11/eigen.h",
-        ],
-    ),
-    copts = [
-        "-fexceptions",
-        "-Wno-undefined-inline",
-        "-Wno-pragma-once-outside-header",
-    ],
-    includes = ["include"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third-party/benchmark/bindings/python/python_headers.BUILD b/third-party/benchmark/bindings/python/python_headers.BUILD
index 9c34cf6ca4bd36..8f139f8621e015 100644
--- a/third-party/benchmark/bindings/python/python_headers.BUILD
+++ b/third-party/benchmark/bindings/python/python_headers.BUILD
@@ -1,3 +1,7 @@
+licenses(["notice"])
+
+package(default_visibility = ["//visibility:public"])
+
 cc_library(
     name = "python_headers",
     hdrs = glob(["**/*.h"]),
diff --git a/third-party/benchmark/bindings/python/requirements.txt b/third-party/benchmark/bindings/python/requirements.txt
deleted file mode 100644
index f5bbe7eca5ceac..00000000000000
--- a/third-party/benchmark/bindings/python/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-absl-py>=0.7.1
-
diff --git a/third-party/benchmark/cmake/CXXFeatureCheck.cmake b/third-party/benchmark/cmake/CXXFeatureCheck.cmake
index 62e6741fe3de0b..e51482659b0f10 100644
--- a/third-party/benchmark/cmake/CXXFeatureCheck.cmake
+++ b/third-party/benchmark/cmake/CXXFeatureCheck.cmake
@@ -17,6 +17,8 @@ if(__cxx_feature_check)
 endif()
 set(__cxx_feature_check INCLUDED)
 
+option(CXXFEATURECHECK_DEBUG OFF)
+
 function(cxx_feature_check FILE)
   string(TOLOWER ${FILE} FILE)
   string(TOUPPER ${FILE} VAR)
@@ -27,18 +29,22 @@ function(cxx_feature_check FILE)
     return()
   endif()
 
+  set(FEATURE_CHECK_CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
   if (ARGC GREATER 1)
     message(STATUS "Enabling additional flags: ${ARGV1}")
-    list(APPEND BENCHMARK_CXX_LINKER_FLAGS ${ARGV1})
+    list(APPEND FEATURE_CHECK_CMAKE_FLAGS ${ARGV1})
   endif()
 
   if (NOT DEFINED COMPILE_${FEATURE})
-    message(STATUS "Performing Test ${FEATURE}")
     if(CMAKE_CROSSCOMPILING)
+      message(STATUS "Cross-compiling to test ${FEATURE}")
       try_compile(COMPILE_${FEATURE}
               ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 11
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
       if(COMPILE_${FEATURE})
         message(WARNING
               "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
@@ -47,11 +53,14 @@ function(cxx_feature_check FILE)
         set(RUN_${FEATURE} 1 CACHE INTERNAL "")
       endif()
     else()
-      message(STATUS "Performing Test ${FEATURE}")
+      message(STATUS "Compiling and running to test ${FEATURE}")
       try_run(RUN_${FEATURE} COMPILE_${FEATURE}
               ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 11
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
     endif()
   endif()
 
@@ -61,7 +70,11 @@ function(cxx_feature_check FILE)
     add_definitions(-DHAVE_${VAR})
   else()
     if(NOT COMPILE_${FEATURE})
-      message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      if(CXXFEATURECHECK_DEBUG)
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile: ${COMPILE_OUTPUT_VAR}")
+      else()
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      endif()
     else()
       message(STATUS "Performing Test ${FEATURE} -- compiled but failed to run")
     endif()
diff --git a/third-party/benchmark/cmake/GetGitVersion.cmake b/third-party/benchmark/cmake/GetGitVersion.cmake
index 04a1f9b70d683f..b0210103b2ccc6 100644
--- a/third-party/benchmark/cmake/GetGitVersion.cmake
+++ b/third-party/benchmark/cmake/GetGitVersion.cmake
@@ -20,38 +20,16 @@ set(__get_git_version INCLUDED)
 
 function(get_git_version var)
   if(GIT_EXECUTABLE)
-      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8 --dirty
           WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
           RESULT_VARIABLE status
-          OUTPUT_VARIABLE GIT_DESCRIBE_VERSION
+          OUTPUT_VARIABLE GIT_VERSION
           ERROR_QUIET)
       if(status)
-          set(GIT_DESCRIBE_VERSION "v0.0.0")
+          set(GIT_VERSION "v0.0.0")
       endif()
-      
-      string(STRIP ${GIT_DESCRIBE_VERSION} GIT_DESCRIBE_VERSION)
-      if(GIT_DESCRIBE_VERSION MATCHES v[^-]*-) 
-         string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  GIT_VERSION ${GIT_DESCRIBE_VERSION})
-      else()
-         string(REGEX REPLACE "v(.*)" "\\1" GIT_VERSION ${GIT_DESCRIBE_VERSION})
-      endif()
-
-      # Work out if the repository is dirty
-      execute_process(COMMAND ${GIT_EXECUTABLE} update-index -q --refresh
-          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-          OUTPUT_QUIET
-          ERROR_QUIET)
-      execute_process(COMMAND ${GIT_EXECUTABLE} diff-index --name-only HEAD --
-          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-          OUTPUT_VARIABLE GIT_DIFF_INDEX
-          ERROR_QUIET)
-      string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
-      if (${GIT_DIRTY})
-          set(GIT_DESCRIBE_VERSION "${GIT_DESCRIBE_VERSION}-dirty")
-      endif()
-      message(STATUS "git version: ${GIT_DESCRIBE_VERSION} normalized to ${GIT_VERSION}")
   else()
-      set(GIT_VERSION "0.0.0")
+      set(GIT_VERSION "v0.0.0")
   endif()
 
   set(${var} ${GIT_VERSION} PARENT_SCOPE)
diff --git a/third-party/benchmark/cmake/GoogleTest.cmake b/third-party/benchmark/cmake/GoogleTest.cmake
index 66cb91008b733b..e66e9d1a2076ea 100644
--- a/third-party/benchmark/cmake/GoogleTest.cmake
+++ b/third-party/benchmark/cmake/GoogleTest.cmake
@@ -29,15 +29,25 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
 
 include(${GOOGLETEST_PREFIX}/googletest-paths.cmake)
 
-# googletest doesn't seem to want to stay build warning clean so let's not hurt ourselves.
-add_compile_options(-w)
-
 # Add googletest directly to our build. This defines
 # the gtest and gtest_main targets.
 add_subdirectory(${GOOGLETEST_SOURCE_DIR}
                  ${GOOGLETEST_BINARY_DIR}
                  EXCLUDE_FROM_ALL)
 
+# googletest doesn't seem to want to stay build warning clean so let's not hurt ourselves.
+if (MSVC)
+  target_compile_options(gtest PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gtest_main PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock_main PRIVATE "/wd4244" "/wd4722")
+else()
+  target_compile_options(gtest PRIVATE "-w")
+  target_compile_options(gtest_main PRIVATE "-w")
+  target_compile_options(gmock PRIVATE "-w")
+  target_compile_options(gmock_main PRIVATE "-w")
+endif()
+
 if(NOT DEFINED GTEST_COMPILE_COMMANDS)
     set(GTEST_COMPILE_COMMANDS ON)
 endif()
diff --git a/third-party/benchmark/cmake/Modules/FindPFM.cmake b/third-party/benchmark/cmake/Modules/FindPFM.cmake
index cf807a1ee9e96b..4c1ce938f9f7a0 100644
--- a/third-party/benchmark/cmake/Modules/FindPFM.cmake
+++ b/third-party/benchmark/cmake/Modules/FindPFM.cmake
@@ -1,26 +1,28 @@
 # If successful, the following variables will be defined:
-# HAVE_LIBPFM.
-# Set BENCHMARK_ENABLE_LIBPFM to 0 to disable, regardless of libpfm presence.
-include(CheckIncludeFile)
-include(CheckLibraryExists)
+# PFM_FOUND.
+# PFM_LIBRARIES
+# PFM_INCLUDE_DIRS
+# the following target will be defined:
+# PFM::libpfm
+
 include(FeatureSummary)
-enable_language(C)
+include(FindPackageHandleStandardArgs)
 
 set_package_properties(PFM PROPERTIES
                        URL http://perfmon2.sourceforge.net/
-                       DESCRIPTION "a helper library to develop monitoring tools"
+                       DESCRIPTION "A helper library to develop monitoring tools"
                        PURPOSE "Used to program specific performance monitoring events")
 
-check_library_exists(libpfm.a pfm_initialize "" HAVE_LIBPFM_INITIALIZE)
-if(HAVE_LIBPFM_INITIALIZE)
-  check_include_file(perfmon/perf_event.h HAVE_PERFMON_PERF_EVENT_H)
-  check_include_file(perfmon/pfmlib.h HAVE_PERFMON_PFMLIB_H)
-  check_include_file(perfmon/pfmlib_perf_event.h HAVE_PERFMON_PFMLIB_PERF_EVENT_H)
-  if(HAVE_PERFMON_PERF_EVENT_H AND HAVE_PERFMON_PFMLIB_H AND HAVE_PERFMON_PFMLIB_PERF_EVENT_H)
-    message("Using Perf Counters.")
-    set(HAVE_LIBPFM 1)
-    set(PFM_FOUND 1)
-  endif()
-else()
-  message("Perf Counters support requested, but was unable to find libpfm.")
+find_library(PFM_LIBRARY NAMES pfm)
+find_path(PFM_INCLUDE_DIR NAMES perfmon/pfmlib.h)
+
+find_package_handle_standard_args(PFM REQUIRED_VARS PFM_LIBRARY PFM_INCLUDE_DIR)
+
+if (PFM_FOUND AND NOT TARGET PFM::libpfm)
+    add_library(PFM::libpfm UNKNOWN IMPORTED)
+    set_target_properties(PFM::libpfm PROPERTIES
+        IMPORTED_LOCATION "${PFM_LIBRARY}"
+        INTERFACE_INCLUDE_DIRECTORIES "${PFM_INCLUDE_DIR}")
 endif()
+
+mark_as_advanced(PFM_LIBRARY PFM_INCLUDE_DIR)
diff --git a/third-party/benchmark/cmake/benchmark.pc.in b/third-party/benchmark/cmake/benchmark.pc.in
index 34beb012eef1a9..9dae881c79f940 100644
--- a/third-party/benchmark/cmake/benchmark.pc.in
+++ b/third-party/benchmark/cmake/benchmark.pc.in
@@ -1,7 +1,7 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
-includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 
 Name: @PROJECT_NAME@
 Description: Google microbenchmark framework
diff --git a/third-party/benchmark/docs/AssemblyTests.md b/third-party/benchmark/docs/AssemblyTests.md
index 1fbdc269b53d66..89df7ca520df85 100644
--- a/third-party/benchmark/docs/AssemblyTests.md
+++ b/third-party/benchmark/docs/AssemblyTests.md
@@ -111,6 +111,7 @@ between compilers or compiler versions. A common example of this
 is matching stack frame addresses. In this case regular expressions
 can be used to match the differing bits of output. For example:
 
+<!-- {% raw %} -->
 ```c++
 int ExternInt;
 struct Point { int x, y, z; };
@@ -127,6 +128,7 @@ extern "C" void test_store_point() {
     // CHECK: ret
 }
 ```
+<!-- {% endraw %} -->
 
 ## Current Requirements and Limitations
 
diff --git a/third-party/benchmark/docs/_config.yml b/third-party/benchmark/docs/_config.yml
index 2f7efbeab578c8..32f9f2e0ddc653 100644
--- a/third-party/benchmark/docs/_config.yml
+++ b/third-party/benchmark/docs/_config.yml
@@ -1 +1,3 @@
-theme: jekyll-theme-minimal
\ No newline at end of file
+theme: jekyll-theme-minimal
+logo: /assets/images/icon_black.png
+show_downloads: true
diff --git a/third-party/benchmark/docs/dependencies.md b/third-party/benchmark/docs/dependencies.md
index 7af52b95bd86e3..07760e10e37098 100644
--- a/third-party/benchmark/docs/dependencies.md
+++ b/third-party/benchmark/docs/dependencies.md
@@ -1,19 +1,13 @@
 # Build tool dependency policy
 
-To ensure the broadest compatibility when building the benchmark library, but
-still allow forward progress, we require any build tooling to be available for:
+We follow the [Foundational C++ support policy](https://opensource.google/documentation/policies/cplusplus-support) for our build tools. In
+particular the ["Build Systems" section](https://opensource.google/documentation/policies/cplusplus-support#build-systems).
 
-* Debian stable _and_
-* The last two Ubuntu LTS releases
+## CMake
 
-Currently, this means using build tool versions that are available for Ubuntu
-18.04 (Bionic Beaver), Ubuntu 20.04 (Focal Fossa), and Debian 11 (bullseye).
+The current supported version is CMake 3.10 as of 2023-08-10. Most modern
+distributions include newer versions, for example:
 
-_Note, CI also runs ubuntu-16.04 and ubuntu-14.04 to ensure best effort support
-for older versions._
-
-## cmake
-The current supported version is cmake 3.5.1 as of 2018-06-06.
-
-_Note, this version is also available for Ubuntu 14.04, an older Ubuntu LTS
-release, as `cmake3`._
+* Ubuntu 20.04 provides CMake 3.16.3
+* Debian 11.4 provides CMake 3.18.4
+* Ubuntu 22.04 provides CMake 3.22.1
diff --git a/third-party/benchmark/docs/index.md b/third-party/benchmark/docs/index.md
index eb82eff9eee009..9cada9688b1ddc 100644
--- a/third-party/benchmark/docs/index.md
+++ b/third-party/benchmark/docs/index.md
@@ -4,7 +4,9 @@
 * [Dependencies](dependencies.md)
 * [Perf Counters](perf_counters.md)
 * [Platform Specific Build Instructions](platform_specific_build_instructions.md)
+* [Python Bindings](python_bindings.md)
 * [Random Interleaving](random_interleaving.md)
+* [Reducing Variance](reducing_variance.md)
 * [Releasing](releasing.md)
 * [Tools](tools.md)
-* [User Guide](user_guide.md)
\ No newline at end of file
+* [User Guide](user_guide.md)
diff --git a/third-party/benchmark/docs/perf_counters.md b/third-party/benchmark/docs/perf_counters.md
index 74560e9669712a..f342092c99704c 100644
--- a/third-party/benchmark/docs/perf_counters.md
+++ b/third-party/benchmark/docs/perf_counters.md
@@ -12,16 +12,17 @@ This feature is available if:
 * The benchmark is run on an architecture featuring a Performance Monitoring
   Unit (PMU),
 * The benchmark is compiled with support for collecting counters. Currently,
-  this requires [libpfm](http://perfmon2.sourceforge.net/) be available at build
-  time
+  this requires [libpfm](http://perfmon2.sourceforge.net/), which is built as a
+  dependency via Bazel.
 
 The feature does not require modifying benchmark code. Counter collection is
 handled at the boundaries where timer collection is also handled. 
 
 To opt-in:
-
-*  Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`.
-*  Enable the cmake flag BENCHMARK_ENABLE_LIBPFM.
+* If using a Bazel build, add `--define pfm=1` to your build flags
+* If using CMake:
+  * Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`.
+  * Enable the CMake flag `BENCHMARK_ENABLE_LIBPFM` in `CMakeLists.txt`.
 
 To use, pass a comma-separated list of counter names through the
 `--benchmark_perf_counters` flag. The names are decoded through libpfm - meaning,
@@ -31,4 +32,4 @@ mapped by libpfm to platform-specifics - see libpfm
 
 The counter values are reported back through the [User Counters](../README.md#custom-counters)
 mechanism, meaning, they are available in all the formats (e.g. JSON) supported
-by User Counters.
\ No newline at end of file
+by User Counters.
diff --git a/third-party/benchmark/docs/releasing.md b/third-party/benchmark/docs/releasing.md
index 334f935393813f..09bf93764d009d 100644
--- a/third-party/benchmark/docs/releasing.md
+++ b/third-party/benchmark/docs/releasing.md
@@ -1,30 +1,23 @@
 # How to release
 
 * Make sure you're on main and synced to HEAD
-* Ensure the project builds and tests run (sanity check only, obviously)
+* Ensure the project builds and tests run
     * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
       passes
 * Prepare release notes
     * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
       commits between the last annotated tag and HEAD
     * Pick the most interesting.
-* Create one last commit that updates the version saved in `CMakeLists.txt` and the
-  `__version__` variable in `bindings/python/google_benchmark/__init__.py`to the release
-  version you're creating. (This version will be used if benchmark is installed from the
-  archive you'll be creating in the next step.)
+* Create one last commit that updates the version saved in `CMakeLists.txt` and `MODULE.bazel`
+  to the release version you're creating. (This version will be used if benchmark is installed
+  from the archive you'll be creating in the next step.)
 
 ```
-project (benchmark VERSION 1.6.0 LANGUAGES CXX)
+project (benchmark VERSION 1.8.0 LANGUAGES CXX)
 ```
 
-```python
-# bindings/python/google_benchmark/__init__.py
-
-# ...
-
-__version__ = "1.6.0"  # <-- change this to the release version you are creating
-
-# ...
+```
+module(name = "com_github_google_benchmark", version="1.8.0")
 ```
 
 * Create a release through github's interface
@@ -33,3 +26,6 @@ __version__ = "1.6.0"  # <-- change this to the release version you are creating
       * `git pull --tags`
       * `git tag -a -f <tag> <tag>`
       * `git push --force --tags origin`
+* Confirm that the "Build and upload Python wheels" action runs to completion
+    * Run it manually if it hasn't run.
+    * IMPORTANT: When re-running manually, make sure to select the newly created `<tag>` as the workflow version in the "Run workflow" tab on the GitHub Actions page. 
diff --git a/third-party/benchmark/docs/tools.md b/third-party/benchmark/docs/tools.md
index f2d0c497f3fc7e..411f41d405ffc8 100644
--- a/third-party/benchmark/docs/tools.md
+++ b/third-party/benchmark/docs/tools.md
@@ -186,6 +186,146 @@ Benchmark                               Time             CPU      Time Old
 This is a mix of the previous two modes, two (potentially different) benchmark binaries are run, and a different filter is applied to each one.
 As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
 
+### Note: Interpreting the output
+
+Performance measurements are an art, and performance comparisons are doubly so.
+Results are often noisy and don't necessarily have large absolute differences to
+them, so just by visual inspection, it is not at all apparent if two
+measurements are actually showing a performance change or not. It is even more
+confusing with multiple benchmark repetitions.
+
+Thankfully, what we can do, is use statistical tests on the results to determine
+whether the performance has statistically-significantly changed. `compare.py`
+uses [Mann–Whitney U
+test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), with a null
+hypothesis being that there's no difference in performance.
+ 
+**The below output is a summary of a benchmark comparison with statistics
+provided for a multi-threaded process.**
+```
+Benchmark                                               Time        CPU    Time Old      Time New       CPU Old       CPU New
+-----------------------------------------------------------------------------------------------------------------------------
+benchmark/threads:1/process_time/real_time_pvalue     0.0000     0.0000    U Test, Repetitions: 27 vs 27
+benchmark/threads:1/process_time/real_time_mean      -0.1442    -0.1442          90            77            90            77
+benchmark/threads:1/process_time/real_time_median    -0.1444    -0.1444          90            77            90            77
+benchmark/threads:1/process_time/real_time_stddev    +0.3974    +0.3933           0             0             0             0
+benchmark/threads:1/process_time/real_time_cv        +0.6329    +0.6280           0             0             0             0
+OVERALL_GEOMEAN                                      -0.1442    -0.1442           0             0             0             0
+```
+--------------------------------------------
+Here's a breakdown of each row:
+
+**benchmark/threads:1/process_time/real_time_pvalue**: This shows the _p-value_ for
+the statistical test comparing the performance of the process running with one
+thread. A value of 0.0000 suggests a statistically significant difference in
+performance. The comparison was conducted using the U Test (Mann-Whitney
+U Test) with 27 repetitions for each case.
+
+**benchmark/threads:1/process_time/real_time_mean**: This shows the relative
+difference in mean execution time between two different cases. The negative
+value (-0.1442) implies that the new process is faster by about 14.42%. The old
+time was 90 units, while the new time is 77 units.
+
+**benchmark/threads:1/process_time/real_time_median**: Similarly, this shows the
+relative difference in the median execution time. Again, the new process is
+faster by 14.44%.
+
+**benchmark/threads:1/process_time/real_time_stddev**: This is the relative
+difference in the standard deviation of the execution time, which is a measure
+of how much variation or dispersion there is from the mean. A positive value
+(+0.3974) implies there is more variance in the execution time in the new
+process.
+
+**benchmark/threads:1/process_time/real_time_cv**: CV stands for Coefficient of
+Variation. It is the ratio of the standard deviation to the mean. It provides a
+standardized measure of dispersion. An increase (+0.6329) indicates more
+relative variability in the new process.
+
+**OVERALL_GEOMEAN**: Geomean stands for geometric mean, a type of average that is
+less influenced by outliers. The negative value indicates a general improvement
+in the new process. However, given the values are all zero for the old and new
+times, this seems to be a mistake or placeholder in the output.
+
+-----------------------------------------
+
+
+
+Let's first try to see what the different columns represent in the above
+`compare.py` benchmarking output:
+
+  1. **Benchmark:** The name of the function being benchmarked, along with the
+     size of the input (after the slash).
+
+  2. **Time:** The average time per operation, across all iterations.
+
+  3. **CPU:** The average CPU time per operation, across all iterations.
+
+  4. **Iterations:** The number of iterations the benchmark was run to get a
+     stable estimate.
+
+  5. **Time Old and Time New:** These represent the average time it takes for a
+     function to run in two different scenarios or versions. For example, you
+     might be comparing how fast a function runs before and after you make some
+     changes to it.
+
+  6. **CPU Old and CPU New:** These show the average amount of CPU time that the
+     function uses in two different scenarios or versions. This is similar to
+     Time Old and Time New, but focuses on CPU usage instead of overall time.
+
+In the comparison section, the relative differences in both time and CPU time
+are displayed for each input size.
+
+
+A statistically-significant difference is determined by a **p-value**, which is
+a measure of the probability that the observed difference could have occurred
+just by random chance. A smaller p-value indicates stronger evidence against the
+null hypothesis. 
+
+**Therefore:**
+  1. If the p-value is less than the chosen significance level (alpha), we
+     reject the null hypothesis and conclude the benchmarks are significantly
+     different.
+  2. If the p-value is greater than or equal to alpha, we fail to reject the
+     null hypothesis and treat the two benchmarks as similar.
+
+
+
+The result of said the statistical test is additionally communicated through color coding:
+```diff
++ Green:
+```
+  The benchmarks are _**statistically different**_. This could mean the
+  performance has either **significantly improved** or **significantly
+  deteriorated**. You should look at the actual performance numbers to see which
+  is the case.
+```diff
+- Red:
+```
+  The benchmarks are _**statistically similar**_. This means the performance
+  **hasn't significantly changed**.
+
+In statistical terms, **'green'** means we reject the null hypothesis that
+there's no difference in performance, and **'red'** means we fail to reject the
+null hypothesis. This might seem counter-intuitive if you're expecting 'green'
+to mean 'improved performance' and 'red' to mean 'worsened performance'. 
+```bash
+  But remember, in this context:
+
+    'Success' means 'successfully finding a difference'.
+    'Failure' means 'failing to find a difference'.
+```
+
+
+Also, please note that **even if** we determine that there **is** a
+statistically-significant difference between the two measurements, it does not
+_necessarily_ mean that the actual benchmarks that were measured **are**
+different, or vice versa, even if we determine that there is **no**
+statistically-significant difference between the two measurements, it does not
+necessarily mean that the actual benchmarks that were measured **are not**
+different.
+
+
+
 ### U test
 
 If there is a sufficient repetition count of the benchmarks, the tool can do
diff --git a/third-party/benchmark/docs/user_guide.md b/third-party/benchmark/docs/user_guide.md
index 34bea6904240aa..d22a9069091f64 100644
--- a/third-party/benchmark/docs/user_guide.md
+++ b/third-party/benchmark/docs/user_guide.md
@@ -28,6 +28,8 @@
 
 [Templated Benchmarks](#templated-benchmarks)
 
+[Templated Benchmarks that take arguments](#templated-benchmarks-with-arguments)
+
 [Fixtures](#fixtures)
 
 [Custom Counters](#custom-counters)
@@ -50,14 +52,19 @@
 
 [Custom Statistics](#custom-statistics)
 
+[Memory Usage](#memory-usage)
+
 [Using RegisterBenchmark](#using-register-benchmark)
 
 [Exiting with an Error](#exiting-with-an-error)
 
-[A Faster KeepRunning Loop](#a-faster-keep-running-loop)
+[A Faster `KeepRunning` Loop](#a-faster-keep-running-loop)
+
+## Benchmarking Tips
 
 [Disabling CPU Frequency Scaling](#disabling-cpu-frequency-scaling)
 
+[Reducing Variance in Benchmarks](reducing_variance.md)
 
 <a name="output-formats" />
 
@@ -180,6 +187,12 @@ BM_memcpy/32          12 ns         12 ns   54687500
 BM_memcpy/32k       1834 ns       1837 ns     357143
 ```
 
+## Disabling Benchmarks
+
+It is possible to temporarily disable benchmarks by renaming the benchmark
+function to have the prefix "DISABLED_". This will cause the benchmark to
+be skipped at runtime.
+
 <a name="result-comparison" />
 
 ## Result comparison
@@ -232,6 +245,19 @@ iterations is at least one, not more than 1e9, until CPU time is greater than
 the minimum time, or the wallclock time is 5x minimum time. The minimum time is
 set per benchmark by calling `MinTime` on the registered benchmark object.
 
+Furthermore warming up a benchmark might be necessary in order to get
+stable results because of e.g caching effects of the code under benchmark.
+Warming up means running the benchmark a given amount of time, before
+results are actually taken into account. The amount of time for which
+the warmup should be run can be set per benchmark by calling
+`MinWarmUpTime` on the registered benchmark object or for all benchmarks
+using the `--benchmark_min_warmup_time` command-line option. Note that
+`MinWarmUpTime` will overwrite the value of `--benchmark_min_warmup_time`
+for the single benchmark. How many iterations the warmup run of each
+benchmark takes is determined the same way as described in the paragraph
+above. Per default the warmup phase is set to 0 seconds and is therefore
+disabled.
+
 Average timings are then reported over the iterations run. If multiple
 repetitions are requested using the `--benchmark_repetitions` command-line
 option, or at registration time, the benchmark function will be run several
@@ -247,10 +273,12 @@ information about the machine on which the benchmarks are run.
 Global setup/teardown specific to each benchmark can be done by
 passing a callback to Setup/Teardown:
 
-The setup/teardown callbacks will be invoked once for each benchmark.
-If the benchmark is multi-threaded (will run in k threads), they will be invoked exactly once before
-each run with k threads.
-If the benchmark uses different size groups of threads, the above will be true for each size group.
+The setup/teardown callbacks will be invoked once for each benchmark. If the
+benchmark is multi-threaded (will run in k threads), they will be invoked
+exactly once before each run with k threads.
+
+If the benchmark uses different size groups of threads, the above will be true
+for each size group.
 
 Eg.,
 
@@ -293,7 +321,7 @@ static void BM_memcpy(benchmark::State& state) {
   delete[] src;
   delete[] dst;
 }
-BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
+BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(8<<10);
 ```
 
 The preceding code is quite repetitive, and can be replaced with the following
@@ -322,7 +350,8 @@ the performance of `std::vector` initialization for uniformly increasing sizes.
 static void BM_DenseRange(benchmark::State& state) {
   for(auto _ : state) {
     std::vector<int> v(state.range(0), state.range(0));
-    benchmark::DoNotOptimize(v.data());
+    auto data = v.data();
+    benchmark::DoNotOptimize(data);
     benchmark::ClobberMemory();
   }
 }
@@ -362,17 +391,17 @@ short-hand. The following macro will pick a few appropriate arguments in the
 product of the two specified ranges and will generate a benchmark for each such
 pair.
 
-{% raw %}
+<!-- {% raw %} -->
 ```c++
 BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
 ```
-{% endraw %}
+<!-- {% endraw %} -->
 
 Some benchmarks may require specific argument values that cannot be expressed
 with `Ranges`. In this case, `ArgsProduct` offers the ability to generate a
 benchmark input for each combination in the product of the supplied vectors.
 
-{% raw %}
+<!-- {% raw %} -->
 ```c++
 BENCHMARK(BM_SetInsert)
     ->ArgsProduct({{1<<10, 3<<10, 8<<10}, {20, 40, 60, 80}})
@@ -391,7 +420,7 @@ BENCHMARK(BM_SetInsert)
     ->Args({3<<10, 80})
     ->Args({8<<10, 80});
 ```
-{% endraw %}
+<!-- {% endraw %} -->
 
 For the most common scenarios, helper methods for creating a list of
 integers for a given sparse or dense range are provided.
@@ -434,13 +463,22 @@ The `test_case_name` is appended to the name of the benchmark and
 should describe the values passed.
 
 ```c++
-template <class ...ExtraArgs>
-void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
-  [...]
+template <class ...Args>
+void BM_takes_args(benchmark::State& state, Args&&... args) {
+  auto args_tuple = std::make_tuple(std::move(args)...);
+  for (auto _ : state) {
+    std::cout << std::get<0>(args_tuple) << ": " << std::get<1>(args_tuple)
+              << '\n';
+    [...]
+  }
 }
 // Registers a benchmark named "BM_takes_args/int_string_test" that passes
-// the specified values to `extra_args`.
+// the specified values to `args`.
 BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+
+// Registers the same benchmark "BM_takes_args/int_test" that passes
+// the specified values to `args`.
+BENCHMARK_CAPTURE(BM_takes_args, int_test, 42, 43);
 ```
 
 Note that elements of `...args` may refer to global variables. Users should
@@ -459,7 +497,8 @@ static void BM_StringCompare(benchmark::State& state) {
   std::string s1(state.range(0), '-');
   std::string s2(state.range(0), '-');
   for (auto _ : state) {
-    benchmark::DoNotOptimize(s1.compare(s2));
+    auto comparison_result = s1.compare(s2);
+    benchmark::DoNotOptimize(comparison_result);
   }
   state.SetComplexityN(state.range(0));
 }
@@ -537,6 +576,30 @@ Three macros are provided for adding benchmark templates.
 #define BENCHMARK_TEMPLATE2(func, arg1, arg2)
 ```
 
+<a name="templated-benchmarks-with-arguments" />
+
+## Templated Benchmarks that take arguments
+
+Sometimes there is a need to template benchmarks, and provide arguments to them.
+
+```c++
+template <class Q> void BM_Sequential_With_Step(benchmark::State& state, int step) {
+  Q q;
+  typename Q::value_type v;
+  for (auto _ : state) {
+    for (int i = state.range(0); i-=step; )
+      q.push(v);
+    for (int e = state.range(0); e-=step; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(
+      static_cast<int64_t>(state.iterations())*state.range(0));
+}
+
+BENCHMARK_TEMPLATE1_CAPTURE(BM_Sequential, WaitQueue<int>, Step1, 1)->Range(1<<0, 1<<10);
+```
+
 <a name="fixtures" />
 
 ## Fixtures
@@ -554,10 +617,10 @@ For Example:
 ```c++
 class MyFixture : public benchmark::Fixture {
 public:
-  void SetUp(const ::benchmark::State& state) {
+  void SetUp(::benchmark::State& state) {
   }
 
-  void TearDown(const ::benchmark::State& state) {
+  void TearDown(::benchmark::State& state) {
   }
 };
 
@@ -668,7 +731,7 @@ is 1k a 1000 (default, `benchmark::Counter::OneK::kIs1000`), or 1024
 When you're compiling in C++11 mode or later you can use `insert()` with
 `std::initializer_list`:
 
-{% raw %}
+<!-- {% raw %} -->
 ```c++
   // With C++11, this can be done:
   state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
@@ -677,7 +740,7 @@ When you're compiling in C++11 mode or later you can use `insert()` with
   state.counters["Bar"] = numBars;
   state.counters["Baz"] = numBazs;
 ```
-{% endraw %}
+<!-- {% endraw %} -->
 
 ### Counter Reporting
 
@@ -773,6 +836,16 @@ static void BM_MultiThreaded(benchmark::State& state) {
 BENCHMARK(BM_MultiThreaded)->Threads(2);
 ```
 
+To run the benchmark across a range of thread counts, instead of `Threads`, use
+`ThreadRange`. This takes two parameters (`min_threads` and `max_threads`) and
+runs the benchmark once for values in the inclusive range. For example:
+
+```c++
+BENCHMARK(BM_MultiThreaded)->ThreadRange(1, 8);
+```
+
+will run `BM_MultiThreaded` with thread counts 1, 2, 4, and 8.
+
 If the benchmarked code itself uses threads and you want to compare it to
 single-threaded code, you may want to use real-time ("wallclock") measurements
 for latency comparisons:
@@ -814,7 +887,7 @@ BENCHMARK(BM_OpenMP)->Range(8, 8<<10);
 
 // Measure the user-visible time, the wall clock (literally, the time that
 // has passed on the clock on the wall), use it to decide for how long to
-// run the benchmark loop. This will always be meaningful, an will match the
+// run the benchmark loop. This will always be meaningful, and will match the
 // time spent by the main thread in single-threaded case, in general decreasing
 // with the number of internal threads doing the work.
 BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->UseRealTime();
@@ -836,7 +909,7 @@ is measured. But sometimes, it is necessary to do some work inside of
 that loop, every iteration, but without counting that time to the benchmark time.
 That is possible, although it is not recommended, since it has high overhead.
 
-{% raw %}
+<!-- {% raw %} -->
 ```c++
 static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
   std::set<int> data;
@@ -851,7 +924,7 @@ static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
 }
 BENCHMARK(BM_SetInsert_With_Timer_Control)->Ranges({{1<<10, 8<<10}, {128, 512}});
 ```
-{% endraw %}
+<!-- {% endraw %} -->
 
 <a name="manual-timing" />
 
@@ -906,6 +979,10 @@ order to manually set the time unit, you can specify it manually:
 BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 ```
 
+Additionally the default time unit can be set globally with the
+`--benchmark_time_unit={ns|us|ms|s}` command line argument. The argument only
+affects benchmarks where the time unit is not set explicitly.
+
 <a name="preventing-optimization" />
 
 ## Preventing Optimization
@@ -958,7 +1035,8 @@ static void BM_vector_push_back(benchmark::State& state) {
   for (auto _ : state) {
     std::vector<int> v;
     v.reserve(1);
-    benchmark::DoNotOptimize(v.data()); // Allow v.data() to be clobbered.
+    auto data = v.data();           // Allow v.data() to be clobbered. Pass as non-const
+    benchmark::DoNotOptimize(data); // lvalue to avoid undesired compiler optimizations
     v.push_back(42);
     benchmark::ClobberMemory(); // Force 42 to be written to memory.
   }
@@ -1037,10 +1115,25 @@ void BM_spin_empty(benchmark::State& state) {
 BENCHMARK(BM_spin_empty)
   ->ComputeStatistics("ratio", [](const std::vector<double>& v) -> double {
     return std::begin(v) / std::end(v);
-  }, benchmark::StatisticUnit::Percentage)
+  }, benchmark::StatisticUnit::kPercentage)
   ->Arg(512);
 ```
 
+<a name="memory-usage" />
+
+## Memory Usage
+
+It's often useful to also track memory usage for benchmarks, alongside CPU
+performance. For this reason, benchmark offers the `RegisterMemoryManager`
+method that allows a custom `MemoryManager` to be injected.
+
+If set, the `MemoryManager::Start` and `MemoryManager::Stop` methods will be
+called at the start and end of benchmark runs to allow user code to fill out
+a report on the number of allocations, bytes used, etc.
+
+This data will then be reported alongside other performance data, currently
+only when using JSON output.
+
 <a name="using-register-benchmark" />
 
 ## Using RegisterBenchmark(name, fn, args...)
@@ -1077,7 +1170,7 @@ int main(int argc, char** argv) {
 
 When errors caused by external influences, such as file I/O and network
 communication, occur within a benchmark the
-`State::SkipWithError(const char* msg)` function can be used to skip that run
+`State::SkipWithError(const std::string& msg)` function can be used to skip that run
 of benchmark and report the error. Note that only future iterations of the
 `KeepRunning()` are skipped. For the ranged-for version of the benchmark loop
 Users must explicitly exit the loop, otherwise all iterations will be performed.
@@ -1188,13 +1281,12 @@ the benchmark loop should be preferred.
 If you see this error:
 
 ```
-***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may
+be noisy and will incur extra overhead.
 ```
 
-you might want to disable the CPU frequency scaling while running the benchmark:
+you might want to disable the CPU frequency scaling while running the
+benchmark, as well as consider other ways to stabilize the performance of
+your system while benchmarking.
 
-```bash
-sudo cpupower frequency-set --governor performance
-./mybench
-sudo cpupower frequency-set --governor powersave
-```
+See [Reducing Variance](reducing_variance.md) for more information.
diff --git a/third-party/benchmark/include/benchmark/benchmark.h b/third-party/benchmark/include/benchmark/benchmark.h
index 6287c0afbdcf8c..08cfe29da344ec 100644
--- a/third-party/benchmark/include/benchmark/benchmark.h
+++ b/third-party/benchmark/include/benchmark/benchmark.h
@@ -187,6 +187,8 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #include <utility>
 #include <vector>
 
+#include "benchmark/export.h"
+
 #if defined(BENCHMARK_HAS_CXX11)
 #include <atomic>
 #include <initializer_list>
@@ -216,37 +218,45 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_UNUSED
 #endif
 
+// Used to annotate functions, methods and classes so they
+// are not optimized by the compiler. Useful for tests
+// where you expect loops to stay in place churning cycles
+#if defined(__clang__)
+#define BENCHMARK_DONT_OPTIMIZE __attribute__((optnone))
+#elif defined(__GNUC__) || defined(__GNUG__)
+#define BENCHMARK_DONT_OPTIMIZE __attribute__((optimize(0)))
+#else
+// MSVC & Intel do not have a no-optimize attribute, only line pragmas
+#define BENCHMARK_DONT_OPTIMIZE
+#endif
+
 #if defined(__GNUC__) || defined(__clang__)
 #define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
-#define BENCHMARK_NOEXCEPT noexcept
-#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
 #elif defined(_MSC_VER) && !defined(__clang__)
 #define BENCHMARK_ALWAYS_INLINE __forceinline
-#if _MSC_VER >= 1900
-#define BENCHMARK_NOEXCEPT noexcept
-#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
-#else
-#define BENCHMARK_NOEXCEPT
-#define BENCHMARK_NOEXCEPT_OP(x)
-#endif
 #define __func__ __FUNCTION__
 #else
 #define BENCHMARK_ALWAYS_INLINE
-#define BENCHMARK_NOEXCEPT
-#define BENCHMARK_NOEXCEPT_OP(x)
 #endif
 
 #define BENCHMARK_INTERNAL_TOSTRING2(x) #x
 #define BENCHMARK_INTERNAL_TOSTRING(x) BENCHMARK_INTERNAL_TOSTRING2(x)
 
 // clang-format off
-#if defined(__GNUC__) || defined(__clang__)
+#if (defined(__GNUC__) && !defined(__NVCC__) && !defined(__NVCOMPILER)) || defined(__clang__)
 #define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
 #define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
 #define BENCHMARK_DISABLE_DEPRECATED_WARNING \
   _Pragma("GCC diagnostic push")             \
   _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
 #define BENCHMARK_RESTORE_DEPRECATED_WARNING _Pragma("GCC diagnostic pop")
+#elif defined(__NVCOMPILER)
+#define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
+#define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING \
+  _Pragma("diagnostic push") \
+  _Pragma("diag_suppress deprecated_entity_with_custom_message")
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING _Pragma("diagnostic pop")
 #else
 #define BENCHMARK_BUILTIN_EXPECT(x, y) x
 #define BENCHMARK_DEPRECATED_MSG(msg)
@@ -280,18 +290,47 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_OVERRIDE
 #endif
 
+#if defined(_MSC_VER)
+#pragma warning(push)
+// C4251: <symbol> needs to have dll-interface to be used by clients of class
+#pragma warning(disable : 4251)
+#endif
+
 namespace benchmark {
 class BenchmarkReporter;
 
-void Initialize(int* argc, char** argv);
-void Shutdown();
+// Default number of minimum benchmark running time in seconds.
+const char kDefaultMinTimeStr[] = "0.5s";
+
+// Returns the version of the library.
+BENCHMARK_EXPORT std::string GetBenchmarkVersion();
+
+BENCHMARK_EXPORT void PrintDefaultHelp();
+
+BENCHMARK_EXPORT void Initialize(int* argc, char** argv,
+                                 void (*HelperPrinterf)() = PrintDefaultHelp);
+BENCHMARK_EXPORT void Shutdown();
 
 // Report to stdout all arguments in 'argv' as unrecognized except the first.
 // Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
-bool ReportUnrecognizedArguments(int argc, char** argv);
+BENCHMARK_EXPORT bool ReportUnrecognizedArguments(int argc, char** argv);
 
 // Returns the current value of --benchmark_filter.
-std::string GetBenchmarkFilter();
+BENCHMARK_EXPORT std::string GetBenchmarkFilter();
+
+// Sets a new value to --benchmark_filter. (This will override this flag's
+// current value).
+// Should be called after `benchmark::Initialize()`, as
+// `benchmark::Initialize()` will override the flag's value.
+BENCHMARK_EXPORT void SetBenchmarkFilter(std::string value);
+
+// Returns the current value of --v (command line value for verbosity).
+BENCHMARK_EXPORT int32_t GetBenchmarkVerbosity();
+
+// Creates a default display reporter. Used by the library when no display
+// reporter is provided, but also made available for external use in case a
+// custom reporter should respect the `--benchmark_format` flag as a fallback
+BENCHMARK_EXPORT BenchmarkReporter* CreateDefaultDisplayReporter();
 
 // Generate a list of benchmarks matching the specified --benchmark_filter flag
 // and if --benchmark_list_tests is specified return after printing the name
@@ -305,22 +344,33 @@ std::string GetBenchmarkFilter();
 // The second and third overload use the specified 'display_reporter' and
 //  'file_reporter' respectively. 'file_reporter' will write to the file
 //  specified
-//   by '--benchmark_output'. If '--benchmark_output' is not given the
+//   by '--benchmark_out'. If '--benchmark_out' is not given the
 //  'file_reporter' is ignored.
 //
 // RETURNS: The number of matching benchmarks.
-size_t RunSpecifiedBenchmarks();
-size_t RunSpecifiedBenchmarks(std::string spec);
+BENCHMARK_EXPORT size_t RunSpecifiedBenchmarks();
+BENCHMARK_EXPORT size_t RunSpecifiedBenchmarks(std::string spec);
+
+BENCHMARK_EXPORT size_t
+RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter);
+BENCHMARK_EXPORT size_t
+RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter, std::string spec);
 
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter);
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
-                              std::string spec);
+BENCHMARK_EXPORT size_t RunSpecifiedBenchmarks(
+    BenchmarkReporter* display_reporter, BenchmarkReporter* file_reporter);
+BENCHMARK_EXPORT size_t
+RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                       BenchmarkReporter* file_reporter, std::string spec);
 
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
-                              BenchmarkReporter* file_reporter);
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
-                              BenchmarkReporter* file_reporter,
-                              std::string spec);
+// TimeUnit is passed to a benchmark in order to specify the order of magnitude
+// for the measured time.
+enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond, kSecond };
+
+BENCHMARK_EXPORT TimeUnit GetDefaultTimeUnit();
+
+// Sets the default time unit the benchmarks use
+// Has to be called before the benchmark loop to take effect
+BENCHMARK_EXPORT void SetDefaultTimeUnit(TimeUnit unit);
 
 // If a MemoryManager is registered (via RegisterMemoryManager()),
 // it can be used to collect and report allocation metrics for a run of the
@@ -358,20 +408,16 @@ class MemoryManager {
   virtual void Start() = 0;
 
   // Implement this to stop recording and fill out the given Result structure.
-  BENCHMARK_DEPRECATED_MSG("Use Stop(Result&) instead")
-  virtual void Stop(Result* result) = 0;
-
-  // FIXME(vyng): Make this pure virtual once we've migrated current users.
-  BENCHMARK_DISABLE_DEPRECATED_WARNING
-  virtual void Stop(Result& result) { Stop(&result); }
-  BENCHMARK_RESTORE_DEPRECATED_WARNING
+  virtual void Stop(Result& result) = 0;
 };
 
 // Register a MemoryManager instance that will be used to collect and report
 // allocation measurements for benchmark runs.
+BENCHMARK_EXPORT
 void RegisterMemoryManager(MemoryManager* memory_manager);
 
 // Add a key-value pair to output as part of the context stanza in the report.
+BENCHMARK_EXPORT
 void AddCustomContext(const std::string& key, const std::string& value);
 
 namespace internal {
@@ -379,14 +425,17 @@ class Benchmark;
 class BenchmarkImp;
 class BenchmarkFamilies;
 
+BENCHMARK_EXPORT std::map<std::string, std::string>*& GetGlobalContext();
+
+BENCHMARK_EXPORT
 void UseCharPointer(char const volatile*);
 
 // Take ownership of the pointer and register the benchmark. Return the
 // registered benchmark.
-Benchmark* RegisterBenchmarkInternal(Benchmark*);
+BENCHMARK_EXPORT Benchmark* RegisterBenchmarkInternal(Benchmark*);
 
 // Ensure that the standard streams are properly initialized in every TU.
-int InitializeStreams();
+BENCHMARK_EXPORT int InitializeStreams();
 BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
 
 }  // namespace internal
@@ -409,7 +458,11 @@ inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
 // intended to add little to no overhead.
 // See: https://youtu.be/nXaxk27zwlk?t=2441
 #ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
+#if !defined(__GNUC__) || defined(__llvm__) || defined(__INTEL_COMPILER)
 template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   asm volatile("" : : "r,m"(value) : "memory");
 }
@@ -423,6 +476,98 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
 #endif
 }
 
+#ifdef BENCHMARK_HAS_CXX11
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
+#if defined(__clang__)
+  asm volatile("" : "+r,m"(value) : : "memory");
+#else
+  asm volatile("" : "+m,r"(value) : : "memory");
+#endif
+}
+#endif
+#elif defined(BENCHMARK_HAS_CXX11) && (__GNUC__ >= 5)
+// Workaround for a bug with full argument copy overhead with GCC.
+// See: #1340 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105519
+template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<std::is_trivially_copyable<Tp>::value &&
+                            (sizeof(Tp) <= sizeof(Tp*))>::type
+    DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "r,m"(value) : "memory");
+}
+
+template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<!std::is_trivially_copyable<Tp>::value ||
+                            (sizeof(Tp) > sizeof(Tp*))>::type
+    DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "m"(value) : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<std::is_trivially_copyable<Tp>::value &&
+                            (sizeof(Tp) <= sizeof(Tp*))>::type
+    DoNotOptimize(Tp& value) {
+  asm volatile("" : "+m,r"(value) : : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<!std::is_trivially_copyable<Tp>::value ||
+                            (sizeof(Tp) > sizeof(Tp*))>::type
+    DoNotOptimize(Tp& value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<std::is_trivially_copyable<Tp>::value &&
+                            (sizeof(Tp) <= sizeof(Tp*))>::type
+    DoNotOptimize(Tp&& value) {
+  asm volatile("" : "+m,r"(value) : : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<!std::is_trivially_copyable<Tp>::value ||
+                            (sizeof(Tp) > sizeof(Tp*))>::type
+    DoNotOptimize(Tp&& value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+
+#else
+// Fallback for GCC < 5. Can add some overhead because the compiler is forced
+// to use memory operations instead of operations with registers.
+// TODO: Remove if GCC < 5 will be unsupported.
+template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "m"(value) : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+
+#ifdef BENCHMARK_HAS_CXX11
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+#endif
+#endif
+
 #ifndef BENCHMARK_HAS_CXX11
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
   asm volatile("" : : : "memory");
@@ -430,6 +575,9 @@ inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
 #endif
 #elif defined(_MSC_VER)
 template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
   _ReadWriteBarrier();
@@ -439,10 +587,25 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { _ReadWriteBarrier(); }
 #endif
 #else
+#ifdef BENCHMARK_HAS_CXX11
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
+  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+}
+#else
 template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
 }
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
+  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+}
+#endif
 // FIXME Add ClobberMemory() for non-gnu and non-msvc compilers, before C++11.
 #endif
 
@@ -506,23 +669,21 @@ Counter::Flags inline operator|(const Counter::Flags& LHS,
 // This is the container for the user-defined counters.
 typedef std::map<std::string, Counter> UserCounters;
 
-// TimeUnit is passed to a benchmark in order to specify the order of magnitude
-// for the measured time.
-enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond, kSecond };
-
 // BigO is passed to a benchmark in order to specify the asymptotic
 // computational
 // complexity for the benchmark. In case oAuto is selected, complexity will be
 // calculated automatically to the best fit.
 enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
 
-typedef uint64_t IterationCount;
+typedef int64_t ComplexityN;
+
+typedef int64_t IterationCount;
 
 enum StatisticUnit { kTime, kPercentage };
 
 // BigOFunc is passed to a benchmark in order to specify the asymptotic
 // computational complexity for the benchmark.
-typedef double(BigOFunc)(IterationCount);
+typedef double(BigOFunc)(ComplexityN);
 
 // StatisticsFunc is passed to a benchmark in order to compute some descriptive
 // statistics over all the measurements of some type
@@ -564,11 +725,21 @@ enum AggregationReportMode
       ARM_FileReportAggregatesOnly | ARM_DisplayReportAggregatesOnly
 };
 
+enum Skipped
+#if defined(BENCHMARK_HAS_CXX11)
+    : unsigned
+#endif
+{
+  NotSkipped = 0,
+  SkippedWithMessage,
+  SkippedWithError
+};
+
 }  // namespace internal
 
 // State is passed to a running Benchmark and contains state for the
 // benchmark to use.
-class State {
+class BENCHMARK_EXPORT State {
  public:
   struct StateIterator;
   friend struct StateIterator;
@@ -580,13 +751,13 @@ class State {
   // have been called previously.
   //
   // NOTE: KeepRunning may not be used after calling either of these functions.
-  BENCHMARK_ALWAYS_INLINE StateIterator begin();
-  BENCHMARK_ALWAYS_INLINE StateIterator end();
+  inline BENCHMARK_ALWAYS_INLINE StateIterator begin();
+  inline BENCHMARK_ALWAYS_INLINE StateIterator end();
 
   // Returns true if the benchmark should continue through another iteration.
   // NOTE: A benchmark may not return from the test until KeepRunning() has
   // returned false.
-  bool KeepRunning();
+  inline bool KeepRunning();
 
   // Returns true iff the benchmark should run n more iterations.
   // REQUIRES: 'n' > 0.
@@ -598,10 +769,10 @@ class State {
   //   while (state.KeepRunningBatch(1000)) {
   //     // process 1000 elements
   //   }
-  bool KeepRunningBatch(IterationCount n);
+  inline bool KeepRunningBatch(IterationCount n);
 
-  // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
-  //           by the current thread.
+  // REQUIRES: timer is running and 'SkipWithMessage(...)' or
+  //   'SkipWithError(...)' has not been called by the current thread.
   // Stop the benchmark timer.  If not called, the timer will be
   // automatically stopped after the last iteration of the benchmark loop.
   //
@@ -616,8 +787,8 @@ class State {
   // within each benchmark iteration, if possible.
   void PauseTiming();
 
-  // REQUIRES: timer is not running and 'SkipWithError(...)' has not been called
-  //           by the current thread.
+  // REQUIRES: timer is not running and 'SkipWithMessage(...)' or
+  //   'SkipWithError(...)' has not been called by the current thread.
   // Start the benchmark timer.  The timer is NOT running on entrance to the
   // benchmark function. It begins running after control flow enters the
   // benchmark loop.
@@ -627,8 +798,30 @@ class State {
   // within each benchmark iteration, if possible.
   void ResumeTiming();
 
-  // REQUIRES: 'SkipWithError(...)' has not been called previously by the
-  //            current thread.
+  // REQUIRES: 'SkipWithMessage(...)' or 'SkipWithError(...)' has not been
+  //            called previously by the current thread.
+  // Report the benchmark as resulting in being skipped with the specified
+  // 'msg'.
+  // After this call the user may explicitly 'return' from the benchmark.
+  //
+  // If the ranged-for style of benchmark loop is used, the user must explicitly
+  // break from the loop, otherwise all future iterations will be run.
+  // If the 'KeepRunning()' loop is used the current thread will automatically
+  // exit the loop at the end of the current iteration.
+  //
+  // For threaded benchmarks only the current thread stops executing and future
+  // calls to `KeepRunning()` will block until all threads have completed
+  // the `KeepRunning()` loop. If multiple threads report being skipped only the
+  // first skip message is used.
+  //
+  // NOTE: Calling 'SkipWithMessage(...)' does not cause the benchmark to exit
+  // the current scope immediately. If the function is called from within
+  // the 'KeepRunning()' loop the current iteration will finish. It is the users
+  // responsibility to exit the scope as needed.
+  void SkipWithMessage(const std::string& msg);
+
+  // REQUIRES: 'SkipWithMessage(...)' or 'SkipWithError(...)' has not been
+  //            called previously by the current thread.
   // Report the benchmark as resulting in an error with the specified 'msg'.
   // After this call the user may explicitly 'return' from the benchmark.
   //
@@ -646,10 +839,13 @@ class State {
   // the current scope immediately. If the function is called from within
   // the 'KeepRunning()' loop the current iteration will finish. It is the users
   // responsibility to exit the scope as needed.
-  void SkipWithError(const char* msg);
+  void SkipWithError(const std::string& msg);
+
+  // Returns true if 'SkipWithMessage(...)' or 'SkipWithError(...)' was called.
+  bool skipped() const { return internal::NotSkipped != skipped_; }
 
   // Returns true if an error has been reported with 'SkipWithError(...)'.
-  bool error_occurred() const { return error_occurred_; }
+  bool error_occurred() const { return internal::SkippedWithError == skipped_; }
 
   // REQUIRES: called exactly once per iteration of the benchmarking loop.
   // Set the manually measured time for this benchmark iteration, which
@@ -684,10 +880,12 @@ class State {
   // and complexity_n will
   // represent the length of N.
   BENCHMARK_ALWAYS_INLINE
-  void SetComplexityN(int64_t complexity_n) { complexity_n_ = complexity_n; }
+  void SetComplexityN(ComplexityN complexity_n) {
+    complexity_n_ = complexity_n;
+  }
 
   BENCHMARK_ALWAYS_INLINE
-  int64_t complexity_length_n() const { return complexity_n_; }
+  ComplexityN complexity_length_n() const { return complexity_n_; }
 
   // If this routine is called with items > 0, then an items/s
   // label is printed on the benchmark report line for the currently
@@ -720,11 +918,7 @@ class State {
   //  BM_Compress   50         50   14115038  compress:27.3%
   //
   // REQUIRES: a benchmark has exited its benchmarking loop.
-  void SetLabel(const char* label);
-
-  void BENCHMARK_ALWAYS_INLINE SetLabel(const std::string& str) {
-    this->SetLabel(str.c_str());
-  }
+  void SetLabel(const std::string& label);
 
   // Range arguments for this run. CHECKs if the argument has been set.
   BENCHMARK_ALWAYS_INLINE
@@ -755,6 +949,9 @@ class State {
     return max_iterations - total_iterations_ + batch_leftover_;
   }
 
+  BENCHMARK_ALWAYS_INLINE
+  std::string name() const { return name_; }
+
  private:
   // items we expect on the first cache line (ie 64 bytes of the struct)
   // When total_iterations_ is 0, KeepRunning() and friends will return false.
@@ -772,29 +969,30 @@ class State {
  private:
   bool started_;
   bool finished_;
-  bool error_occurred_;
+  internal::Skipped skipped_;
 
   // items we don't need on the first cache line
   std::vector<int64_t> range_;
 
-  int64_t complexity_n_;
+  ComplexityN complexity_n_;
 
  public:
   // Container for user-defined counters.
   UserCounters counters;
 
  private:
-  State(IterationCount max_iters, const std::vector<int64_t>& ranges,
-        int thread_i, int n_threads, internal::ThreadTimer* timer,
-        internal::ThreadManager* manager,
+  State(std::string name, IterationCount max_iters,
+        const std::vector<int64_t>& ranges, int thread_i, int n_threads,
+        internal::ThreadTimer* timer, internal::ThreadManager* manager,
         internal::PerfCountersMeasurement* perf_counters_measurement);
 
   void StartKeepRunning();
   // Implementation of KeepRunning() and KeepRunningBatch().
   // is_batch must be true unless n is 1.
-  bool KeepRunningInternal(IterationCount n, bool is_batch);
+  inline bool KeepRunningInternal(IterationCount n, bool is_batch);
   void FinishKeepRunning();
 
+  const std::string name_;
   const int thread_index_;
   const int threads_;
 
@@ -826,7 +1024,7 @@ inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningInternal(IterationCount n,
   }
   if (!started_) {
     StartKeepRunning();
-    if (!error_occurred_ && total_iterations_ >= n) {
+    if (!skipped() && total_iterations_ >= n) {
       total_iterations_ -= n;
       return true;
     }
@@ -856,7 +1054,7 @@ struct State::StateIterator {
 
   BENCHMARK_ALWAYS_INLINE
   explicit StateIterator(State* st)
-      : cached_(st->error_occurred_ ? 0 : st->max_iterations), parent_(st) {}
+      : cached_(st->skipped() ? 0 : st->max_iterations), parent_(st) {}
 
  public:
   BENCHMARK_ALWAYS_INLINE
@@ -899,7 +1097,7 @@ typedef void(Function)(State&);
 // be called on this object to change the properties of the benchmark.
 // Each method returns "this" so that multiple method calls can
 // chained into one expression.
-class Benchmark {
+class BENCHMARK_EXPORT Benchmark {
  public:
   virtual ~Benchmark();
 
@@ -971,7 +1169,7 @@ class Benchmark {
 
   // Have "setup" and/or "teardown" invoked once for every benchmark run.
   // If the benchmark is multi-threaded (will run in k threads concurrently),
-  // the setup callback will be invoked exactly once (not k times) before
+  // the setup callback will be be invoked exactly once (not k times) before
   // each run with k threads. Time allowing (e.g. for a short benchmark), there
   // may be multiple such runs per benchmark, each run with its own
   // "setup"/"teardown".
@@ -1000,12 +1198,19 @@ class Benchmark {
   // REQUIRES: `t > 0` and `Iterations` has not been called on this benchmark.
   Benchmark* MinTime(double t);
 
+  // Set the minimum amount of time to run the benchmark before taking runtimes
+  // of this benchmark into account. This
+  // option overrides the `benchmark_min_warmup_time` flag.
+  // REQUIRES: `t >= 0` and `Iterations` has not been called on this benchmark.
+  Benchmark* MinWarmUpTime(double t);
+
   // Specify the amount of iterations that should be run by this benchmark.
+  // This option overrides the `benchmark_min_time` flag.
   // REQUIRES: 'n > 0' and `MinTime` has not been called on this benchmark.
   //
   // NOTE: This function should only be used when *exact* iteration control is
   //   needed and never to control or limit how long a benchmark runs, where
-  // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
+  // `--benchmark_min_time=<N>s` or `MinTime(...)` should be used instead.
   Benchmark* Iterations(IterationCount n);
 
   // Specify the amount of times to repeat this benchmark. This option overrides
@@ -1025,7 +1230,7 @@ class Benchmark {
   // By default, the CPU time is measured only for the main thread, which may
   // be unrepresentative if the benchmark uses threads internally. If called,
   // the total CPU time spent by all the threads will be measured instead.
-  // By default, the only the main thread CPU time will be measured.
+  // By default, only the main thread CPU time will be measured.
   Benchmark* MeasureProcessCPUTime();
 
   // If a particular benchmark should use the Wall clock instead of the CPU time
@@ -1090,12 +1295,16 @@ class Benchmark {
 
   virtual void Run(State& state) = 0;
 
+  TimeUnit GetTimeUnit() const;
+
  protected:
-  explicit Benchmark(const char* name);
-  Benchmark(Benchmark const&);
-  void SetName(const char* name);
+  explicit Benchmark(const std::string& name);
+  void SetName(const std::string& name);
 
+ public:
+  const char* GetName() const;
   int ArgsCnt() const;
+  const char* GetArgName(int arg) const;
 
  private:
   friend class BenchmarkFamilies;
@@ -1105,9 +1314,13 @@ class Benchmark {
   AggregationReportMode aggregation_report_mode_;
   std::vector<std::string> arg_names_;       // Args for all benchmark runs
   std::vector<std::vector<int64_t> > args_;  // Args for all benchmark runs
+
   TimeUnit time_unit_;
+  bool use_default_time_unit_;
+
   int range_multiplier_;
   double min_time_;
+  double min_warmup_time_;
   IterationCount iterations_;
   int repetitions_;
   bool measure_process_cpu_time_;
@@ -1122,7 +1335,17 @@ class Benchmark {
   callback_function setup_;
   callback_function teardown_;
 
-  Benchmark& operator=(Benchmark const&);
+  Benchmark(Benchmark const&)
+#if defined(BENCHMARK_HAS_CXX11)
+      = delete
+#endif
+      ;
+
+  Benchmark& operator=(Benchmark const&)
+#if defined(BENCHMARK_HAS_CXX11)
+      = delete
+#endif
+      ;
 };
 
 }  // namespace internal
@@ -1131,27 +1354,27 @@ class Benchmark {
 // the specified functor 'fn'.
 //
 // RETURNS: A pointer to the registered benchmark.
-internal::Benchmark* RegisterBenchmark(const char* name,
+internal::Benchmark* RegisterBenchmark(const std::string& name,
                                        internal::Function* fn);
 
 #if defined(BENCHMARK_HAS_CXX11)
 template <class Lambda>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn);
+internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn);
 #endif
 
 // Remove all registered benchmarks. All pointers to previously registered
 // benchmarks are invalidated.
-void ClearRegisteredBenchmarks();
+BENCHMARK_EXPORT void ClearRegisteredBenchmarks();
 
 namespace internal {
 // The class used to hold all Benchmarks created from static function.
 // (ie those created using the BENCHMARK(...) macros.
-class FunctionBenchmark : public Benchmark {
+class BENCHMARK_EXPORT FunctionBenchmark : public Benchmark {
  public:
-  FunctionBenchmark(const char* name, Function* func)
+  FunctionBenchmark(const std::string& name, Function* func)
       : Benchmark(name), func_(func) {}
 
-  virtual void Run(State& st) BENCHMARK_OVERRIDE;
+  void Run(State& st) BENCHMARK_OVERRIDE;
 
  private:
   Function* func_;
@@ -1161,35 +1384,38 @@ class FunctionBenchmark : public Benchmark {
 template <class Lambda>
 class LambdaBenchmark : public Benchmark {
  public:
-  virtual void Run(State& st) BENCHMARK_OVERRIDE { lambda_(st); }
+  void Run(State& st) BENCHMARK_OVERRIDE { lambda_(st); }
 
  private:
   template <class OLambda>
-  LambdaBenchmark(const char* name, OLambda&& lam)
+  LambdaBenchmark(const std::string& name, OLambda&& lam)
       : Benchmark(name), lambda_(std::forward<OLambda>(lam)) {}
 
   LambdaBenchmark(LambdaBenchmark const&) = delete;
 
   template <class Lam>  // NOLINTNEXTLINE(readability-redundant-declaration)
-  friend Benchmark* ::benchmark::RegisterBenchmark(const char*, Lam&&);
+  friend Benchmark* ::benchmark::RegisterBenchmark(const std::string&, Lam&&);
 
   Lambda lambda_;
 };
 #endif
-
 }  // namespace internal
 
-inline internal::Benchmark* RegisterBenchmark(const char* name,
+inline internal::Benchmark* RegisterBenchmark(const std::string& name,
                                               internal::Function* fn) {
+  // FIXME: this should be a `std::make_unique<>()` but we don't have C++14.
+  // codechecker_intentional [cplusplus.NewDeleteLeaks]
   return internal::RegisterBenchmarkInternal(
       ::new internal::FunctionBenchmark(name, fn));
 }
 
 #ifdef BENCHMARK_HAS_CXX11
 template <class Lambda>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn) {
+internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn) {
   using BenchType =
       internal::LambdaBenchmark<typename std::decay<Lambda>::type>;
+  // FIXME: this should be a `std::make_unique<>()` but we don't have C++14.
+  // codechecker_intentional [cplusplus.NewDeleteLeaks]
   return internal::RegisterBenchmarkInternal(
       ::new BenchType(name, std::forward<Lambda>(fn)));
 }
@@ -1198,7 +1424,7 @@ internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn) {
 #if defined(BENCHMARK_HAS_CXX11) && \
     (!defined(BENCHMARK_GCC_VERSION) || BENCHMARK_GCC_VERSION >= 409)
 template <class Lambda, class... Args>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn,
+internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn,
                                        Args&&... args) {
   return benchmark::RegisterBenchmark(
       name, [=](benchmark::State& st) { fn(st, args...); });
@@ -1212,7 +1438,7 @@ class Fixture : public internal::Benchmark {
  public:
   Fixture() : internal::Benchmark("") {}
 
-  virtual void Run(State& st) BENCHMARK_OVERRIDE {
+  void Run(State& st) BENCHMARK_OVERRIDE {
     this->SetUp(st);
     this->BenchmarkCase(st);
     this->TearDown(st);
@@ -1228,7 +1454,6 @@ class Fixture : public internal::Benchmark {
  protected:
   virtual void BenchmarkCase(State&) = 0;
 };
-
 }  // namespace benchmark
 
 // ------------------------------------------------------
@@ -1268,7 +1493,7 @@ class Fixture : public internal::Benchmark {
   BENCHMARK_PRIVATE_DECLARE(_benchmark_) =                           \
       (::benchmark::internal::RegisterBenchmarkInternal(             \
           new ::benchmark::internal::FunctionBenchmark(#__VA_ARGS__, \
-                                                       &__VA_ARGS__)))
+                                                       __VA_ARGS__)))
 #else
 #define BENCHMARK(n)                                     \
   BENCHMARK_PRIVATE_DECLARE(n) =                         \
@@ -1298,7 +1523,7 @@ class Fixture : public internal::Benchmark {
 // /* Registers a benchmark named "BM_takes_args/int_string_test` */
 // BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
 #define BENCHMARK_CAPTURE(func, test_case_name, ...)     \
-  BENCHMARK_PRIVATE_DECLARE(func) =                      \
+  BENCHMARK_PRIVATE_DECLARE(_benchmark_) =               \
       (::benchmark::internal::RegisterBenchmarkInternal( \
           new ::benchmark::internal::FunctionBenchmark(  \
               #func "/" #test_case_name,                 \
@@ -1335,37 +1560,62 @@ class Fixture : public internal::Benchmark {
 #define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
 #endif
 
-#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)                  \
-  class BaseClass##_##Method##_Benchmark : public BaseClass {           \
-   public:                                                              \
-    BaseClass##_##Method##_Benchmark() {                                \
-      this->SetName(#BaseClass "/" #Method);                            \
-    }                                                                   \
-                                                                        \
-   protected:                                                           \
-    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
+#ifdef BENCHMARK_HAS_CXX11
+// This will register a benchmark for a templatized function,
+// with the additional arguments specified by `...`.
+//
+// For example:
+//
+// template <typename T, class ...ExtraArgs>`
+// void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
+//  [...]
+//}
+// /* Registers a benchmark named "BM_takes_args<void>/int_string_test` */
+// BENCHMARK_TEMPLATE1_CAPTURE(BM_takes_args, void, int_string_test, 42,
+//                             std::string("abc"));
+#define BENCHMARK_TEMPLATE1_CAPTURE(func, a, test_case_name, ...) \
+  BENCHMARK_CAPTURE(func<a>, test_case_name, __VA_ARGS__)
+
+#define BENCHMARK_TEMPLATE2_CAPTURE(func, a, b, test_case_name, ...) \
+  BENCHMARK_PRIVATE_DECLARE(func) =                                  \
+      (::benchmark::internal::RegisterBenchmarkInternal(             \
+          new ::benchmark::internal::FunctionBenchmark(              \
+              #func "<" #a "," #b ">"                                \
+                    "/" #test_case_name,                             \
+              [](::benchmark::State& st) { func<a, b>(st, __VA_ARGS__); })))
+#endif  // BENCHMARK_HAS_CXX11
+
+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)          \
+  class BaseClass##_##Method##_Benchmark : public BaseClass {   \
+   public:                                                      \
+    BaseClass##_##Method##_Benchmark() {                        \
+      this->SetName(#BaseClass "/" #Method);                    \
+    }                                                           \
+                                                                \
+   protected:                                                   \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
   };
 
-#define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a)     \
-  class BaseClass##_##Method##_Benchmark : public BaseClass<a> {        \
-   public:                                                              \
-    BaseClass##_##Method##_Benchmark() {                                \
-      this->SetName(#BaseClass "<" #a ">/" #Method);                    \
-    }                                                                   \
-                                                                        \
-   protected:                                                           \
-    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
+#define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<a> {    \
+   public:                                                          \
+    BaseClass##_##Method##_Benchmark() {                            \
+      this->SetName(#BaseClass "<" #a ">/" #Method);                \
+    }                                                               \
+                                                                    \
+   protected:                                                       \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;     \
   };
 
-#define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b)  \
-  class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {     \
-   public:                                                              \
-    BaseClass##_##Method##_Benchmark() {                                \
-      this->SetName(#BaseClass "<" #a "," #b ">/" #Method);             \
-    }                                                                   \
-                                                                        \
-   protected:                                                           \
-    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
+#define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {    \
+   public:                                                             \
+    BaseClass##_##Method##_Benchmark() {                               \
+      this->SetName(#BaseClass "<" #a "," #b ">/" #Method);            \
+    }                                                                  \
+                                                                       \
+   protected:                                                          \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;        \
   };
 
 #ifdef BENCHMARK_HAS_CXX11
@@ -1377,7 +1627,7 @@ class Fixture : public internal::Benchmark {
     }                                                                      \
                                                                            \
    protected:                                                              \
-    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;    \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;            \
   };
 #else
 #define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(n, a) \
@@ -1439,8 +1689,15 @@ class Fixture : public internal::Benchmark {
 #endif
 
 // Helper macro to create a main routine in a test that runs the benchmarks
+// Note the workaround for Hexagon simulator passing argc != 0, argv = NULL.
 #define BENCHMARK_MAIN()                                                \
   int main(int argc, char** argv) {                                     \
+    char arg0_default[] = "benchmark";                                  \
+    char* args_default = arg0_default;                                  \
+    if (!argv) {                                                        \
+      argc = 1;                                                         \
+      argv = &args_default;                                             \
+    }                                                                   \
     ::benchmark::Initialize(&argc, argv);                               \
     if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
     ::benchmark::RunSpecifiedBenchmarks();                              \
@@ -1454,7 +1711,7 @@ class Fixture : public internal::Benchmark {
 
 namespace benchmark {
 
-struct CPUInfo {
+struct BENCHMARK_EXPORT CPUInfo {
   struct CacheInfo {
     std::string type;
     int level;
@@ -1478,7 +1735,7 @@ struct CPUInfo {
 };
 
 // Adding Struct for System Information
-struct SystemInfo {
+struct BENCHMARK_EXPORT SystemInfo {
   std::string name;
   static const SystemInfo& Get();
 
@@ -1490,10 +1747,11 @@ struct SystemInfo {
 // BenchmarkName contains the components of the Benchmark's name
 // which allows individual fields to be modified or cleared before
 // building the final name using 'str()'.
-struct BenchmarkName {
+struct BENCHMARK_EXPORT BenchmarkName {
   std::string function_name;
   std::string args;
   std::string min_time;
+  std::string min_warmup_time;
   std::string iterations;
   std::string repetitions;
   std::string time_type;
@@ -1509,7 +1767,7 @@ struct BenchmarkName {
 // can control the destination of the reports by calling
 // RunSpecifiedBenchmarks and passing it a custom reporter object.
 // The reporter object must implement the following interface.
-class BenchmarkReporter {
+class BENCHMARK_EXPORT BenchmarkReporter {
  public:
   struct Context {
     CPUInfo const& cpu_info;
@@ -1520,20 +1778,21 @@ class BenchmarkReporter {
     Context();
   };
 
-  struct Run {
+  struct BENCHMARK_EXPORT Run {
     static const int64_t no_repetition_index = -1;
     enum RunType { RT_Iteration, RT_Aggregate };
 
     Run()
         : run_type(RT_Iteration),
           aggregate_unit(kTime),
-          error_occurred(false),
+          skipped(internal::NotSkipped),
           iterations(1),
           threads(1),
-          time_unit(kNanosecond),
+          time_unit(GetDefaultTimeUnit()),
           real_accumulated_time(0),
           cpu_accumulated_time(0),
           max_heapbytes_used(0),
+          use_real_time_for_initial_big_o(false),
           complexity(oNone),
           complexity_lambda(),
           complexity_n(0),
@@ -1550,8 +1809,8 @@ class BenchmarkReporter {
     std::string aggregate_name;
     StatisticUnit aggregate_unit;
     std::string report_label;  // Empty if not set by benchmark.
-    bool error_occurred;
-    std::string error_message;
+    internal::Skipped skipped;
+    std::string skip_message;
 
     IterationCount iterations;
     int64_t threads;
@@ -1576,10 +1835,14 @@ class BenchmarkReporter {
     // This is set to 0.0 if memory tracing is not enabled.
     double max_heapbytes_used;
 
+    // By default Big-O is computed for CPU time, but that is not what you want
+    // to happen when manual time was requested, which is stored as real time.
+    bool use_real_time_for_initial_big_o;
+
     // Keep track of arguments to compute asymptotic complexity
     BigO complexity;
     BigOFunc* complexity_lambda;
-    int64_t complexity_n;
+    ComplexityN complexity_n;
 
     // what statistics to compute from the measurements
     const std::vector<internal::Statistics>* statistics;
@@ -1620,6 +1883,12 @@ class BenchmarkReporter {
   // to skip runs based on the context information.
   virtual bool ReportContext(const Context& context) = 0;
 
+  // Called once for each group of benchmark runs, gives information about
+  // the configurations of the runs.
+  virtual void ReportRunsConfig(double /*min_time*/,
+                                bool /*has_explicit_iters*/,
+                                IterationCount /*iters*/) {}
+
   // Called once for each group of benchmark runs, gives information about
   // cpu-time and heap memory usage during the benchmark run. If the group
   // of runs contained more than two entries then 'report' contains additional
@@ -1665,7 +1934,7 @@ class BenchmarkReporter {
 
 // Simple reporter that outputs benchmark data to the console. This is the
 // default reporter used by RunSpecifiedBenchmarks().
-class ConsoleReporter : public BenchmarkReporter {
+class BENCHMARK_EXPORT ConsoleReporter : public BenchmarkReporter {
  public:
   enum OutputOptions {
     OO_None = 0,
@@ -1677,8 +1946,8 @@ class ConsoleReporter : public BenchmarkReporter {
   explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
       : output_options_(opts_), name_field_width_(0), printed_header_(false) {}
 
-  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
-  virtual void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
+  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
 
  protected:
   virtual void PrintRunData(const Run& report);
@@ -1690,12 +1959,12 @@ class ConsoleReporter : public BenchmarkReporter {
   bool printed_header_;
 };
 
-class JSONReporter : public BenchmarkReporter {
+class BENCHMARK_EXPORT JSONReporter : public BenchmarkReporter {
  public:
   JSONReporter() : first_report_(true) {}
-  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
-  virtual void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
-  virtual void Finalize() BENCHMARK_OVERRIDE;
+  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
+  void Finalize() BENCHMARK_OVERRIDE;
 
  private:
   void PrintRunData(const Run& report);
@@ -1703,13 +1972,13 @@ class JSONReporter : public BenchmarkReporter {
   bool first_report_;
 };
 
-class BENCHMARK_DEPRECATED_MSG(
+class BENCHMARK_EXPORT BENCHMARK_DEPRECATED_MSG(
     "The CSV Reporter will be removed in a future release") CSVReporter
     : public BenchmarkReporter {
  public:
   CSVReporter() : printed_header_(false) {}
-  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
-  virtual void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
+  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
 
  private:
   void PrintRunData(const Run& report);
@@ -1748,18 +2017,24 @@ inline double GetTimeUnitMultiplier(TimeUnit unit) {
 
 // Creates a list of integer values for the given range and multiplier.
 // This can be used together with ArgsProduct() to allow multiple ranges
-// with different multiplers.
+// with different multipliers.
 // Example:
 // ArgsProduct({
 //   CreateRange(0, 1024, /*multi=*/32),
 //   CreateRange(0, 100, /*multi=*/4),
 //   CreateDenseRange(0, 4, /*step=*/1),
 // });
+BENCHMARK_EXPORT
 std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi);
 
 // Creates a list of integer values for the given range and step.
+BENCHMARK_EXPORT
 std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit, int step);
 
 }  // namespace benchmark
 
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
 #endif  // BENCHMARK_BENCHMARK_H_
diff --git a/third-party/benchmark/requirements.txt b/third-party/benchmark/requirements.txt
deleted file mode 100644
index e451894e2356c6..00000000000000
--- a/third-party/benchmark/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-numpy == 1.19.4
-scipy == 1.5.4
-pandas == 1.1.5
diff --git a/third-party/benchmark/setup.py b/third-party/benchmark/setup.py
index 83069e56683977..cb20042da51230 100644
--- a/third-party/benchmark/setup.py
+++ b/third-party/benchmark/setup.py
@@ -1,56 +1,50 @@
+import contextlib
 import os
-import posixpath
 import platform
-import re
 import shutil
-import sys
+import sysconfig
+from pathlib import Path
+from typing import Generator
 
-from distutils import sysconfig
 import setuptools
 from setuptools.command import build_ext
 
+PYTHON_INCLUDE_PATH_PLACEHOLDER = "<PYTHON_INCLUDE_PATH>"
 
-HERE = os.path.dirname(os.path.abspath(__file__))
+IS_WINDOWS = platform.system() == "Windows"
+IS_MAC = platform.system() == "Darwin"
 
 
-IS_WINDOWS = sys.platform.startswith("win")
-
-
-def _get_version():
-    """Parse the version string from __init__.py."""
-    with open(
-        os.path.join(HERE, "bindings", "python", "google_benchmark", "__init__.py")
-    ) as init_file:
+ at contextlib.contextmanager
+def temp_fill_include_path(fp: str) -> Generator[None, None, None]:
+    """Temporarily set the Python include path in a file."""
+    with open(fp, "r+") as f:
         try:
-            version_line = next(
-                line for line in init_file if line.startswith("__version__")
+            content = f.read()
+            replaced = content.replace(
+                PYTHON_INCLUDE_PATH_PLACEHOLDER,
+                Path(sysconfig.get_paths()["include"]).as_posix(),
             )
-        except StopIteration:
-            raise ValueError("__version__ not defined in __init__.py")
-        else:
-            namespace = {}
-            exec(version_line, namespace)  # pylint: disable=exec-used
-            return namespace["__version__"]
-
-
-def _parse_requirements(path):
-    with open(os.path.join(HERE, path)) as requirements:
-        return [
-            line.rstrip()
-            for line in requirements
-            if not (line.isspace() or line.startswith("#"))
-        ]
+            f.seek(0)
+            f.write(replaced)
+            f.truncate()
+            yield
+        finally:
+            # revert to the original content after exit
+            f.seek(0)
+            f.write(content)
+            f.truncate()
 
 
 class BazelExtension(setuptools.Extension):
     """A C/C++ extension that is defined as a Bazel BUILD target."""
 
-    def __init__(self, name, bazel_target):
+    def __init__(self, name: str, bazel_target: str):
+        super().__init__(name=name, sources=[])
+
         self.bazel_target = bazel_target
-        self.relpath, self.target_name = posixpath.relpath(bazel_target, "//").split(
-            ":"
-        )
-        setuptools.Extension.__init__(self, name, sources=[])
+        stripped_target = bazel_target.split("//")[-1]
+        self.relpath, self.target_name = stripped_target.split(":")
 
 
 class BuildBazelExtension(build_ext.build_ext):
@@ -59,88 +53,71 @@ class BuildBazelExtension(build_ext.build_ext):
     def run(self):
         for ext in self.extensions:
             self.bazel_build(ext)
-        build_ext.build_ext.run(self)
-
-    def bazel_build(self, ext):
+        super().run()
+        # explicitly call `bazel shutdown` for graceful exit
+        self.spawn(["bazel", "shutdown"])
+
+    def copy_extensions_to_source(self):
+        """
+        Copy generated extensions into the source tree.
+        This is done in the ``bazel_build`` method, so it's not necessary to
+        do again in the `build_ext` base class.
+        """
+        pass
+
+    def bazel_build(self, ext: BazelExtension) -> None:
         """Runs the bazel build to create the package."""
-        with open("WORKSPACE", "r") as workspace:
-            workspace_contents = workspace.read()
-
-        with open("WORKSPACE", "w") as workspace:
-            workspace.write(
-                re.sub(
-                    r'(?<=path = ").*(?=",  # May be overwritten by setup\.py\.)',
-                    sysconfig.get_python_inc().replace(os.path.sep, posixpath.sep),
-                    workspace_contents,
-                )
+        with temp_fill_include_path("WORKSPACE"):
+            temp_path = Path(self.build_temp)
+
+            bazel_argv = [
+                "bazel",
+                "build",
+                ext.bazel_target,
+                "--enable_bzlmod=false",
+                f"--symlink_prefix={temp_path / 'bazel-'}",
+                f"--compilation_mode={'dbg' if self.debug else 'opt'}",
+                # C++17 is required by nanobind
+                f"--cxxopt={'/std:c++17' if IS_WINDOWS else '-std=c++17'}",
+            ]
+
+            if IS_WINDOWS:
+                # Link with python*.lib.
+                for library_dir in self.library_dirs:
+                    bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
+            elif IS_MAC:
+                if platform.machine() == "x86_64":
+                    # C++17 needs macOS 10.14 at minimum
+                    bazel_argv.append("--macos_minimum_os=10.14")
+
+                    # cross-compilation for Mac ARM64 on GitHub Mac x86 runners.
+                    # ARCHFLAGS is set by cibuildwheel before macOS wheel builds.
+                    archflags = os.getenv("ARCHFLAGS", "")
+                    if "arm64" in archflags:
+                        bazel_argv.append("--cpu=darwin_arm64")
+                        bazel_argv.append("--macos_cpus=arm64")
+
+                elif platform.machine() == "arm64":
+                    bazel_argv.append("--macos_minimum_os=11.0")
+
+            self.spawn(bazel_argv)
+
+            shared_lib_suffix = ".dll" if IS_WINDOWS else ".so"
+            ext_name = ext.target_name + shared_lib_suffix
+            ext_bazel_bin_path = (
+                temp_path / "bazel-bin" / ext.relpath / ext_name
             )
 
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
-
-        bazel_argv = [
-            "bazel",
-            "build",
-            ext.bazel_target,
-            "--symlink_prefix=" + os.path.join(self.build_temp, "bazel-"),
-            "--compilation_mode=" + ("dbg" if self.debug else "opt"),
-        ]
-
-        if IS_WINDOWS:
-            # Link with python*.lib.
-            for library_dir in self.library_dirs:
-                bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
-        elif sys.platform == "darwin" and platform.machine() == "x86_64":
-            bazel_argv.append("--macos_minimum_os=10.9")
-
-        self.spawn(bazel_argv)
-
-        shared_lib_suffix = ".dll" if IS_WINDOWS else ".so"
-        ext_bazel_bin_path = os.path.join(
-            self.build_temp,
-            "bazel-bin",
-            ext.relpath,
-            ext.target_name + shared_lib_suffix,
-        )
-
-        ext_dest_path = self.get_ext_fullpath(ext.name)
-        ext_dest_dir = os.path.dirname(ext_dest_path)
-        if not os.path.exists(ext_dest_dir):
-            os.makedirs(ext_dest_dir)
-        shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
+            ext_dest_path = Path(self.get_ext_fullpath(ext.name))
+            shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
 
 
 setuptools.setup(
-    name="google_benchmark",
-    version=_get_version(),
-    url="https://github.com/google/benchmark",
-    description="A library to benchmark code snippets.",
-    author="Google",
-    author_email="benchmark-py at google.com",
-    # Contained modules and scripts.
-    package_dir={"": "bindings/python"},
-    packages=setuptools.find_packages("bindings/python"),
-    install_requires=_parse_requirements("bindings/python/requirements.txt"),
     cmdclass=dict(build_ext=BuildBazelExtension),
     ext_modules=[
         BazelExtension(
-            "google_benchmark._benchmark",
-            "//bindings/python/google_benchmark:_benchmark",
+            name="google_benchmark._benchmark",
+            bazel_target="//bindings/python/google_benchmark:_benchmark",
         )
     ],
-    zip_safe=False,
-    # PyPI package information.
-    classifiers=[
-        "Development Status :: 4 - Beta",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Science/Research",
-        "License :: OSI Approved :: Apache Software License",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
-        "Topic :: Software Development :: Testing",
-        "Topic :: System :: Benchmark",
-    ],
-    license="Apache 2.0",
-    keywords="benchmark",
 )
diff --git a/third-party/benchmark/src/CMakeLists.txt b/third-party/benchmark/src/CMakeLists.txt
index e814a4e00f7c2a..943594b70bcd0c 100644
--- a/third-party/benchmark/src/CMakeLists.txt
+++ b/third-party/benchmark/src/CMakeLists.txt
@@ -25,12 +25,25 @@ set_target_properties(benchmark PROPERTIES
   SOVERSION ${GENERIC_LIB_SOVERSION}
 )
 target_include_directories(benchmark PUBLIC
-  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>)
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+)
+
+set_property(
+  SOURCE benchmark.cc
+  APPEND
+  PROPERTY COMPILE_DEFINITIONS
+  BENCHMARK_VERSION="${VERSION}"
+)
 
 # libpfm, if available
-if (HAVE_LIBPFM)
-  target_link_libraries(benchmark PRIVATE pfm)
-  add_definitions(-DHAVE_LIBPFM)
+if (PFM_FOUND)
+  target_link_libraries(benchmark PRIVATE PFM::libpfm)
+  target_compile_definitions(benchmark PRIVATE -DHAVE_LIBPFM)
+endif()
+
+# pthread affinity, if available
+if(HAVE_PTHREAD_AFFINITY)
+  target_compile_definitions(benchmark PRIVATE -DBENCHMARK_HAS_PTHREAD_AFFINITY)
 endif()
 
 # Link threads.
@@ -53,6 +66,10 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
   target_link_libraries(benchmark PRIVATE kstat)
 endif()
 
+if (NOT BUILD_SHARED_LIBS)
+  target_compile_definitions(benchmark PUBLIC -DBENCHMARK_STATIC_DEFINE)
+endif()
+
 # Benchmark main library
 add_library(benchmark_main "benchmark_main.cc")
 add_library(benchmark::benchmark_main ALIAS benchmark_main)
@@ -60,10 +77,10 @@ set_target_properties(benchmark_main PROPERTIES
   OUTPUT_NAME "benchmark_main"
   VERSION ${GENERIC_LIB_VERSION}
   SOVERSION ${GENERIC_LIB_SOVERSION}
+  DEFINE_SYMBOL benchmark_EXPORTS
 )
 target_link_libraries(benchmark_main PUBLIC benchmark::benchmark)
 
-
 set(generated_dir "${PROJECT_BINARY_DIR}")
 
 set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
@@ -107,6 +124,7 @@ if (BENCHMARK_ENABLE_INSTALL)
 
   install(
     DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
+              "${PROJECT_BINARY_DIR}/include/benchmark"
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
     FILES_MATCHING PATTERN "*.*h")
 
diff --git a/third-party/benchmark/src/benchmark.cc b/third-party/benchmark/src/benchmark.cc
index 473151136202d3..31f2cde8ff1061 100644
--- a/third-party/benchmark/src/benchmark.cc
+++ b/third-party/benchmark/src/benchmark.cc
@@ -19,7 +19,7 @@
 #include "internal_macros.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -65,12 +65,28 @@ BM_DEFINE_bool(benchmark_list_tests, false);
 // linked into the binary are run.
 BM_DEFINE_string(benchmark_filter, "");
 
-// Minimum number of seconds we should run benchmark before results are
-// considered significant.  For cpu-time based tests, this is the lower bound
+// Specification of how long to run the benchmark.
+//
+// It can be either an exact number of iterations (specified as `<integer>x`),
+// or a minimum number of seconds (specified as `<float>s`). If the latter
+// format (ie., min seconds) is used, the system may run the benchmark longer
+// until the results are considered significant.
+//
+// For backward compatibility, the `s` suffix may be omitted, in which case,
+// the specified number is interpreted as the number of seconds.
+//
+// For cpu-time based tests, this is the lower bound
 // on the total cpu time used by all threads that make up the test.  For
 // real-time based tests, this is the lower bound on the elapsed time of the
 // benchmark execution, regardless of number of threads.
-BM_DEFINE_double(benchmark_min_time, 0.5);
+BM_DEFINE_string(benchmark_min_time, kDefaultMinTimeStr);
+
+// Minimum number of seconds a benchmark should be run before results should be
+// taken into account. This e.g can be necessary for benchmarks of code which
+// needs to fill some form of cache before performance is of interest.
+// Note: results gathered within this period are discarded and not used for
+// reported result.
+BM_DEFINE_double(benchmark_min_warmup_time, 0.0);
 
 // The number of runs of each benchmark. If greater than 1, the mean and
 // standard deviation of the runs will be reported.
@@ -121,6 +137,10 @@ BM_DEFINE_string(benchmark_perf_counters, "");
 // pairs. Kept internal as it's only used for parsing from env/command line.
 BM_DEFINE_kvpairs(benchmark_context, {});
 
+// Set the default time unit to use for reports
+// Valid values are 'ns', 'us', 'ms' or 's'
+BM_DEFINE_string(benchmark_time_unit, "");
+
 // The level of verbose logging to output
 BM_DEFINE_int32(v, 0);
 
@@ -128,23 +148,28 @@ namespace internal {
 
 std::map<std::string, std::string>* global_context = nullptr;
 
+BENCHMARK_EXPORT std::map<std::string, std::string>*& GetGlobalContext() {
+  return global_context;
+}
+
 // FIXME: wouldn't LTO mess this up?
 void UseCharPointer(char const volatile*) {}
 
 }  // namespace internal
 
-State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
-             int thread_i, int n_threads, internal::ThreadTimer* timer,
-             internal::ThreadManager* manager,
+State::State(std::string name, IterationCount max_iters,
+             const std::vector<int64_t>& ranges, int thread_i, int n_threads,
+             internal::ThreadTimer* timer, internal::ThreadManager* manager,
              internal::PerfCountersMeasurement* perf_counters_measurement)
     : total_iterations_(0),
       batch_leftover_(0),
       max_iterations(max_iters),
       started_(false),
       finished_(false),
-      error_occurred_(false),
+      skipped_(internal::NotSkipped),
       range_(ranges),
       complexity_n_(0),
+      name_(std::move(name)),
       thread_index_(thread_i),
       threads_(n_threads),
       timer_(timer),
@@ -154,6 +179,17 @@ State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
   BM_CHECK_LT(thread_index_, threads_)
       << "thread_index must be less than threads";
 
+  // Add counters with correct flag now.  If added with `counters[name]` in
+  // `PauseTiming`, a new `Counter` will be inserted the first time, which
+  // won't have the flag.  Inserting them now also reduces the allocations
+  // during the benchmark.
+  if (perf_counters_measurement_) {
+    for (const std::string& counter_name :
+         perf_counters_measurement_->names()) {
+      counters[counter_name] = Counter(0.0, Counter::kAvgIterations);
+    }
+  }
+
   // Note: The use of offsetof below is technically undefined until C++17
   // because State is not a standard layout type. However, all compilers
   // currently provide well-defined behavior as an extension (which is
@@ -166,55 +202,79 @@ State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
 #elif defined(__GNUC__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Winvalid-offsetof"
-#elif defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Winvalid-offsetof"
+#endif
+#if defined(__NVCC__)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 1427
+#endif
+#if defined(__NVCOMPILER)
+#pragma diagnostic push
+#pragma diag_suppress offset_in_non_POD_nonstandard
 #endif
   // Offset tests to ensure commonly accessed data is on the first cache line.
   const int cache_line_size = 64;
-  static_assert(offsetof(State, error_occurred_) <=
-                    (cache_line_size - sizeof(error_occurred_)),
-                "");
+  static_assert(
+      offsetof(State, skipped_) <= (cache_line_size - sizeof(skipped_)), "");
 #if defined(__INTEL_COMPILER)
 #pragma warning pop
 #elif defined(__GNUC__)
 #pragma GCC diagnostic pop
-#elif defined(__clang__)
-#pragma clang diagnostic pop
+#endif
+#if defined(__NVCC__)
+#pragma nv_diagnostic pop
+#endif
+#if defined(__NVCOMPILER)
+#pragma diagnostic pop
 #endif
 }
 
 void State::PauseTiming() {
   // Add in time accumulated so far
-  BM_CHECK(started_ && !finished_ && !error_occurred_);
+  BM_CHECK(started_ && !finished_ && !skipped());
   timer_->StopTimer();
   if (perf_counters_measurement_) {
-    auto measurements = perf_counters_measurement_->StopAndGetMeasurements();
+    std::vector<std::pair<std::string, double>> measurements;
+    if (!perf_counters_measurement_->Stop(measurements)) {
+      BM_CHECK(false) << "Perf counters read the value failed.";
+    }
     for (const auto& name_and_measurement : measurements) {
-      auto name = name_and_measurement.first;
-      auto measurement = name_and_measurement.second;
-      BM_CHECK_EQ(counters[name], 0.0);
-      counters[name] = Counter(measurement, Counter::kAvgIterations);
+      const std::string& name = name_and_measurement.first;
+      const double measurement = name_and_measurement.second;
+      // Counter was inserted with `kAvgIterations` flag by the constructor.
+      assert(counters.find(name) != counters.end());
+      counters[name].value += measurement;
     }
   }
 }
 
 void State::ResumeTiming() {
-  BM_CHECK(started_ && !finished_ && !error_occurred_);
+  BM_CHECK(started_ && !finished_ && !skipped());
   timer_->StartTimer();
   if (perf_counters_measurement_) {
     perf_counters_measurement_->Start();
   }
 }
 
-void State::SkipWithError(const char* msg) {
-  BM_CHECK(msg);
-  error_occurred_ = true;
+void State::SkipWithMessage(const std::string& msg) {
+  skipped_ = internal::SkippedWithMessage;
+  {
+    MutexLock l(manager_->GetBenchmarkMutex());
+    if (internal::NotSkipped == manager_->results.skipped_) {
+      manager_->results.skip_message_ = msg;
+      manager_->results.skipped_ = skipped_;
+    }
+  }
+  total_iterations_ = 0;
+  if (timer_->running()) timer_->StopTimer();
+}
+
+void State::SkipWithError(const std::string& msg) {
+  skipped_ = internal::SkippedWithError;
   {
     MutexLock l(manager_->GetBenchmarkMutex());
-    if (manager_->results.has_error_ == false) {
-      manager_->results.error_message_ = msg;
-      manager_->results.has_error_ = true;
+    if (internal::NotSkipped == manager_->results.skipped_) {
+      manager_->results.skip_message_ = msg;
+      manager_->results.skipped_ = skipped_;
     }
   }
   total_iterations_ = 0;
@@ -225,7 +285,7 @@ void State::SetIterationTime(double seconds) {
   timer_->SetIterationTime(seconds);
 }
 
-void State::SetLabel(const char* label) {
+void State::SetLabel(const std::string& label) {
   MutexLock l(manager_->GetBenchmarkMutex());
   manager_->results.report_label_ = label;
 }
@@ -233,14 +293,14 @@ void State::SetLabel(const char* label) {
 void State::StartKeepRunning() {
   BM_CHECK(!started_ && !finished_);
   started_ = true;
-  total_iterations_ = error_occurred_ ? 0 : max_iterations;
+  total_iterations_ = skipped() ? 0 : max_iterations;
   manager_->StartStopBarrier();
-  if (!error_occurred_) ResumeTiming();
+  if (!skipped()) ResumeTiming();
 }
 
 void State::FinishKeepRunning() {
-  BM_CHECK(started_ && (!finished_ || error_occurred_));
-  if (!error_occurred_) {
+  BM_CHECK(started_ && (!finished_ || skipped()));
+  if (!skipped()) {
     PauseTiming();
   }
   // Total iterations has now wrapped around past 0. Fix this.
@@ -318,14 +378,26 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
 
     size_t num_repetitions_total = 0;
 
+    // This perfcounters object needs to be created before the runners vector
+    // below so it outlasts their lifetime.
+    PerfCountersMeasurement perfcounters(
+        StrSplit(FLAGS_benchmark_perf_counters, ','));
+
+    // Vector of benchmarks to run
     std::vector<internal::BenchmarkRunner> runners;
     runners.reserve(benchmarks.size());
+
+    // Count the number of benchmarks with threads to warn the user in case
+    // performance counters are used.
+    int benchmarks_with_threads = 0;
+
+    // Loop through all benchmarks
     for (const BenchmarkInstance& benchmark : benchmarks) {
       BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
       if (benchmark.complexity() != oNone)
         reports_for_family = &per_family_reports[benchmark.family_index()];
-
-      runners.emplace_back(benchmark, reports_for_family);
+      benchmarks_with_threads += (benchmark.threads() > 1);
+      runners.emplace_back(benchmark, &perfcounters, reports_for_family);
       int num_repeats_of_this_instance = runners.back().GetNumRepeats();
       num_repetitions_total += num_repeats_of_this_instance;
       if (reports_for_family)
@@ -333,6 +405,17 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
     }
     assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
 
+    // The use of performance counters with threads would be unintuitive for
+    // the average user so we need to warn them about this case
+    if ((benchmarks_with_threads > 0) && (perfcounters.num_counters() > 0)) {
+      GetErrorLogInstance()
+          << "***WARNING*** There are " << benchmarks_with_threads
+          << " benchmarks with threads and " << perfcounters.num_counters()
+          << " performance counters were requested. Beware counters will "
+             "reflect the combined usage across all "
+             "threads.\n";
+    }
+
     std::vector<size_t> repetition_indices;
     repetition_indices.reserve(num_repetitions_total);
     for (size_t runner_index = 0, num_runners = runners.size();
@@ -356,6 +439,12 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
       if (runner.HasRepeatsRemaining()) continue;
       // FIXME: report each repetition separately, not all of them in bulk.
 
+      display_reporter->ReportRunsConfig(
+          runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
+      if (file_reporter)
+        file_reporter->ReportRunsConfig(
+            runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
+
       RunResults run_results = runner.GetResults();
 
       // Maybe calculate complexity report
@@ -389,14 +478,15 @@ std::unique_ptr<BenchmarkReporter> CreateReporter(
   typedef std::unique_ptr<BenchmarkReporter> PtrType;
   if (name == "console") {
     return PtrType(new ConsoleReporter(output_opts));
-  } else if (name == "json") {
-    return PtrType(new JSONReporter);
-  } else if (name == "csv") {
-    return PtrType(new CSVReporter);
-  } else {
-    std::cerr << "Unexpected format: '" << name << "'\n";
-    std::exit(1);
   }
+  if (name == "json") {
+    return PtrType(new JSONReporter());
+  }
+  if (name == "csv") {
+    return PtrType(new CSVReporter());
+  }
+  std::cerr << "Unexpected format: '" << name << "'\n";
+  std::exit(1);
 }
 
 BENCHMARK_RESTORE_DEPRECATED_WARNING
@@ -433,6 +523,14 @@ ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
 
 }  // end namespace internal
 
+BenchmarkReporter* CreateDefaultDisplayReporter() {
+  static auto default_display_reporter =
+      internal::CreateReporter(FLAGS_benchmark_format,
+                               internal::GetOutputOptions())
+          .release();
+  return default_display_reporter;
+}
+
 size_t RunSpecifiedBenchmarks() {
   return RunSpecifiedBenchmarks(nullptr, nullptr, FLAGS_benchmark_filter);
 }
@@ -468,8 +566,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
   std::unique_ptr<BenchmarkReporter> default_display_reporter;
   std::unique_ptr<BenchmarkReporter> default_file_reporter;
   if (!display_reporter) {
-    default_display_reporter = internal::CreateReporter(
-        FLAGS_benchmark_format, internal::GetOutputOptions());
+    default_display_reporter.reset(CreateDefaultDisplayReporter());
     display_reporter = default_display_reporter.get();
   }
   auto& Out = display_reporter->GetOutputStream();
@@ -480,17 +577,23 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
     Err << "A custom file reporter was provided but "
            "--benchmark_out=<file> was not specified."
         << std::endl;
+    Out.flush();
+    Err.flush();
     std::exit(1);
   }
   if (!fname.empty()) {
     output_file.open(fname);
     if (!output_file.is_open()) {
       Err << "invalid file name: '" << fname << "'" << std::endl;
+      Out.flush();
+      Err.flush();
       std::exit(1);
     }
     if (!file_reporter) {
       default_file_reporter = internal::CreateReporter(
-          FLAGS_benchmark_out_format, ConsoleReporter::OO_None);
+          FLAGS_benchmark_out_format, FLAGS_benchmark_counters_tabular
+                                          ? ConsoleReporter::OO_Tabular
+                                          : ConsoleReporter::OO_None);
       file_reporter = default_file_reporter.get();
     }
     file_reporter->SetOutputStream(&output_file);
@@ -498,10 +601,16 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
   }
 
   std::vector<internal::BenchmarkInstance> benchmarks;
-  if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) return 0;
+  if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) {
+    Out.flush();
+    Err.flush();
+    return 0;
+  }
 
   if (benchmarks.empty()) {
     Err << "Failed to match any benchmarks against regex: " << spec << "\n";
+    Out.flush();
+    Err.flush();
     return 0;
   }
 
@@ -512,11 +621,28 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
     internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
   }
 
+  Out.flush();
+  Err.flush();
   return benchmarks.size();
 }
 
+namespace {
+// stores the time unit benchmarks use by default
+TimeUnit default_time_unit = kNanosecond;
+}  // namespace
+
+TimeUnit GetDefaultTimeUnit() { return default_time_unit; }
+
+void SetDefaultTimeUnit(TimeUnit unit) { default_time_unit = unit; }
+
 std::string GetBenchmarkFilter() { return FLAGS_benchmark_filter; }
 
+void SetBenchmarkFilter(std::string value) {
+  FLAGS_benchmark_filter = std::move(value);
+}
+
+int32_t GetBenchmarkVerbosity() { return FLAGS_v; }
+
 void RegisterMemoryManager(MemoryManager* manager) {
   internal::memory_manager = manager;
 }
@@ -533,27 +659,31 @@ void AddCustomContext(const std::string& key, const std::string& value) {
 
 namespace internal {
 
+void (*HelperPrintf)();
+
 void PrintUsageAndExit() {
-  fprintf(stdout,
-          "benchmark"
-          " [--benchmark_list_tests={true|false}]\n"
-          "          [--benchmark_filter=<regex>]\n"
-          "          [--benchmark_min_time=<min_time>]\n"
-          "          [--benchmark_repetitions=<num_repetitions>]\n"
-          "          [--benchmark_enable_random_interleaving={true|false}]\n"
-          "          [--benchmark_report_aggregates_only={true|false}]\n"
-          "          [--benchmark_display_aggregates_only={true|false}]\n"
-          "          [--benchmark_format=<console|json|csv>]\n"
-          "          [--benchmark_out=<filename>]\n"
-          "          [--benchmark_out_format=<json|console|csv>]\n"
-          "          [--benchmark_color={auto|true|false}]\n"
-          "          [--benchmark_counters_tabular={true|false}]\n"
-          "          [--benchmark_perf_counters=<counter>,...]\n"
-          "          [--benchmark_context=<key>=<value>,...]\n"
-          "          [--v=<verbosity>]\n");
+  HelperPrintf();
   exit(0);
 }
 
+void SetDefaultTimeUnitFromFlag(const std::string& time_unit_flag) {
+  if (time_unit_flag == "s") {
+    return SetDefaultTimeUnit(kSecond);
+  }
+  if (time_unit_flag == "ms") {
+    return SetDefaultTimeUnit(kMillisecond);
+  }
+  if (time_unit_flag == "us") {
+    return SetDefaultTimeUnit(kMicrosecond);
+  }
+  if (time_unit_flag == "ns") {
+    return SetDefaultTimeUnit(kNanosecond);
+  }
+  if (!time_unit_flag.empty()) {
+    PrintUsageAndExit();
+  }
+}
+
 void ParseCommandLineFlags(int* argc, char** argv) {
   using namespace benchmark;
   BenchmarkReporter::Context::executable_name =
@@ -562,8 +692,10 @@ void ParseCommandLineFlags(int* argc, char** argv) {
     if (ParseBoolFlag(argv[i], "benchmark_list_tests",
                       &FLAGS_benchmark_list_tests) ||
         ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
-        ParseDoubleFlag(argv[i], "benchmark_min_time",
+        ParseStringFlag(argv[i], "benchmark_min_time",
                         &FLAGS_benchmark_min_time) ||
+        ParseDoubleFlag(argv[i], "benchmark_min_warmup_time",
+                        &FLAGS_benchmark_min_warmup_time) ||
         ParseInt32Flag(argv[i], "benchmark_repetitions",
                        &FLAGS_benchmark_repetitions) ||
         ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
@@ -583,6 +715,8 @@ void ParseCommandLineFlags(int* argc, char** argv) {
                         &FLAGS_benchmark_perf_counters) ||
         ParseKeyValueFlag(argv[i], "benchmark_context",
                           &FLAGS_benchmark_context) ||
+        ParseStringFlag(argv[i], "benchmark_time_unit",
+                        &FLAGS_benchmark_time_unit) ||
         ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
       for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
 
@@ -598,6 +732,7 @@ void ParseCommandLineFlags(int* argc, char** argv) {
       PrintUsageAndExit();
     }
   }
+  SetDefaultTimeUnitFromFlag(FLAGS_benchmark_time_unit);
   if (FLAGS_benchmark_color.empty()) {
     PrintUsageAndExit();
   }
@@ -613,7 +748,34 @@ int InitializeStreams() {
 
 }  // end namespace internal
 
-void Initialize(int* argc, char** argv) {
+std::string GetBenchmarkVersion() { return {BENCHMARK_VERSION}; }
+
+void PrintDefaultHelp() {
+  fprintf(stdout,
+          "benchmark"
+          " [--benchmark_list_tests={true|false}]\n"
+          "          [--benchmark_filter=<regex>]\n"
+          "          [--benchmark_min_time=`<integer>x` OR `<float>s` ]\n"
+          "          [--benchmark_min_warmup_time=<min_warmup_time>]\n"
+          "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_enable_random_interleaving={true|false}]\n"
+          "          [--benchmark_report_aggregates_only={true|false}]\n"
+          "          [--benchmark_display_aggregates_only={true|false}]\n"
+          "          [--benchmark_format=<console|json|csv>]\n"
+          "          [--benchmark_out=<filename>]\n"
+          "          [--benchmark_out_format=<json|console|csv>]\n"
+          "          [--benchmark_color={auto|true|false}]\n"
+          "          [--benchmark_counters_tabular={true|false}]\n"
+#if defined HAVE_LIBPFM
+          "          [--benchmark_perf_counters=<counter>,...]\n"
+#endif
+          "          [--benchmark_context=<key>=<value>,...]\n"
+          "          [--benchmark_time_unit={ns|us|ms|s}]\n"
+          "          [--v=<verbosity>]\n");
+}
+
+void Initialize(int* argc, char** argv, void (*HelperPrintf)()) {
+  internal::HelperPrintf = HelperPrintf;
   internal::ParseCommandLineFlags(argc, argv);
   internal::LogLevel() = FLAGS_v;
 }
diff --git a/third-party/benchmark/src/benchmark_api_internal.cc b/third-party/benchmark/src/benchmark_api_internal.cc
index 4de36e3c8ba3f2..286f9865308594 100644
--- a/third-party/benchmark/src/benchmark_api_internal.cc
+++ b/third-party/benchmark/src/benchmark_api_internal.cc
@@ -16,7 +16,7 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
       per_family_instance_index_(per_family_instance_idx),
       aggregation_report_mode_(benchmark_.aggregation_report_mode_),
       args_(args),
-      time_unit_(benchmark_.time_unit_),
+      time_unit_(benchmark_.GetTimeUnit()),
       measure_process_cpu_time_(benchmark_.measure_process_cpu_time_),
       use_real_time_(benchmark_.use_real_time_),
       use_manual_time_(benchmark_.use_manual_time_),
@@ -25,6 +25,7 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
       statistics_(benchmark_.statistics_),
       repetitions_(benchmark_.repetitions_),
       min_time_(benchmark_.min_time_),
+      min_warmup_time_(benchmark_.min_warmup_time_),
       iterations_(benchmark_.iterations_),
       threads_(thread_count) {
   name_.function_name = benchmark_.name_;
@@ -50,6 +51,11 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
     name_.min_time = StrFormat("min_time:%0.3f", benchmark_.min_time_);
   }
 
+  if (!IsZero(benchmark->min_warmup_time_)) {
+    name_.min_warmup_time =
+        StrFormat("min_warmup_time:%0.3f", benchmark_.min_warmup_time_);
+  }
+
   if (benchmark_.iterations_ != 0) {
     name_.iterations = StrFormat(
         "iterations:%lu", static_cast<unsigned long>(benchmark_.iterations_));
@@ -87,24 +93,24 @@ State BenchmarkInstance::Run(
     IterationCount iters, int thread_id, internal::ThreadTimer* timer,
     internal::ThreadManager* manager,
     internal::PerfCountersMeasurement* perf_counters_measurement) const {
-  State st(iters, args_, thread_id, threads_, timer, manager,
-           perf_counters_measurement);
+  State st(name_.function_name, iters, args_, thread_id, threads_, timer,
+           manager, perf_counters_measurement);
   benchmark_.Run(st);
   return st;
 }
 
 void BenchmarkInstance::Setup() const {
   if (setup_) {
-    State st(/*iters*/ 1, args_, /*thread_id*/ 0, threads_, nullptr, nullptr,
-             nullptr);
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr);
     setup_(st);
   }
 }
 
 void BenchmarkInstance::Teardown() const {
   if (teardown_) {
-    State st(/*iters*/ 1, args_, /*thread_id*/ 0, threads_, nullptr, nullptr,
-             nullptr);
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr);
     teardown_(st);
   }
 }
diff --git a/third-party/benchmark/src/benchmark_api_internal.h b/third-party/benchmark/src/benchmark_api_internal.h
index 94c2b2972bb186..94f516531bc4fa 100644
--- a/third-party/benchmark/src/benchmark_api_internal.h
+++ b/third-party/benchmark/src/benchmark_api_internal.h
@@ -36,6 +36,7 @@ class BenchmarkInstance {
   const std::vector<Statistics>& statistics() const { return statistics_; }
   int repetitions() const { return repetitions_; }
   double min_time() const { return min_time_; }
+  double min_warmup_time() const { return min_warmup_time_; }
   IterationCount iterations() const { return iterations_; }
   int threads() const { return threads_; }
   void Setup() const;
@@ -62,6 +63,7 @@ class BenchmarkInstance {
   const std::vector<Statistics>& statistics_;
   int repetitions_;
   double min_time_;
+  double min_warmup_time_;
   IterationCount iterations_;
   int threads_;  // Number of concurrent threads to us
 
@@ -76,6 +78,7 @@ bool FindBenchmarksInternal(const std::string& re,
 
 bool IsZero(double n);
 
+BENCHMARK_EXPORT
 ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);
 
 }  // end namespace internal
diff --git a/third-party/benchmark/src/benchmark_main.cc b/third-party/benchmark/src/benchmark_main.cc
index b3b247831496f6..cd61cd2ad50692 100644
--- a/third-party/benchmark/src/benchmark_main.cc
+++ b/third-party/benchmark/src/benchmark_main.cc
@@ -14,4 +14,5 @@
 
 #include "benchmark/benchmark.h"
 
+BENCHMARK_EXPORT int main(int, char**);
 BENCHMARK_MAIN();
diff --git a/third-party/benchmark/src/benchmark_name.cc b/third-party/benchmark/src/benchmark_name.cc
index 2a17ebce277f56..01676bbc84df42 100644
--- a/third-party/benchmark/src/benchmark_name.cc
+++ b/third-party/benchmark/src/benchmark_name.cc
@@ -51,8 +51,9 @@ std::string join(char delimiter, const Ts&... ts) {
 }
 }  // namespace
 
+BENCHMARK_EXPORT
 std::string BenchmarkName::str() const {
-  return join('/', function_name, args, min_time, iterations, repetitions,
-              time_type, threads);
+  return join('/', function_name, args, min_time, min_warmup_time, iterations,
+              repetitions, time_type, threads);
 }
 }  // namespace benchmark
diff --git a/third-party/benchmark/src/benchmark_register.cc b/third-party/benchmark/src/benchmark_register.cc
index 61a0c26178e3d4..e447c9a2d39ba4 100644
--- a/third-party/benchmark/src/benchmark_register.cc
+++ b/third-party/benchmark/src/benchmark_register.cc
@@ -15,7 +15,7 @@
 #include "benchmark_register.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -53,10 +53,13 @@ namespace benchmark {
 
 namespace {
 // For non-dense Range, intermediate values are powers of kRangeMultiplier.
-static const int kRangeMultiplier = 8;
+static constexpr int kRangeMultiplier = 8;
+
 // The size of a benchmark family determines is the number of inputs to repeat
 // the benchmark on. If this is "large" then warn the user during configuration.
-static const size_t kMaxFamilySize = 100;
+static constexpr size_t kMaxFamilySize = 100;
+
+static constexpr char kDisabledPrefix[] = "DISABLED_";
 }  // end namespace
 
 namespace internal {
@@ -116,10 +119,10 @@ bool BenchmarkFamilies::FindBenchmarks(
   // Make regular expression out of command-line flag
   std::string error_msg;
   Regex re;
-  bool isNegativeFilter = false;
+  bool is_negative_filter = false;
   if (spec[0] == '-') {
     spec.replace(0, 1, "");
-    isNegativeFilter = true;
+    is_negative_filter = true;
   }
   if (!re.Init(spec, &error_msg)) {
     Err << "Could not compile benchmark re: " << error_msg << std::endl;
@@ -154,7 +157,8 @@ bool BenchmarkFamilies::FindBenchmarks(
           << " will be repeated at least " << family_size << " times.\n";
     }
     // reserve in the special case the regex ".", since we know the final
-    // family size.
+    // family size.  this doesn't take into account any disabled benchmarks
+    // so worst case we reserve more than we need.
     if (spec == ".") benchmarks->reserve(benchmarks->size() + family_size);
 
     for (auto const& args : family->args_) {
@@ -164,8 +168,9 @@ bool BenchmarkFamilies::FindBenchmarks(
                                    num_threads);
 
         const auto full_name = instance.name().str();
-        if ((re.Match(full_name) && !isNegativeFilter) ||
-            (!re.Match(full_name) && isNegativeFilter)) {
+        if (full_name.rfind(kDisabledPrefix, 0) != 0 &&
+            ((re.Match(full_name) && !is_negative_filter) ||
+             (!re.Match(full_name) && is_negative_filter))) {
           benchmarks->push_back(std::move(instance));
 
           ++per_family_instance_index;
@@ -199,12 +204,14 @@ bool FindBenchmarksInternal(const std::string& re,
 //                               Benchmark
 //=============================================================================//
 
-Benchmark::Benchmark(const char* name)
+Benchmark::Benchmark(const std::string& name)
     : name_(name),
       aggregation_report_mode_(ARM_Unspecified),
-      time_unit_(kNanosecond),
+      time_unit_(GetDefaultTimeUnit()),
+      use_default_time_unit_(true),
       range_multiplier_(kRangeMultiplier),
       min_time_(0),
+      min_warmup_time_(0),
       iterations_(0),
       repetitions_(0),
       measure_process_cpu_time_(false),
@@ -223,7 +230,7 @@ Benchmark::Benchmark(const char* name)
 Benchmark::~Benchmark() {}
 
 Benchmark* Benchmark::Name(const std::string& name) {
-  SetName(name.c_str());
+  SetName(name);
   return this;
 }
 
@@ -235,6 +242,7 @@ Benchmark* Benchmark::Arg(int64_t x) {
 
 Benchmark* Benchmark::Unit(TimeUnit unit) {
   time_unit_ = unit;
+  use_default_time_unit_ = false;
   return this;
 }
 
@@ -348,9 +356,17 @@ Benchmark* Benchmark::MinTime(double t) {
   return this;
 }
 
+Benchmark* Benchmark::MinWarmUpTime(double t) {
+  BM_CHECK(t >= 0.0);
+  BM_CHECK(iterations_ == 0);
+  min_warmup_time_ = t;
+  return this;
+}
+
 Benchmark* Benchmark::Iterations(IterationCount n) {
   BM_CHECK(n > 0);
   BM_CHECK(IsZero(min_time_));
+  BM_CHECK(IsZero(min_warmup_time_));
   iterations_ = n;
   return this;
 }
@@ -452,7 +468,9 @@ Benchmark* Benchmark::ThreadPerCpu() {
   return this;
 }
 
-void Benchmark::SetName(const char* name) { name_ = name; }
+void Benchmark::SetName(const std::string& name) { name_ = name; }
+
+const char* Benchmark::GetName() const { return name_.c_str(); }
 
 int Benchmark::ArgsCnt() const {
   if (args_.empty()) {
@@ -462,6 +480,16 @@ int Benchmark::ArgsCnt() const {
   return static_cast<int>(args_.front().size());
 }
 
+const char* Benchmark::GetArgName(int arg) const {
+  BM_CHECK_GE(arg, 0);
+  BM_CHECK_LT(arg, static_cast<int>(arg_names_.size()));
+  return arg_names_[arg].c_str();
+}
+
+TimeUnit Benchmark::GetTimeUnit() const {
+  return use_default_time_unit_ ? GetDefaultTimeUnit() : time_unit_;
+}
+
 //=============================================================================//
 //                            FunctionBenchmark
 //=============================================================================//
diff --git a/third-party/benchmark/src/benchmark_register.h b/third-party/benchmark/src/benchmark_register.h
index d3f4974e9074df..53367c707cf41b 100644
--- a/third-party/benchmark/src/benchmark_register.h
+++ b/third-party/benchmark/src/benchmark_register.h
@@ -1,6 +1,7 @@
 #ifndef BENCHMARK_REGISTER_H
 #define BENCHMARK_REGISTER_H
 
+#include <algorithm>
 #include <limits>
 #include <vector>
 
@@ -23,7 +24,7 @@ typename std::vector<T>::iterator AddPowers(std::vector<T>* dst, T lo, T hi,
   static const T kmax = std::numeric_limits<T>::max();
 
   // Space out the values in multiples of "mult"
-  for (T i = static_cast<T>(1); i <= hi; i *= mult) {
+  for (T i = static_cast<T>(1); i <= hi; i *= static_cast<T>(mult)) {
     if (i >= lo) {
       dst->push_back(i);
     }
@@ -32,7 +33,7 @@ typename std::vector<T>::iterator AddPowers(std::vector<T>* dst, T lo, T hi,
     if (i > kmax / mult) break;
   }
 
-  return dst->begin() + start_offset;
+  return dst->begin() + static_cast<int>(start_offset);
 }
 
 template <typename T>
diff --git a/third-party/benchmark/src/benchmark_runner.cc b/third-party/benchmark/src/benchmark_runner.cc
index eac807b066f10e..dcddb437e37d70 100644
--- a/third-party/benchmark/src/benchmark_runner.cc
+++ b/third-party/benchmark/src/benchmark_runner.cc
@@ -19,7 +19,7 @@
 #include "internal_macros.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -28,11 +28,14 @@
 
 #include <algorithm>
 #include <atomic>
+#include <climits>
+#include <cmath>
 #include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <limits>
 #include <memory>
 #include <string>
 #include <thread>
@@ -61,7 +64,9 @@ MemoryManager* memory_manager = nullptr;
 
 namespace {
 
-static constexpr IterationCount kMaxIterations = 1000000000;
+static constexpr IterationCount kMaxIterations = 1000000000000;
+const double kDefaultMinTime =
+    std::strtod(::benchmark::kDefaultMinTimeStr, /*p_end*/ nullptr);
 
 BenchmarkReporter::Run CreateRunReport(
     const benchmark::internal::BenchmarkInstance& b,
@@ -75,8 +80,8 @@ BenchmarkReporter::Run CreateRunReport(
   report.run_name = b.name();
   report.family_index = b.family_index();
   report.per_family_instance_index = b.per_family_instance_index();
-  report.error_occurred = results.has_error_;
-  report.error_message = results.error_message_;
+  report.skipped = results.skipped_;
+  report.skip_message = results.skip_message_;
   report.report_label = results.report_label_;
   // This is the total iterations across all threads.
   report.iterations = results.iterations;
@@ -85,12 +90,13 @@ BenchmarkReporter::Run CreateRunReport(
   report.repetition_index = repetition_index;
   report.repetitions = repeats;
 
-  if (!report.error_occurred) {
+  if (!report.skipped) {
     if (b.use_manual_time()) {
       report.real_accumulated_time = results.manual_time_used;
     } else {
       report.real_accumulated_time = results.real_time_used;
     }
+    report.use_real_time_for_initial_big_o = b.use_manual_time();
     report.cpu_accumulated_time = results.cpu_time_used;
     report.complexity_n = results.complexity_n;
     report.complexity = b.complexity();
@@ -103,7 +109,7 @@ BenchmarkReporter::Run CreateRunReport(
       report.memory_result = memory_result;
       report.allocs_per_iter =
           memory_iterations ? static_cast<double>(memory_result->num_allocs) /
-                                  memory_iterations
+                                  static_cast<double>(memory_iterations)
                             : 0;
     }
 
@@ -122,9 +128,10 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,
       b->measure_process_cpu_time()
           ? internal::ThreadTimer::CreateProcessCpuTime()
           : internal::ThreadTimer::Create());
+
   State st =
       b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
-  BM_CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
+  BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations)
       << "Benchmark returned before State::KeepRunning() returned false!";
   {
     MutexLock l(manager->GetBenchmarkMutex());
@@ -139,24 +146,100 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,
   manager->NotifyThreadComplete();
 }
 
+double ComputeMinTime(const benchmark::internal::BenchmarkInstance& b,
+                      const BenchTimeType& iters_or_time) {
+  if (!IsZero(b.min_time())) return b.min_time();
+  // If the flag was used to specify number of iters, then return the default
+  // min_time.
+  if (iters_or_time.tag == BenchTimeType::ITERS) return kDefaultMinTime;
+
+  return iters_or_time.time;
+}
+
+IterationCount ComputeIters(const benchmark::internal::BenchmarkInstance& b,
+                            const BenchTimeType& iters_or_time) {
+  if (b.iterations() != 0) return b.iterations();
+
+  // We've already concluded that this flag is currently used to pass
+  // iters but do a check here again anyway.
+  BM_CHECK(iters_or_time.tag == BenchTimeType::ITERS);
+  return iters_or_time.iters;
+}
+
 }  // end namespace
 
+BenchTimeType ParseBenchMinTime(const std::string& value) {
+  BenchTimeType ret;
+
+  if (value.empty()) {
+    ret.tag = BenchTimeType::TIME;
+    ret.time = 0.0;
+    return ret;
+  }
+
+  if (value.back() == 'x') {
+    char* p_end;
+    // Reset errno before it's changed by strtol.
+    errno = 0;
+    IterationCount num_iters = std::strtol(value.c_str(), &p_end, 10);
+
+    // After a valid parse, p_end should have been set to
+    // point to the 'x' suffix.
+    BM_CHECK(errno == 0 && p_end != nullptr && *p_end == 'x')
+        << "Malformed iters value passed to --benchmark_min_time: `" << value
+        << "`. Expected --benchmark_min_time=<integer>x.";
+
+    ret.tag = BenchTimeType::ITERS;
+    ret.iters = num_iters;
+    return ret;
+  }
+
+  bool has_suffix = value.back() == 's';
+  if (!has_suffix) {
+    BM_VLOG(0) << "Value passed to --benchmark_min_time should have a suffix. "
+                  "Eg., `30s` for 30-seconds.";
+  }
+
+  char* p_end;
+  // Reset errno before it's changed by strtod.
+  errno = 0;
+  double min_time = std::strtod(value.c_str(), &p_end);
+
+  // After a successful parse, p_end should point to the suffix 's',
+  // or the end of the string if the suffix was omitted.
+  BM_CHECK(errno == 0 && p_end != nullptr &&
+           ((has_suffix && *p_end == 's') || *p_end == '\0'))
+      << "Malformed seconds value passed to --benchmark_min_time: `" << value
+      << "`. Expected --benchmark_min_time=<float>x.";
+
+  ret.tag = BenchTimeType::TIME;
+  ret.time = min_time;
+
+  return ret;
+}
+
 BenchmarkRunner::BenchmarkRunner(
     const benchmark::internal::BenchmarkInstance& b_,
+    PerfCountersMeasurement* pcm_,
     BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
     : b(b_),
       reports_for_family(reports_for_family_),
-      min_time(!IsZero(b.min_time()) ? b.min_time() : FLAGS_benchmark_min_time),
+      parsed_benchtime_flag(ParseBenchMinTime(FLAGS_benchmark_min_time)),
+      min_time(ComputeMinTime(b_, parsed_benchtime_flag)),
+      min_warmup_time((!IsZero(b.min_time()) && b.min_warmup_time() > 0.0)
+                          ? b.min_warmup_time()
+                          : FLAGS_benchmark_min_warmup_time),
+      warmup_done(!(min_warmup_time > 0.0)),
       repeats(b.repetitions() != 0 ? b.repetitions()
                                    : FLAGS_benchmark_repetitions),
-      has_explicit_iteration_count(b.iterations() != 0),
+      has_explicit_iteration_count(b.iterations() != 0 ||
+                                   parsed_benchtime_flag.tag ==
+                                       BenchTimeType::ITERS),
       pool(b.threads() - 1),
-      iters(has_explicit_iteration_count ? b.iterations() : 1),
-      perf_counters_measurement(
-          PerfCounters::Create(StrSplit(FLAGS_benchmark_perf_counters, ','))),
-      perf_counters_measurement_ptr(perf_counters_measurement.IsValid()
-                                        ? &perf_counters_measurement
-                                        : nullptr) {
+      iters(has_explicit_iteration_count
+                ? ComputeIters(b_, parsed_benchtime_flag)
+                : 1),
+      perf_counters_measurement_ptr(pcm_) {
   run_results.display_report_aggregates_only =
       (FLAGS_benchmark_report_aggregates_only ||
        FLAGS_benchmark_display_aggregates_only);
@@ -169,7 +252,7 @@ BenchmarkRunner::BenchmarkRunner(
     run_results.file_report_aggregates_only =
         (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
     BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
-             perf_counters_measurement.IsValid())
+             (perf_counters_measurement_ptr->num_counters() == 0))
         << "Perf counters were requested but could not be set up.";
   }
 }
@@ -232,20 +315,20 @@ IterationCount BenchmarkRunner::PredictNumItersNeeded(
     const IterationResults& i) const {
   // See how much iterations should be increased by.
   // Note: Avoid division by zero with max(seconds, 1ns).
-  double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
+  double multiplier = GetMinTimeToApply() * 1.4 / std::max(i.seconds, 1e-9);
   // If our last run was at least 10% of FLAGS_benchmark_min_time then we
   // use the multiplier directly.
   // Otherwise we use at most 10 times expansion.
   // NOTE: When the last run was at least 10% of the min time the max
   // expansion should be 14x.
-  bool is_significant = (i.seconds / min_time) > 0.1;
+  const bool is_significant = (i.seconds / GetMinTimeToApply()) > 0.1;
   multiplier = is_significant ? multiplier : 10.0;
 
   // So what seems to be the sufficiently-large iteration count? Round up.
   const IterationCount max_next_iters = static_cast<IterationCount>(
-      std::lround(std::max(multiplier * static_cast<double>(i.iters),
-                           static_cast<double>(i.iters) + 1.0)));
-  // But we do have *some* sanity limits though..
+      std::llround(std::max(multiplier * static_cast<double>(i.iters),
+                            static_cast<double>(i.iters) + 1.0)));
+  // But we do have *some* limits though..
   const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
 
   BM_VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
@@ -257,21 +340,80 @@ bool BenchmarkRunner::ShouldReportIterationResults(
   // Determine if this run should be reported;
   // Either it has run for a sufficient amount of time
   // or because an error was reported.
-  return i.results.has_error_ ||
+  return i.results.skipped_ ||
          i.iters >= kMaxIterations ||  // Too many iterations already.
-         i.seconds >= min_time ||      // The elapsed time is large enough.
+         i.seconds >=
+             GetMinTimeToApply() ||  // The elapsed time is large enough.
          // CPU time is specified but the elapsed real time greatly exceeds
          // the minimum time.
-         // Note that user provided timers are except from this sanity check.
-         ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time());
+         // Note that user provided timers are except from this test.
+         ((i.results.real_time_used >= 5 * GetMinTimeToApply()) &&
+          !b.use_manual_time());
+}
+
+double BenchmarkRunner::GetMinTimeToApply() const {
+  // In order to re-use functionality to run and measure benchmarks for running
+  // a warmup phase of the benchmark, we need a way of telling whether to apply
+  // min_time or min_warmup_time. This function will figure out if we are in the
+  // warmup phase and therefore need to apply min_warmup_time or if we already
+  // in the benchmarking phase and min_time needs to be applied.
+  return warmup_done ? min_time : min_warmup_time;
+}
+
+void BenchmarkRunner::FinishWarmUp(const IterationCount& i) {
+  warmup_done = true;
+  iters = i;
+}
+
+void BenchmarkRunner::RunWarmUp() {
+  // Use the same mechanisms for warming up the benchmark as used for actually
+  // running and measuring the benchmark.
+  IterationResults i_warmup;
+  // Dont use the iterations determined in the warmup phase for the actual
+  // measured benchmark phase. While this may be a good starting point for the
+  // benchmark and it would therefore get rid of the need to figure out how many
+  // iterations are needed if min_time is set again, this may also be a complete
+  // wrong guess since the warmup loops might be considerably slower (e.g
+  // because of caching effects).
+  const IterationCount i_backup = iters;
+
+  for (;;) {
+    b.Setup();
+    i_warmup = DoNIterations();
+    b.Teardown();
+
+    const bool finish = ShouldReportIterationResults(i_warmup);
+
+    if (finish) {
+      FinishWarmUp(i_backup);
+      break;
+    }
+
+    // Although we are running "only" a warmup phase where running enough
+    // iterations at once without measuring time isn't as important as it is for
+    // the benchmarking phase, we still do it the same way as otherwise it is
+    // very confusing for the user to know how to choose a proper value for
+    // min_warmup_time if a different approach on running it is used.
+    iters = PredictNumItersNeeded(i_warmup);
+    assert(iters > i_warmup.iters &&
+           "if we did more iterations than we want to do the next time, "
+           "then we should have accepted the current iteration run.");
+  }
 }
 
 void BenchmarkRunner::DoOneRepetition() {
   assert(HasRepeatsRemaining() && "Already done all repetitions?");
 
   const bool is_the_first_repetition = num_repetitions_done == 0;
-  IterationResults i;
 
+  // In case a warmup phase is requested by the benchmark, run it now.
+  // After running the warmup phase the BenchmarkRunner should be in a state as
+  // this warmup never happened except the fact that warmup_done is set. Every
+  // other manipulation of the BenchmarkRunner instance would be a bug! Please
+  // fix it.
+  if (!warmup_done) RunWarmUp();
+
+  IterationResults i;
   // We *may* be gradually increasing the length (iteration count)
   // of the benchmark until we decide the results are significant.
   // And once we do, we report those last results and exit.
@@ -324,10 +466,7 @@ void BenchmarkRunner::DoOneRepetition() {
     manager->WaitForAllThreads();
     manager.reset();
     b.Teardown();
-
-    BENCHMARK_DISABLE_DEPRECATED_WARNING
-    memory_manager->Stop(memory_result);
-    BENCHMARK_RESTORE_DEPRECATED_WARNING
+    memory_manager->Stop(*memory_result);
   }
 
   // Ok, now actually report.
@@ -337,7 +476,7 @@ void BenchmarkRunner::DoOneRepetition() {
 
   if (reports_for_family) {
     ++reports_for_family->num_runs_done;
-    if (!report.error_occurred) reports_for_family->Runs.push_back(report);
+    if (!report.skipped) reports_for_family->Runs.push_back(report);
   }
 
   run_results.non_aggregates.push_back(report);
diff --git a/third-party/benchmark/src/benchmark_runner.h b/third-party/benchmark/src/benchmark_runner.h
index 752eefdc26fa09..db2fa04396c50d 100644
--- a/third-party/benchmark/src/benchmark_runner.h
+++ b/third-party/benchmark/src/benchmark_runner.h
@@ -25,7 +25,8 @@
 
 namespace benchmark {
 
-BM_DECLARE_double(benchmark_min_time);
+BM_DECLARE_string(benchmark_min_time);
+BM_DECLARE_double(benchmark_min_warmup_time);
 BM_DECLARE_int32(benchmark_repetitions);
 BM_DECLARE_bool(benchmark_report_aggregates_only);
 BM_DECLARE_bool(benchmark_display_aggregates_only);
@@ -43,9 +44,21 @@ struct RunResults {
   bool file_report_aggregates_only = false;
 };
 
+struct BENCHMARK_EXPORT BenchTimeType {
+  enum { ITERS, TIME } tag;
+  union {
+    IterationCount iters;
+    double time;
+  };
+};
+
+BENCHMARK_EXPORT
+BenchTimeType ParseBenchMinTime(const std::string& value);
+
 class BenchmarkRunner {
  public:
   BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  benchmark::internal::PerfCountersMeasurement* pmc_,
                   BenchmarkReporter::PerFamilyRunReports* reports_for_family);
 
   int GetNumRepeats() const { return repeats; }
@@ -62,13 +75,22 @@ class BenchmarkRunner {
     return reports_for_family;
   }
 
+  double GetMinTime() const { return min_time; }
+
+  bool HasExplicitIters() const { return has_explicit_iteration_count; }
+
+  IterationCount GetIters() const { return iters; }
+
  private:
   RunResults run_results;
 
   const benchmark::internal::BenchmarkInstance& b;
   BenchmarkReporter::PerFamilyRunReports* reports_for_family;
 
+  BenchTimeType parsed_benchtime_flag;
   const double min_time;
+  const double min_warmup_time;
+  bool warmup_done;
   const int repeats;
   const bool has_explicit_iteration_count;
 
@@ -82,8 +104,7 @@ class BenchmarkRunner {
   // So only the first repetition has to find/calculate it,
   // the other repetitions will just use that precomputed iteration count.
 
-  PerfCountersMeasurement perf_counters_measurement;
-  PerfCountersMeasurement* const perf_counters_measurement_ptr;
+  PerfCountersMeasurement* const perf_counters_measurement_ptr = nullptr;
 
   struct IterationResults {
     internal::ThreadManager::Result results;
@@ -95,6 +116,12 @@ class BenchmarkRunner {
   IterationCount PredictNumItersNeeded(const IterationResults& i) const;
 
   bool ShouldReportIterationResults(const IterationResults& i) const;
+
+  double GetMinTimeToApply() const;
+
+  void FinishWarmUp(const IterationCount& i);
+
+  void RunWarmUp();
 };
 
 }  // namespace internal
diff --git a/third-party/benchmark/src/check.h b/third-party/benchmark/src/check.h
index 0efd13ff4db6d9..c1cd5e85e44cfc 100644
--- a/third-party/benchmark/src/check.h
+++ b/third-party/benchmark/src/check.h
@@ -5,18 +5,34 @@
 #include <cstdlib>
 #include <ostream>
 
+#include "benchmark/export.h"
 #include "internal_macros.h"
 #include "log.h"
 
+#if defined(__GNUC__) || defined(__clang__)
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#elif defined(_MSC_VER) && !defined(__clang__)
+#if _MSC_VER >= 1900
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+#define __func__ __FUNCTION__
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+
 namespace benchmark {
 namespace internal {
 
 typedef void(AbortHandlerT)();
 
-inline AbortHandlerT*& GetAbortHandler() {
-  static AbortHandlerT* handler = &std::abort;
-  return handler;
-}
+BENCHMARK_EXPORT
+AbortHandlerT*& GetAbortHandler();
 
 BENCHMARK_NORETURN inline void CallAbortHandler() {
   GetAbortHandler()();
@@ -36,10 +52,17 @@ class CheckHandler {
 
   LogType& GetLog() { return log_; }
 
+#if defined(COMPILER_MSVC)
+#pragma warning(push)
+#pragma warning(disable : 4722)
+#endif
   BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
     log_ << std::endl;
     CallAbortHandler();
   }
+#if defined(COMPILER_MSVC)
+#pragma warning(pop)
+#endif
 
   CheckHandler& operator=(const CheckHandler&) = delete;
   CheckHandler(const CheckHandler&) = delete;
diff --git a/third-party/benchmark/src/colorprint.cc b/third-party/benchmark/src/colorprint.cc
index 1a000a0637368c..abc71492f77aa1 100644
--- a/third-party/benchmark/src/colorprint.cc
+++ b/third-party/benchmark/src/colorprint.cc
@@ -96,18 +96,18 @@ std::string FormatString(const char* msg, va_list args) {
   // currently there is no error handling for failure, so this is hack.
   BM_CHECK(ret >= 0);
 
-  if (ret == 0)  // handle empty expansion
+  if (ret == 0) {  // handle empty expansion
     return {};
-  else if (static_cast<size_t>(ret) < size)
+  }
+  if (static_cast<size_t>(ret) < size) {
     return local_buff;
-  else {
-    // we did not provide a long enough buffer on our first attempt.
-    size = static_cast<size_t>(ret) + 1;  // + 1 for the null byte
-    std::unique_ptr<char[]> buff(new char[size]);
-    ret = vsnprintf(buff.get(), size, msg, args);
-    BM_CHECK(ret > 0 && (static_cast<size_t>(ret)) < size);
-    return buff.get();
   }
+  // we did not provide a long enough buffer on our first attempt.
+  size = static_cast<size_t>(ret) + 1;  // + 1 for the null byte
+  std::unique_ptr<char[]> buff(new char[size]);
+  ret = vsnprintf(buff.get(), size, msg, args);
+  BM_CHECK(ret > 0 && (static_cast<size_t>(ret)) < size);
+  return buff.get();
 }
 
 std::string FormatString(const char* msg, ...) {
@@ -140,12 +140,12 @@ void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
   // We need to flush the stream buffers into the console before each
   // SetConsoleTextAttribute call lest it affect the text that is already
   // printed but has not yet reached the console.
-  fflush(stdout);
+  out.flush();
   SetConsoleTextAttribute(stdout_handle,
                           GetPlatformColorCode(color) | FOREGROUND_INTENSITY);
-  vprintf(fmt, args);
+  out << FormatString(fmt, args);
 
-  fflush(stdout);
+  out.flush();
   // Restores the text color.
   SetConsoleTextAttribute(stdout_handle, old_color_attrs);
 #else
@@ -163,12 +163,24 @@ bool IsColorTerminal() {
 #else
   // On non-Windows platforms, we rely on the TERM variable. This list of
   // supported TERM values is copied from Google Test:
-  // <https://github.com/google/googletest/blob/master/googletest/src/gtest.cc#L2925>.
+  // <https://github.com/google/googletest/blob/v1.13.0/googletest/src/gtest.cc#L3225-L3259>.
   const char* const SUPPORTED_TERM_VALUES[] = {
-      "xterm",         "xterm-color",     "xterm-256color",
-      "screen",        "screen-256color", "tmux",
-      "tmux-256color", "rxvt-unicode",    "rxvt-unicode-256color",
-      "linux",         "cygwin",
+      "xterm",
+      "xterm-color",
+      "xterm-256color",
+      "screen",
+      "screen-256color",
+      "tmux",
+      "tmux-256color",
+      "rxvt-unicode",
+      "rxvt-unicode-256color",
+      "linux",
+      "cygwin",
+      "xterm-kitty",
+      "alacritty",
+      "foot",
+      "foot-extra",
+      "wezterm",
   };
 
   const char* const term = getenv("TERM");
diff --git a/third-party/benchmark/src/commandlineflags.cc b/third-party/benchmark/src/commandlineflags.cc
index 9615e351ffaed4..dcb414959df4ea 100644
--- a/third-party/benchmark/src/commandlineflags.cc
+++ b/third-party/benchmark/src/commandlineflags.cc
@@ -121,12 +121,14 @@ static std::string FlagToEnvVar(const char* flag) {
 
 }  // namespace
 
+BENCHMARK_EXPORT
 bool BoolFromEnv(const char* flag, bool default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value_str = getenv(env_var.c_str());
   return value_str == nullptr ? default_val : IsTruthyFlagValue(value_str);
 }
 
+BENCHMARK_EXPORT
 int32_t Int32FromEnv(const char* flag, int32_t default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value_str = getenv(env_var.c_str());
@@ -139,6 +141,7 @@ int32_t Int32FromEnv(const char* flag, int32_t default_val) {
   return value;
 }
 
+BENCHMARK_EXPORT
 double DoubleFromEnv(const char* flag, double default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value_str = getenv(env_var.c_str());
@@ -151,12 +154,14 @@ double DoubleFromEnv(const char* flag, double default_val) {
   return value;
 }
 
+BENCHMARK_EXPORT
 const char* StringFromEnv(const char* flag, const char* default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value = getenv(env_var.c_str());
   return value == nullptr ? default_val : value;
 }
 
+BENCHMARK_EXPORT
 std::map<std::string, std::string> KvPairsFromEnv(
     const char* flag, std::map<std::string, std::string> default_val) {
   const std::string env_var = FlagToEnvVar(flag);
@@ -201,6 +206,7 @@ const char* ParseFlagValue(const char* str, const char* flag,
   return flag_end + 1;
 }
 
+BENCHMARK_EXPORT
 bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, true);
@@ -213,6 +219,7 @@ bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
   return true;
 }
 
+BENCHMARK_EXPORT
 bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
@@ -225,6 +232,7 @@ bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
                     value);
 }
 
+BENCHMARK_EXPORT
 bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
@@ -237,6 +245,7 @@ bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
                      value);
 }
 
+BENCHMARK_EXPORT
 bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
@@ -248,6 +257,7 @@ bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
   return true;
 }
 
+BENCHMARK_EXPORT
 bool ParseKeyValueFlag(const char* str, const char* flag,
                        std::map<std::string, std::string>* value) {
   const char* const value_str = ParseFlagValue(str, flag, false);
@@ -263,23 +273,26 @@ bool ParseKeyValueFlag(const char* str, const char* flag,
   return true;
 }
 
+BENCHMARK_EXPORT
 bool IsFlag(const char* str, const char* flag) {
   return (ParseFlagValue(str, flag, true) != nullptr);
 }
 
+BENCHMARK_EXPORT
 bool IsTruthyFlagValue(const std::string& value) {
   if (value.size() == 1) {
     char v = value[0];
     return isalnum(v) &&
            !(v == '0' || v == 'f' || v == 'F' || v == 'n' || v == 'N');
-  } else if (!value.empty()) {
+  }
+  if (!value.empty()) {
     std::string value_lower(value);
     std::transform(value_lower.begin(), value_lower.end(), value_lower.begin(),
                    [](char c) { return static_cast<char>(::tolower(c)); });
     return !(value_lower == "false" || value_lower == "no" ||
              value_lower == "off");
-  } else
-    return true;
+  }
+  return true;
 }
 
 }  // end namespace benchmark
diff --git a/third-party/benchmark/src/commandlineflags.h b/third-party/benchmark/src/commandlineflags.h
index 5baaf11784df6a..7882628975eacc 100644
--- a/third-party/benchmark/src/commandlineflags.h
+++ b/third-party/benchmark/src/commandlineflags.h
@@ -5,28 +5,33 @@
 #include <map>
 #include <string>
 
+#include "benchmark/export.h"
+
 // Macro for referencing flags.
 #define FLAG(name) FLAGS_##name
 
 // Macros for declaring flags.
-#define BM_DECLARE_bool(name) extern bool FLAG(name)
-#define BM_DECLARE_int32(name) extern int32_t FLAG(name)
-#define BM_DECLARE_double(name) extern double FLAG(name)
-#define BM_DECLARE_string(name) extern std::string FLAG(name)
+#define BM_DECLARE_bool(name) BENCHMARK_EXPORT extern bool FLAG(name)
+#define BM_DECLARE_int32(name) BENCHMARK_EXPORT extern int32_t FLAG(name)
+#define BM_DECLARE_double(name) BENCHMARK_EXPORT extern double FLAG(name)
+#define BM_DECLARE_string(name) BENCHMARK_EXPORT extern std::string FLAG(name)
 #define BM_DECLARE_kvpairs(name) \
-  extern std::map<std::string, std::string> FLAG(name)
+  BENCHMARK_EXPORT extern std::map<std::string, std::string> FLAG(name)
 
 // Macros for defining flags.
 #define BM_DEFINE_bool(name, default_val) \
-  bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
+  BENCHMARK_EXPORT bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
 #define BM_DEFINE_int32(name, default_val) \
-  int32_t FLAG(name) = benchmark::Int32FromEnv(#name, default_val)
+  BENCHMARK_EXPORT int32_t FLAG(name) =    \
+      benchmark::Int32FromEnv(#name, default_val)
 #define BM_DEFINE_double(name, default_val) \
-  double FLAG(name) = benchmark::DoubleFromEnv(#name, default_val)
+  BENCHMARK_EXPORT double FLAG(name) =      \
+      benchmark::DoubleFromEnv(#name, default_val)
 #define BM_DEFINE_string(name, default_val) \
-  std::string FLAG(name) = benchmark::StringFromEnv(#name, default_val)
-#define BM_DEFINE_kvpairs(name, default_val)      \
-  std::map<std::string, std::string> FLAG(name) = \
+  BENCHMARK_EXPORT std::string FLAG(name) = \
+      benchmark::StringFromEnv(#name, default_val)
+#define BM_DEFINE_kvpairs(name, default_val)                       \
+  BENCHMARK_EXPORT std::map<std::string, std::string> FLAG(name) = \
       benchmark::KvPairsFromEnv(#name, default_val)
 
 namespace benchmark {
@@ -35,6 +40,7 @@ namespace benchmark {
 //
 // If the variable exists, returns IsTruthyFlagValue() value;  if not,
 // returns the given default value.
+BENCHMARK_EXPORT
 bool BoolFromEnv(const char* flag, bool default_val);
 
 // Parses an Int32 from the environment variable corresponding to the given
@@ -42,6 +48,7 @@ bool BoolFromEnv(const char* flag, bool default_val);
 //
 // If the variable exists, returns ParseInt32() value;  if not, returns
 // the given default value.
+BENCHMARK_EXPORT
 int32_t Int32FromEnv(const char* flag, int32_t default_val);
 
 // Parses an Double from the environment variable corresponding to the given
@@ -49,6 +56,7 @@ int32_t Int32FromEnv(const char* flag, int32_t default_val);
 //
 // If the variable exists, returns ParseDouble();  if not, returns
 // the given default value.
+BENCHMARK_EXPORT
 double DoubleFromEnv(const char* flag, double default_val);
 
 // Parses a string from the environment variable corresponding to the given
@@ -56,6 +64,7 @@ double DoubleFromEnv(const char* flag, double default_val);
 //
 // If variable exists, returns its value;  if not, returns
 // the given default value.
+BENCHMARK_EXPORT
 const char* StringFromEnv(const char* flag, const char* default_val);
 
 // Parses a set of kvpairs from the environment variable corresponding to the
@@ -63,6 +72,7 @@ const char* StringFromEnv(const char* flag, const char* default_val);
 //
 // If variable exists, returns its value;  if not, returns
 // the given default value.
+BENCHMARK_EXPORT
 std::map<std::string, std::string> KvPairsFromEnv(
     const char* flag, std::map<std::string, std::string> default_val);
 
@@ -75,40 +85,47 @@ std::map<std::string, std::string> KvPairsFromEnv(
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseBoolFlag(const char* str, const char* flag, bool* value);
 
 // Parses a string for an Int32 flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);
 
 // Parses a string for a Double flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseDoubleFlag(const char* str, const char* flag, double* value);
 
 // Parses a string for a string flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseStringFlag(const char* str, const char* flag, std::string* value);
 
 // Parses a string for a kvpairs flag in the form "--flag=key=value,key=value"
 //
 // On success, stores the value of the flag in *value and returns true. On
 // failure returns false, though *value may have been mutated.
+BENCHMARK_EXPORT
 bool ParseKeyValueFlag(const char* str, const char* flag,
                        std::map<std::string, std::string>* value);
 
 // Returns true if the string matches the flag.
+BENCHMARK_EXPORT
 bool IsFlag(const char* str, const char* flag);
 
 // Returns true unless value starts with one of: '0', 'f', 'F', 'n' or 'N', or
 // some non-alphanumeric character. Also returns false if the value matches
 // one of 'no', 'false', 'off' (case-insensitive). As a special case, also
 // returns true if value is the empty string.
+BENCHMARK_EXPORT
 bool IsTruthyFlagValue(const std::string& value);
 
 }  // end namespace benchmark
diff --git a/third-party/benchmark/src/complexity.cc b/third-party/benchmark/src/complexity.cc
index 825c57394a8ca3..eee3122646f953 100644
--- a/third-party/benchmark/src/complexity.cc
+++ b/third-party/benchmark/src/complexity.cc
@@ -37,12 +37,14 @@ BigOFunc* FittingCurve(BigO complexity) {
       return [](IterationCount n) -> double { return std::pow(n, 3); };
     case oLogN:
       /* Note: can't use log2 because Android's GNU STL lacks it */
-      return
-          [](IterationCount n) { return kLog2E * log(static_cast<double>(n)); };
+      return [](IterationCount n) {
+        return kLog2E * std::log(static_cast<double>(n));
+      };
     case oNLogN:
       /* Note: can't use log2 because Android's GNU STL lacks it */
       return [](IterationCount n) {
-        return kLog2E * n * log(static_cast<double>(n));
+        return kLog2E * static_cast<double>(n) *
+               std::log(static_cast<double>(n));
       };
     case o1:
     default:
@@ -75,12 +77,12 @@ std::string GetBigOString(BigO complexity) {
 // given by the lambda expression.
 //   - n             : Vector containing the size of the benchmark tests.
 //   - time          : Vector containing the times for the benchmark tests.
-//   - fitting_curve : lambda expression (e.g. [](int64_t n) {return n; };).
+//   - fitting_curve : lambda expression (e.g. [](ComplexityN n) {return n; };).
 
 // For a deeper explanation on the algorithm logic, please refer to
 // https://en.wikipedia.org/wiki/Least_squares#Least_squares,_regression_analysis_and_statistics
 
-LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
+LeastSq MinimalLeastSq(const std::vector<ComplexityN>& n,
                        const std::vector<double>& time,
                        BigOFunc* fitting_curve) {
   double sigma_gn_squared = 0.0;
@@ -105,12 +107,12 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
   double rms = 0.0;
   for (size_t i = 0; i < n.size(); ++i) {
     double fit = result.coef * fitting_curve(n[i]);
-    rms += pow((time[i] - fit), 2);
+    rms += std::pow((time[i] - fit), 2);
   }
 
   // Normalized RMS by the mean of the observed values
-  double mean = sigma_time / n.size();
-  result.rms = sqrt(rms / n.size()) / mean;
+  double mean = sigma_time / static_cast<double>(n.size());
+  result.rms = std::sqrt(rms / static_cast<double>(n.size())) / mean;
 
   return result;
 }
@@ -122,7 +124,7 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
 //   - complexity : If different than oAuto, the fitting curve will stick to
 //                  this one. If it is oAuto, it will be calculated the best
 //                  fitting curve.
-LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
+LeastSq MinimalLeastSq(const std::vector<ComplexityN>& n,
                        const std::vector<double>& time, const BigO complexity) {
   BM_CHECK_EQ(n.size(), time.size());
   BM_CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
@@ -162,7 +164,7 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
   if (reports.size() < 2) return results;
 
   // Accumulators.
-  std::vector<int64_t> n;
+  std::vector<ComplexityN> n;
   std::vector<double> real_time;
   std::vector<double> cpu_time;
 
@@ -171,8 +173,10 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
     BM_CHECK_GT(run.complexity_n, 0)
         << "Did you forget to call SetComplexityN?";
     n.push_back(run.complexity_n);
-    real_time.push_back(run.real_accumulated_time / run.iterations);
-    cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
+    real_time.push_back(run.real_accumulated_time /
+                        static_cast<double>(run.iterations));
+    cpu_time.push_back(run.cpu_accumulated_time /
+                       static_cast<double>(run.iterations));
   }
 
   LeastSq result_cpu;
@@ -182,8 +186,19 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
     result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity_lambda);
     result_real = MinimalLeastSq(n, real_time, reports[0].complexity_lambda);
   } else {
-    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity);
-    result_real = MinimalLeastSq(n, real_time, result_cpu.complexity);
+    const BigO* InitialBigO = &reports[0].complexity;
+    const bool use_real_time_for_initial_big_o =
+        reports[0].use_real_time_for_initial_big_o;
+    if (use_real_time_for_initial_big_o) {
+      result_real = MinimalLeastSq(n, real_time, *InitialBigO);
+      InitialBigO = &result_real.complexity;
+      // The Big-O complexity for CPU time must have the same Big-O function!
+    }
+    result_cpu = MinimalLeastSq(n, cpu_time, *InitialBigO);
+    InitialBigO = &result_cpu.complexity;
+    if (!use_real_time_for_initial_big_o) {
+      result_real = MinimalLeastSq(n, real_time, *InitialBigO);
+    }
   }
 
   // Drop the 'args' when reporting complexity.
diff --git a/third-party/benchmark/src/complexity.h b/third-party/benchmark/src/complexity.h
index df29b48d29b4e5..0a0679b48bc872 100644
--- a/third-party/benchmark/src/complexity.h
+++ b/third-party/benchmark/src/complexity.h
@@ -31,7 +31,7 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
     const std::vector<BenchmarkReporter::Run>& reports);
 
 // This data structure will contain the result returned by MinimalLeastSq
-//   - coef        : Estimated coeficient for the high-order term as
+//   - coef        : Estimated coefficient for the high-order term as
 //                   interpolated from data.
 //   - rms         : Normalized Root Mean Squared Error.
 //   - complexity  : Scalability form (e.g. oN, oNLogN). In case a scalability
diff --git a/third-party/benchmark/src/console_reporter.cc b/third-party/benchmark/src/console_reporter.cc
index 04cc0b74e58ef4..35c3de2a4dbaea 100644
--- a/third-party/benchmark/src/console_reporter.cc
+++ b/third-party/benchmark/src/console_reporter.cc
@@ -33,6 +33,7 @@
 
 namespace benchmark {
 
+BENCHMARK_EXPORT
 bool ConsoleReporter::ReportContext(const Context& context) {
   name_field_width_ = context.name_field_width;
   printed_header_ = false;
@@ -41,17 +42,22 @@ bool ConsoleReporter::ReportContext(const Context& context) {
   PrintBasicContext(&GetErrorStream(), context);
 
 #ifdef BENCHMARK_OS_WINDOWS
-  if ((output_options_ & OO_Color) && &std::cout != &GetOutputStream()) {
-    GetErrorStream()
-        << "Color printing is only supported for stdout on windows."
-           " Disabling color printing\n";
-    output_options_ = static_cast<OutputOptions>(output_options_ & ~OO_Color);
+  if ((output_options_ & OO_Color)) {
+    auto stdOutBuf = std::cout.rdbuf();
+    auto outStreamBuf = GetOutputStream().rdbuf();
+    if (stdOutBuf != outStreamBuf) {
+      GetErrorStream()
+          << "Color printing is only supported for stdout on windows."
+             " Disabling color printing\n";
+      output_options_ = static_cast<OutputOptions>(output_options_ & ~OO_Color);
+    }
   }
 #endif
 
   return true;
 }
 
+BENCHMARK_EXPORT
 void ConsoleReporter::PrintHeader(const Run& run) {
   std::string str =
       FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
@@ -69,6 +75,7 @@ void ConsoleReporter::PrintHeader(const Run& run) {
   GetOutputStream() << line << "\n" << str << "\n" << line << "\n";
 }
 
+BENCHMARK_EXPORT
 void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
   for (const auto& run : reports) {
     // print the header:
@@ -99,6 +106,9 @@ static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
 }
 
 static std::string FormatTime(double time) {
+  // For the time columns of the console printer 13 digits are reserved. One of
+  // them is a space and max two of them are the time unit (e.g ns). That puts
+  // us at 10 digits usable for the number.
   // Align decimal places...
   if (time < 1.0) {
     return FormatString("%10.3f", time);
@@ -109,9 +119,15 @@ static std::string FormatTime(double time) {
   if (time < 100.0) {
     return FormatString("%10.1f", time);
   }
+  // Assuming the time is at max 9.9999e+99 and we have 10 digits for the
+  // number, we get 10-1(.)-1(e)-1(sign)-2(exponent) = 5 digits to print.
+  if (time > 9999999999 /*max 10 digit number*/) {
+    return FormatString("%1.4e", time);
+  }
   return FormatString("%10.0f", time);
 }
 
+BENCHMARK_EXPORT
 void ConsoleReporter::PrintRunData(const Run& result) {
   typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
   auto& Out = GetOutputStream();
@@ -123,9 +139,13 @@ void ConsoleReporter::PrintRunData(const Run& result) {
   printer(Out, name_color, "%-*s ", name_field_width_,
           result.benchmark_name().c_str());
 
-  if (result.error_occurred) {
+  if (internal::SkippedWithError == result.skipped) {
     printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
-            result.error_message.c_str());
+            result.skip_message.c_str());
+    printer(Out, COLOR_DEFAULT, "\n");
+    return;
+  } else if (internal::SkippedWithMessage == result.skipped) {
+    printer(Out, COLOR_WHITE, "SKIPPED: \'%s\'", result.skip_message.c_str());
     printer(Out, COLOR_DEFAULT, "\n");
     return;
   }
diff --git a/third-party/benchmark/src/counter.cc b/third-party/benchmark/src/counter.cc
index cf5b78ee3ac6b4..aa14cd8092f94e 100644
--- a/third-party/benchmark/src/counter.cc
+++ b/third-party/benchmark/src/counter.cc
@@ -27,10 +27,10 @@ double Finish(Counter const& c, IterationCount iterations, double cpu_time,
     v /= num_threads;
   }
   if (c.flags & Counter::kIsIterationInvariant) {
-    v *= iterations;
+    v *= static_cast<double>(iterations);
   }
   if (c.flags & Counter::kAvgIterations) {
-    v /= iterations;
+    v /= static_cast<double>(iterations);
   }
 
   if (c.flags & Counter::kInvert) {  // Invert is *always* last.
diff --git a/third-party/benchmark/src/csv_reporter.cc b/third-party/benchmark/src/csv_reporter.cc
index 1c5e9fa6689c0e..4b39e2c52fb91a 100644
--- a/third-party/benchmark/src/csv_reporter.cc
+++ b/third-party/benchmark/src/csv_reporter.cc
@@ -52,11 +52,13 @@ std::string CsvEscape(const std::string& s) {
   return '"' + tmp + '"';
 }
 
+BENCHMARK_EXPORT
 bool CSVReporter::ReportContext(const Context& context) {
   PrintBasicContext(&GetErrorStream(), context);
   return true;
 }
 
+BENCHMARK_EXPORT
 void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
   std::ostream& Out = GetOutputStream();
 
@@ -103,13 +105,14 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
   }
 }
 
+BENCHMARK_EXPORT
 void CSVReporter::PrintRunData(const Run& run) {
   std::ostream& Out = GetOutputStream();
   Out << CsvEscape(run.benchmark_name()) << ",";
-  if (run.error_occurred) {
+  if (run.skipped) {
     Out << std::string(elements.size() - 3, ',');
-    Out << "true,";
-    Out << CsvEscape(run.error_message) << "\n";
+    Out << std::boolalpha << (internal::SkippedWithError == run.skipped) << ",";
+    Out << CsvEscape(run.skip_message) << "\n";
     return;
   }
 
@@ -119,13 +122,21 @@ void CSVReporter::PrintRunData(const Run& run) {
   }
   Out << ",";
 
-  Out << run.GetAdjustedRealTime() << ",";
-  Out << run.GetAdjustedCPUTime() << ",";
+  if (run.run_type != Run::RT_Aggregate ||
+      run.aggregate_unit == StatisticUnit::kTime) {
+    Out << run.GetAdjustedRealTime() << ",";
+    Out << run.GetAdjustedCPUTime() << ",";
+  } else {
+    assert(run.aggregate_unit == StatisticUnit::kPercentage);
+    Out << run.real_accumulated_time << ",";
+    Out << run.cpu_accumulated_time << ",";
+  }
 
   // Do not print timeLabel on bigO and RMS report
   if (run.report_big_o) {
     Out << GetBigOString(run.complexity);
-  } else if (!run.report_rms) {
+  } else if (!run.report_rms &&
+             run.aggregate_unit != StatisticUnit::kPercentage) {
     Out << GetTimeUnitString(run.time_unit);
   }
   Out << ",";
diff --git a/third-party/benchmark/src/cycleclock.h b/third-party/benchmark/src/cycleclock.h
index d65d32a39d3d3b..eff563e7fac223 100644
--- a/third-party/benchmark/src/cycleclock.h
+++ b/third-party/benchmark/src/cycleclock.h
@@ -36,7 +36,8 @@
 // declarations of some other intrinsics, breaking compilation.
 // Therefore, we simply declare __rdtsc ourselves. See also
 // http://connect.microsoft.com/VisualStudio/feedback/details/262047
-#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64)
+#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64) && \
+    !defined(_M_ARM64EC)
 extern "C" uint64_t __rdtsc();
 #pragma intrinsic(__rdtsc)
 #endif
@@ -114,7 +115,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   // when I know it will work.  Otherwise, I'll use __rdtsc and hope
   // the code is being compiled with a non-ancient compiler.
   _asm rdtsc
-#elif defined(COMPILER_MSVC) && defined(_M_ARM64)
+#elif defined(COMPILER_MSVC) && (defined(_M_ARM64) || defined(_M_ARM64EC))
   // See // https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics
   // and https://reviews.llvm.org/D53115
   int64_t virtual_timer_value;
@@ -132,7 +133,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
 
   // Native Client does not provide any API to access cycle counter.
   // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
-  // because is provides nanosecond resolution (which is noticable at
+  // because is provides nanosecond resolution (which is noticeable at
   // least for PNaCl modules running on x86 Mac & Linux).
   // Initialize to always return 0 if clock_gettime fails.
   struct timespec ts = {0, 0};
@@ -173,7 +174,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-#elif defined(__loongarch__)
+#elif defined(__loongarch__) || defined(__csky__)
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
@@ -188,15 +189,16 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
 #endif
   return tsc;
 #elif defined(__riscv)  // RISC-V
-  // Use RDCYCLE (and RDCYCLEH on riscv32)
+  // Use RDTIME (and RDTIMEH on riscv32).
+  // RDCYCLE is a privileged instruction since Linux 6.6.
 #if __riscv_xlen == 32
   uint32_t cycles_lo, cycles_hi0, cycles_hi1;
   // This asm also includes the PowerPC overflow handling strategy, as above.
   // Implemented in assembly because Clang insisted on branching.
   asm volatile(
-      "rdcycleh %0\n"
-      "rdcycle %1\n"
-      "rdcycleh %2\n"
+      "rdtimeh %0\n"
+      "rdtime %1\n"
+      "rdtimeh %2\n"
       "sub %0, %0, %2\n"
       "seqz %0, %0\n"
       "sub %0, zero, %0\n"
@@ -205,17 +207,31 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   return (static_cast<uint64_t>(cycles_hi1) << 32) | cycles_lo;
 #else
   uint64_t cycles;
-  asm volatile("rdcycle %0" : "=r"(cycles));
+  asm volatile("rdtime %0" : "=r"(cycles));
   return cycles;
 #endif
 #elif defined(__e2k__) || defined(__elbrus__)
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__hexagon__)
+  uint64_t pcycle;
+  asm volatile("%0 = C15:14" : "=r"(pcycle));
+  return static_cast<double>(pcycle);
+#elif defined(__alpha__)
+  // Alpha has a cycle counter, the PCC register, but it is an unsigned 32-bit
+  // integer and thus wraps every ~4s, making using it for tick counts
+  // unreliable beyond this time range.  The real-time clock is low-precision,
+  // roughtly ~1ms, but it is the only option that can reasonable count
+  // indefinitely.
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 #else
-// The soft failover to a generic implementation is automatic only for ARM.
-// For other platforms the developer is expected to make an attempt to create
-// a fast implementation and use generic version if nothing better is available.
+  // The soft failover to a generic implementation is automatic only for ARM.
+  // For other platforms the developer is expected to make an attempt to create
+  // a fast implementation and use generic version if nothing better is
+  // available.
 #error You need to define CycleTimer for your OS and CPU
 #endif
 }
diff --git a/third-party/benchmark/src/internal_macros.h b/third-party/benchmark/src/internal_macros.h
index 91f367b894bcd8..8dd7d0c6502e86 100644
--- a/third-party/benchmark/src/internal_macros.h
+++ b/third-party/benchmark/src/internal_macros.h
@@ -1,8 +1,6 @@
 #ifndef BENCHMARK_INTERNAL_MACROS_H_
 #define BENCHMARK_INTERNAL_MACROS_H_
 
-#include "benchmark/benchmark.h"
-
 /* Needed to detect STL */
 #include <cstdlib>
 
@@ -44,6 +42,19 @@
   #define BENCHMARK_OS_CYGWIN 1
 #elif defined(_WIN32)
   #define BENCHMARK_OS_WINDOWS 1
+  // WINAPI_FAMILY_PARTITION is defined in winapifamily.h.
+  // We include windows.h which implicitly includes winapifamily.h for compatibility.
+  #ifndef NOMINMAX
+    #define NOMINMAX
+  #endif
+  #include <windows.h>
+  #if defined(WINAPI_FAMILY_PARTITION)
+    #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+      #define BENCHMARK_OS_WINDOWS_WIN32 1
+    #elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+      #define BENCHMARK_OS_WINDOWS_RT 1
+    #endif
+  #endif
   #if defined(__MINGW32__)
     #define BENCHMARK_OS_MINGW 1
   #endif
@@ -80,6 +91,8 @@
 #define BENCHMARK_OS_QNX 1
 #elif defined(__MVS__)
 #define BENCHMARK_OS_ZOS 1
+#elif defined(__hexagon__)
+#define BENCHMARK_OS_QURT 1
 #endif
 
 #if defined(__ANDROID__) && defined(__GLIBCXX__)
diff --git a/third-party/benchmark/src/json_reporter.cc b/third-party/benchmark/src/json_reporter.cc
index e84a4ed24f9d86..b8c8c94c08a0f0 100644
--- a/third-party/benchmark/src/json_reporter.cc
+++ b/third-party/benchmark/src/json_reporter.cc
@@ -28,10 +28,6 @@
 #include "timers.h"
 
 namespace benchmark {
-namespace internal {
-extern std::map<std::string, std::string>* global_context;
-}
-
 namespace {
 
 std::string StrEscape(const std::string& s) {
@@ -89,12 +85,6 @@ std::string FormatKV(std::string const& key, int64_t value) {
   return ss.str();
 }
 
-std::string FormatKV(std::string const& key, IterationCount value) {
-  std::stringstream ss;
-  ss << '"' << StrEscape(key) << "\": " << value;
-  return ss.str();
-}
-
 std::string FormatKV(std::string const& key, double value) {
   std::stringstream ss;
   ss << '"' << StrEscape(key) << "\": ";
@@ -177,15 +167,25 @@ bool JSONReporter::ReportContext(const Context& context) {
   }
   out << "],\n";
 
+  out << indent << FormatKV("library_version", GetBenchmarkVersion());
+  out << ",\n";
+
 #if defined(NDEBUG)
   const char build_type[] = "release";
 #else
   const char build_type[] = "debug";
 #endif
   out << indent << FormatKV("library_build_type", build_type);
+  out << ",\n";
+
+  // NOTE: our json schema is not strictly tied to the library version!
+  out << indent << FormatKV("json_schema_version", int64_t(1));
+
+  std::map<std::string, std::string>* global_context =
+      internal::GetGlobalContext();
 
-  if (internal::global_context != nullptr) {
-    for (const auto& kv : *internal::global_context) {
+  if (global_context != nullptr) {
+    for (const auto& kv : *global_context) {
       out << ",\n";
       out << indent << FormatKV(kv.first, kv.second);
     }
@@ -261,9 +261,12 @@ void JSONReporter::PrintRunData(Run const& run) {
       BENCHMARK_UNREACHABLE();
     }()) << ",\n";
   }
-  if (run.error_occurred) {
-    out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
-    out << indent << FormatKV("error_message", run.error_message) << ",\n";
+  if (internal::SkippedWithError == run.skipped) {
+    out << indent << FormatKV("error_occurred", true) << ",\n";
+    out << indent << FormatKV("error_message", run.skip_message) << ",\n";
+  } else if (internal::SkippedWithMessage == run.skipped) {
+    out << indent << FormatKV("skipped", true) << ",\n";
+    out << indent << FormatKV("skip_message", run.skip_message) << ",\n";
   }
   if (!run.report_big_o && !run.report_rms) {
     out << indent << FormatKV("iterations", run.iterations) << ",\n";
@@ -301,7 +304,8 @@ void JSONReporter::PrintRunData(Run const& run) {
     out << ",\n"
         << indent << FormatKV("max_bytes_used", memory_result.max_bytes_used);
 
-    auto report_if_present = [&out, &indent](const char* label, int64_t val) {
+    auto report_if_present = [&out, &indent](const std::string& label,
+                                             int64_t val) {
       if (val != MemoryManager::TombstoneValue)
         out << ",\n" << indent << FormatKV(label, val);
     };
diff --git a/third-party/benchmark/src/log.h b/third-party/benchmark/src/log.h
index 48c071aded8f30..9a21400b096d53 100644
--- a/third-party/benchmark/src/log.h
+++ b/third-party/benchmark/src/log.h
@@ -4,7 +4,12 @@
 #include <iostream>
 #include <ostream>
 
-#include "benchmark/benchmark.h"
+// NOTE: this is also defined in benchmark.h but we're trying to avoid a
+// dependency.
+// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer.
+#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
+#define BENCHMARK_HAS_CXX11
+#endif
 
 namespace benchmark {
 namespace internal {
@@ -23,7 +28,16 @@ class LogType {
  private:
   LogType(std::ostream* out) : out_(out) {}
   std::ostream* out_;
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(LogType);
+
+  // NOTE: we could use BENCHMARK_DISALLOW_COPY_AND_ASSIGN but we shouldn't have
+  // a dependency on benchmark.h from here.
+#ifndef BENCHMARK_HAS_CXX11
+  LogType(const LogType&);
+  LogType& operator=(const LogType&);
+#else
+  LogType(const LogType&) = delete;
+  LogType& operator=(const LogType&) = delete;
+#endif
 };
 
 template <class Tp>
@@ -47,13 +61,13 @@ inline int& LogLevel() {
 }
 
 inline LogType& GetNullLogInstance() {
-  static LogType log(nullptr);
-  return log;
+  static LogType null_log(static_cast<std::ostream*>(nullptr));
+  return null_log;
 }
 
 inline LogType& GetErrorLogInstance() {
-  static LogType log(&std::clog);
-  return log;
+  static LogType error_log(&std::clog);
+  return error_log;
 }
 
 inline LogType& GetLogInstanceForLevel(int level) {
diff --git a/third-party/benchmark/src/perf_counters.cc b/third-party/benchmark/src/perf_counters.cc
index b2ac7687efef65..d466e27e86f942 100644
--- a/third-party/benchmark/src/perf_counters.cc
+++ b/third-party/benchmark/src/perf_counters.cc
@@ -15,6 +15,7 @@
 #include "perf_counters.h"
 
 #include <cstring>
+#include <memory>
 #include <vector>
 
 #if defined HAVE_LIBPFM
@@ -28,105 +29,254 @@ namespace internal {
 constexpr size_t PerfCounterValues::kMaxCounters;
 
 #if defined HAVE_LIBPFM
+
+size_t PerfCounterValues::Read(const std::vector<int>& leaders) {
+  // Create a pointer for multiple reads
+  const size_t bufsize = values_.size() * sizeof(values_[0]);
+  char* ptr = reinterpret_cast<char*>(values_.data());
+  size_t size = bufsize;
+  for (int lead : leaders) {
+    auto read_bytes = ::read(lead, ptr, size);
+    if (read_bytes >= ssize_t(sizeof(uint64_t))) {
+      // Actual data bytes are all bytes minus initial padding
+      std::size_t data_bytes = read_bytes - sizeof(uint64_t);
+      // This should be very cheap since it's in hot cache
+      std::memmove(ptr, ptr + sizeof(uint64_t), data_bytes);
+      // Increment our counters
+      ptr += data_bytes;
+      size -= data_bytes;
+    } else {
+      int err = errno;
+      GetErrorLogInstance() << "Error reading lead " << lead << " errno:" << err
+                            << " " << ::strerror(err) << "\n";
+      return 0;
+    }
+  }
+  return (bufsize - size) / sizeof(uint64_t);
+}
+
 const bool PerfCounters::kSupported = true;
 
-bool PerfCounters::Initialize() { return pfm_initialize() == PFM_SUCCESS; }
+// Initializes libpfm only on the first call.  Returns whether that single
+// initialization was successful.
+bool PerfCounters::Initialize() {
+  // Function-scope static gets initialized only once on first call.
+  static const bool success = []() {
+    return pfm_initialize() == PFM_SUCCESS;
+  }();
+  return success;
+}
+
+bool PerfCounters::IsCounterSupported(const std::string& name) {
+  Initialize();
+  perf_event_attr_t attr;
+  std::memset(&attr, 0, sizeof(attr));
+  pfm_perf_encode_arg_t arg;
+  std::memset(&arg, 0, sizeof(arg));
+  arg.attr = &attr;
+  const int mode = PFM_PLM3;  // user mode only
+  int ret = pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT_EXT,
+                                      &arg);
+  return (ret == PFM_SUCCESS);
+}
 
 PerfCounters PerfCounters::Create(
     const std::vector<std::string>& counter_names) {
-  if (counter_names.empty()) {
-    return NoCounters();
-  }
-  if (counter_names.size() > PerfCounterValues::kMaxCounters) {
-    GetErrorLogInstance()
-        << counter_names.size()
-        << " counters were requested. The minimum is 1, the maximum is "
-        << PerfCounterValues::kMaxCounters << "\n";
-    return NoCounters();
+  if (!counter_names.empty()) {
+    Initialize();
   }
-  std::vector<int> counter_ids(counter_names.size());
 
-  const int mode = PFM_PLM3;  // user mode only
+  // Valid counters will populate these arrays but we start empty
+  std::vector<std::string> valid_names;
+  std::vector<int> counter_ids;
+  std::vector<int> leader_ids;
+
+  // Resize to the maximum possible
+  valid_names.reserve(counter_names.size());
+  counter_ids.reserve(counter_names.size());
+
+  const int kCounterMode = PFM_PLM3;  // user mode only
+
+  // Group leads will be assigned on demand. The idea is that once we cannot
+  // create a counter descriptor, the reason is that this group has maxed out
+  // so we set the group_id again to -1 and retry - giving the algorithm a
+  // chance to create a new group leader to hold the next set of counters.
+  int group_id = -1;
+
+  // Loop through all performance counters
   for (size_t i = 0; i < counter_names.size(); ++i) {
-    const bool is_first = i == 0;
-    struct perf_event_attr attr {};
-    attr.size = sizeof(attr);
-    const int group_id = !is_first ? counter_ids[0] : -1;
+    // we are about to push into the valid names vector
+    // check if we did not reach the maximum
+    if (valid_names.size() == PerfCounterValues::kMaxCounters) {
+      // Log a message if we maxed out and stop adding
+      GetErrorLogInstance()
+          << counter_names.size() << " counters were requested. The maximum is "
+          << PerfCounterValues::kMaxCounters << " and " << valid_names.size()
+          << " were already added. All remaining counters will be ignored\n";
+      // stop the loop and return what we have already
+      break;
+    }
+
+    // Check if this name is empty
     const auto& name = counter_names[i];
     if (name.empty()) {
-      GetErrorLogInstance() << "A counter name was the empty string\n";
-      return NoCounters();
+      GetErrorLogInstance()
+          << "A performance counter name was the empty string\n";
+      continue;
     }
+
+    // Here first means first in group, ie the group leader
+    const bool is_first = (group_id < 0);
+
+    // This struct will be populated by libpfm from the counter string
+    // and then fed into the syscall perf_event_open
+    struct perf_event_attr attr {};
+    attr.size = sizeof(attr);
+
+    // This is the input struct to libpfm.
     pfm_perf_encode_arg_t arg{};
     arg.attr = &attr;
-
-    const int pfm_get =
-        pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT, &arg);
+    const int pfm_get = pfm_get_os_event_encoding(name.c_str(), kCounterMode,
+                                                  PFM_OS_PERF_EVENT, &arg);
     if (pfm_get != PFM_SUCCESS) {
-      GetErrorLogInstance() << "Unknown counter name: " << name << "\n";
-      return NoCounters();
+      GetErrorLogInstance()
+          << "Unknown performance counter name: " << name << "\n";
+      continue;
     }
-    attr.disabled = is_first;
-    // Note: the man page for perf_event_create suggests inerit = true and
+
+    // We then proceed to populate the remaining fields in our attribute struct
+    // Note: the man page for perf_event_create suggests inherit = true and
     // read_format = PERF_FORMAT_GROUP don't work together, but that's not the
     // case.
+    attr.disabled = is_first;
     attr.inherit = true;
     attr.pinned = is_first;
     attr.exclude_kernel = true;
     attr.exclude_user = false;
     attr.exclude_hv = true;
-    // Read all counters in one read.
+
+    // Read all counters in a group in one read.
     attr.read_format = PERF_FORMAT_GROUP;
 
     int id = -1;
-    static constexpr size_t kNrOfSyscallRetries = 5;
-    // Retry syscall as it was interrupted often (b/64774091).
-    for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
-         ++num_retries) {
-      id = perf_event_open(&attr, 0, -1, group_id, 0);
-      if (id >= 0 || errno != EINTR) {
-        break;
+    while (id < 0) {
+      static constexpr size_t kNrOfSyscallRetries = 5;
+      // Retry syscall as it was interrupted often (b/64774091).
+      for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
+           ++num_retries) {
+        id = perf_event_open(&attr, 0, -1, group_id, 0);
+        if (id >= 0 || errno != EINTR) {
+          break;
+        }
+      }
+      if (id < 0) {
+        // If the file descriptor is negative we might have reached a limit
+        // in the current group. Set the group_id to -1 and retry
+        if (group_id >= 0) {
+          // Create a new group
+          group_id = -1;
+        } else {
+          // At this point we have already retried to set a new group id and
+          // failed. We then give up.
+          break;
+        }
       }
     }
+
+    // We failed to get a new file descriptor. We might have reached a hard
+    // hardware limit that cannot be resolved even with group multiplexing
     if (id < 0) {
-      GetErrorLogInstance()
-          << "Failed to get a file descriptor for " << name << "\n";
-      return NoCounters();
-    }
+      GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor "
+                               "for performance counter "
+                            << name << ". Ignoring\n";
 
-    counter_ids[i] = id;
+      // We give up on this counter but try to keep going
+      // as the others would be fine
+      continue;
+    }
+    if (group_id < 0) {
+      // This is a leader, store and assign it to the current file descriptor
+      leader_ids.push_back(id);
+      group_id = id;
+    }
+    // This is a valid counter, add it to our descriptor's list
+    counter_ids.push_back(id);
+    valid_names.push_back(name);
   }
-  if (ioctl(counter_ids[0], PERF_EVENT_IOC_ENABLE) != 0) {
-    GetErrorLogInstance() << "Failed to start counters\n";
-    return NoCounters();
+
+  // Loop through all group leaders activating them
+  // There is another option of starting ALL counters in a process but
+  // that would be far reaching an intrusion. If the user is using PMCs
+  // by themselves then this would have a side effect on them. It is
+  // friendlier to loop through all groups individually.
+  for (int lead : leader_ids) {
+    if (ioctl(lead, PERF_EVENT_IOC_ENABLE) != 0) {
+      // This should never happen but if it does, we give up on the
+      // entire batch as recovery would be a mess.
+      GetErrorLogInstance() << "***WARNING*** Failed to start counters. "
+                               "Claring out all counters.\n";
+
+      // Close all peformance counters
+      for (int id : counter_ids) {
+        ::close(id);
+      }
+
+      // Return an empty object so our internal state is still good and
+      // the process can continue normally without impact
+      return NoCounters();
+    }
   }
 
-  return PerfCounters(counter_names, std::move(counter_ids));
+  return PerfCounters(std::move(valid_names), std::move(counter_ids),
+                      std::move(leader_ids));
 }
 
-PerfCounters::~PerfCounters() {
+void PerfCounters::CloseCounters() const {
   if (counter_ids_.empty()) {
     return;
   }
-  ioctl(counter_ids_[0], PERF_EVENT_IOC_DISABLE);
+  for (int lead : leader_ids_) {
+    ioctl(lead, PERF_EVENT_IOC_DISABLE);
+  }
   for (int fd : counter_ids_) {
     close(fd);
   }
 }
 #else   // defined HAVE_LIBPFM
+size_t PerfCounterValues::Read(const std::vector<int>&) { return 0; }
+
 const bool PerfCounters::kSupported = false;
 
 bool PerfCounters::Initialize() { return false; }
 
+bool PerfCounters::IsCounterSupported(const std::string&) { return false; }
+
 PerfCounters PerfCounters::Create(
     const std::vector<std::string>& counter_names) {
   if (!counter_names.empty()) {
-    GetErrorLogInstance() << "Performance counters not supported.";
+    GetErrorLogInstance() << "Performance counters not supported.\n";
   }
   return NoCounters();
 }
 
-PerfCounters::~PerfCounters() = default;
+void PerfCounters::CloseCounters() const {}
 #endif  // defined HAVE_LIBPFM
+
+PerfCountersMeasurement::PerfCountersMeasurement(
+    const std::vector<std::string>& counter_names)
+    : start_values_(counter_names.size()), end_values_(counter_names.size()) {
+  counters_ = PerfCounters::Create(counter_names);
+}
+
+PerfCounters& PerfCounters::operator=(PerfCounters&& other) noexcept {
+  if (this != &other) {
+    CloseCounters();
+
+    counter_ids_ = std::move(other.counter_ids_);
+    leader_ids_ = std::move(other.leader_ids_);
+    counter_names_ = std::move(other.counter_names_);
+  }
+  return *this;
+}
 }  // namespace internal
 }  // namespace benchmark
diff --git a/third-party/benchmark/src/perf_counters.h b/third-party/benchmark/src/perf_counters.h
index 47ca1385e24dd7..bf5eb6bc3aec9d 100644
--- a/third-party/benchmark/src/perf_counters.h
+++ b/third-party/benchmark/src/perf_counters.h
@@ -17,16 +17,25 @@
 
 #include <array>
 #include <cstdint>
+#include <cstring>
+#include <memory>
 #include <vector>
 
 #include "benchmark/benchmark.h"
 #include "check.h"
 #include "log.h"
+#include "mutex.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
 #include <unistd.h>
 #endif
 
+#if defined(_MSC_VER)
+#pragma warning(push)
+// C4251: <symbol> needs to have dll-interface to be used by clients of class
+#pragma warning(disable : 4251)
+#endif
+
 namespace benchmark {
 namespace internal {
 
@@ -36,18 +45,21 @@ namespace internal {
 // The implementation ensures the storage is inlined, and allows 0-based
 // indexing into the counter values.
 // The object is used in conjunction with a PerfCounters object, by passing it
-// to Snapshot(). The values are populated such that
-// perfCounters->names()[i]'s value is obtained at position i (as given by
-// operator[]) of this object.
-class PerfCounterValues {
+// to Snapshot(). The Read() method relocates individual reads, discarding
+// the initial padding from each group leader in the values buffer such that
+// all user accesses through the [] operator are correct.
+class BENCHMARK_EXPORT PerfCounterValues {
  public:
   explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) {
     BM_CHECK_LE(nr_counters_, kMaxCounters);
   }
 
-  uint64_t operator[](size_t pos) const { return values_[kPadding + pos]; }
+  // We are reading correctly now so the values don't need to skip padding
+  uint64_t operator[](size_t pos) const { return values_[pos]; }
 
-  static constexpr size_t kMaxCounters = 3;
+  // Increased the maximum to 32 only since the buffer
+  // is std::array<> backed
+  static constexpr size_t kMaxCounters = 32;
 
  private:
   friend class PerfCounters;
@@ -58,7 +70,14 @@ class PerfCounterValues {
             sizeof(uint64_t) * (kPadding + nr_counters_)};
   }
 
-  static constexpr size_t kPadding = 1;
+  // This reading is complex and as the goal of this class is to
+  // abstract away the intrincacies of the reading process, this is
+  // a better place for it
+  size_t Read(const std::vector<int>& leaders);
+
+  // Move the padding to 2 due to the reading algorithm (1st padding plus a
+  // current read padding)
+  static constexpr size_t kPadding = 2;
   std::array<uint64_t, kPadding + kMaxCounters> values_;
   const size_t nr_counters_;
 };
@@ -66,27 +85,34 @@ class PerfCounterValues {
 // Collect PMU counters. The object, once constructed, is ready to be used by
 // calling read(). PMU counter collection is enabled from the time create() is
 // called, to obtain the object, until the object's destructor is called.
-class PerfCounters final {
+class BENCHMARK_EXPORT PerfCounters final {
  public:
   // True iff this platform supports performance counters.
   static const bool kSupported;
 
-  bool IsValid() const { return is_valid_; }
+  // Returns an empty object
   static PerfCounters NoCounters() { return PerfCounters(); }
 
-  ~PerfCounters();
+  ~PerfCounters() { CloseCounters(); }
+  PerfCounters() = default;
   PerfCounters(PerfCounters&&) = default;
   PerfCounters(const PerfCounters&) = delete;
+  PerfCounters& operator=(PerfCounters&&) noexcept;
+  PerfCounters& operator=(const PerfCounters&) = delete;
 
   // Platform-specific implementations may choose to do some library
   // initialization here.
   static bool Initialize();
 
+  // Check if the given counter is supported, if the app wants to
+  // check before passing
+  static bool IsCounterSupported(const std::string& name);
+
   // Return a PerfCounters object ready to read the counters with the names
   // specified. The values are user-mode only. The counter name format is
   // implementation and OS specific.
-  // TODO: once we move to C++-17, this should be a std::optional, and then the
-  // IsValid() boolean can be dropped.
+  // In case of failure, this method will in the worst case return an
+  // empty object whose state will still be valid.
   static PerfCounters Create(const std::vector<std::string>& counter_names);
 
   // Take a snapshot of the current value of the counters into the provided
@@ -95,10 +121,7 @@ class PerfCounters final {
   BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) const {
 #ifndef BENCHMARK_OS_WINDOWS
     assert(values != nullptr);
-    assert(IsValid());
-    auto buffer = values->get_data_buffer();
-    auto read_bytes = ::read(counter_ids_[0], buffer.first, buffer.second);
-    return static_cast<size_t>(read_bytes) == buffer.second;
+    return values->Read(leader_ids_) == counter_ids_.size();
 #else
     (void)values;
     return false;
@@ -110,63 +133,68 @@ class PerfCounters final {
 
  private:
   PerfCounters(const std::vector<std::string>& counter_names,
-               std::vector<int>&& counter_ids)
+               std::vector<int>&& counter_ids, std::vector<int>&& leader_ids)
       : counter_ids_(std::move(counter_ids)),
-        counter_names_(counter_names),
-        is_valid_(true) {}
-  PerfCounters() : is_valid_(false) {}
+        leader_ids_(std::move(leader_ids)),
+        counter_names_(counter_names) {}
+
+  void CloseCounters() const;
 
   std::vector<int> counter_ids_;
-  const std::vector<std::string> counter_names_;
-  const bool is_valid_;
+  std::vector<int> leader_ids_;
+  std::vector<std::string> counter_names_;
 };
 
 // Typical usage of the above primitives.
-class PerfCountersMeasurement final {
+class BENCHMARK_EXPORT PerfCountersMeasurement final {
  public:
-  PerfCountersMeasurement(PerfCounters&& c)
-      : counters_(std::move(c)),
-        start_values_(counters_.IsValid() ? counters_.names().size() : 0),
-        end_values_(counters_.IsValid() ? counters_.names().size() : 0) {}
+  PerfCountersMeasurement(const std::vector<std::string>& counter_names);
+
+  size_t num_counters() const { return counters_.num_counters(); }
 
-  bool IsValid() const { return counters_.IsValid(); }
+  std::vector<std::string> names() const { return counters_.names(); }
 
-  BENCHMARK_ALWAYS_INLINE void Start() {
-    assert(IsValid());
+  BENCHMARK_ALWAYS_INLINE bool Start() {
+    if (num_counters() == 0) return true;
     // Tell the compiler to not move instructions above/below where we take
     // the snapshot.
     ClobberMemory();
-    counters_.Snapshot(&start_values_);
+    valid_read_ &= counters_.Snapshot(&start_values_);
     ClobberMemory();
+
+    return valid_read_;
   }
 
-  BENCHMARK_ALWAYS_INLINE std::vector<std::pair<std::string, double>>
-  StopAndGetMeasurements() {
-    assert(IsValid());
+  BENCHMARK_ALWAYS_INLINE bool Stop(
+      std::vector<std::pair<std::string, double>>& measurements) {
+    if (num_counters() == 0) return true;
     // Tell the compiler to not move instructions above/below where we take
     // the snapshot.
     ClobberMemory();
-    counters_.Snapshot(&end_values_);
+    valid_read_ &= counters_.Snapshot(&end_values_);
     ClobberMemory();
 
-    std::vector<std::pair<std::string, double>> ret;
     for (size_t i = 0; i < counters_.names().size(); ++i) {
       double measurement = static_cast<double>(end_values_[i]) -
                            static_cast<double>(start_values_[i]);
-      ret.push_back({counters_.names()[i], measurement});
+      measurements.push_back({counters_.names()[i], measurement});
     }
-    return ret;
+
+    return valid_read_;
   }
 
  private:
   PerfCounters counters_;
+  bool valid_read_ = true;
   PerfCounterValues start_values_;
   PerfCounterValues end_values_;
 };
 
-BENCHMARK_UNUSED static bool perf_init_anchor = PerfCounters::Initialize();
-
 }  // namespace internal
 }  // namespace benchmark
 
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
 #endif  // BENCHMARK_PERF_COUNTERS_H
diff --git a/third-party/benchmark/src/re.h b/third-party/benchmark/src/re.h
index 630046782dc487..9afb869bea27ee 100644
--- a/third-party/benchmark/src/re.h
+++ b/third-party/benchmark/src/re.h
@@ -33,7 +33,7 @@
 // Prefer C regex libraries when compiling w/o exceptions so that we can
 // correctly report errors.
 #if defined(BENCHMARK_HAS_NO_EXCEPTIONS) && \
-    defined(BENCHMARK_HAVE_STD_REGEX) && \
+    defined(HAVE_STD_REGEX) && \
     (defined(HAVE_GNU_POSIX_REGEX) || defined(HAVE_POSIX_REGEX))
   #undef HAVE_STD_REGEX
 #endif
diff --git a/third-party/benchmark/src/reporter.cc b/third-party/benchmark/src/reporter.cc
index 1d2df17b90f013..076bc31a2eccc2 100644
--- a/third-party/benchmark/src/reporter.cc
+++ b/third-party/benchmark/src/reporter.cc
@@ -25,9 +25,6 @@
 #include "timers.h"
 
 namespace benchmark {
-namespace internal {
-extern std::map<std::string, std::string> *global_context;
-}
 
 BenchmarkReporter::BenchmarkReporter()
     : output_stream_(&std::cout), error_stream_(&std::cerr) {}
@@ -39,7 +36,11 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
   BM_CHECK(out) << "cannot be null";
   auto &Out = *out;
 
+#ifndef BENCHMARK_OS_QURT
+  // Date/time information is not available on QuRT.
+  // Attempting to get it via this call cause the binary to crash.
   Out << LocalDateTimeString() << "\n";
+#endif
 
   if (context.executable_name)
     Out << "Running " << context.executable_name << "\n";
@@ -67,8 +68,11 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
     Out << "\n";
   }
 
-  if (internal::global_context != nullptr) {
-    for (const auto &kv : *internal::global_context) {
+  std::map<std::string, std::string> *global_context =
+      internal::GetGlobalContext();
+
+  if (global_context != nullptr) {
+    for (const auto &kv : *global_context) {
       Out << kv.first << ": " << kv.second << "\n";
     }
   }
diff --git a/third-party/benchmark/src/sleep.cc b/third-party/benchmark/src/sleep.cc
deleted file mode 100644
index ab59000f24adf8..00000000000000
--- a/third-party/benchmark/src/sleep.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "sleep.h"
-
-#include <cerrno>
-#include <cstdlib>
-#include <ctime>
-
-#include "internal_macros.h"
-
-#ifdef BENCHMARK_OS_WINDOWS
-#include <windows.h>
-#endif
-
-#ifdef BENCHMARK_OS_ZOS
-#include <unistd.h>
-#endif
-
-namespace benchmark {
-#ifdef BENCHMARK_OS_WINDOWS
-// Window's Sleep takes milliseconds argument.
-void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); }
-void SleepForSeconds(double seconds) {
-  SleepForMilliseconds(static_cast<int>(kNumMillisPerSecond * seconds));
-}
-#else  // BENCHMARK_OS_WINDOWS
-void SleepForMicroseconds(int microseconds) {
-#ifdef BENCHMARK_OS_ZOS
-  // z/OS does not support nanosleep. Instead call sleep() and then usleep() to
-  // sleep for the remaining microseconds because usleep() will fail if its
-  // argument is greater than 1000000.
-  div_t sleepTime = div(microseconds, kNumMicrosPerSecond);
-  int seconds = sleepTime.quot;
-  while (seconds != 0) seconds = sleep(seconds);
-  while (usleep(sleepTime.rem) == -1 && errno == EINTR)
-    ;
-#else
-  struct timespec sleep_time;
-  sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
-  sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
-  while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
-    ;  // Ignore signals and wait for the full interval to elapse.
-#endif
-}
-
-void SleepForMilliseconds(int milliseconds) {
-  SleepForMicroseconds(milliseconds * kNumMicrosPerMilli);
-}
-
-void SleepForSeconds(double seconds) {
-  SleepForMicroseconds(static_cast<int>(seconds * kNumMicrosPerSecond));
-}
-#endif  // BENCHMARK_OS_WINDOWS
-}  // end namespace benchmark
diff --git a/third-party/benchmark/src/sleep.h b/third-party/benchmark/src/sleep.h
deleted file mode 100644
index f98551afe28491..00000000000000
--- a/third-party/benchmark/src/sleep.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef BENCHMARK_SLEEP_H_
-#define BENCHMARK_SLEEP_H_
-
-namespace benchmark {
-const int kNumMillisPerSecond = 1000;
-const int kNumMicrosPerMilli = 1000;
-const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000;
-const int kNumNanosPerMicro = 1000;
-const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
-
-void SleepForMilliseconds(int milliseconds);
-void SleepForSeconds(double seconds);
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_SLEEP_H_
diff --git a/third-party/benchmark/src/statistics.cc b/third-party/benchmark/src/statistics.cc
index 3e5ef099397138..261dcb299a6773 100644
--- a/third-party/benchmark/src/statistics.cc
+++ b/third-party/benchmark/src/statistics.cc
@@ -32,7 +32,7 @@ auto StatisticsSum = [](const std::vector<double>& v) {
 
 double StatisticsMean(const std::vector<double>& v) {
   if (v.empty()) return 0.0;
-  return StatisticsSum(v) * (1.0 / v.size());
+  return StatisticsSum(v) * (1.0 / static_cast<double>(v.size()));
 }
 
 double StatisticsMedian(const std::vector<double>& v) {
@@ -42,13 +42,13 @@ double StatisticsMedian(const std::vector<double>& v) {
   auto center = copy.begin() + v.size() / 2;
   std::nth_element(copy.begin(), center, copy.end());
 
-  // did we have an odd number of samples?
-  // if yes, then center is the median
-  // it no, then we are looking for the average between center and the value
-  // before
+  // Did we have an odd number of samples?  If yes, then center is the median.
+  // If not, then we are looking for the average between center and the value
+  // before.  Instead of resorting, we just look for the max value before it,
+  // which is not necessarily the element immediately preceding `center` Since
+  // `copy` is only partially sorted by `nth_element`.
   if (v.size() % 2 == 1) return *center;
-  auto center2 = copy.begin() + v.size() / 2 - 1;
-  std::nth_element(copy.begin(), center2, copy.end());
+  auto center2 = std::max_element(copy.begin(), center);
   return (*center + *center2) / 2.0;
 }
 
@@ -71,8 +71,11 @@ double StatisticsStdDev(const std::vector<double>& v) {
   // Sample standard deviation is undefined for n = 1
   if (v.size() == 1) return 0.0;
 
-  const double avg_squares = SumSquares(v) * (1.0 / v.size());
-  return Sqrt(v.size() / (v.size() - 1.0) * (avg_squares - Sqr(mean)));
+  const double avg_squares =
+      SumSquares(v) * (1.0 / static_cast<double>(v.size()));
+  return Sqrt(static_cast<double>(v.size()) /
+              (static_cast<double>(v.size()) - 1.0) *
+              (avg_squares - Sqr(mean)));
 }
 
 double StatisticsCV(const std::vector<double>& v) {
@@ -81,6 +84,8 @@ double StatisticsCV(const std::vector<double>& v) {
   const auto stddev = StatisticsStdDev(v);
   const auto mean = StatisticsMean(v);
 
+  if (std::fpclassify(mean) == FP_ZERO) return 0.0;
+
   return stddev / mean;
 }
 
@@ -89,9 +94,8 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
   typedef BenchmarkReporter::Run Run;
   std::vector<Run> results;
 
-  auto error_count =
-      std::count_if(reports.begin(), reports.end(),
-                    [](Run const& run) { return run.error_occurred; });
+  auto error_count = std::count_if(reports.begin(), reports.end(),
+                                   [](Run const& run) { return run.skipped; });
 
   if (reports.size() - error_count < 2) {
     // We don't report aggregated data if there was a single run.
@@ -118,11 +122,13 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
     for (auto const& cnt : r.counters) {
       auto it = counter_stats.find(cnt.first);
       if (it == counter_stats.end()) {
-        counter_stats.insert({cnt.first, {cnt.second, std::vector<double>{}}});
-        it = counter_stats.find(cnt.first);
+        it = counter_stats
+                 .emplace(cnt.first,
+                          CounterStat{cnt.second, std::vector<double>{}})
+                 .first;
         it->second.s.reserve(reports.size());
       } else {
-        BM_CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
+        BM_CHECK_EQ(it->second.c.flags, cnt.second.flags);
       }
     }
   }
@@ -131,7 +137,7 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
   for (Run const& run : reports) {
     BM_CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
     BM_CHECK_EQ(run_iterations, run.iterations);
-    if (run.error_occurred) continue;
+    if (run.skipped) continue;
     real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
     cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
     // user counters
diff --git a/third-party/benchmark/src/statistics.h b/third-party/benchmark/src/statistics.h
index a9545a58c64844..6e5560e8f19f9a 100644
--- a/third-party/benchmark/src/statistics.h
+++ b/third-party/benchmark/src/statistics.h
@@ -22,15 +22,21 @@
 
 namespace benchmark {
 
-// Return a vector containing the mean, median and standard devation information
-// (and any user-specified info) for the specified list of reports. If 'reports'
-// contains less than two non-errored runs an empty vector is returned
+// Return a vector containing the mean, median and standard deviation
+// information (and any user-specified info) for the specified list of reports.
+// If 'reports' contains less than two non-errored runs an empty vector is
+// returned
+BENCHMARK_EXPORT
 std::vector<BenchmarkReporter::Run> ComputeStats(
     const std::vector<BenchmarkReporter::Run>& reports);
 
+BENCHMARK_EXPORT
 double StatisticsMean(const std::vector<double>& v);
+BENCHMARK_EXPORT
 double StatisticsMedian(const std::vector<double>& v);
+BENCHMARK_EXPORT
 double StatisticsStdDev(const std::vector<double>& v);
+BENCHMARK_EXPORT
 double StatisticsCV(const std::vector<double>& v);
 
 }  // end namespace benchmark
diff --git a/third-party/benchmark/src/string_util.cc b/third-party/benchmark/src/string_util.cc
index 401fa13df7afe5..c69e40a8133cc5 100644
--- a/third-party/benchmark/src/string_util.cc
+++ b/third-party/benchmark/src/string_util.cc
@@ -11,16 +11,17 @@
 #include <sstream>
 
 #include "arraysize.h"
+#include "benchmark/benchmark.h"
 
 namespace benchmark {
 namespace {
-
 // kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta.
-const char kBigSIUnits[] = "kMGTPEZY";
+const char* const kBigSIUnits[] = {"k", "M", "G", "T", "P", "E", "Z", "Y"};
 // Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi.
-const char kBigIECUnits[] = "KMGTPEZY";
+const char* const kBigIECUnits[] = {"Ki", "Mi", "Gi", "Ti",
+                                    "Pi", "Ei", "Zi", "Yi"};
 // milli, micro, nano, pico, femto, atto, zepto, yocto.
-const char kSmallSIUnits[] = "munpfazy";
+const char* const kSmallSIUnits[] = {"m", "u", "n", "p", "f", "a", "z", "y"};
 
 // We require that all three arrays have the same size.
 static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
@@ -30,9 +31,8 @@ static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
 
 static const int64_t kUnitsSize = arraysize(kBigSIUnits);
 
-void ToExponentAndMantissa(double val, double thresh, int precision,
-                           double one_k, std::string* mantissa,
-                           int64_t* exponent) {
+void ToExponentAndMantissa(double val, int precision, double one_k,
+                           std::string* mantissa, int64_t* exponent) {
   std::stringstream mantissa_stream;
 
   if (val < 0) {
@@ -43,8 +43,8 @@ void ToExponentAndMantissa(double val, double thresh, int precision,
   // Adjust threshold so that it never excludes things which can't be rendered
   // in 'precision' digits.
   const double adjusted_threshold =
-      std::max(thresh, 1.0 / std::pow(10.0, precision));
-  const double big_threshold = adjusted_threshold * one_k;
+      std::max(1.0, 1.0 / std::pow(10.0, precision));
+  const double big_threshold = (adjusted_threshold * one_k) - 1;
   const double small_threshold = adjusted_threshold;
   // Values in ]simple_threshold,small_threshold[ will be printed as-is
   const double simple_threshold = 0.01;
@@ -92,37 +92,20 @@ std::string ExponentToPrefix(int64_t exponent, bool iec) {
   const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1);
   if (index >= kUnitsSize) return "";
 
-  const char* array =
+  const char* const* array =
       (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits);
-  if (iec)
-    return array[index] + std::string("i");
-  else
-    return std::string(1, array[index]);
+
+  return std::string(array[index]);
 }
 
-std::string ToBinaryStringFullySpecified(double value, double threshold,
-                                         int precision, double one_k = 1024.0) {
+std::string ToBinaryStringFullySpecified(double value, int precision,
+                                         Counter::OneK one_k) {
   std::string mantissa;
   int64_t exponent;
-  ToExponentAndMantissa(value, threshold, precision, one_k, &mantissa,
+  ToExponentAndMantissa(value, precision,
+                        one_k == Counter::kIs1024 ? 1024.0 : 1000.0, &mantissa,
                         &exponent);
-  return mantissa + ExponentToPrefix(exponent, false);
-}
-
-}  // end namespace
-
-void AppendHumanReadable(int n, std::string* str) {
-  std::stringstream ss;
-  // Round down to the nearest SI prefix.
-  ss << ToBinaryStringFullySpecified(n, 1.0, 0);
-  *str += ss.str();
-}
-
-std::string HumanReadableNumber(double n, double one_k) {
-  // 1.1 means that figures up to 1.1k should be shown with the next unit down;
-  // this softens edge effects.
-  // 1 means that we should show one decimal place of precision.
-  return ToBinaryStringFullySpecified(n, 1.1, 1, one_k);
+  return mantissa + ExponentToPrefix(exponent, one_k == Counter::kIs1024);
 }
 
 std::string StrFormatImp(const char* msg, va_list args) {
@@ -133,21 +116,21 @@ std::string StrFormatImp(const char* msg, va_list args) {
   // TODO(ericwf): use std::array for first attempt to avoid one memory
   // allocation guess what the size might be
   std::array<char, 256> local_buff;
-  std::size_t size = local_buff.size();
+
   // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
   // in the android-ndk
-  auto ret = vsnprintf(local_buff.data(), size, msg, args_cp);
+  auto ret = vsnprintf(local_buff.data(), local_buff.size(), msg, args_cp);
 
   va_end(args_cp);
 
   // handle empty expansion
   if (ret == 0) return std::string{};
-  if (static_cast<std::size_t>(ret) < size)
+  if (static_cast<std::size_t>(ret) < local_buff.size())
     return std::string(local_buff.data());
 
   // we did not provide a long enough buffer on our first attempt.
   // add 1 to size to account for null-byte in size cast to prevent overflow
-  size = static_cast<std::size_t>(ret) + 1;
+  std::size_t size = static_cast<std::size_t>(ret) + 1;
   auto buff_ptr = std::unique_ptr<char[]>(new char[size]);
   // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
   // in the android-ndk
@@ -155,6 +138,12 @@ std::string StrFormatImp(const char* msg, va_list args) {
   return std::string(buff_ptr.get());
 }
 
+}  // end namespace
+
+std::string HumanReadableNumber(double n, Counter::OneK one_k) {
+  return ToBinaryStringFullySpecified(n, 1, one_k);
+}
+
 std::string StrFormat(const char* format, ...) {
   va_list args;
   va_start(args, format);
diff --git a/third-party/benchmark/src/string_util.h b/third-party/benchmark/src/string_util.h
index ff3b7da47d6cb9..731aa2c04c3e0a 100644
--- a/third-party/benchmark/src/string_util.h
+++ b/third-party/benchmark/src/string_util.h
@@ -4,15 +4,19 @@
 #include <sstream>
 #include <string>
 #include <utility>
+#include <vector>
 
+#include "benchmark/benchmark.h"
+#include "benchmark/export.h"
+#include "check.h"
 #include "internal_macros.h"
 
 namespace benchmark {
 
-void AppendHumanReadable(int n, std::string* str);
-
-std::string HumanReadableNumber(double n, double one_k = 1024.0);
+BENCHMARK_EXPORT
+std::string HumanReadableNumber(double n, Counter::OneK one_k);
 
+BENCHMARK_EXPORT
 #if defined(__MINGW32__)
 __attribute__((format(__MINGW_PRINTF_FORMAT, 1, 2)))
 #elif defined(__GNUC__)
@@ -38,6 +42,7 @@ inline std::string StrCat(Args&&... args) {
   return ss.str();
 }
 
+BENCHMARK_EXPORT
 std::vector<std::string> StrSplit(const std::string& str, char delim);
 
 // Disable lint checking for this block since it re-implements C functions.
diff --git a/third-party/benchmark/src/sysinfo.cc b/third-party/benchmark/src/sysinfo.cc
index 3a56e8cace4858..daeb98b026d18f 100644
--- a/third-party/benchmark/src/sysinfo.cc
+++ b/third-party/benchmark/src/sysinfo.cc
@@ -12,16 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(_MSC_VER)
-// FIXME: This must be defined before any other includes to disable deprecation
-// warnings for use of codecvt from C++17. We should remove our reliance on
-// the deprecated functionality instead.
-#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
-#endif
-
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
+#if !defined(WINVER) || WINVER < 0x0600
+#undef WINVER
+#define WINVER 0x0600
+#endif  // WINVER handling
 #include <shlwapi.h>
 #undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
 #include <versionhelpers.h>
@@ -30,7 +27,7 @@
 #include <codecvt>
 #else
 #include <fcntl.h>
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -45,10 +42,17 @@
 #endif
 #if defined(BENCHMARK_OS_SOLARIS)
 #include <kstat.h>
+#include <netdb.h>
 #endif
 #if defined(BENCHMARK_OS_QNX)
 #include <sys/syspage.h>
 #endif
+#if defined(BENCHMARK_OS_QURT)
+#include <qurt.h>
+#endif
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+#include <pthread.h>
+#endif
 
 #include <algorithm>
 #include <array>
@@ -65,15 +69,17 @@
 #include <limits>
 #include <locale>
 #include <memory>
+#include <random>
 #include <sstream>
 #include <utility>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
 #include "cycleclock.h"
 #include "internal_macros.h"
 #include "log.h"
-#include "sleep.h"
 #include "string_util.h"
+#include "timers.h"
 
 namespace benchmark {
 namespace {
@@ -98,67 +104,59 @@ BENCHMARK_NORETURN void PrintErrorAndDie(Args&&... args) {
 /// `sysctl` with the result type it's to be interpreted as.
 struct ValueUnion {
   union DataT {
-    uint32_t uint32_value;
-    uint64_t uint64_value;
+    int32_t int32_value;
+    int64_t int64_value;
     // For correct aliasing of union members from bytes.
     char bytes[8];
   };
   using DataPtr = std::unique_ptr<DataT, decltype(&std::free)>;
 
   // The size of the data union member + its trailing array size.
-  size_t Size;
-  DataPtr Buff;
+  std::size_t size;
+  DataPtr buff;
 
  public:
-  ValueUnion() : Size(0), Buff(nullptr, &std::free) {}
+  ValueUnion() : size(0), buff(nullptr, &std::free) {}
 
-  explicit ValueUnion(size_t BuffSize)
-      : Size(sizeof(DataT) + BuffSize),
-        Buff(::new (std::malloc(Size)) DataT(), &std::free) {}
+  explicit ValueUnion(std::size_t buff_size)
+      : size(sizeof(DataT) + buff_size),
+        buff(::new (std::malloc(size)) DataT(), &std::free) {}
 
   ValueUnion(ValueUnion&& other) = default;
 
-  explicit operator bool() const { return bool(Buff); }
+  explicit operator bool() const { return bool(buff); }
 
-  char* data() const { return Buff->bytes; }
+  char* data() const { return buff->bytes; }
 
   std::string GetAsString() const { return std::string(data()); }
 
   int64_t GetAsInteger() const {
-    if (Size == sizeof(Buff->uint32_value))
-      return static_cast<int32_t>(Buff->uint32_value);
-    else if (Size == sizeof(Buff->uint64_value))
-      return static_cast<int64_t>(Buff->uint64_value);
-    BENCHMARK_UNREACHABLE();
-  }
-
-  uint64_t GetAsUnsigned() const {
-    if (Size == sizeof(Buff->uint32_value))
-      return Buff->uint32_value;
-    else if (Size == sizeof(Buff->uint64_value))
-      return Buff->uint64_value;
+    if (size == sizeof(buff->int32_value))
+      return buff->int32_value;
+    else if (size == sizeof(buff->int64_value))
+      return buff->int64_value;
     BENCHMARK_UNREACHABLE();
   }
 
   template <class T, int N>
   std::array<T, N> GetAsArray() {
-    const int ArrSize = sizeof(T) * N;
-    BM_CHECK_LE(ArrSize, Size);
-    std::array<T, N> Arr;
-    std::memcpy(Arr.data(), data(), ArrSize);
-    return Arr;
+    const int arr_size = sizeof(T) * N;
+    BM_CHECK_LE(arr_size, size);
+    std::array<T, N> arr;
+    std::memcpy(arr.data(), data(), arr_size);
+    return arr;
   }
 };
 
-ValueUnion GetSysctlImp(std::string const& Name) {
+ValueUnion GetSysctlImp(std::string const& name) {
 #if defined BENCHMARK_OS_OPENBSD
   int mib[2];
 
   mib[0] = CTL_HW;
-  if ((Name == "hw.ncpu") || (Name == "hw.cpuspeed")) {
+  if ((name == "hw.ncpu") || (name == "hw.cpuspeed")) {
     ValueUnion buff(sizeof(int));
 
-    if (Name == "hw.ncpu") {
+    if (name == "hw.ncpu") {
       mib[1] = HW_NCPU;
     } else {
       mib[1] = HW_CPUSPEED;
@@ -171,41 +169,41 @@ ValueUnion GetSysctlImp(std::string const& Name) {
   }
   return ValueUnion();
 #else
-  size_t CurBuffSize = 0;
-  if (sysctlbyname(Name.c_str(), nullptr, &CurBuffSize, nullptr, 0) == -1)
+  std::size_t cur_buff_size = 0;
+  if (sysctlbyname(name.c_str(), nullptr, &cur_buff_size, nullptr, 0) == -1)
     return ValueUnion();
 
-  ValueUnion buff(CurBuffSize);
-  if (sysctlbyname(Name.c_str(), buff.data(), &buff.Size, nullptr, 0) == 0)
+  ValueUnion buff(cur_buff_size);
+  if (sysctlbyname(name.c_str(), buff.data(), &buff.size, nullptr, 0) == 0)
     return buff;
   return ValueUnion();
 #endif
 }
 
 BENCHMARK_MAYBE_UNUSED
-bool GetSysctl(std::string const& Name, std::string* Out) {
-  Out->clear();
-  auto Buff = GetSysctlImp(Name);
-  if (!Buff) return false;
-  Out->assign(Buff.data());
+bool GetSysctl(std::string const& name, std::string* out) {
+  out->clear();
+  auto buff = GetSysctlImp(name);
+  if (!buff) return false;
+  out->assign(buff.data());
   return true;
 }
 
 template <class Tp,
           class = typename std::enable_if<std::is_integral<Tp>::value>::type>
-bool GetSysctl(std::string const& Name, Tp* Out) {
-  *Out = 0;
-  auto Buff = GetSysctlImp(Name);
-  if (!Buff) return false;
-  *Out = static_cast<Tp>(Buff.GetAsUnsigned());
+bool GetSysctl(std::string const& name, Tp* out) {
+  *out = 0;
+  auto buff = GetSysctlImp(name);
+  if (!buff) return false;
+  *out = static_cast<Tp>(buff.GetAsInteger());
   return true;
 }
 
 template <class Tp, size_t N>
-bool GetSysctl(std::string const& Name, std::array<Tp, N>* Out) {
-  auto Buff = GetSysctlImp(Name);
-  if (!Buff) return false;
-  *Out = Buff.GetAsArray<Tp, N>();
+bool GetSysctl(std::string const& name, std::array<Tp, N>* out) {
+  auto buff = GetSysctlImp(name);
+  if (!buff) return false;
+  *out = buff.GetAsArray<Tp, N>();
   return true;
 }
 #endif
@@ -241,21 +239,21 @@ CPUInfo::Scaling CpuScaling(int num_cpus) {
 #endif
 }
 
-int CountSetBitsInCPUMap(std::string Val) {
-  auto CountBits = [](std::string Part) {
+int CountSetBitsInCPUMap(std::string val) {
+  auto CountBits = [](std::string part) {
     using CPUMask = std::bitset<sizeof(std::uintptr_t) * CHAR_BIT>;
-    Part = "0x" + Part;
-    CPUMask Mask(benchmark::stoul(Part, nullptr, 16));
-    return static_cast<int>(Mask.count());
+    part = "0x" + part;
+    CPUMask mask(benchmark::stoul(part, nullptr, 16));
+    return static_cast<int>(mask.count());
   };
-  size_t Pos;
+  std::size_t pos;
   int total = 0;
-  while ((Pos = Val.find(',')) != std::string::npos) {
-    total += CountBits(Val.substr(0, Pos));
-    Val = Val.substr(Pos + 1);
+  while ((pos = val.find(',')) != std::string::npos) {
+    total += CountBits(val.substr(0, pos));
+    val = val.substr(pos + 1);
   }
-  if (!Val.empty()) {
-    total += CountBits(Val);
+  if (!val.empty()) {
+    total += CountBits(val);
   }
   return total;
 }
@@ -264,16 +262,16 @@ BENCHMARK_MAYBE_UNUSED
 std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
   std::vector<CPUInfo::CacheInfo> res;
   std::string dir = "/sys/devices/system/cpu/cpu0/cache/";
-  int Idx = 0;
+  int idx = 0;
   while (true) {
     CPUInfo::CacheInfo info;
-    std::string FPath = StrCat(dir, "index", Idx++, "/");
-    std::ifstream f(StrCat(FPath, "size").c_str());
+    std::string fpath = StrCat(dir, "index", idx++, "/");
+    std::ifstream f(StrCat(fpath, "size").c_str());
     if (!f.is_open()) break;
     std::string suffix;
     f >> info.size;
     if (f.fail())
-      PrintErrorAndDie("Failed while reading file '", FPath, "size'");
+      PrintErrorAndDie("Failed while reading file '", fpath, "size'");
     if (f.good()) {
       f >> suffix;
       if (f.bad())
@@ -284,13 +282,13 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
       else if (suffix == "K")
         info.size *= 1024;
     }
-    if (!ReadFromFile(StrCat(FPath, "type"), &info.type))
-      PrintErrorAndDie("Failed to read from file ", FPath, "type");
-    if (!ReadFromFile(StrCat(FPath, "level"), &info.level))
-      PrintErrorAndDie("Failed to read from file ", FPath, "level");
+    if (!ReadFromFile(StrCat(fpath, "type"), &info.type))
+      PrintErrorAndDie("Failed to read from file ", fpath, "type");
+    if (!ReadFromFile(StrCat(fpath, "level"), &info.level))
+      PrintErrorAndDie("Failed to read from file ", fpath, "level");
     std::string map_str;
-    if (!ReadFromFile(StrCat(FPath, "shared_cpu_map"), &map_str))
-      PrintErrorAndDie("Failed to read from file ", FPath, "shared_cpu_map");
+    if (!ReadFromFile(StrCat(fpath, "shared_cpu_map"), &map_str))
+      PrintErrorAndDie("Failed to read from file ", fpath, "shared_cpu_map");
     info.num_sharing = CountSetBitsInCPUMap(map_str);
     res.push_back(info);
   }
@@ -301,26 +299,26 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
 #ifdef BENCHMARK_OS_MACOSX
 std::vector<CPUInfo::CacheInfo> GetCacheSizesMacOSX() {
   std::vector<CPUInfo::CacheInfo> res;
-  std::array<uint64_t, 4> CacheCounts{{0, 0, 0, 0}};
-  GetSysctl("hw.cacheconfig", &CacheCounts);
+  std::array<int, 4> cache_counts{{0, 0, 0, 0}};
+  GetSysctl("hw.cacheconfig", &cache_counts);
 
   struct {
     std::string name;
     std::string type;
     int level;
-    uint64_t num_sharing;
-  } Cases[] = {{"hw.l1dcachesize", "Data", 1, CacheCounts[1]},
-               {"hw.l1icachesize", "Instruction", 1, CacheCounts[1]},
-               {"hw.l2cachesize", "Unified", 2, CacheCounts[2]},
-               {"hw.l3cachesize", "Unified", 3, CacheCounts[3]}};
-  for (auto& C : Cases) {
+    int num_sharing;
+  } cases[] = {{"hw.l1dcachesize", "Data", 1, cache_counts[1]},
+               {"hw.l1icachesize", "Instruction", 1, cache_counts[1]},
+               {"hw.l2cachesize", "Unified", 2, cache_counts[2]},
+               {"hw.l3cachesize", "Unified", 3, cache_counts[3]}};
+  for (auto& c : cases) {
     int val;
-    if (!GetSysctl(C.name, &val)) continue;
+    if (!GetSysctl(c.name, &val)) continue;
     CPUInfo::CacheInfo info;
-    info.type = C.type;
-    info.level = C.level;
+    info.type = c.type;
+    info.level = c.level;
     info.size = val;
-    info.num_sharing = static_cast<int>(C.num_sharing);
+    info.num_sharing = c.num_sharing;
     res.push_back(std::move(info));
   }
   return res;
@@ -334,7 +332,7 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
 
   using UPtr = std::unique_ptr<PInfo, decltype(&std::free)>;
   GetLogicalProcessorInformation(nullptr, &buffer_size);
-  UPtr buff((PInfo*)malloc(buffer_size), &std::free);
+  UPtr buff(static_cast<PInfo*>(std::malloc(buffer_size)), &std::free);
   if (!GetLogicalProcessorInformation(buff.get(), &buffer_size))
     PrintErrorAndDie("Failed during call to GetLogicalProcessorInformation: ",
                      GetLastError());
@@ -345,16 +343,16 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
   for (; it != end; ++it) {
     if (it->Relationship != RelationCache) continue;
     using BitSet = std::bitset<sizeof(ULONG_PTR) * CHAR_BIT>;
-    BitSet B(it->ProcessorMask);
+    BitSet b(it->ProcessorMask);
     // To prevent duplicates, only consider caches where CPU 0 is specified
-    if (!B.test(0)) continue;
-    CInfo* Cache = &it->Cache;
+    if (!b.test(0)) continue;
+    const CInfo& cache = it->Cache;
     CPUInfo::CacheInfo C;
-    C.num_sharing = static_cast<int>(B.count());
-    C.level = Cache->Level;
-    C.size = Cache->Size;
+    C.num_sharing = static_cast<int>(b.count());
+    C.level = cache.Level;
+    C.size = cache.Size;
     C.type = "Unknown";
-    switch (Cache->Type) {
+    switch (cache.Type) {
       case CacheUnified:
         C.type = "Unified";
         break;
@@ -417,6 +415,8 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
   return GetCacheSizesWindows();
 #elif defined(BENCHMARK_OS_QNX)
   return GetCacheSizesQNX();
+#elif defined(BENCHMARK_OS_QURT)
+  return std::vector<CPUInfo::CacheInfo>();
 #else
   return GetCacheSizesFromKVFS();
 #endif
@@ -425,23 +425,32 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
 std::string GetSystemName() {
 #if defined(BENCHMARK_OS_WINDOWS)
   std::string str;
-  const unsigned COUNT = MAX_COMPUTERNAME_LENGTH + 1;
+  static constexpr int COUNT = MAX_COMPUTERNAME_LENGTH + 1;
   TCHAR hostname[COUNT] = {'\0'};
   DWORD DWCOUNT = COUNT;
   if (!GetComputerName(hostname, &DWCOUNT)) return std::string("");
 #ifndef UNICODE
   str = std::string(hostname, DWCOUNT);
 #else
-  // Using wstring_convert, Is deprecated in C++17
-  using convert_type = std::codecvt_utf8<wchar_t>;
-  std::wstring_convert<convert_type, wchar_t> converter;
-  std::wstring wStr(hostname, DWCOUNT);
-  str = converter.to_bytes(wStr);
+  // `WideCharToMultiByte` returns `0` when conversion fails.
+  int len = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, hostname,
+                                DWCOUNT, NULL, 0, NULL, NULL);
+  str.resize(len);
+  WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, hostname, DWCOUNT, &str[0],
+                      str.size(), NULL, NULL);
 #endif
   return str;
-#else  // defined(BENCHMARK_OS_WINDOWS)
+#elif defined(BENCHMARK_OS_QURT)
+  std::string str = "Hexagon DSP";
+  qurt_arch_version_t arch_version_struct;
+  if (qurt_sysenv_get_arch_version(&arch_version_struct) == QURT_EOK) {
+    str += " v";
+    str += std::to_string(arch_version_struct.arch_version);
+  }
+  return str;
+#else
 #ifndef HOST_NAME_MAX
-#ifdef BENCHMARK_HAS_SYSCTL  // BSD/Mac Doesnt have HOST_NAME_MAX defined
+#ifdef BENCHMARK_HAS_SYSCTL  // BSD/Mac doesn't have HOST_NAME_MAX defined
 #define HOST_NAME_MAX 64
 #elif defined(BENCHMARK_OS_NACL)
 #define HOST_NAME_MAX 64
@@ -449,6 +458,8 @@ std::string GetSystemName() {
 #define HOST_NAME_MAX 154
 #elif defined(BENCHMARK_OS_RTEMS)
 #define HOST_NAME_MAX 256
+#elif defined(BENCHMARK_OS_SOLARIS)
+#define HOST_NAME_MAX MAXHOSTNAMELEN
 #elif defined(BENCHMARK_OS_ZOS)
 #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
 #else
@@ -463,12 +474,11 @@ std::string GetSystemName() {
 #endif  // Catch-all POSIX block.
 }
 
-int GetNumCPUs() {
+int GetNumCPUsImpl() {
 #ifdef BENCHMARK_HAS_SYSCTL
-  int NumCPU = -1;
-  if (GetSysctl("hw.ncpu", &NumCPU)) return NumCPU;
-  fprintf(stderr, "Err: %s\n", strerror(errno));
-  std::exit(EXIT_FAILURE);
+  int num_cpu = -1;
+  if (GetSysctl("hw.ncpu", &num_cpu)) return num_cpu;
+  PrintErrorAndDie("Err: ", strerror(errno));
 #elif defined(BENCHMARK_OS_WINDOWS)
   SYSTEM_INFO sysinfo;
   // Use memset as opposed to = {} to avoid GCC missing initializer false
@@ -480,64 +490,155 @@ int GetNumCPUs() {
                                         // group
 #elif defined(BENCHMARK_OS_SOLARIS)
   // Returns -1 in case of a failure.
-  int NumCPU = sysconf(_SC_NPROCESSORS_ONLN);
-  if (NumCPU < 0) {
-    fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n",
-            strerror(errno));
+  long num_cpu = sysconf(_SC_NPROCESSORS_ONLN);
+  if (num_cpu < 0) {
+    PrintErrorAndDie("sysconf(_SC_NPROCESSORS_ONLN) failed with error: ",
+                     strerror(errno));
   }
-  return NumCPU;
+  return (int)num_cpu;
 #elif defined(BENCHMARK_OS_QNX)
   return static_cast<int>(_syspage_ptr->num_cpu);
+#elif defined(BENCHMARK_OS_QURT)
+  qurt_sysenv_max_hthreads_t hardware_threads;
+  if (qurt_sysenv_get_max_hw_threads(&hardware_threads) != QURT_EOK) {
+    hardware_threads.max_hthreads = 1;
+  }
+  return hardware_threads.max_hthreads;
 #else
-  int NumCPUs = 0;
-  int MaxID = -1;
+  int num_cpus = 0;
+  int max_id = -1;
   std::ifstream f("/proc/cpuinfo");
   if (!f.is_open()) {
-    std::cerr << "failed to open /proc/cpuinfo\n";
-    return -1;
+    PrintErrorAndDie("Failed to open /proc/cpuinfo");
   }
+#if defined(__alpha__)
+  const std::string Key = "cpus detected";
+#else
   const std::string Key = "processor";
+#endif
   std::string ln;
   while (std::getline(f, ln)) {
     if (ln.empty()) continue;
-    size_t SplitIdx = ln.find(':');
+    std::size_t split_idx = ln.find(':');
     std::string value;
 #if defined(__s390__)
     // s390 has another format in /proc/cpuinfo
     // it needs to be parsed differently
-    if (SplitIdx != std::string::npos)
-      value = ln.substr(Key.size() + 1, SplitIdx - Key.size() - 1);
+    if (split_idx != std::string::npos)
+      value = ln.substr(Key.size() + 1, split_idx - Key.size() - 1);
 #else
-    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
+    if (split_idx != std::string::npos) value = ln.substr(split_idx + 1);
 #endif
     if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) {
-      NumCPUs++;
+      num_cpus++;
       if (!value.empty()) {
-        int CurID = benchmark::stoi(value);
-        MaxID = std::max(CurID, MaxID);
+        const int cur_id = benchmark::stoi(value);
+        max_id = std::max(cur_id, max_id);
       }
     }
   }
   if (f.bad()) {
-    std::cerr << "Failure reading /proc/cpuinfo\n";
-    return -1;
+    PrintErrorAndDie("Failure reading /proc/cpuinfo");
   }
   if (!f.eof()) {
-    std::cerr << "Failed to read to end of /proc/cpuinfo\n";
-    return -1;
+    PrintErrorAndDie("Failed to read to end of /proc/cpuinfo");
   }
   f.close();
 
-  if ((MaxID + 1) != NumCPUs) {
+  if ((max_id + 1) != num_cpus) {
     fprintf(stderr,
             "CPU ID assignments in /proc/cpuinfo seem messed up."
             " This is usually caused by a bad BIOS.\n");
   }
-  return NumCPUs;
+  return num_cpus;
 #endif
   BENCHMARK_UNREACHABLE();
 }
 
+int GetNumCPUs() {
+  const int num_cpus = GetNumCPUsImpl();
+  if (num_cpus < 1) {
+    PrintErrorAndDie(
+        "Unable to extract number of CPUs.  If your platform uses "
+        "/proc/cpuinfo, custom support may need to be added.");
+  }
+  return num_cpus;
+}
+
+class ThreadAffinityGuard final {
+ public:
+  ThreadAffinityGuard() : reset_affinity(SetAffinity()) {
+    if (!reset_affinity)
+      std::cerr << "***WARNING*** Failed to set thread affinity. Estimated CPU "
+                   "frequency may be incorrect."
+                << std::endl;
+  }
+
+  ~ThreadAffinityGuard() {
+    if (!reset_affinity) return;
+
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+    int ret = pthread_setaffinity_np(self, sizeof(previous_affinity),
+                                     &previous_affinity);
+    if (ret == 0) return;
+#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
+    DWORD_PTR ret = SetThreadAffinityMask(self, previous_affinity);
+    if (ret != 0) return;
+#endif  // def BENCHMARK_HAS_PTHREAD_AFFINITY
+    PrintErrorAndDie("Failed to reset thread affinity");
+  }
+
+  ThreadAffinityGuard(ThreadAffinityGuard&&) = delete;
+  ThreadAffinityGuard(const ThreadAffinityGuard&) = delete;
+  ThreadAffinityGuard& operator=(ThreadAffinityGuard&&) = delete;
+  ThreadAffinityGuard& operator=(const ThreadAffinityGuard&) = delete;
+
+ private:
+  bool SetAffinity() {
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+    int ret;
+    self = pthread_self();
+    ret = pthread_getaffinity_np(self, sizeof(previous_affinity),
+                                 &previous_affinity);
+    if (ret != 0) return false;
+
+    cpu_set_t affinity;
+    memcpy(&affinity, &previous_affinity, sizeof(affinity));
+
+    bool is_first_cpu = true;
+
+    for (int i = 0; i < CPU_SETSIZE; ++i)
+      if (CPU_ISSET(i, &affinity)) {
+        if (is_first_cpu)
+          is_first_cpu = false;
+        else
+          CPU_CLR(i, &affinity);
+      }
+
+    if (is_first_cpu) return false;
+
+    ret = pthread_setaffinity_np(self, sizeof(affinity), &affinity);
+    return ret == 0;
+#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
+    self = GetCurrentThread();
+    DWORD_PTR mask = static_cast<DWORD_PTR>(1) << GetCurrentProcessorNumber();
+    previous_affinity = SetThreadAffinityMask(self, mask);
+    return previous_affinity != 0;
+#else
+    return false;
+#endif  // def BENCHMARK_HAS_PTHREAD_AFFINITY
+  }
+
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+  pthread_t self;
+  cpu_set_t previous_affinity;
+#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
+  HANDLE self;
+  DWORD_PTR previous_affinity;
+#endif  // def BENCHMARK_HAS_PTHREAD_AFFINITY
+  bool reset_affinity;
+};
+
 double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
   // Currently, scaling is only used on linux path here,
   // suppress diagnostics about it being unused on other paths.
@@ -566,7 +667,7 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
                       &freq)) {
     // The value is in kHz (as the file name suggests).  For example, on a
     // 2GHz warpstation, the file contains the value "2000000".
-    return freq * 1000.0;
+    return static_cast<double>(freq) * 1000.0;
   }
 
   const double error_value = -1;
@@ -578,7 +679,7 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
     return error_value;
   }
 
-  auto startsWithKey = [](std::string const& Value, std::string const& Key) {
+  auto StartsWithKey = [](std::string const& Value, std::string const& Key) {
     if (Key.size() > Value.size()) return false;
     auto Cmp = [&](char X, char Y) {
       return std::tolower(X) == std::tolower(Y);
@@ -589,18 +690,18 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
   std::string ln;
   while (std::getline(f, ln)) {
     if (ln.empty()) continue;
-    size_t SplitIdx = ln.find(':');
+    std::size_t split_idx = ln.find(':');
     std::string value;
-    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
+    if (split_idx != std::string::npos) value = ln.substr(split_idx + 1);
     // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
     // accept positive values. Some environments (virtual machines) report zero,
     // which would cause infinite looping in WallTime_Init.
-    if (startsWithKey(ln, "cpu MHz")) {
+    if (StartsWithKey(ln, "cpu MHz")) {
       if (!value.empty()) {
         double cycles_per_second = benchmark::stod(value) * 1000000.0;
         if (cycles_per_second > 0) return cycles_per_second;
       }
-    } else if (startsWithKey(ln, "bogomips")) {
+    } else if (StartsWithKey(ln, "bogomips")) {
       if (!value.empty()) {
         bogo_clock = benchmark::stod(value) * 1000000.0;
         if (bogo_clock < 0.0) bogo_clock = error_value;
@@ -622,7 +723,7 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
   if (bogo_clock >= 0.0) return bogo_clock;
 
 #elif defined BENCHMARK_HAS_SYSCTL
-  constexpr auto* FreqStr =
+  constexpr auto* freqStr =
 #if defined(BENCHMARK_OS_FREEBSD) || defined(BENCHMARK_OS_NETBSD)
       "machdep.tsc_freq";
 #elif defined BENCHMARK_OS_OPENBSD
@@ -634,14 +735,17 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
 #endif
   unsigned long long hz = 0;
 #if defined BENCHMARK_OS_OPENBSD
-  if (GetSysctl(FreqStr, &hz)) return hz * 1000000;
+  if (GetSysctl(freqStr, &hz)) return hz * 1000000;
 #else
-  if (GetSysctl(FreqStr, &hz)) return hz;
+  if (GetSysctl(freqStr, &hz)) return hz;
 #endif
   fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
-          FreqStr, strerror(errno));
+          freqStr, strerror(errno));
+  fprintf(stderr,
+          "This does not affect benchmark measurements, only the "
+          "metadata output.\n");
 
-#elif defined BENCHMARK_OS_WINDOWS
+#elif defined BENCHMARK_OS_WINDOWS_WIN32
   // In NT, read MHz from the registry. If we fail to do so or we're in win9x
   // then make a crude estimate.
   DWORD data, data_size = sizeof(data);
@@ -650,15 +754,16 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
           SHGetValueA(HKEY_LOCAL_MACHINE,
                       "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
                       "~MHz", nullptr, &data, &data_size)))
-    return static_cast<double>((int64_t)data *
-                               (int64_t)(1000 * 1000));  // was mhz
+    return static_cast<double>(static_cast<int64_t>(data) *
+                               static_cast<int64_t>(1000 * 1000));  // was mhz
 #elif defined(BENCHMARK_OS_SOLARIS)
   kstat_ctl_t* kc = kstat_open();
   if (!kc) {
     std::cerr << "failed to open /dev/kstat\n";
     return -1;
   }
-  kstat_t* ksp = kstat_lookup(kc, (char*)"cpu_info", -1, (char*)"cpu_info0");
+  kstat_t* ksp = kstat_lookup(kc, const_cast<char*>("cpu_info"), -1,
+                              const_cast<char*>("cpu_info0"));
   if (!ksp) {
     std::cerr << "failed to lookup in /dev/kstat\n";
     return -1;
@@ -667,8 +772,8 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
     std::cerr << "failed to read from /dev/kstat\n";
     return -1;
   }
-  kstat_named_t* knp =
-      (kstat_named_t*)kstat_data_lookup(ksp, (char*)"current_clock_Hz");
+  kstat_named_t* knp = (kstat_named_t*)kstat_data_lookup(
+      ksp, const_cast<char*>("current_clock_Hz"));
   if (!knp) {
     std::cerr << "failed to lookup data in /dev/kstat\n";
     return -1;
@@ -682,22 +787,55 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
   kstat_close(kc);
   return clock_hz;
 #elif defined(BENCHMARK_OS_QNX)
-  return static_cast<double>((int64_t)(SYSPAGE_ENTRY(cpuinfo)->speed) *
-                             (int64_t)(1000 * 1000));
+  return static_cast<double>(
+      static_cast<int64_t>(SYSPAGE_ENTRY(cpuinfo)->speed) *
+      static_cast<int64_t>(1000 * 1000));
+#elif defined(BENCHMARK_OS_QURT)
+  // QuRT doesn't provide any API to query Hexagon frequency.
+  return 1000000000;
 #endif
   // If we've fallen through, attempt to roughly estimate the CPU clock rate.
-  const int estimate_time_ms = 1000;
+
+  // Make sure to use the same cycle counter when starting and stopping the
+  // cycle timer. We just pin the current thread to a cpu in the previous
+  // affinity set.
+  ThreadAffinityGuard affinity_guard;
+
+  static constexpr double estimate_time_s = 1.0;
+  const double start_time = ChronoClockNow();
   const auto start_ticks = cycleclock::Now();
-  SleepForMilliseconds(estimate_time_ms);
-  return static_cast<double>(cycleclock::Now() - start_ticks);
+
+  // Impose load instead of calling sleep() to make sure the cycle counter
+  // works.
+  using PRNG = std::minstd_rand;
+  using Result = PRNG::result_type;
+  PRNG rng(static_cast<Result>(start_ticks));
+
+  Result state = 0;
+
+  do {
+    static constexpr size_t batch_size = 10000;
+    rng.discard(batch_size);
+    state += rng();
+
+  } while (ChronoClockNow() - start_time < estimate_time_s);
+
+  DoNotOptimize(state);
+
+  const auto end_ticks = cycleclock::Now();
+  const double end_time = ChronoClockNow();
+
+  return static_cast<double>(end_ticks - start_ticks) / (end_time - start_time);
+  // Reset the affinity of current thread when the lifetime of affinity_guard
+  // ends.
 }
 
 std::vector<double> GetLoadAvg() {
 #if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) ||     \
      defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||      \
      defined BENCHMARK_OS_OPENBSD || defined BENCHMARK_OS_DRAGONFLY) && \
-    !defined(__ANDROID__)
-  constexpr int kMaxSamples = 3;
+    !(defined(__ANDROID__) && __ANDROID_API__ < 29)
+  static constexpr int kMaxSamples = 3;
   std::vector<double> res(kMaxSamples, 0.0);
   const int nelem = getloadavg(res.data(), kMaxSamples);
   if (nelem < 1) {
diff --git a/third-party/benchmark/src/thread_manager.h b/third-party/benchmark/src/thread_manager.h
index 4680285089401c..819b3c44db6626 100644
--- a/third-party/benchmark/src/thread_manager.h
+++ b/third-party/benchmark/src/thread_manager.h
@@ -43,8 +43,8 @@ class ThreadManager {
     double manual_time_used = 0;
     int64_t complexity_n = 0;
     std::string report_label_;
-    std::string error_message_;
-    bool has_error_ = false;
+    std::string skip_message_;
+    internal::Skipped skipped_ = internal::NotSkipped;
     UserCounters counters;
   };
   GUARDED_BY(GetBenchmarkMutex()) Result results;
diff --git a/third-party/benchmark/src/timers.cc b/third-party/benchmark/src/timers.cc
index ed35c01f5540f8..667e7b2eef3c39 100644
--- a/third-party/benchmark/src/timers.cc
+++ b/third-party/benchmark/src/timers.cc
@@ -23,7 +23,7 @@
 #include <windows.h>
 #else
 #include <fcntl.h>
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -38,6 +38,9 @@
 #include <mach/mach_port.h>
 #include <mach/thread_act.h>
 #endif
+#if defined(BENCHMARK_OS_QURT)
+#include <qurt.h>
+#endif
 #endif
 
 #ifdef BENCHMARK_OS_EMSCRIPTEN
@@ -56,7 +59,6 @@
 
 #include "check.h"
 #include "log.h"
-#include "sleep.h"
 #include "string_util.h"
 
 namespace benchmark {
@@ -65,6 +67,9 @@ namespace benchmark {
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wunused-function"
 #endif
+#if defined(__NVCOMPILER)
+#pragma diag_suppress declared_but_not_referenced
+#endif
 
 namespace {
 #if defined(BENCHMARK_OS_WINDOWS)
@@ -79,7 +84,7 @@ double MakeTime(FILETIME const& kernel_time, FILETIME const& user_time) {
           static_cast<double>(user.QuadPart)) *
          1e-7;
 }
-#elif !defined(BENCHMARK_OS_FUCHSIA)
+#elif !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 double MakeTime(struct rusage const& ru) {
   return (static_cast<double>(ru.ru_utime.tv_sec) +
           static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
@@ -97,7 +102,8 @@ double MakeTime(thread_basic_info_data_t const& info) {
 #endif
 #if defined(CLOCK_PROCESS_CPUTIME_ID) || defined(CLOCK_THREAD_CPUTIME_ID)
 double MakeTime(struct timespec const& ts) {
-  return ts.tv_sec + (static_cast<double>(ts.tv_nsec) * 1e-9);
+  return static_cast<double>(ts.tv_sec) +
+         (static_cast<double>(ts.tv_nsec) * 1e-9);
 }
 #endif
 
@@ -119,11 +125,15 @@ double ProcessCPUUsage() {
                       &user_time))
     return MakeTime(kernel_time, user_time);
   DiagnoseAndExit("GetProccessTimes() failed");
+#elif defined(BENCHMARK_OS_QURT)
+  return static_cast<double>(
+             qurt_timer_timetick_to_us(qurt_timer_get_ticks())) *
+         1.0e-6;
 #elif defined(BENCHMARK_OS_EMSCRIPTEN)
   // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) returns 0 on Emscripten.
   // Use Emscripten-specific API. Reported CPU time would be exactly the
   // same as total time, but this is ok because there aren't long-latency
-  // syncronous system calls in Emscripten.
+  // synchronous system calls in Emscripten.
   return emscripten_get_now() * 1e-3;
 #elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
   // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
@@ -149,6 +159,10 @@ double ThreadCPUUsage() {
   GetThreadTimes(this_thread, &creation_time, &exit_time, &kernel_time,
                  &user_time);
   return MakeTime(kernel_time, user_time);
+#elif defined(BENCHMARK_OS_QURT)
+  return static_cast<double>(
+             qurt_timer_timetick_to_us(qurt_timer_get_ticks())) *
+         1.0e-6;
 #elif defined(BENCHMARK_OS_MACOSX)
   // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
   // See https://github.com/google/benchmark/pull/292
diff --git a/third-party/benchmark/test/AssemblyTests.cmake b/third-party/benchmark/test/AssemblyTests.cmake
index 3d078586f1de14..c43c711faf87e9 100644
--- a/third-party/benchmark/test/AssemblyTests.cmake
+++ b/third-party/benchmark/test/AssemblyTests.cmake
@@ -1,3 +1,23 @@
+set(CLANG_SUPPORTED_VERSION "5.0.0")
+set(GCC_SUPPORTED_VERSION "5.5.0")
+
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL ${CLANG_SUPPORTED_VERSION})
+    message (WARNING
+      "Unsupported Clang version " ${CMAKE_CXX_COMPILER_VERSION}
+      ". Expected is " ${CLANG_SUPPORTED_VERSION}
+      ". Assembly tests may be broken.")
+  endif()
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL ${GCC_SUPPORTED_VERSION})
+    message (WARNING
+      "Unsupported GCC version " ${CMAKE_CXX_COMPILER_VERSION}
+      ". Expected is " ${GCC_SUPPORTED_VERSION}
+      ". Assembly tests may be broken.")
+  endif()
+else()
+  message (WARNING "Unsupported compiler. Assembly tests may be broken.")
+endif()
 
 include(split_list)
 
@@ -23,6 +43,7 @@ string(TOUPPER "${CMAKE_CXX_COMPILER_ID}" ASM_TEST_COMPILER)
 macro(add_filecheck_test name)
   cmake_parse_arguments(ARG "" "" "CHECK_PREFIXES" ${ARGV})
   add_library(${name} OBJECT ${name}.cc)
+  target_link_libraries(${name} PRIVATE benchmark::benchmark)
   set_target_properties(${name} PROPERTIES COMPILE_FLAGS "-S ${ASM_TEST_FLAGS}")
   set(ASM_OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${name}.s")
   add_custom_target(copy_${name} ALL
diff --git a/third-party/benchmark/test/CMakeLists.txt b/third-party/benchmark/test/CMakeLists.txt
index 162af53f80f76f..1de175f98d3421 100644
--- a/third-party/benchmark/test/CMakeLists.txt
+++ b/third-party/benchmark/test/CMakeLists.txt
@@ -1,8 +1,12 @@
 # Enable the tests
 
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+
 find_package(Threads REQUIRED)
 include(CheckCXXCompilerFlag)
 
+add_cxx_compiler_flag(-Wno-unused-variable)
+
 # NOTE: Some tests use `<cassert>` to perform the test. Therefore we must
 # strip -DNDEBUG from the default CMake flags in DEBUG mode.
 string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
@@ -22,6 +26,10 @@ if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
   endforeach()
 endif()
 
+if (NOT BUILD_SHARED_LIBS)
+  add_definitions(-DBENCHMARK_STATIC_DEFINE)
+endif()
+
 check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
 set(BENCHMARK_O3_FLAG "")
 if (BENCHMARK_HAS_O3_FLAG)
@@ -35,10 +43,14 @@ if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
 endif()
 
 add_library(output_test_helper STATIC output_test_helper.cc output_test.h)
+target_link_libraries(output_test_helper PRIVATE benchmark::benchmark)
 
 macro(compile_benchmark_test name)
   add_executable(${name} "${name}.cc")
   target_link_libraries(${name} benchmark::benchmark ${CMAKE_THREAD_LIBS_INIT})
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "NVHPC")
+  target_compile_options( ${name} PRIVATE --diag_suppress partial_override )
+  endif()
 endmacro(compile_benchmark_test)
 
 macro(compile_benchmark_test_with_main name)
@@ -48,26 +60,43 @@ endmacro(compile_benchmark_test_with_main)
 
 macro(compile_output_test name)
   add_executable(${name} "${name}.cc" output_test.h)
-  target_link_libraries(${name} output_test_helper benchmark::benchmark
+  target_link_libraries(${name} output_test_helper benchmark::benchmark_main
           ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endmacro(compile_output_test)
 
+macro(benchmark_add_test)
+  add_test(${ARGV})
+  if(WIN32 AND BUILD_SHARED_LIBS)
+    cmake_parse_arguments(TEST "" "NAME" "" ${ARGN})
+    set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT_MODIFICATION "PATH=path_list_prepend:$<TARGET_FILE_DIR:benchmark::benchmark>")
+  endif()
+endmacro(benchmark_add_test)
+
 # Demonstration executable
 compile_benchmark_test(benchmark_test)
-add_test(NAME benchmark COMMAND benchmark_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME benchmark COMMAND benchmark_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(spec_arg_test)
-add_test(NAME spec_arg COMMAND spec_arg_test --benchmark_filter=BM_NotChosen)
+benchmark_add_test(NAME spec_arg COMMAND spec_arg_test --benchmark_filter=BM_NotChosen)
+
+compile_benchmark_test(spec_arg_verbosity_test)
+benchmark_add_test(NAME spec_arg_verbosity COMMAND spec_arg_verbosity_test --v=42)
 
 compile_benchmark_test(benchmark_setup_teardown_test)
-add_test(NAME benchmark_setup_teardown COMMAND benchmark_setup_teardown_test)
+benchmark_add_test(NAME benchmark_setup_teardown COMMAND benchmark_setup_teardown_test)
 
 compile_benchmark_test(filter_test)
 macro(add_filter_test name filter expect)
-  add_test(NAME ${name} COMMAND filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
-  add_test(NAME ${name}_list_only COMMAND filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect})
+  benchmark_add_test(NAME ${name} COMMAND filter_test --benchmark_min_time=0.01s --benchmark_filter=${filter} ${expect})
+  benchmark_add_test(NAME ${name}_list_only COMMAND filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect})
 endmacro(add_filter_test)
 
+compile_benchmark_test(benchmark_min_time_flag_time_test)
+benchmark_add_test(NAME min_time_flag_time COMMAND benchmark_min_time_flag_time_test)
+
+compile_benchmark_test(benchmark_min_time_flag_iters_test)
+benchmark_add_test(NAME min_time_flag_iters COMMAND benchmark_min_time_flag_iters_test)
+
 add_filter_test(filter_simple "Foo" 3)
 add_filter_test(filter_simple_negative "-Foo" 2)
 add_filter_test(filter_suffix "BM_.*" 4)
@@ -88,78 +117,83 @@ add_filter_test(filter_regex_end ".*Ba$" 1)
 add_filter_test(filter_regex_end_negative "-.*Ba$" 4)
 
 compile_benchmark_test(options_test)
-add_test(NAME options_benchmarks COMMAND options_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME options_benchmarks COMMAND options_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(basic_test)
-add_test(NAME basic_benchmark COMMAND basic_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME basic_benchmark COMMAND basic_test --benchmark_min_time=0.01s)
 
 compile_output_test(repetitions_test)
-add_test(NAME repetitions_benchmark COMMAND repetitions_test --benchmark_min_time=0.01 --benchmark_repetitions=3)
+benchmark_add_test(NAME repetitions_benchmark COMMAND repetitions_test --benchmark_min_time=0.01s --benchmark_repetitions=3)
 
 compile_benchmark_test(diagnostics_test)
-add_test(NAME diagnostics_test COMMAND diagnostics_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME diagnostics_test COMMAND diagnostics_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(skip_with_error_test)
-add_test(NAME skip_with_error_test COMMAND skip_with_error_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME skip_with_error_test COMMAND skip_with_error_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(donotoptimize_test)
+# Enable errors for deprecated deprecations (DoNotOptimize(Tp const& value)).
+check_cxx_compiler_flag(-Werror=deprecated-declarations BENCHMARK_HAS_DEPRECATED_DECLARATIONS_FLAG)
+if (BENCHMARK_HAS_DEPRECATED_DECLARATIONS_FLAG)
+  target_compile_options (donotoptimize_test PRIVATE "-Werror=deprecated-declarations")
+endif()
 # Some of the issues with DoNotOptimize only occur when optimization is enabled
 check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
 if (BENCHMARK_HAS_O3_FLAG)
   set_target_properties(donotoptimize_test PROPERTIES COMPILE_FLAGS "-O3")
 endif()
-add_test(NAME donotoptimize_test COMMAND donotoptimize_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME donotoptimize_test COMMAND donotoptimize_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(fixture_test)
-add_test(NAME fixture_test COMMAND fixture_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME fixture_test COMMAND fixture_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(register_benchmark_test)
-add_test(NAME register_benchmark_test COMMAND register_benchmark_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME register_benchmark_test COMMAND register_benchmark_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(map_test)
-add_test(NAME map_test COMMAND map_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME map_test COMMAND map_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(multiple_ranges_test)
-add_test(NAME multiple_ranges_test COMMAND multiple_ranges_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME multiple_ranges_test COMMAND multiple_ranges_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(args_product_test)
-add_test(NAME args_product_test COMMAND args_product_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME args_product_test COMMAND args_product_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test_with_main(link_main_test)
-add_test(NAME link_main_test COMMAND link_main_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME link_main_test COMMAND link_main_test --benchmark_min_time=0.01s)
 
 compile_output_test(reporter_output_test)
-add_test(NAME reporter_output_test COMMAND reporter_output_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME reporter_output_test COMMAND reporter_output_test --benchmark_min_time=0.01s)
 
 compile_output_test(templated_fixture_test)
-add_test(NAME templated_fixture_test COMMAND templated_fixture_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME templated_fixture_test COMMAND templated_fixture_test --benchmark_min_time=0.01s)
 
 compile_output_test(user_counters_test)
-add_test(NAME user_counters_test COMMAND user_counters_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME user_counters_test COMMAND user_counters_test --benchmark_min_time=0.01s)
 
 compile_output_test(perf_counters_test)
-add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time=0.01 --benchmark_perf_counters=CYCLES,BRANCHES)
+benchmark_add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time=0.01s --benchmark_perf_counters=CYCLES,INSTRUCTIONS)
 
 compile_output_test(internal_threading_test)
-add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01s)
 
 compile_output_test(report_aggregates_only_test)
-add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01s)
 
 compile_output_test(display_aggregates_only_test)
-add_test(NAME display_aggregates_only_test COMMAND display_aggregates_only_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME display_aggregates_only_test COMMAND display_aggregates_only_test --benchmark_min_time=0.01s)
 
 compile_output_test(user_counters_tabular_test)
-add_test(NAME user_counters_tabular_test COMMAND user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01)
+benchmark_add_test(NAME user_counters_tabular_test COMMAND user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01s)
 
 compile_output_test(user_counters_thousands_test)
-add_test(NAME user_counters_thousands_test COMMAND user_counters_thousands_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME user_counters_thousands_test COMMAND user_counters_thousands_test --benchmark_min_time=0.01s)
 
 compile_output_test(memory_manager_test)
-add_test(NAME memory_manager_test COMMAND memory_manager_test --benchmark_min_time=0.01)
+benchmark_add_test(NAME memory_manager_test COMMAND memory_manager_test --benchmark_min_time=0.01s)
 
-check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
-if (BENCHMARK_HAS_CXX03_FLAG)
+# MSVC does not allow to set the language standard to C++98/03.
+if(NOT (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC"))
   compile_benchmark_test(cxx03_test)
   set_target_properties(cxx03_test
       PROPERTIES
@@ -170,22 +204,22 @@ if (BENCHMARK_HAS_CXX03_FLAG)
   # causing the test to fail to compile. To prevent this we explicitly disable
   # the warning.
   check_cxx_compiler_flag(-Wno-odr BENCHMARK_HAS_WNO_ODR)
-  if (BENCHMARK_ENABLE_LTO AND BENCHMARK_HAS_WNO_ODR)
-    set_target_properties(cxx03_test
-        PROPERTIES
-        LINK_FLAGS "-Wno-odr")
+  check_cxx_compiler_flag(-Wno-lto-type-mismatch BENCHMARK_HAS_WNO_LTO_TYPE_MISMATCH)
+  # Cannot set_target_properties multiple times here because the warnings will
+  # be overwritten on each call
+  set (DISABLE_LTO_WARNINGS "")
+  if (BENCHMARK_HAS_WNO_ODR)
+    set(DISABLE_LTO_WARNINGS "${DISABLE_LTO_WARNINGS} -Wno-odr")
   endif()
-  add_test(NAME cxx03 COMMAND cxx03_test --benchmark_min_time=0.01)
+  if (BENCHMARK_HAS_WNO_LTO_TYPE_MISMATCH)
+    set(DISABLE_LTO_WARNINGS "${DISABLE_LTO_WARNINGS} -Wno-lto-type-mismatch")
+  endif()
+  set_target_properties(cxx03_test PROPERTIES LINK_FLAGS "${DISABLE_LTO_WARNINGS}")
+  benchmark_add_test(NAME cxx03 COMMAND cxx03_test --benchmark_min_time=0.01s)
 endif()
 
-# Attempt to work around flaky test failures when running on Appveyor servers.
-if (DEFINED ENV{APPVEYOR})
-  set(COMPLEXITY_MIN_TIME "0.5")
-else()
-  set(COMPLEXITY_MIN_TIME "0.01")
-endif()
 compile_output_test(complexity_test)
-add_test(NAME complexity_benchmark COMMAND complexity_test --benchmark_min_time=${COMPLEXITY_MIN_TIME})
+benchmark_add_test(NAME complexity_benchmark COMMAND complexity_test --benchmark_min_time=1000000x)
 
 ###############################################################################
 # GoogleTest Unit Tests
@@ -200,7 +234,12 @@ if (BENCHMARK_ENABLE_GTEST_TESTS)
 
   macro(add_gtest name)
     compile_gtest(${name})
-    add_test(NAME ${name} COMMAND ${name})
+    benchmark_add_test(NAME ${name} COMMAND ${name})
+    if(WIN32 AND BUILD_SHARED_LIBS)
+      set_tests_properties(${name} PROPERTIES
+        ENVIRONMENT_MODIFICATION "PATH=path_list_prepend:$<TARGET_FILE_DIR:benchmark::benchmark>;PATH=path_list_prepend:$<TARGET_FILE_DIR:gmock_main>"
+      )
+    endif()
   endmacro()
 
   add_gtest(benchmark_gtest)
@@ -210,6 +249,8 @@ if (BENCHMARK_ENABLE_GTEST_TESTS)
   add_gtest(statistics_gtest)
   add_gtest(string_util_gtest)
   add_gtest(perf_counters_gtest)
+  add_gtest(time_unit_gtest)
+  add_gtest(min_time_parse_gtest)
 endif(BENCHMARK_ENABLE_GTEST_TESTS)
 
 ###############################################################################
diff --git a/third-party/benchmark/test/args_product_test.cc b/third-party/benchmark/test/args_product_test.cc
index d44f391f748006..63b8b71e45a069 100644
--- a/third-party/benchmark/test/args_product_test.cc
+++ b/third-party/benchmark/test/args_product_test.cc
@@ -23,7 +23,7 @@ class ArgsProductFixture : public ::benchmark::Fixture {
                         {2, 15, 10, 9},
                         {4, 5, 6, 11}}) {}
 
-  void SetUp(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
+  void SetUp(const ::benchmark::State& state) override {
     std::vector<int64_t> ranges = {state.range(0), state.range(1),
                                    state.range(2), state.range(3)};
 
@@ -34,7 +34,7 @@ class ArgsProductFixture : public ::benchmark::Fixture {
 
   // NOTE: This is not TearDown as we want to check after _all_ runs are
   // complete.
-  virtual ~ArgsProductFixture() {
+  ~ArgsProductFixture() override {
     if (actualValues != expectedValues) {
       std::cout << "EXPECTED\n";
       for (const auto& v : expectedValues) {
diff --git a/third-party/benchmark/test/basic_test.cc b/third-party/benchmark/test/basic_test.cc
index 3a8fd42a8cdd09..c25bec7ddd58cf 100644
--- a/third-party/benchmark/test/basic_test.cc
+++ b/third-party/benchmark/test/basic_test.cc
@@ -5,7 +5,8 @@
 
 void BM_empty(benchmark::State& state) {
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
 }
 BENCHMARK(BM_empty);
@@ -147,7 +148,7 @@ void BM_OneTemplateFunc(benchmark::State& state) {
   auto arg = state.range(0);
   T sum = 0;
   for (auto _ : state) {
-    sum += arg;
+    sum += static_cast<T>(arg);
   }
 }
 BENCHMARK(BM_OneTemplateFunc<int>)->Arg(1);
@@ -159,8 +160,8 @@ void BM_TwoTemplateFunc(benchmark::State& state) {
   A sum = 0;
   B prod = 1;
   for (auto _ : state) {
-    sum += arg;
-    prod *= arg;
+    sum += static_cast<A>(arg);
+    prod *= static_cast<B>(arg);
   }
 }
 BENCHMARK(BM_TwoTemplateFunc<int, double>)->Arg(1);
diff --git a/third-party/benchmark/test/benchmark_gtest.cc b/third-party/benchmark/test/benchmark_gtest.cc
index 14a885ba46da4d..2c9e555d92dcdd 100644
--- a/third-party/benchmark/test/benchmark_gtest.cc
+++ b/third-party/benchmark/test/benchmark_gtest.cc
@@ -3,12 +3,12 @@
 #include <vector>
 
 #include "../src/benchmark_register.h"
+#include "benchmark/benchmark.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 namespace benchmark {
 namespace internal {
-extern std::map<std::string, std::string>* global_context;
 
 namespace {
 
@@ -38,8 +38,9 @@ TEST(AddRangeTest, Advanced64) {
 
 TEST(AddRangeTest, FullRange8) {
   std::vector<int8_t> dst;
-  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), 8);
-  EXPECT_THAT(dst, testing::ElementsAre(1, 8, 64, 127));
+  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), int8_t{8});
+  EXPECT_THAT(
+      dst, testing::ElementsAre(int8_t{1}, int8_t{8}, int8_t{64}, int8_t{127}));
 }
 
 TEST(AddRangeTest, FullRange64) {
@@ -129,11 +130,13 @@ TEST(AddRangeTest, FullNegativeRange64) {
 
 TEST(AddRangeTest, Simple8) {
   std::vector<int8_t> dst;
-  AddRange<int8_t>(&dst, 1, 8, 2);
-  EXPECT_THAT(dst, testing::ElementsAre(1, 2, 4, 8));
+  AddRange<int8_t>(&dst, int8_t{1}, int8_t{8}, int8_t{2});
+  EXPECT_THAT(dst,
+              testing::ElementsAre(int8_t{1}, int8_t{2}, int8_t{4}, int8_t{8}));
 }
 
 TEST(AddCustomContext, Simple) {
+  std::map<std::string, std::string> *&global_context = GetGlobalContext();
   EXPECT_THAT(global_context, nullptr);
 
   AddCustomContext("foo", "bar");
@@ -148,6 +151,7 @@ TEST(AddCustomContext, Simple) {
 }
 
 TEST(AddCustomContext, DuplicateKey) {
+  std::map<std::string, std::string> *&global_context = GetGlobalContext();
   EXPECT_THAT(global_context, nullptr);
 
   AddCustomContext("foo", "bar");
diff --git a/third-party/benchmark/test/benchmark_name_gtest.cc b/third-party/benchmark/test/benchmark_name_gtest.cc
index afb401c1f5328c..0a6746d04df7a4 100644
--- a/third-party/benchmark/test/benchmark_name_gtest.cc
+++ b/third-party/benchmark/test/benchmark_name_gtest.cc
@@ -32,6 +32,14 @@ TEST(BenchmarkNameTest, MinTime) {
   EXPECT_EQ(name.str(), "function_name/some_args:3/4/min_time:3.4s");
 }
 
+TEST(BenchmarkNameTest, MinWarmUpTime) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.args = "some_args:3/4";
+  name.min_warmup_time = "min_warmup_time:3.5s";
+  EXPECT_EQ(name.str(), "function_name/some_args:3/4/min_warmup_time:3.5s");
+}
+
 TEST(BenchmarkNameTest, Iterations) {
   auto name = BenchmarkName();
   name.function_name = "function_name";
diff --git a/third-party/benchmark/test/benchmark_random_interleaving_gtest.cc b/third-party/benchmark/test/benchmark_random_interleaving_gtest.cc
index d04befa8e38109..7f2086750d5346 100644
--- a/third-party/benchmark/test/benchmark_random_interleaving_gtest.cc
+++ b/third-party/benchmark/test/benchmark_random_interleaving_gtest.cc
@@ -51,10 +51,9 @@ class BenchmarkTest : public testing::Test {
   void Execute(const std::string& pattern) {
     queue->Clear();
 
-    BenchmarkReporter* reporter = new NullReporter;
+    std::unique_ptr<BenchmarkReporter> reporter(new NullReporter());
     FLAGS_benchmark_filter = pattern;
-    RunSpecifiedBenchmarks(reporter);
-    delete reporter;
+    RunSpecifiedBenchmarks(reporter.get());
 
     queue->Put("DONE");  // End marker
   }
diff --git a/third-party/benchmark/test/benchmark_setup_teardown_test.cc b/third-party/benchmark/test/benchmark_setup_teardown_test.cc
index efa34e15c129b7..6c3cc2e58fbdeb 100644
--- a/third-party/benchmark/test/benchmark_setup_teardown_test.cc
+++ b/third-party/benchmark/test/benchmark_setup_teardown_test.cc
@@ -10,19 +10,19 @@
 
 // Test that Setup() and Teardown() are called exactly once
 // for each benchmark run (single-threaded).
-namespace single {
+namespace singlethreaded {
 static int setup_call = 0;
 static int teardown_call = 0;
-}  // namespace single
+}  // namespace singlethreaded
 static void DoSetup1(const benchmark::State& state) {
-  ++single::setup_call;
+  ++singlethreaded::setup_call;
 
   // Setup/Teardown should never be called with any thread_idx != 0.
   assert(state.thread_index() == 0);
 }
 
 static void DoTeardown1(const benchmark::State& state) {
-  ++single::teardown_call;
+  ++singlethreaded::teardown_call;
   assert(state.thread_index() == 0);
 }
 
@@ -80,11 +80,11 @@ int fixture_setup = 0;
 
 class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
  public:
-  void SetUp(const ::benchmark::State&) BENCHMARK_OVERRIDE {
+  void SetUp(const ::benchmark::State&) override {
     fixture_interaction::fixture_setup++;
   }
 
-  ~FIXTURE_BECHMARK_NAME() {}
+  ~FIXTURE_BECHMARK_NAME() override {}
 };
 
 BENCHMARK_F(FIXTURE_BECHMARK_NAME, BM_WithFixture)(benchmark::State& st) {
@@ -134,8 +134,8 @@ int main(int argc, char** argv) {
   assert(ret > 0);
 
   // Setup/Teardown is called once for each arg group (1,3,5,7).
-  assert(single::setup_call == 4);
-  assert(single::teardown_call == 4);
+  assert(singlethreaded::setup_call == 4);
+  assert(singlethreaded::teardown_call == 4);
 
   // 3 group of threads calling this function (3,5,10).
   assert(concurrent::setup_call.load(std::memory_order_relaxed) == 3);
@@ -145,7 +145,7 @@ int main(int argc, char** argv) {
 
   // Setup is called 4 times, once for each arg group (1,3,5,7)
   assert(fixture_interaction::setup == 4);
-  // Fixture::Setup is called everytime the bm routine is run.
+  // Fixture::Setup is called every time the bm routine is run.
   // The exact number is indeterministic, so we just assert that
   // it's more than setup.
   assert(fixture_interaction::fixture_setup > fixture_interaction::setup);
diff --git a/third-party/benchmark/test/benchmark_test.cc b/third-party/benchmark/test/benchmark_test.cc
index 2906cdcde997d8..8b14017d03a584 100644
--- a/third-party/benchmark/test/benchmark_test.cc
+++ b/third-party/benchmark/test/benchmark_test.cc
@@ -5,6 +5,7 @@
 #include <stdint.h>
 
 #include <chrono>
+#include <complex>
 #include <cstdlib>
 #include <iostream>
 #include <limits>
@@ -15,6 +16,7 @@
 #include <sstream>
 #include <string>
 #include <thread>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -26,7 +28,7 @@
 
 namespace {
 
-int BENCHMARK_NOINLINE Factorial(uint32_t n) {
+int BENCHMARK_NOINLINE Factorial(int n) {
   return (n == 1) ? 1 : n * Factorial(n - 1);
 }
 
@@ -74,7 +76,8 @@ BENCHMARK_RANGE(BM_CalculatePiRange, 1, 1024 * 1024);
 static void BM_CalculatePi(benchmark::State& state) {
   static const int depth = 1024;
   for (auto _ : state) {
-    benchmark::DoNotOptimize(CalculatePi(static_cast<int>(depth)));
+    double pi = CalculatePi(static_cast<int>(depth));
+    benchmark::DoNotOptimize(pi);
   }
 }
 BENCHMARK(BM_CalculatePi)->Threads(8);
@@ -90,7 +93,8 @@ static void BM_SetInsert(benchmark::State& state) {
     for (int j = 0; j < state.range(1); ++j) data.insert(rand());
   }
   state.SetItemsProcessed(state.iterations() * state.range(1));
-  state.SetBytesProcessed(state.iterations() * state.range(1) * sizeof(int));
+  state.SetBytesProcessed(state.iterations() * state.range(1) *
+                          static_cast<int64_t>(sizeof(int)));
 }
 
 // Test many inserts at once to reduce the total iterations needed. Otherwise,
@@ -108,7 +112,7 @@ static void BM_Sequential(benchmark::State& state) {
   }
   const int64_t items_processed = state.iterations() * state.range(0);
   state.SetItemsProcessed(items_processed);
-  state.SetBytesProcessed(items_processed * sizeof(v));
+  state.SetBytesProcessed(items_processed * static_cast<int64_t>(sizeof(v)));
 }
 BENCHMARK_TEMPLATE2(BM_Sequential, std::vector<int>, int)
     ->Range(1 << 0, 1 << 10);
@@ -122,7 +126,10 @@ static void BM_StringCompare(benchmark::State& state) {
   size_t len = static_cast<size_t>(state.range(0));
   std::string s1(len, '-');
   std::string s2(len, '-');
-  for (auto _ : state) benchmark::DoNotOptimize(s1.compare(s2));
+  for (auto _ : state) {
+    auto comp = s1.compare(s2);
+    benchmark::DoNotOptimize(comp);
+  }
 }
 BENCHMARK(BM_StringCompare)->Range(1, 1 << 20);
 
@@ -169,7 +176,7 @@ static void BM_ParallelMemset(benchmark::State& state) {
     for (int i = from; i < to; i++) {
       // No need to lock test_vector_mu as ranges
       // do not overlap between threads.
-      benchmark::DoNotOptimize(test_vector->at(i) = 1);
+      benchmark::DoNotOptimize(test_vector->at(static_cast<size_t>(i)) = 1);
     }
   }
 
@@ -220,6 +227,31 @@ void BM_non_template_args(benchmark::State& state, int, double) {
 }
 BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
 
+template <class T, class U, class... ExtraArgs>
+void BM_template2_capture(benchmark::State& state, ExtraArgs&&... extra_args) {
+  static_assert(std::is_same<T, void>::value, "");
+  static_assert(std::is_same<U, char*>::value, "");
+  static_assert(std::is_same<ExtraArgs..., unsigned int>::value, "");
+  unsigned int dummy[sizeof...(ExtraArgs)] = {extra_args...};
+  assert(dummy[0] == 42);
+  for (auto _ : state) {
+  }
+}
+BENCHMARK_TEMPLATE2_CAPTURE(BM_template2_capture, void, char*, foo, 42U);
+BENCHMARK_CAPTURE((BM_template2_capture<void, char*>), foo, 42U);
+
+template <class T, class... ExtraArgs>
+void BM_template1_capture(benchmark::State& state, ExtraArgs&&... extra_args) {
+  static_assert(std::is_same<T, void>::value, "");
+  static_assert(std::is_same<ExtraArgs..., unsigned long>::value, "");
+  unsigned long dummy[sizeof...(ExtraArgs)] = {extra_args...};
+  assert(dummy[0] == 24);
+  for (auto _ : state) {
+  }
+}
+BENCHMARK_TEMPLATE1_CAPTURE(BM_template1_capture, void, foo, 24UL);
+BENCHMARK_CAPTURE(BM_template1_capture<void>, foo, 24UL);
+
 #endif  // BENCHMARK_HAS_CXX11
 
 static void BM_DenseThreadRanges(benchmark::State& st) {
@@ -244,4 +276,25 @@ BENCHMARK(BM_DenseThreadRanges)->Arg(1)->DenseThreadRange(1, 3);
 BENCHMARK(BM_DenseThreadRanges)->Arg(2)->DenseThreadRange(1, 4, 2);
 BENCHMARK(BM_DenseThreadRanges)->Arg(3)->DenseThreadRange(5, 14, 3);
 
+static void BM_BenchmarkName(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+
+  // Check that the benchmark name is passed correctly to `state`.
+  assert("BM_BenchmarkName" == state.name());
+}
+BENCHMARK(BM_BenchmarkName);
+
+// regression test for #1446
+template <typename type>
+static void BM_templated_test(benchmark::State& state) {
+  for (auto _ : state) {
+    type created_string;
+    benchmark::DoNotOptimize(created_string);
+  }
+}
+
+static auto BM_templated_test_double = BM_templated_test<std::complex<double>>;
+BENCHMARK(BM_templated_test_double);
+
 BENCHMARK_MAIN();
diff --git a/third-party/benchmark/test/clobber_memory_assembly_test.cc b/third-party/benchmark/test/clobber_memory_assembly_test.cc
index ab269130cd5cc3..54e26ccdadf4c1 100644
--- a/third-party/benchmark/test/clobber_memory_assembly_test.cc
+++ b/third-party/benchmark/test/clobber_memory_assembly_test.cc
@@ -3,6 +3,7 @@
 #ifdef __clang__
 #pragma clang diagnostic ignored "-Wreturn-type"
 #endif
+BENCHMARK_DISABLE_DEPRECATED_WARNING
 
 extern "C" {
 
diff --git a/third-party/benchmark/test/complexity_test.cc b/third-party/benchmark/test/complexity_test.cc
index ea268b54598800..fb4ad1ad53a982 100644
--- a/third-party/benchmark/test/complexity_test.cc
+++ b/third-party/benchmark/test/complexity_test.cc
@@ -26,7 +26,7 @@ int AddComplexityTest(const std::string &test_name,
   AddCases(
       TC_ConsoleOut,
       {{"^%bigo_name %bigo_str %bigo_str[ ]*$"},
-       {"^%bigo_name", MR_Not},  // Assert we didn't only matched a name.
+       {"^%bigo_name", MR_Not},  // Assert we we didn't only matched a name.
        {"^%rms_name %rms %rms[ ]*$", MR_Next}});
   AddCases(
       TC_JSONOut,
@@ -69,35 +69,44 @@ int AddComplexityTest(const std::string &test_name,
 
 void BM_Complexity_O1(benchmark::State &state) {
   for (auto _ : state) {
-    for (int i = 0; i < 1024; ++i) {
-      benchmark::DoNotOptimize(&i);
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+    long tmp = state.iterations();
+    benchmark::DoNotOptimize(tmp);
+    for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
+      benchmark::DoNotOptimize(state.iterations());
+      tmp *= state.iterations();
+      benchmark::DoNotOptimize(tmp);
     }
+
+    // always 1ns per iteration
+    state.SetIterationTime(42 * 1e-9);
   }
   state.SetComplexityN(state.range(0));
 }
-BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1);
-BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity();
 BENCHMARK(BM_Complexity_O1)
     ->Range(1, 1 << 18)
+    ->UseManualTime()
+    ->Complexity(benchmark::o1);
+BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->UseManualTime()->Complexity();
+BENCHMARK(BM_Complexity_O1)
+    ->Range(1, 1 << 18)
+    ->UseManualTime()
     ->Complexity([](benchmark::IterationCount) { return 1.0; });
 
-const char *one_test_name = "BM_Complexity_O1";
-const char *big_o_1_test_name = "BM_Complexity_O1_BigO";
-const char *rms_o_1_test_name = "BM_Complexity_O1_RMS";
-const char *enum_big_o_1 = "\\([0-9]+\\)";
-// FIXME: Tolerate both '(1)' and 'lgN' as output when the complexity is auto
-// deduced.
-// See https://github.com/google/benchmark/issues/272
-const char *auto_big_o_1 = "(\\([0-9]+\\))|(lgN)";
+const char *one_test_name = "BM_Complexity_O1/manual_time";
+const char *big_o_1_test_name = "BM_Complexity_O1/manual_time_BigO";
+const char *rms_o_1_test_name = "BM_Complexity_O1/manual_time_RMS";
+const char *enum_auto_big_o_1 = "\\([0-9]+\\)";
 const char *lambda_big_o_1 = "f\\(N\\)";
 
 // Add enum tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     enum_big_o_1, /*family_index=*/0);
+                     enum_auto_big_o_1, /*family_index=*/0);
 
-// Add auto enum tests
+// Add auto tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     auto_big_o_1, /*family_index=*/1);
+                     enum_auto_big_o_1, /*family_index=*/1);
 
 // Add lambda tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
@@ -107,42 +116,44 @@ ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
 // --------------------------- Testing BigO O(N) --------------------------- //
 // ========================================================================= //
 
-std::vector<int> ConstructRandomVector(int64_t size) {
-  std::vector<int> v;
-  v.reserve(static_cast<int>(size));
-  for (int i = 0; i < size; ++i) {
-    v.push_back(static_cast<int>(std::rand() % size));
-  }
-  return v;
-}
-
 void BM_Complexity_O_N(benchmark::State &state) {
-  auto v = ConstructRandomVector(state.range(0));
-  // Test worst case scenario (item not in vector)
-  const int64_t item_not_in_vector = state.range(0) * 2;
   for (auto _ : state) {
-    benchmark::DoNotOptimize(std::find(v.begin(), v.end(), item_not_in_vector));
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+    long tmp = state.iterations();
+    benchmark::DoNotOptimize(tmp);
+    for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
+      benchmark::DoNotOptimize(state.iterations());
+      tmp *= state.iterations();
+      benchmark::DoNotOptimize(tmp);
+    }
+
+    // 1ns per iteration per entry
+    state.SetIterationTime(static_cast<double>(state.range(0)) * 42.0 * 1e-9);
   }
   state.SetComplexityN(state.range(0));
 }
 BENCHMARK(BM_Complexity_O_N)
     ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
+    ->Range(1 << 10, 1 << 20)
+    ->UseManualTime()
     ->Complexity(benchmark::oN);
 BENCHMARK(BM_Complexity_O_N)
     ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
+    ->Range(1 << 10, 1 << 20)
+    ->UseManualTime()
+    ->Complexity();
+BENCHMARK(BM_Complexity_O_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 20)
+    ->UseManualTime()
     ->Complexity([](benchmark::IterationCount n) -> double {
       return static_cast<double>(n);
     });
-BENCHMARK(BM_Complexity_O_N)
-    ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
-    ->Complexity();
 
-const char *n_test_name = "BM_Complexity_O_N";
-const char *big_o_n_test_name = "BM_Complexity_O_N_BigO";
-const char *rms_o_n_test_name = "BM_Complexity_O_N_RMS";
+const char *n_test_name = "BM_Complexity_O_N/manual_time";
+const char *big_o_n_test_name = "BM_Complexity_O_N/manual_time_BigO";
+const char *rms_o_n_test_name = "BM_Complexity_O_N/manual_time_RMS";
 const char *enum_auto_big_o_n = "N";
 const char *lambda_big_o_n = "f\\(N\\)";
 
@@ -150,40 +161,57 @@ const char *lambda_big_o_n = "f\\(N\\)";
 ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
                      enum_auto_big_o_n, /*family_index=*/3);
 
+// Add auto tests
+ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
+                     enum_auto_big_o_n, /*family_index=*/4);
+
 // Add lambda tests
 ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
-                     lambda_big_o_n, /*family_index=*/4);
+                     lambda_big_o_n, /*family_index=*/5);
 
 // ========================================================================= //
-// ------------------------- Testing BigO O(N*lgN) ------------------------- //
+// ------------------------- Testing BigO O(NlgN) ------------------------- //
 // ========================================================================= //
 
+static const double kLog2E = 1.44269504088896340736;
 static void BM_Complexity_O_N_log_N(benchmark::State &state) {
-  auto v = ConstructRandomVector(state.range(0));
   for (auto _ : state) {
-    std::sort(v.begin(), v.end());
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+    long tmp = state.iterations();
+    benchmark::DoNotOptimize(tmp);
+    for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
+      benchmark::DoNotOptimize(state.iterations());
+      tmp *= state.iterations();
+      benchmark::DoNotOptimize(tmp);
+    }
+
+    state.SetIterationTime(static_cast<double>(state.range(0)) * kLog2E *
+                           std::log(state.range(0)) * 42.0 * 1e-9);
   }
   state.SetComplexityN(state.range(0));
 }
-static const double kLog2E = 1.44269504088896340736;
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
+    ->Range(1 << 10, 1U << 24)
+    ->UseManualTime()
     ->Complexity(benchmark::oNLogN);
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
-    ->Complexity([](benchmark::IterationCount n) {
-      return kLog2E * n * log(static_cast<double>(n));
-    });
+    ->Range(1 << 10, 1U << 24)
+    ->UseManualTime()
+    ->Complexity();
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
-    ->Complexity();
+    ->Range(1 << 10, 1U << 24)
+    ->UseManualTime()
+    ->Complexity([](benchmark::IterationCount n) {
+      return kLog2E * static_cast<double>(n) * std::log(static_cast<double>(n));
+    });
 
-const char *n_lg_n_test_name = "BM_Complexity_O_N_log_N";
-const char *big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_BigO";
-const char *rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_RMS";
+const char *n_lg_n_test_name = "BM_Complexity_O_N_log_N/manual_time";
+const char *big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N/manual_time_BigO";
+const char *rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N/manual_time_RMS";
 const char *enum_auto_big_o_n_lg_n = "NlgN";
 const char *lambda_big_o_n_lg_n = "f\\(N\\)";
 
@@ -192,11 +220,16 @@ ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
                      rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n,
                      /*family_index=*/6);
 
-// Add lambda tests
+// NOTE: auto big-o is wron.g
 ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
-                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n,
+                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n,
                      /*family_index=*/7);
 
+//// Add lambda tests
+ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
+                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n,
+                     /*family_index=*/8);
+
 // ========================================================================= //
 // -------- Testing formatting of Complexity with captured args ------------ //
 // ========================================================================= //
@@ -205,19 +238,30 @@ void BM_ComplexityCaptureArgs(benchmark::State &state, int n) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
     benchmark::DoNotOptimize(state.iterations());
+    long tmp = state.iterations();
+    benchmark::DoNotOptimize(tmp);
+    for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
+      benchmark::DoNotOptimize(state.iterations());
+      tmp *= state.iterations();
+      benchmark::DoNotOptimize(tmp);
+    }
+
+    state.SetIterationTime(static_cast<double>(state.range(0)) * 42.0 * 1e-9);
   }
   state.SetComplexityN(n);
 }
 
 BENCHMARK_CAPTURE(BM_ComplexityCaptureArgs, capture_test, 100)
+    ->UseManualTime()
     ->Complexity(benchmark::oN)
     ->Ranges({{1, 2}, {3, 4}});
 
 const std::string complexity_capture_name =
-    "BM_ComplexityCaptureArgs/capture_test";
+    "BM_ComplexityCaptureArgs/capture_test/manual_time";
 
 ADD_COMPLEXITY_CASES(complexity_capture_name, complexity_capture_name + "_BigO",
-                     complexity_capture_name + "_RMS", "N", /*family_index=*/9);
+                     complexity_capture_name + "_RMS", "N",
+                     /*family_index=*/9);
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
diff --git a/third-party/benchmark/test/diagnostics_test.cc b/third-party/benchmark/test/diagnostics_test.cc
index c54d5b0d708a15..7c68a98929d851 100644
--- a/third-party/benchmark/test/diagnostics_test.cc
+++ b/third-party/benchmark/test/diagnostics_test.cc
@@ -49,7 +49,8 @@ void BM_diagnostic_test(benchmark::State& state) {
   if (called_once == false) try_invalid_pause_resume(state);
 
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
 
   if (called_once == false) try_invalid_pause_resume(state);
@@ -64,7 +65,8 @@ void BM_diagnostic_test_keep_running(benchmark::State& state) {
   if (called_once == false) try_invalid_pause_resume(state);
 
   while (state.KeepRunning()) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
 
   if (called_once == false) try_invalid_pause_resume(state);
@@ -74,7 +76,16 @@ void BM_diagnostic_test_keep_running(benchmark::State& state) {
 BENCHMARK(BM_diagnostic_test_keep_running);
 
 int main(int argc, char* argv[]) {
+#ifdef NDEBUG
+  // This test is exercising functionality for debug builds, which are not
+  // available in release builds. Skip the test if we are in that environment
+  // to avoid a test failure.
+  std::cout << "Diagnostic test disabled in release build" << std::endl;
+  (void)argc;
+  (void)argv;
+#else
   benchmark::internal::GetAbortHandler() = &TestHandler;
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
+#endif
 }
diff --git a/third-party/benchmark/test/donotoptimize_assembly_test.cc b/third-party/benchmark/test/donotoptimize_assembly_test.cc
index 2e86a51e223423..dc286f53e20f60 100644
--- a/third-party/benchmark/test/donotoptimize_assembly_test.cc
+++ b/third-party/benchmark/test/donotoptimize_assembly_test.cc
@@ -3,12 +3,16 @@
 #ifdef __clang__
 #pragma clang diagnostic ignored "-Wreturn-type"
 #endif
+BENCHMARK_DISABLE_DEPRECATED_WARNING
 
 extern "C" {
 
 extern int ExternInt;
 extern int ExternInt2;
 extern int ExternInt3;
+extern int BigArray[2049];
+
+const int ConstBigArray[2049]{};
 
 inline int Add42(int x) { return x + 42; }
 
@@ -23,7 +27,15 @@ struct Large {
   int value;
   int data[2];
 };
+
+struct ExtraLarge {
+  int arr[2049];
+};
 }
+
+extern ExtraLarge ExtraLargeObj;
+const ExtraLarge ConstExtraLargeObj{};
+
 // CHECK-LABEL: test_with_rvalue:
 extern "C" void test_with_rvalue() {
   benchmark::DoNotOptimize(Add42(0));
@@ -68,6 +80,22 @@ extern "C" void test_with_large_lvalue() {
   // CHECK: ret
 }
 
+// CHECK-LABEL: test_with_extra_large_lvalue_with_op:
+extern "C" void test_with_extra_large_lvalue_with_op() {
+  ExtraLargeObj.arr[16] = 42;
+  benchmark::DoNotOptimize(ExtraLargeObj);
+  // CHECK: movl $42, ExtraLargeObj+64(%rip)
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_big_array_with_op
+extern "C" void test_with_big_array_with_op() {
+  BigArray[16] = 42;
+  benchmark::DoNotOptimize(BigArray);
+  // CHECK: movl $42, BigArray+64(%rip)
+  // CHECK: ret
+}
+
 // CHECK-LABEL: test_with_non_trivial_lvalue:
 extern "C" void test_with_non_trivial_lvalue() {
   NotTriviallyCopyable NTC(ExternInt);
@@ -96,6 +124,18 @@ extern "C" void test_with_large_const_lvalue() {
   // CHECK: ret
 }
 
+// CHECK-LABEL: test_with_const_extra_large_obj:
+extern "C" void test_with_const_extra_large_obj() {
+  benchmark::DoNotOptimize(ConstExtraLargeObj);
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_const_big_array
+extern "C" void test_with_const_big_array() {
+  benchmark::DoNotOptimize(ConstBigArray);
+  // CHECK: ret
+}
+
 // CHECK-LABEL: test_with_non_trivial_const_lvalue:
 extern "C" void test_with_non_trivial_const_lvalue() {
   const NotTriviallyCopyable Obj(ExternInt);
diff --git a/third-party/benchmark/test/donotoptimize_test.cc b/third-party/benchmark/test/donotoptimize_test.cc
index c321f156a1e00f..04ec9386a3b404 100644
--- a/third-party/benchmark/test/donotoptimize_test.cc
+++ b/third-party/benchmark/test/donotoptimize_test.cc
@@ -4,9 +4,9 @@
 
 namespace {
 #if defined(__GNUC__)
-std::uint64_t double_up(const std::uint64_t x) __attribute__((const));
+std::int64_t double_up(const std::int64_t x) __attribute__((const));
 #endif
-std::uint64_t double_up(const std::uint64_t x) { return x * 2; }
+std::int64_t double_up(const std::int64_t x) { return x * 2; }
 }  // namespace
 
 // Using DoNotOptimize on types like BitRef seem to cause a lot of problems
@@ -29,6 +29,15 @@ struct BitRef {
 int main(int, char*[]) {
   // this test verifies compilation of DoNotOptimize() for some types
 
+  char buffer1[1] = "";
+  benchmark::DoNotOptimize(buffer1);
+
+  char buffer2[2] = "";
+  benchmark::DoNotOptimize(buffer2);
+
+  char buffer3[3] = "";
+  benchmark::DoNotOptimize(buffer3);
+
   char buffer8[8] = "";
   benchmark::DoNotOptimize(buffer8);
 
@@ -37,17 +46,24 @@ int main(int, char*[]) {
 
   char buffer1024[1024] = "";
   benchmark::DoNotOptimize(buffer1024);
-  benchmark::DoNotOptimize(&buffer1024[0]);
+  char* bptr = &buffer1024[0];
+  benchmark::DoNotOptimize(bptr);
 
   int x = 123;
   benchmark::DoNotOptimize(x);
-  benchmark::DoNotOptimize(&x);
+  int* xp = &x;
+  benchmark::DoNotOptimize(xp);
   benchmark::DoNotOptimize(x += 42);
 
-  benchmark::DoNotOptimize(double_up(x));
+  std::int64_t y = double_up(x);
+  benchmark::DoNotOptimize(y);
 
   // These tests are to e
-  benchmark::DoNotOptimize(BitRef::Make());
   BitRef lval = BitRef::Make();
   benchmark::DoNotOptimize(lval);
+
+#ifdef BENCHMARK_HAS_CXX11
+  // Check that accept rvalue.
+  benchmark::DoNotOptimize(BitRef::Make());
+#endif
 }
diff --git a/third-party/benchmark/test/filter_test.cc b/third-party/benchmark/test/filter_test.cc
index a567de2dd58e42..4c8b8ea488ad0d 100644
--- a/third-party/benchmark/test/filter_test.cc
+++ b/third-party/benchmark/test/filter_test.cc
@@ -14,28 +14,27 @@ namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
+  bool ReportContext(const Context& context) override {
     return ConsoleReporter::ReportContext(context);
   };
 
-  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
+  void ReportRuns(const std::vector<Run>& report) override {
     ++count_;
-    max_family_index_ =
-        std::max<size_t>(max_family_index_, report[0].family_index);
+    max_family_index_ = std::max(max_family_index_, report[0].family_index);
     ConsoleReporter::ReportRuns(report);
   };
 
   TestReporter() : count_(0), max_family_index_(0) {}
 
-  virtual ~TestReporter() {}
+  ~TestReporter() override {}
 
-  size_t GetCount() const { return count_; }
+  int GetCount() const { return count_; }
 
-  size_t GetMaxFamilyIndex() const { return max_family_index_; }
+  int64_t GetMaxFamilyIndex() const { return max_family_index_; }
 
  private:
-  mutable size_t count_;
-  mutable size_t max_family_index_;
+  mutable int count_;
+  mutable int64_t max_family_index_;
 };
 
 }  // end namespace
@@ -79,13 +78,13 @@ int main(int argc, char** argv) {
   benchmark::Initialize(&argc, argv);
 
   TestReporter test_reporter;
-  const size_t returned_count =
-      benchmark::RunSpecifiedBenchmarks(&test_reporter);
+  const int64_t returned_count =
+      static_cast<int64_t>(benchmark::RunSpecifiedBenchmarks(&test_reporter));
 
   if (argc == 2) {
     // Make sure we ran all of the tests
     std::stringstream ss(argv[1]);
-    size_t expected_return;
+    int64_t expected_return;
     ss >> expected_return;
 
     if (returned_count != expected_return) {
@@ -95,8 +94,8 @@ int main(int argc, char** argv) {
       return -1;
     }
 
-    const size_t expected_reports = list_only ? 0 : expected_return;
-    const size_t reports_count = test_reporter.GetCount();
+    const int64_t expected_reports = list_only ? 0 : expected_return;
+    const int64_t reports_count = test_reporter.GetCount();
     if (reports_count != expected_reports) {
       std::cerr << "ERROR: Expected " << expected_reports
                 << " tests to be run but reported_count = " << reports_count
@@ -104,8 +103,8 @@ int main(int argc, char** argv) {
       return -1;
     }
 
-    const size_t max_family_index = test_reporter.GetMaxFamilyIndex();
-    const size_t num_families = reports_count == 0 ? 0 : 1 + max_family_index;
+    const int64_t max_family_index = test_reporter.GetMaxFamilyIndex();
+    const int64_t num_families = reports_count == 0 ? 0 : 1 + max_family_index;
     if (num_families != expected_reports) {
       std::cerr << "ERROR: Expected " << expected_reports
                 << " test families to be run but num_families = "
diff --git a/third-party/benchmark/test/fixture_test.cc b/third-party/benchmark/test/fixture_test.cc
index af650dbd0661a7..d1093ebf52fc83 100644
--- a/third-party/benchmark/test/fixture_test.cc
+++ b/third-party/benchmark/test/fixture_test.cc
@@ -8,21 +8,21 @@
 
 class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
  public:
-  void SetUp(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
+  void SetUp(const ::benchmark::State& state) override {
     if (state.thread_index() == 0) {
       assert(data.get() == nullptr);
       data.reset(new int(42));
     }
   }
 
-  void TearDown(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
+  void TearDown(const ::benchmark::State& state) override {
     if (state.thread_index() == 0) {
       assert(data.get() != nullptr);
       data.reset();
     }
   }
 
-  ~FIXTURE_BECHMARK_NAME() { assert(data == nullptr); }
+  ~FIXTURE_BECHMARK_NAME() override { assert(data == nullptr); }
 
   std::unique_ptr<int> data;
 };
diff --git a/third-party/benchmark/test/link_main_test.cc b/third-party/benchmark/test/link_main_test.cc
index 241ad5c3905e9f..131937eebca9d7 100644
--- a/third-party/benchmark/test/link_main_test.cc
+++ b/third-party/benchmark/test/link_main_test.cc
@@ -2,7 +2,8 @@
 
 void BM_empty(benchmark::State& state) {
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
 }
 BENCHMARK(BM_empty);
diff --git a/third-party/benchmark/test/map_test.cc b/third-party/benchmark/test/map_test.cc
index 509613457c1b89..0fdba7c87c4fe9 100644
--- a/third-party/benchmark/test/map_test.cc
+++ b/third-party/benchmark/test/map_test.cc
@@ -24,7 +24,8 @@ static void BM_MapLookup(benchmark::State& state) {
     m = ConstructRandomMap(size);
     state.ResumeTiming();
     for (int i = 0; i < size; ++i) {
-      benchmark::DoNotOptimize(m.find(std::rand() % size));
+      auto it = m.find(std::rand() % size);
+      benchmark::DoNotOptimize(it);
     }
   }
   state.SetItemsProcessed(state.iterations() * size);
@@ -34,11 +35,11 @@ BENCHMARK(BM_MapLookup)->Range(1 << 3, 1 << 12);
 // Using fixtures.
 class MapFixture : public ::benchmark::Fixture {
  public:
-  void SetUp(const ::benchmark::State& st) BENCHMARK_OVERRIDE {
+  void SetUp(const ::benchmark::State& st) override {
     m = ConstructRandomMap(static_cast<int>(st.range(0)));
   }
 
-  void TearDown(const ::benchmark::State&) BENCHMARK_OVERRIDE { m.clear(); }
+  void TearDown(const ::benchmark::State&) override { m.clear(); }
 
   std::map<int, int> m;
 };
@@ -47,7 +48,8 @@ BENCHMARK_DEFINE_F(MapFixture, Lookup)(benchmark::State& state) {
   const int size = static_cast<int>(state.range(0));
   for (auto _ : state) {
     for (int i = 0; i < size; ++i) {
-      benchmark::DoNotOptimize(m.find(std::rand() % size));
+      auto it = m.find(std::rand() % size);
+      benchmark::DoNotOptimize(it);
     }
   }
   state.SetItemsProcessed(state.iterations() * size);
diff --git a/third-party/benchmark/test/memory_manager_test.cc b/third-party/benchmark/test/memory_manager_test.cc
index f0c192fcbd00d9..4df674d586ed73 100644
--- a/third-party/benchmark/test/memory_manager_test.cc
+++ b/third-party/benchmark/test/memory_manager_test.cc
@@ -5,16 +5,17 @@
 #include "output_test.h"
 
 class TestMemoryManager : public benchmark::MemoryManager {
-  void Start() BENCHMARK_OVERRIDE {}
-  void Stop(Result* result) BENCHMARK_OVERRIDE {
-    result->num_allocs = 42;
-    result->max_bytes_used = 42000;
+  void Start() override {}
+  void Stop(Result& result) override {
+    result.num_allocs = 42;
+    result.max_bytes_used = 42000;
   }
 };
 
 void BM_empty(benchmark::State& state) {
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
 }
 BENCHMARK(BM_empty);
diff --git a/third-party/benchmark/test/multiple_ranges_test.cc b/third-party/benchmark/test/multiple_ranges_test.cc
index 7618c4da0892a2..5300a96036c1b1 100644
--- a/third-party/benchmark/test/multiple_ranges_test.cc
+++ b/third-party/benchmark/test/multiple_ranges_test.cc
@@ -28,7 +28,7 @@ class MultipleRangesFixture : public ::benchmark::Fixture {
                         {2, 7, 15},
                         {7, 6, 3}}) {}
 
-  void SetUp(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
+  void SetUp(const ::benchmark::State& state) override {
     std::vector<int64_t> ranges = {state.range(0), state.range(1),
                                    state.range(2)};
 
@@ -39,7 +39,7 @@ class MultipleRangesFixture : public ::benchmark::Fixture {
 
   // NOTE: This is not TearDown as we want to check after _all_ runs are
   // complete.
-  virtual ~MultipleRangesFixture() {
+  ~MultipleRangesFixture() override {
     if (actualValues != expectedValues) {
       std::cout << "EXPECTED\n";
       for (const auto& v : expectedValues) {
diff --git a/third-party/benchmark/test/options_test.cc b/third-party/benchmark/test/options_test.cc
index d424d40b9518d1..a1b209f3eb334a 100644
--- a/third-party/benchmark/test/options_test.cc
+++ b/third-party/benchmark/test/options_test.cc
@@ -33,6 +33,8 @@ BENCHMARK(BM_basic)->DenseRange(10, 15);
 BENCHMARK(BM_basic)->Args({42, 42});
 BENCHMARK(BM_basic)->Ranges({{64, 512}, {64, 512}});
 BENCHMARK(BM_basic)->MinTime(0.7);
+BENCHMARK(BM_basic)->MinWarmUpTime(0.8);
+BENCHMARK(BM_basic)->MinTime(0.1)->MinWarmUpTime(0.2);
 BENCHMARK(BM_basic)->UseRealTime();
 BENCHMARK(BM_basic)->ThreadRange(2, 4);
 BENCHMARK(BM_basic)->ThreadPerCpu();
@@ -65,8 +67,8 @@ void BM_explicit_iteration_count(benchmark::State& state) {
 
   // Test that the requested iteration count is respected.
   assert(state.max_iterations == 42);
-  size_t actual_iterations = 0;
-  for (auto _ : state) ++actual_iterations;
+  for (auto _ : state) {
+  }
   assert(state.iterations() == state.max_iterations);
   assert(state.iterations() == 42);
 }
diff --git a/third-party/benchmark/test/output_test.h b/third-party/benchmark/test/output_test.h
index c6ff8ef2d3039a..c08fe1d87e6c8a 100644
--- a/third-party/benchmark/test/output_test.h
+++ b/third-party/benchmark/test/output_test.h
@@ -85,7 +85,7 @@ std::string GetFileReporterOutput(int argc, char* argv[]);
 struct Results;
 typedef std::function<void(Results const&)> ResultsCheckFn;
 
-size_t AddChecker(const char* bm_name_pattern, const ResultsCheckFn& fn);
+size_t AddChecker(const std::string& bm_name_pattern, const ResultsCheckFn& fn);
 
 // Class holding the results of a benchmark.
 // It is passed in calls to checker functions.
@@ -117,7 +117,7 @@ struct Results {
 
   // get the string for a result by name, or nullptr if the name
   // is not found
-  const std::string* Get(const char* entry_name) const {
+  const std::string* Get(const std::string& entry_name) const {
     auto it = values.find(entry_name);
     if (it == values.end()) return nullptr;
     return &it->second;
@@ -126,12 +126,12 @@ struct Results {
   // get a result by name, parsed as a specific type.
   // NOTE: for counters, use GetCounterAs instead.
   template <class T>
-  T GetAs(const char* entry_name) const;
+  T GetAs(const std::string& entry_name) const;
 
   // counters are written as doubles, so they have to be read first
   // as a double, and only then converted to the asked type.
   template <class T>
-  T GetCounterAs(const char* entry_name) const {
+  T GetCounterAs(const std::string& entry_name) const {
     double dval = GetAs<double>(entry_name);
     T tval = static_cast<T>(dval);
     return tval;
@@ -139,7 +139,7 @@ struct Results {
 };
 
 template <class T>
-T Results::GetAs(const char* entry_name) const {
+T Results::GetAs(const std::string& entry_name) const {
   auto* sv = Get(entry_name);
   BM_CHECK(sv != nullptr && !sv->empty());
   std::stringstream ss;
diff --git a/third-party/benchmark/test/output_test_helper.cc b/third-party/benchmark/test/output_test_helper.cc
index 81584cbf778b03..265f28aae7c7cb 100644
--- a/third-party/benchmark/test/output_test_helper.cc
+++ b/third-party/benchmark/test/output_test_helper.cc
@@ -45,7 +45,7 @@ SubMap& GetSubstitutions() {
   static SubMap map = {
       {"%float", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"},
       // human-readable float
-      {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kMGTPEZYmunpfazy]?"},
+      {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kKMGTPEZYmunpfazy]?i?"},
       {"%percentage", percentage_re},
       {"%int", "[ ]*[0-9]+"},
       {" %s ", "[ ]+"},
@@ -65,6 +65,7 @@ SubMap& GetSubstitutions() {
       {"%csv_us_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",us,,,,,"},
       {"%csv_ms_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ms,,,,,"},
       {"%csv_s_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",s,,,,,"},
+      {"%csv_cv_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",,,,,,"},
       {"%csv_bytes_report",
        "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re + ",,,,"},
       {"%csv_items_report",
@@ -143,7 +144,7 @@ class TestReporter : public benchmark::BenchmarkReporter {
   TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
       : reporters_(std::move(reps)) {}
 
-  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
+  bool ReportContext(const Context& context) override {
     bool last_ret = false;
     bool first = true;
     for (auto rep : reporters_) {
@@ -157,10 +158,10 @@ class TestReporter : public benchmark::BenchmarkReporter {
     return last_ret;
   }
 
-  void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
+  void ReportRuns(const std::vector<Run>& report) override {
     for (auto rep : reporters_) rep->ReportRuns(report);
   }
-  void Finalize() BENCHMARK_OVERRIDE {
+  void Finalize() override {
     for (auto rep : reporters_) rep->Finalize();
   }
 
@@ -248,9 +249,8 @@ void ResultsChecker::CheckResults(std::stringstream& output) {
       if (!p.regex->Match(r.name)) {
         BM_VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
         continue;
-      } else {
-        BM_VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
       }
+      BM_VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
       BM_VLOG(1) << "Checking results of " << r.name << ": ... \n";
       p.fn(r);
       BM_VLOG(1) << "Checking results of " << r.name << ": OK.\n";
@@ -300,7 +300,7 @@ std::vector<std::string> ResultsChecker::SplitCsv_(const std::string& line) {
 
 }  // end namespace internal
 
-size_t AddChecker(const char* bm_name, const ResultsCheckFn& fn) {
+size_t AddChecker(const std::string& bm_name, const ResultsCheckFn& fn) {
   auto& rc = internal::GetResultsChecker();
   rc.Add(bm_name, fn);
   return rc.results.size();
@@ -328,16 +328,18 @@ double Results::GetTime(BenchmarkTime which) const {
   BM_CHECK(unit);
   if (*unit == "ns") {
     return val * 1.e-9;
-  } else if (*unit == "us") {
+  }
+  if (*unit == "us") {
     return val * 1.e-6;
-  } else if (*unit == "ms") {
+  }
+  if (*unit == "ms") {
     return val * 1.e-3;
-  } else if (*unit == "s") {
+  }
+  if (*unit == "s") {
     return val;
-  } else {
-    BM_CHECK(1 == 0) << "unknown time unit: " << *unit;
-    return 0;
   }
+  BM_CHECK(1 == 0) << "unknown time unit: " << *unit;
+  return 0;
 }
 
 // ========================================================================= //
@@ -393,14 +395,14 @@ void RunOutputTests(int argc, char* argv[]) {
   benchmark::JSONReporter JR;
   benchmark::CSVReporter CSVR;
   struct ReporterTest {
-    const char* name;
+    std::string name;
     std::vector<TestCase>& output_cases;
     std::vector<TestCase>& error_cases;
     benchmark::BenchmarkReporter& reporter;
     std::stringstream out_stream;
     std::stringstream err_stream;
 
-    ReporterTest(const char* n, std::vector<TestCase>& out_tc,
+    ReporterTest(const std::string& n, std::vector<TestCase>& out_tc,
                  std::vector<TestCase>& err_tc,
                  benchmark::BenchmarkReporter& br)
         : name(n), output_cases(out_tc), error_cases(err_tc), reporter(br) {
@@ -408,12 +410,12 @@ void RunOutputTests(int argc, char* argv[]) {
       reporter.SetErrorStream(&err_stream);
     }
   } TestCases[] = {
-      {"ConsoleReporter", GetTestCaseList(TC_ConsoleOut),
+      {std::string("ConsoleReporter"), GetTestCaseList(TC_ConsoleOut),
        GetTestCaseList(TC_ConsoleErr), CR},
-      {"JSONReporter", GetTestCaseList(TC_JSONOut), GetTestCaseList(TC_JSONErr),
-       JR},
-      {"CSVReporter", GetTestCaseList(TC_CSVOut), GetTestCaseList(TC_CSVErr),
-       CSVR},
+      {std::string("JSONReporter"), GetTestCaseList(TC_JSONOut),
+       GetTestCaseList(TC_JSONErr), JR},
+      {std::string("CSVReporter"), GetTestCaseList(TC_CSVOut),
+       GetTestCaseList(TC_CSVErr), CSVR},
   };
 
   // Create the test reporter and run the benchmarks.
@@ -422,7 +424,8 @@ void RunOutputTests(int argc, char* argv[]) {
   benchmark::RunSpecifiedBenchmarks(&test_rep);
 
   for (auto& rep_test : TestCases) {
-    std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
+    std::string msg =
+        std::string("\nTesting ") + rep_test.name + std::string(" Output\n");
     std::string banner(msg.size() - 1, '-');
     std::cout << banner << msg << banner << "\n";
 
@@ -439,7 +442,7 @@ void RunOutputTests(int argc, char* argv[]) {
   // the checks to subscribees.
   auto& csv = TestCases[2];
   // would use == but gcc spits a warning
-  BM_CHECK(std::strcmp(csv.name, "CSVReporter") == 0);
+  BM_CHECK(csv.name == std::string("CSVReporter"));
   internal::GetResultsChecker().CheckResults(csv.out_stream);
 }
 
diff --git a/third-party/benchmark/test/perf_counters_gtest.cc b/third-party/benchmark/test/perf_counters_gtest.cc
index 3eac62463bc5b3..2e63049285d751 100644
--- a/third-party/benchmark/test/perf_counters_gtest.cc
+++ b/third-party/benchmark/test/perf_counters_gtest.cc
@@ -1,6 +1,8 @@
+#include <random>
 #include <thread>
 
 #include "../src/perf_counters.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 #ifndef GTEST_SKIP
@@ -11,12 +13,15 @@ struct MsgHandler {
 #endif
 
 using benchmark::internal::PerfCounters;
+using benchmark::internal::PerfCountersMeasurement;
 using benchmark::internal::PerfCounterValues;
+using ::testing::AllOf;
+using ::testing::Gt;
+using ::testing::Lt;
 
 namespace {
 const char kGenericPerfEvent1[] = "CYCLES";
-const char kGenericPerfEvent2[] = "BRANCHES";
-const char kGenericPerfEvent3[] = "INSTRUCTIONS";
+const char kGenericPerfEvent2[] = "INSTRUCTIONS";
 
 TEST(PerfCountersTest, Init) {
   EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported);
@@ -27,7 +32,7 @@ TEST(PerfCountersTest, OneCounter) {
     GTEST_SKIP() << "Performance counters not supported.\n";
   }
   EXPECT_TRUE(PerfCounters::Initialize());
-  EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1}).IsValid());
+  EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1}).num_counters(), 1);
 }
 
 TEST(PerfCountersTest, NegativeTest) {
@@ -36,29 +41,44 @@ TEST(PerfCountersTest, NegativeTest) {
     return;
   }
   EXPECT_TRUE(PerfCounters::Initialize());
-  EXPECT_FALSE(PerfCounters::Create({}).IsValid());
-  EXPECT_FALSE(PerfCounters::Create({""}).IsValid());
-  EXPECT_FALSE(PerfCounters::Create({"not a counter name"}).IsValid());
+  // Safety checks
+  // Create() will always create a valid object, even if passed no or
+  // wrong arguments as the new behavior is to warn and drop unsupported
+  // counters
+  EXPECT_EQ(PerfCounters::Create({}).num_counters(), 0);
+  EXPECT_EQ(PerfCounters::Create({""}).num_counters(), 0);
+  EXPECT_EQ(PerfCounters::Create({"not a counter name"}).num_counters(), 0);
   {
-    EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
-                                      kGenericPerfEvent3})
-                    .IsValid());
-  }
-  EXPECT_FALSE(
-      PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1})
-          .IsValid());
-  EXPECT_FALSE(PerfCounters::Create({kGenericPerfEvent3, "not a counter name",
-                                     kGenericPerfEvent1})
-                   .IsValid());
+    // Try sneaking in a bad egg to see if it is filtered out. The
+    // number of counters has to be two, not zero
+    auto counter =
+        PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1});
+    EXPECT_EQ(counter.num_counters(), 2);
+    EXPECT_EQ(counter.names(), std::vector<std::string>(
+                                   {kGenericPerfEvent2, kGenericPerfEvent1}));
+  }
+  {
+    // Try sneaking in an outrageous counter, like a fat finger mistake
+    auto counter = PerfCounters::Create(
+        {kGenericPerfEvent2, "not a counter name", kGenericPerfEvent1});
+    EXPECT_EQ(counter.num_counters(), 2);
+    EXPECT_EQ(counter.names(), std::vector<std::string>(
+                                   {kGenericPerfEvent2, kGenericPerfEvent1}));
+  }
   {
-    EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
-                                      kGenericPerfEvent3})
-                    .IsValid());
-  }
-  EXPECT_FALSE(
-      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
-                            kGenericPerfEvent3, "MISPREDICTED_BRANCH_RETIRED"})
-          .IsValid());
+    // Finally try a golden input - it should like both of them
+    EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2})
+                  .num_counters(),
+              2);
+  }
+  {
+    // Add a bad apple in the end of the chain to check the edges
+    auto counter = PerfCounters::Create(
+        {kGenericPerfEvent1, kGenericPerfEvent2, "bad event name"});
+    EXPECT_EQ(counter.num_counters(), 2);
+    EXPECT_EQ(counter.names(), std::vector<std::string>(
+                                   {kGenericPerfEvent1, kGenericPerfEvent2}));
+  }
 }
 
 TEST(PerfCountersTest, Read1Counter) {
@@ -67,7 +87,7 @@ TEST(PerfCountersTest, Read1Counter) {
   }
   EXPECT_TRUE(PerfCounters::Initialize());
   auto counters = PerfCounters::Create({kGenericPerfEvent1});
-  EXPECT_TRUE(counters.IsValid());
+  EXPECT_EQ(counters.num_counters(), 1);
   PerfCounterValues values1(1);
   EXPECT_TRUE(counters.Snapshot(&values1));
   EXPECT_GT(values1[0], 0);
@@ -84,7 +104,7 @@ TEST(PerfCountersTest, Read2Counters) {
   EXPECT_TRUE(PerfCounters::Initialize());
   auto counters =
       PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
-  EXPECT_TRUE(counters.IsValid());
+  EXPECT_EQ(counters.num_counters(), 2);
   PerfCounterValues values1(2);
   EXPECT_TRUE(counters.Snapshot(&values1));
   EXPECT_GT(values1[0], 0);
@@ -95,30 +115,121 @@ TEST(PerfCountersTest, Read2Counters) {
   EXPECT_GT(values2[1], 0);
 }
 
-size_t do_work() {
-  size_t res = 0;
-  for (size_t i = 0; i < 100000000; ++i) res += i * i;
-  return res;
+TEST(PerfCountersTest, ReopenExistingCounters) {
+  // This test works in recent and old Intel hardware, Pixel 3, and Pixel 6.
+  // However we cannot make assumptions beyond 2 HW counters due to Pixel 6.
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  std::vector<std::string> kMetrics({kGenericPerfEvent1});
+  std::vector<PerfCounters> counters(2);
+  for (auto& counter : counters) {
+    counter = PerfCounters::Create(kMetrics);
+  }
+  PerfCounterValues values(1);
+  EXPECT_TRUE(counters[0].Snapshot(&values));
+  EXPECT_TRUE(counters[1].Snapshot(&values));
 }
 
-void measure(size_t threadcount, PerfCounterValues* values1,
-             PerfCounterValues* values2) {
-  BM_CHECK_NE(values1, nullptr);
-  BM_CHECK_NE(values2, nullptr);
+TEST(PerfCountersTest, CreateExistingMeasurements) {
+  // The test works (i.e. causes read to fail) for the assumptions
+  // about hardware capabilities (i.e. small number (2) hardware
+  // counters) at this date,
+  // the same as previous test ReopenExistingCounters.
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+
+  // This means we will try 10 counters but we can only guarantee
+  // for sure at this time that only 3 will work. Perhaps in the future
+  // we could use libpfm to query for the hardware limits on this
+  // particular platform.
+  const int kMaxCounters = 10;
+  const int kMinValidCounters = 2;
+
+  // Let's use a ubiquitous counter that is guaranteed to work
+  // on all platforms
+  const std::vector<std::string> kMetrics{"cycles"};
+
+  // Cannot create a vector of actual objects because the
+  // copy constructor of PerfCounters is deleted - and so is
+  // implicitly deleted on PerfCountersMeasurement too
+  std::vector<std::unique_ptr<PerfCountersMeasurement>>
+      perf_counter_measurements;
+
+  perf_counter_measurements.reserve(kMaxCounters);
+  for (int j = 0; j < kMaxCounters; ++j) {
+    perf_counter_measurements.emplace_back(
+        new PerfCountersMeasurement(kMetrics));
+  }
+
+  std::vector<std::pair<std::string, double>> measurements;
+
+  // Start all counters together to see if they hold
+  size_t max_counters = kMaxCounters;
+  for (size_t i = 0; i < kMaxCounters; ++i) {
+    auto& counter(*perf_counter_measurements[i]);
+    EXPECT_EQ(counter.num_counters(), 1);
+    if (!counter.Start()) {
+      max_counters = i;
+      break;
+    };
+  }
+
+  ASSERT_GE(max_counters, kMinValidCounters);
+
+  // Start all together
+  for (size_t i = 0; i < max_counters; ++i) {
+    auto& counter(*perf_counter_measurements[i]);
+    EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
+  }
+
+  // Start/stop individually
+  for (size_t i = 0; i < max_counters; ++i) {
+    auto& counter(*perf_counter_measurements[i]);
+    measurements.clear();
+    counter.Start();
+    EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
+  }
+}
+
+// We try to do some meaningful work here but the compiler
+// insists in optimizing away our loop so we had to add a
+// no-optimize macro. In case it fails, we added some entropy
+// to this pool as well.
+
+BENCHMARK_DONT_OPTIMIZE size_t do_work() {
+  static std::mt19937 rd{std::random_device{}()};
+  static std::uniform_int_distribution<size_t> mrand(0, 10);
+  const size_t kNumLoops = 1000000;
+  size_t sum = 0;
+  for (size_t j = 0; j < kNumLoops; ++j) {
+    sum += mrand(rd);
+  }
+  benchmark::DoNotOptimize(sum);
+  return sum;
+}
+
+void measure(size_t threadcount, PerfCounterValues* before,
+             PerfCounterValues* after) {
+  BM_CHECK_NE(before, nullptr);
+  BM_CHECK_NE(after, nullptr);
   std::vector<std::thread> threads(threadcount);
   auto work = [&]() { BM_CHECK(do_work() > 1000); };
 
   // We need to first set up the counters, then start the threads, so the
-  // threads would inherit the counters. But later, we need to first destroy the
-  // thread pool (so all the work finishes), then measure the counters. So the
-  // scopes overlap, and we need to explicitly control the scope of the
+  // threads would inherit the counters. But later, we need to first destroy
+  // the thread pool (so all the work finishes), then measure the counters. So
+  // the scopes overlap, and we need to explicitly control the scope of the
   // threadpool.
   auto counters =
-      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent3});
+      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
   for (auto& t : threads) t = std::thread(work);
-  counters.Snapshot(values1);
+  counters.Snapshot(before);
   for (auto& t : threads) t.join();
-  counters.Snapshot(values2);
+  counters.Snapshot(after);
 }
 
 TEST(PerfCountersTest, MultiThreaded) {
@@ -126,20 +237,71 @@ TEST(PerfCountersTest, MultiThreaded) {
     GTEST_SKIP() << "Test skipped because libpfm is not supported.";
   }
   EXPECT_TRUE(PerfCounters::Initialize());
-  PerfCounterValues values1(2);
-  PerfCounterValues values2(2);
+  PerfCounterValues before(2);
+  PerfCounterValues after(2);
 
-  measure(2, &values1, &values2);
-  std::vector<double> D1{static_cast<double>(values2[0] - values1[0]),
-                         static_cast<double>(values2[1] - values1[1])};
+  // Notice that this test will work even if we taskset it to a single CPU
+  // In this case the threads will run sequentially
+  // Start two threads and measure the number of combined cycles and
+  // instructions
+  measure(2, &before, &after);
+  std::vector<double> Elapsed2Threads{
+      static_cast<double>(after[0] - before[0]),
+      static_cast<double>(after[1] - before[1])};
 
-  measure(4, &values1, &values2);
-  std::vector<double> D2{static_cast<double>(values2[0] - values1[0]),
-                         static_cast<double>(values2[1] - values1[1])};
+  // Start four threads and measure the number of combined cycles and
+  // instructions
+  measure(4, &before, &after);
+  std::vector<double> Elapsed4Threads{
+      static_cast<double>(after[0] - before[0]),
+      static_cast<double>(after[1] - before[1])};
 
-  // Some extra work will happen on the main thread - like joining the threads
-  // - so the ratio won't be quite 2.0, but very close.
-  EXPECT_GE(D2[0], 1.9 * D1[0]);
-  EXPECT_GE(D2[1], 1.9 * D1[1]);
+  // The following expectations fail (at least on a beefy workstation with lots
+  // of cpus) - it seems that in some circumstances the runtime of 4 threads
+  // can even be better than with 2.
+  // So instead of expecting 4 threads to be slower, let's just make sure they
+  // do not differ too much in general (one is not more than 10x than the
+  // other).
+  EXPECT_THAT(Elapsed4Threads[0] / Elapsed2Threads[0], AllOf(Gt(0.1), Lt(10)));
+  EXPECT_THAT(Elapsed4Threads[1] / Elapsed2Threads[1], AllOf(Gt(0.1), Lt(10)));
 }
+
+TEST(PerfCountersTest, HardwareLimits) {
+  // The test works (i.e. causes read to fail) for the assumptions
+  // about hardware capabilities (i.e. small number (3-4) hardware
+  // counters) at this date,
+  // the same as previous test ReopenExistingCounters.
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+
+  // Taken from `perf list`, but focusses only on those HW events that actually
+  // were reported when running `sudo perf stat -a sleep 10`, intersected over
+  // several platforms. All HW events listed in the first command not reported
+  // in the second seem to not work. This is sad as we don't really get to test
+  // the grouping here (groups can contain up to 6 members)...
+  std::vector<std::string> counter_names{
+      "cycles",         // leader
+      "instructions",   //
+      "branch-misses",  //
+  };
+
+  // In the off-chance that some of these values are not supported,
+  // we filter them out so the test will complete without failure
+  // albeit it might not actually test the grouping on that platform
+  std::vector<std::string> valid_names;
+  for (const std::string& name : counter_names) {
+    if (PerfCounters::IsCounterSupported(name)) {
+      valid_names.push_back(name);
+    }
+  }
+  PerfCountersMeasurement counter(valid_names);
+
+  std::vector<std::pair<std::string, double>> measurements;
+
+  counter.Start();
+  EXPECT_TRUE(counter.Stop(measurements));
+}
+
 }  // namespace
diff --git a/third-party/benchmark/test/perf_counters_test.cc b/third-party/benchmark/test/perf_counters_test.cc
index 3017a452fe2759..3cc593e629d806 100644
--- a/third-party/benchmark/test/perf_counters_test.cc
+++ b/third-party/benchmark/test/perf_counters_test.cc
@@ -1,27 +1,92 @@
+#include <cstdarg>
 #undef NDEBUG
 
+#include "../src/commandlineflags.h"
 #include "../src/perf_counters.h"
-
 #include "benchmark/benchmark.h"
 #include "output_test.h"
 
+namespace benchmark {
+
+BM_DECLARE_string(benchmark_perf_counters);
+
+}  // namespace benchmark
+
 static void BM_Simple(benchmark::State& state) {
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
 }
 BENCHMARK(BM_Simple);
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Simple\",$"}});
 
+const int kIters = 1000000;
+
+void BM_WithoutPauseResume(benchmark::State& state) {
+  int n = 0;
+
+  for (auto _ : state) {
+    for (auto i = 0; i < kIters; ++i) {
+      n = 1 - n;
+      benchmark::DoNotOptimize(n);
+    }
+  }
+}
+
+BENCHMARK(BM_WithoutPauseResume);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_WithoutPauseResume\",$"}});
+
+void BM_WithPauseResume(benchmark::State& state) {
+  int m = 0, n = 0;
+
+  for (auto _ : state) {
+    for (auto i = 0; i < kIters; ++i) {
+      n = 1 - n;
+      benchmark::DoNotOptimize(n);
+    }
+
+    state.PauseTiming();
+    for (auto j = 0; j < kIters; ++j) {
+      m = 1 - m;
+      benchmark::DoNotOptimize(m);
+    }
+    state.ResumeTiming();
+  }
+}
+
+BENCHMARK(BM_WithPauseResume);
+
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_WithPauseResume\",$"}});
+
 static void CheckSimple(Results const& e) {
   CHECK_COUNTER_VALUE(e, double, "CYCLES", GT, 0);
-  CHECK_COUNTER_VALUE(e, double, "BRANCHES", GT, 0.0);
 }
+
+double withoutPauseResumeInstrCount = 0.0;
+double withPauseResumeInstrCount = 0.0;
+
+static void SaveInstrCountWithoutResume(Results const& e) {
+  withoutPauseResumeInstrCount = e.GetAs<double>("INSTRUCTIONS");
+}
+
+static void SaveInstrCountWithResume(Results const& e) {
+  withPauseResumeInstrCount = e.GetAs<double>("INSTRUCTIONS");
+}
+
 CHECK_BENCHMARK_RESULTS("BM_Simple", &CheckSimple);
+CHECK_BENCHMARK_RESULTS("BM_WithoutPauseResume", &SaveInstrCountWithoutResume);
+CHECK_BENCHMARK_RESULTS("BM_WithPauseResume", &SaveInstrCountWithResume);
 
 int main(int argc, char* argv[]) {
   if (!benchmark::internal::PerfCounters::kSupported) {
     return 0;
   }
+  benchmark::FLAGS_benchmark_perf_counters = "CYCLES,INSTRUCTIONS";
+  benchmark::internal::PerfCounters::Initialize();
   RunOutputTests(argc, argv);
+
+  BM_CHECK_GT(withPauseResumeInstrCount, kIters);
+  BM_CHECK_GT(withoutPauseResumeInstrCount, kIters);
+  BM_CHECK_LT(withPauseResumeInstrCount, 1.5 * withoutPauseResumeInstrCount);
 }
diff --git a/third-party/benchmark/test/register_benchmark_test.cc b/third-party/benchmark/test/register_benchmark_test.cc
index 602405b67e8de8..d69d144a4e171a 100644
--- a/third-party/benchmark/test/register_benchmark_test.cc
+++ b/third-party/benchmark/test/register_benchmark_test.cc
@@ -10,7 +10,7 @@ namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
+  void ReportRuns(const std::vector<Run>& report) override {
     all_runs_.insert(all_runs_.end(), begin(report), end(report));
     ConsoleReporter::ReportRuns(report);
   }
@@ -19,11 +19,11 @@ class TestReporter : public benchmark::ConsoleReporter {
 };
 
 struct TestCase {
-  std::string name;
-  const char* label;
+  const std::string name;
+  const std::string label;
   // Note: not explicit as we rely on it being converted through ADD_CASES.
-  TestCase(const char* xname) : TestCase(xname, nullptr) {}
-  TestCase(const char* xname, const char* xlabel)
+  TestCase(const std::string& xname) : TestCase(xname, "") {}
+  TestCase(const std::string& xname, const std::string& xlabel)
       : name(xname), label(xlabel) {}
 
   typedef benchmark::BenchmarkReporter::Run Run;
@@ -32,7 +32,7 @@ struct TestCase {
     // clang-format off
     BM_CHECK(name == run.benchmark_name()) << "expected " << name << " got "
                                       << run.benchmark_name();
-    if (label) {
+    if (!label.empty()) {
       BM_CHECK(run.report_label == label) << "expected " << label << " got "
                                        << run.report_label;
     } else {
@@ -95,6 +95,18 @@ ADD_CASES({"test1", "One"}, {"test2", "Two"}, {"test3", "Three"});
 
 #endif  // BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
 
+//----------------------------------------------------------------------------//
+// Test RegisterBenchmark with DISABLED_ benchmark
+//----------------------------------------------------------------------------//
+void DISABLED_BM_function(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(DISABLED_BM_function);
+ReturnVal dummy3 = benchmark::RegisterBenchmark("DISABLED_BM_function_manual",
+                                                DISABLED_BM_function);
+// No need to add cases because we don't expect them to run.
+
 //----------------------------------------------------------------------------//
 // Test RegisterBenchmark with different callable types
 //----------------------------------------------------------------------------//
@@ -111,7 +123,7 @@ void TestRegistrationAtRuntime() {
   {
     CustomFixture fx;
     benchmark::RegisterBenchmark("custom_fixture", fx);
-    AddCases({"custom_fixture"});
+    AddCases({std::string("custom_fixture")});
   }
 #endif
 #ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
diff --git a/third-party/benchmark/test/reporter_output_test.cc b/third-party/benchmark/test/reporter_output_test.cc
index 2b6e6543dd2ef4..7867165d1f3df4 100644
--- a/third-party/benchmark/test/reporter_output_test.cc
+++ b/third-party/benchmark/test/reporter_output_test.cc
@@ -17,7 +17,7 @@ static int AddContextCases() {
   AddCases(TC_ConsoleErr,
            {
                {"^%int-%int-%intT%int:%int:%int[-+]%int:%int$", MR_Default},
-               {"Running .*/reporter_output_test(\\.exe)?$", MR_Next},
+               {"Running .*(/|\\\\)reporter_output_test(\\.exe)?$", MR_Next},
                {"Run on \\(%int X %float MHz CPU s?\\)", MR_Next},
            });
   AddCases(TC_JSONOut,
@@ -55,6 +55,9 @@ static int AddContextCases() {
              {{"Load Average: (%float, ){0,2}%float$", MR_Next}});
   }
   AddCases(TC_JSONOut, {{"\"load_avg\": \\[(%float,?){0,3}],$", MR_Next}});
+  AddCases(TC_JSONOut, {{"\"library_version\": \".*\",$", MR_Next}});
+  AddCases(TC_JSONOut, {{"\"library_build_type\": \".*\",$", MR_Next}});
+  AddCases(TC_JSONOut, {{"\"json_schema_version\": 1$", MR_Next}});
   return 0;
 }
 int dummy_register = AddContextCases();
@@ -93,7 +96,8 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_basic\",%csv_report$"}});
 void BM_bytes_per_second(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
   state.SetBytesProcessed(1);
 }
@@ -124,7 +128,8 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_bytes_per_second\",%csv_bytes_report$"}});
 void BM_items_per_second(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
   state.SetItemsProcessed(1);
 }
@@ -318,7 +323,7 @@ ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"},
 ADD_CASES(TC_CSVOut, {{"^\"BM_no_arg_name/3\",%csv_report$"}});
 
 // ========================================================================= //
-// ------------------------ Testing Arg Name Output ----------------------- //
+// ------------------------ Testing Arg Name Output ------------------------ //
 // ========================================================================= //
 
 void BM_arg_name(benchmark::State& state) {
@@ -404,7 +409,8 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_BigArgs/1073741824 %console_report$"},
 void BM_Complexity_O1(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
   state.SetComplexityN(state.range(0));
 }
@@ -1085,7 +1091,7 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_UserPercentStats/iterations:5/repeats:3/"
                       {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
                        "manual_time_stddev\",%csv_report$"},
                       {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
-                       "manual_time_\",%csv_report$"}});
+                       "manual_time_\",%csv_cv_report$"}});
 
 // ========================================================================= //
 // ------------------------- Testing StrEscape JSON ------------------------ //
diff --git a/third-party/benchmark/test/skip_with_error_test.cc b/third-party/benchmark/test/skip_with_error_test.cc
index 026d4791335074..2139a19e250717 100644
--- a/third-party/benchmark/test/skip_with_error_test.cc
+++ b/third-party/benchmark/test/skip_with_error_test.cc
@@ -10,17 +10,17 @@ namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
+  bool ReportContext(const Context& context) override {
     return ConsoleReporter::ReportContext(context);
   };
 
-  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
+  void ReportRuns(const std::vector<Run>& report) override {
     all_runs_.insert(all_runs_.end(), begin(report), end(report));
     ConsoleReporter::ReportRuns(report);
   }
 
   TestReporter() {}
-  virtual ~TestReporter() {}
+  ~TestReporter() override {}
 
   mutable std::vector<Run> all_runs_;
 };
@@ -35,8 +35,9 @@ struct TestCase {
   void CheckRun(Run const& run) const {
     BM_CHECK(name == run.benchmark_name())
         << "expected " << name << " got " << run.benchmark_name();
-    BM_CHECK(error_occurred == run.error_occurred);
-    BM_CHECK(error_message == run.error_message);
+    BM_CHECK_EQ(error_occurred,
+                benchmark::internal::SkippedWithError == run.skipped);
+    BM_CHECK(error_message == run.skip_message);
     if (error_occurred) {
       // BM_CHECK(run.iterations == 0);
     } else {
@@ -47,7 +48,8 @@ struct TestCase {
 
 std::vector<TestCase> ExpectedResults;
 
-int AddCases(const char* base_name, std::initializer_list<TestCase> const& v) {
+int AddCases(const std::string& base_name,
+             std::initializer_list<TestCase> const& v) {
   for (auto TC : v) {
     TC.name = base_name + TC.name;
     ExpectedResults.push_back(std::move(TC));
@@ -141,7 +143,8 @@ ADD_CASES("BM_error_during_running_ranged_for",
 
 void BM_error_after_running(benchmark::State& state) {
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
   if (state.thread_index() <= (state.threads() / 2))
     state.SkipWithError("error message");
diff --git a/third-party/benchmark/test/spec_arg_test.cc b/third-party/benchmark/test/spec_arg_test.cc
index 043db1be47a2e4..06aafbeb9b5eeb 100644
--- a/third-party/benchmark/test/spec_arg_test.cc
+++ b/third-party/benchmark/test/spec_arg_test.cc
@@ -17,11 +17,11 @@ namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
+  bool ReportContext(const Context& context) override {
     return ConsoleReporter::ReportContext(context);
   };
 
-  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
+  void ReportRuns(const std::vector<Run>& report) override {
     assert(report.size() == 1);
     matched_functions.push_back(report[0].run_name.function_name);
     ConsoleReporter::ReportRuns(report);
@@ -29,7 +29,7 @@ class TestReporter : public benchmark::ConsoleReporter {
 
   TestReporter() {}
 
-  virtual ~TestReporter() {}
+  ~TestReporter() override {}
 
   const std::vector<std::string>& GetMatchedFunctions() const {
     return matched_functions;
@@ -91,5 +91,15 @@ int main(int argc, char** argv) {
               << matched_functions.front() << "]\n";
     return 2;
   }
+
+  // Test that SetBenchmarkFilter works.
+  const std::string golden_value = "golden_value";
+  benchmark::SetBenchmarkFilter(golden_value);
+  std::string current_value = benchmark::GetBenchmarkFilter();
+  if (golden_value != current_value) {
+    std::cerr << "Expected [" << golden_value
+              << "] for --benchmark_filter but got [" << current_value << "]\n";
+    return 3;
+  }
   return 0;
 }
diff --git a/third-party/benchmark/test/statistics_gtest.cc b/third-party/benchmark/test/statistics_gtest.cc
index 1de2d87d4ba559..48c77260fd5353 100644
--- a/third-party/benchmark/test/statistics_gtest.cc
+++ b/third-party/benchmark/test/statistics_gtest.cc
@@ -28,8 +28,8 @@ TEST(StatisticsTest, StdDev) {
 TEST(StatisticsTest, CV) {
   EXPECT_DOUBLE_EQ(benchmark::StatisticsCV({101, 101, 101, 101}), 0.0);
   EXPECT_DOUBLE_EQ(benchmark::StatisticsCV({1, 2, 3}), 1. / 2.);
-  EXPECT_DOUBLE_EQ(benchmark::StatisticsCV({2.5, 2.4, 3.3, 4.2, 5.1}),
-                   0.32888184094918121);
+  ASSERT_NEAR(benchmark::StatisticsCV({2.5, 2.4, 3.3, 4.2, 5.1}),
+              0.32888184094918121, 1e-15);
 }
 
 }  // end namespace
diff --git a/third-party/benchmark/test/string_util_gtest.cc b/third-party/benchmark/test/string_util_gtest.cc
index 698f2d43eb88fe..67b4bc0c24f262 100644
--- a/third-party/benchmark/test/string_util_gtest.cc
+++ b/third-party/benchmark/test/string_util_gtest.cc
@@ -1,9 +1,12 @@
 //===---------------------------------------------------------------------===//
-// statistics_test - Unit tests for src/statistics.cc
+// string_util_test - Unit tests for src/string_util.cc
 //===---------------------------------------------------------------------===//
 
+#include <tuple>
+
 #include "../src/internal_macros.h"
 #include "../src/string_util.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 namespace {
@@ -63,7 +66,10 @@ TEST(StringUtilTest, stoul) {
     EXPECT_EQ(4ul, pos);
   }
 #ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-  { ASSERT_THROW(benchmark::stoul("this is a test"), std::invalid_argument); }
+  {
+    ASSERT_THROW(std::ignore = benchmark::stoul("this is a test"),
+                 std::invalid_argument);
+  }
 #endif
 }
 
@@ -107,7 +113,10 @@ EXPECT_EQ(1ul, pos);
   EXPECT_EQ(4ul, pos);
 }
 #ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-{ ASSERT_THROW(benchmark::stoi("this is a test"), std::invalid_argument); }
+{
+  ASSERT_THROW(std::ignore = benchmark::stoi("this is a test"),
+               std::invalid_argument);
+}
 #endif
 }
 
@@ -137,7 +146,10 @@ EXPECT_EQ(1ul, pos);
   EXPECT_EQ(8ul, pos);
 }
 #ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-{ ASSERT_THROW(benchmark::stod("this is a test"), std::invalid_argument); }
+{
+  ASSERT_THROW(std::ignore = benchmark::stod("this is a test"),
+               std::invalid_argument);
+}
 #endif
 }
 
@@ -149,4 +161,39 @@ TEST(StringUtilTest, StrSplit) {
             std::vector<std::string>({"hello", "there", "is", "more"}));
 }
 
+using HumanReadableFixture = ::testing::TestWithParam<
+    std::tuple<double, benchmark::Counter::OneK, std::string>>;
+
+INSTANTIATE_TEST_SUITE_P(
+    HumanReadableTests, HumanReadableFixture,
+    ::testing::Values(
+        std::make_tuple(0.0, benchmark::Counter::kIs1024, "0"),
+        std::make_tuple(999.0, benchmark::Counter::kIs1024, "999"),
+        std::make_tuple(1000.0, benchmark::Counter::kIs1024, "1000"),
+        std::make_tuple(1024.0, benchmark::Counter::kIs1024, "1Ki"),
+        std::make_tuple(1000 * 1000.0, benchmark::Counter::kIs1024,
+                        "976\\.56.Ki"),
+        std::make_tuple(1024 * 1024.0, benchmark::Counter::kIs1024, "1Mi"),
+        std::make_tuple(1000 * 1000 * 1000.0, benchmark::Counter::kIs1024,
+                        "953\\.674Mi"),
+        std::make_tuple(1024 * 1024 * 1024.0, benchmark::Counter::kIs1024,
+                        "1Gi"),
+        std::make_tuple(0.0, benchmark::Counter::kIs1000, "0"),
+        std::make_tuple(999.0, benchmark::Counter::kIs1000, "999"),
+        std::make_tuple(1000.0, benchmark::Counter::kIs1000, "1k"),
+        std::make_tuple(1024.0, benchmark::Counter::kIs1000, "1.024k"),
+        std::make_tuple(1000 * 1000.0, benchmark::Counter::kIs1000, "1M"),
+        std::make_tuple(1024 * 1024.0, benchmark::Counter::kIs1000,
+                        "1\\.04858M"),
+        std::make_tuple(1000 * 1000 * 1000.0, benchmark::Counter::kIs1000,
+                        "1G"),
+        std::make_tuple(1024 * 1024 * 1024.0, benchmark::Counter::kIs1000,
+                        "1\\.07374G")));
+
+TEST_P(HumanReadableFixture, HumanReadableNumber) {
+  std::string str = benchmark::HumanReadableNumber(std::get<0>(GetParam()),
+                                                   std::get<1>(GetParam()));
+  ASSERT_THAT(str, ::testing::MatchesRegex(std::get<2>(GetParam())));
+}
+
 }  // end namespace
diff --git a/third-party/benchmark/test/user_counters_tabular_test.cc b/third-party/benchmark/test/user_counters_tabular_test.cc
index 45ac043d519301..cfc1ab069c78a6 100644
--- a/third-party/benchmark/test/user_counters_tabular_test.cc
+++ b/third-party/benchmark/test/user_counters_tabular_test.cc
@@ -63,6 +63,9 @@ ADD_CASES(TC_CSVOut, {{"%csv_header,"
 
 void BM_Counters_Tabular(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters.insert({
@@ -330,7 +333,7 @@ ADD_CASES(TC_CSVOut,
           {{"^\"BM_Counters_Tabular/repeats:2/threads:1_stddev\",%csv_report,"
             "%float,%float,%float,%float,%float,%float$"}});
 ADD_CASES(TC_CSVOut,
-          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_cv\",%csv_report,"
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_cv\",%csv_cv_report,"
             "%float,%float,%float,%float,%float,%float$"}});
 ADD_CASES(TC_CSVOut,
           {{"^\"BM_Counters_Tabular/repeats:2/threads:2\",%csv_report,"
@@ -348,7 +351,7 @@ ADD_CASES(TC_CSVOut,
           {{"^\"BM_Counters_Tabular/repeats:2/threads:2_stddev\",%csv_report,"
             "%float,%float,%float,%float,%float,%float$"}});
 ADD_CASES(TC_CSVOut,
-          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_cv\",%csv_report,"
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_cv\",%csv_cv_report,"
             "%float,%float,%float,%float,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
@@ -372,7 +375,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/repeats:2/threads:2$",
 void BM_CounterRates_Tabular(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters.insert({
diff --git a/third-party/benchmark/test/user_counters_test.cc b/third-party/benchmark/test/user_counters_test.cc
index 1cc74552a1bd17..22252acbf6a224 100644
--- a/third-party/benchmark/test/user_counters_test.cc
+++ b/third-party/benchmark/test/user_counters_test.cc
@@ -67,7 +67,8 @@ int num_calls1 = 0;
 void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
   state.counters["foo"] = 1;
   state.counters["bar"] = ++num_calls1;
@@ -118,7 +119,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_WithBytesAndItemsPSec",
 void BM_Counters_Rate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kIsRate};
@@ -161,7 +163,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_Rate", &CheckRate);
 void BM_Invert(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{0.0001, bm::Counter::kInvert};
@@ -195,14 +198,14 @@ void CheckInvert(Results const& e) {
 CHECK_BENCHMARK_RESULTS("BM_Invert", &CheckInvert);
 
 // ========================================================================= //
-// ------------------------- InvertedRate Counters Output
-// -------------------------- //
+// --------------------- InvertedRate Counters Output ---------------------- //
 // ========================================================================= //
 
 void BM_Counters_InvertedRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] =
@@ -330,7 +333,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreads/threads:%int",
 void BM_Counters_AvgThreadsRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreadsRate};
@@ -417,7 +421,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_IterationInvariant",
 void BM_Counters_kIsIterationInvariantRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] =
@@ -460,7 +465,7 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_kIsIterationInvariantRate",
                         &CheckIsIterationInvariantRate);
 
 // ========================================================================= //
-// ------------------- AvgIterations Counters Output ------------------ //
+// --------------------- AvgIterations Counters Output --------------------- //
 // ========================================================================= //
 
 void BM_Counters_AvgIterations(benchmark::State& state) {
@@ -502,13 +507,14 @@ void CheckAvgIterations(Results const& e) {
 CHECK_BENCHMARK_RESULTS("BM_Counters_AvgIterations", &CheckAvgIterations);
 
 // ========================================================================= //
-// ----------------- AvgIterationsRate Counters Output ---------------- //
+// ------------------- AvgIterationsRate Counters Output ------------------- //
 // ========================================================================= //
 
 void BM_Counters_kAvgIterationsRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = double(state.iterations()) * double(state.iterations());
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgIterationsRate};
diff --git a/third-party/benchmark/test/user_counters_thousands_test.cc b/third-party/benchmark/test/user_counters_thousands_test.cc
index a42683b32fa7b2..fc153835f80062 100644
--- a/third-party/benchmark/test/user_counters_thousands_test.cc
+++ b/third-party/benchmark/test/user_counters_thousands_test.cc
@@ -16,13 +16,13 @@ void BM_Counters_Thousands(benchmark::State& state) {
       {"t0_1000000DefaultBase",
        bm::Counter(1000 * 1000, bm::Counter::kDefaults)},
       {"t1_1000000Base1000", bm::Counter(1000 * 1000, bm::Counter::kDefaults,
-                                         benchmark::Counter::OneK::kIs1000)},
+                                         bm::Counter::OneK::kIs1000)},
       {"t2_1000000Base1024", bm::Counter(1000 * 1000, bm::Counter::kDefaults,
-                                         benchmark::Counter::OneK::kIs1024)},
+                                         bm::Counter::OneK::kIs1024)},
       {"t3_1048576Base1000", bm::Counter(1024 * 1024, bm::Counter::kDefaults,
-                                         benchmark::Counter::OneK::kIs1000)},
+                                         bm::Counter::OneK::kIs1000)},
       {"t4_1048576Base1024", bm::Counter(1024 * 1024, bm::Counter::kDefaults,
-                                         benchmark::Counter::OneK::kIs1024)},
+                                         bm::Counter::OneK::kIs1024)},
   });
 }
 BENCHMARK(BM_Counters_Thousands)->Repetitions(2);
@@ -30,21 +30,21 @@ ADD_CASES(
     TC_ConsoleOut,
     {
         {"^BM_Counters_Thousands/repeats:2 %console_report "
-         "t0_1000000DefaultBase=1000k "
-         "t1_1000000Base1000=1000k t2_1000000Base1024=976.56[23]k "
-         "t3_1048576Base1000=1048.58k t4_1048576Base1024=1024k$"},
+         "t0_1000000DefaultBase=1M "
+         "t1_1000000Base1000=1M t2_1000000Base1024=976.56[23]Ki "
+         "t3_1048576Base1000=1.04858M t4_1048576Base1024=1Mi$"},
         {"^BM_Counters_Thousands/repeats:2 %console_report "
-         "t0_1000000DefaultBase=1000k "
-         "t1_1000000Base1000=1000k t2_1000000Base1024=976.56[23]k "
-         "t3_1048576Base1000=1048.58k t4_1048576Base1024=1024k$"},
+         "t0_1000000DefaultBase=1M "
+         "t1_1000000Base1000=1M t2_1000000Base1024=976.56[23]Ki "
+         "t3_1048576Base1000=1.04858M t4_1048576Base1024=1Mi$"},
         {"^BM_Counters_Thousands/repeats:2_mean %console_report "
-         "t0_1000000DefaultBase=1000k t1_1000000Base1000=1000k "
-         "t2_1000000Base1024=976.56[23]k t3_1048576Base1000=1048.58k "
-         "t4_1048576Base1024=1024k$"},
+         "t0_1000000DefaultBase=1M t1_1000000Base1000=1M "
+         "t2_1000000Base1024=976.56[23]Ki t3_1048576Base1000=1.04858M "
+         "t4_1048576Base1024=1Mi$"},
         {"^BM_Counters_Thousands/repeats:2_median %console_report "
-         "t0_1000000DefaultBase=1000k t1_1000000Base1000=1000k "
-         "t2_1000000Base1024=976.56[23]k t3_1048576Base1000=1048.58k "
-         "t4_1048576Base1024=1024k$"},
+         "t0_1000000DefaultBase=1M t1_1000000Base1000=1M "
+         "t2_1000000Base1024=976.56[23]Ki t3_1048576Base1000=1.04858M "
+         "t4_1048576Base1024=1Mi$"},
         {"^BM_Counters_Thousands/repeats:2_stddev %console_time_only_report [ "
          "]*2 t0_1000000DefaultBase=0 t1_1000000Base1000=0 "
          "t2_1000000Base1024=0 t3_1048576Base1000=0 t4_1048576Base1024=0$"},
diff --git a/third-party/benchmark/tools/compare.py b/third-party/benchmark/tools/compare.py
index f1504c96fa2ba8..7572520cc0ca10 100755
--- a/third-party/benchmark/tools/compare.py
+++ b/third-party/benchmark/tools/compare.py
@@ -1,29 +1,35 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
-import unittest
+# type: ignore
 
 """
 compare.py - versatile benchmark output compare tool
 """
 
 import argparse
-from argparse import ArgumentParser
 import json
+import os
 import sys
+import unittest
+from argparse import ArgumentParser
+
 import gbench
-from gbench import util, report
-from gbench.util import *
+from gbench import report, util
 
 
 def check_inputs(in1, in2, flags):
     """
     Perform checking on the user provided inputs and diagnose any abnormalities
     """
-    in1_kind, in1_err = classify_input_file(in1)
-    in2_kind, in2_err = classify_input_file(in2)
-    output_file = find_benchmark_flag("--benchmark_out=", flags)
-    output_type = find_benchmark_flag("--benchmark_out_format=", flags)
-    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
+    in1_kind, in1_err = util.classify_input_file(in1)
+    in2_kind, in2_err = util.classify_input_file(in2)
+    output_file = util.find_benchmark_flag("--benchmark_out=", flags)
+    output_type = util.find_benchmark_flag("--benchmark_out_format=", flags)
+    if (
+        in1_kind == util.IT_Executable
+        and in2_kind == util.IT_Executable
+        and output_file
+    ):
         print(
             (
                 "WARNING: '--benchmark_out=%s' will be passed to both "
@@ -31,11 +37,14 @@ def check_inputs(in1, in2, flags):
             )
             % output_file
         )
-    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
-        print(
-            "WARNING: passing optional flags has no effect since both "
-            "inputs are JSON"
-        )
+    if in1_kind == util.IT_JSON and in2_kind == util.IT_JSON:
+        # When both sides are JSON the only supported flag is
+        # --benchmark_filter=
+        for flag in util.remove_benchmark_flags("--benchmark_filter=", flags):
+            print(
+                "WARNING: passing %s has no effect since both "
+                "inputs are JSON" % flag
+            )
     if output_type is not None and output_type != "json":
         print(
             (
@@ -48,7 +57,9 @@ def check_inputs(in1, in2, flags):
 
 
 def create_parser():
-    parser = ArgumentParser(description="versatile benchmark output compare tool")
+    parser = ArgumentParser(
+        description="versatile benchmark output compare tool"
+    )
 
     parser.add_argument(
         "-a",
@@ -294,7 +305,9 @@ def main():
     # Now, filter the benchmarks so that the difference report can work
     if filter_baseline and filter_contender:
         replacement = "[%s vs. %s]" % (filter_baseline, filter_contender)
-        json1 = gbench.report.filter_benchmark(json1_orig, filter_baseline, replacement)
+        json1 = gbench.report.filter_benchmark(
+            json1_orig, filter_baseline, replacement
+        )
         json2 = gbench.report.filter_benchmark(
             json2_orig, filter_contender, replacement
         )
@@ -314,7 +327,7 @@ def main():
     # Optionally, diff and output to JSON
     if args.dump_to_json is not None:
         with open(args.dump_to_json, "w") as f_json:
-            json.dump(diff_report, f_json)
+            json.dump(diff_report, f_json, indent=1)
 
 
 class TestParser(unittest.TestCase):
@@ -423,7 +436,9 @@ def test_filters_basic(self):
         self.assertFalse(parsed.benchmark_options)
 
     def test_filters_with_remainder(self):
-        parsed = self.parser.parse_args(["filters", self.testInput0, "c", "d", "e"])
+        parsed = self.parser.parse_args(
+            ["filters", self.testInput0, "c", "d", "e"]
+        )
         self.assertFalse(parsed.display_aggregates_only)
         self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, "filters")
@@ -459,7 +474,14 @@ def test_benchmarksfiltered_basic(self):
 
     def test_benchmarksfiltered_with_remainder(self):
         parsed = self.parser.parse_args(
-            ["benchmarksfiltered", self.testInput0, "c", self.testInput1, "e", "f"]
+            [
+                "benchmarksfiltered",
+                self.testInput0,
+                "c",
+                self.testInput1,
+                "e",
+                "f",
+            ]
         )
         self.assertFalse(parsed.display_aggregates_only)
         self.assertTrue(parsed.utest)
diff --git a/third-party/benchmark/tools/gbench/Inputs/test1_run1.json b/third-party/benchmark/tools/gbench/Inputs/test1_run1.json
index 601e327aefb596..9daed0bcc6c417 100644
--- a/third-party/benchmark/tools/gbench/Inputs/test1_run1.json
+++ b/third-party/benchmark/tools/gbench/Inputs/test1_run1.json
@@ -114,6 +114,14 @@
       "real_time": 1,
       "cpu_time": 1,
       "time_unit": "s"
+    },
+    {
+      "name": "BM_hasLabel",
+      "label": "a label",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "s"
     }
   ]
 }
diff --git a/third-party/benchmark/tools/gbench/Inputs/test1_run2.json b/third-party/benchmark/tools/gbench/Inputs/test1_run2.json
index 3cbcf39b0c9384..dc52970abf8b89 100644
--- a/third-party/benchmark/tools/gbench/Inputs/test1_run2.json
+++ b/third-party/benchmark/tools/gbench/Inputs/test1_run2.json
@@ -114,6 +114,14 @@
       "real_time": 1,
       "cpu_time": 1,
       "time_unit": "ns"
+    },
+    {
+      "name": "BM_hasLabel",
+      "label": "a label",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "s"
     }
   ]
 }
diff --git a/third-party/benchmark/tools/gbench/__init__.py b/third-party/benchmark/tools/gbench/__init__.py
index ffca396b4c3f24..921256881491c0 100644
--- a/third-party/benchmark/tools/gbench/__init__.py
+++ b/third-party/benchmark/tools/gbench/__init__.py
@@ -5,4 +5,4 @@
 __versioninfo__ = (0, 5, 0)
 __version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
 
-__all__ = []
+__all__ = []  # type: ignore
diff --git a/third-party/benchmark/tools/gbench/report.py b/third-party/benchmark/tools/gbench/report.py
index 5092b0bf1469c3..7158fd1654cb10 100644
--- a/third-party/benchmark/tools/gbench/report.py
+++ b/third-party/benchmark/tools/gbench/report.py
@@ -1,15 +1,17 @@
-"""report.py - Utilities for reporting statistics about benchmark results
+# type: ignore
+
+"""
+report.py - Utilities for reporting statistics about benchmark results
 """
 
-import unittest
-import os
-import re
 import copy
+import os
 import random
+import re
+import unittest
 
-from scipy.stats import mannwhitneyu, gmean
 from numpy import array
-from pandas import Timedelta
+from scipy.stats import gmean, mannwhitneyu
 
 
 class BenchmarkColor(object):
@@ -42,6 +44,13 @@ def __format__(self, format):
 UTEST_OPTIMAL_REPETITIONS = 9  # Lowest reasonable number, More is better.
 UTEST_COL_NAME = "_pvalue"
 
+_TIME_UNIT_TO_SECONDS_MULTIPLIER = {
+    "s": 1.0,
+    "ms": 1e-3,
+    "us": 1e-6,
+    "ns": 1e-9,
+}
+
 
 def color_format(use_color, fmt_str, *args, **kwargs):
     """
@@ -52,7 +61,10 @@ def color_format(use_color, fmt_str, *args, **kwargs):
     """
     assert use_color is True or use_color is False
     if not use_color:
-        args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE for arg in args]
+        args = [
+            arg if not isinstance(arg, BenchmarkColor) else BC_NONE
+            for arg in args
+        ]
         kwargs = {
             key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE
             for key, arg in kwargs.items()
@@ -165,9 +177,9 @@ def get_timedelta_field_as_seconds(benchmark, field_name):
     Get value of field_name field of benchmark, which is time with time unit
     time_unit, as time in seconds.
     """
-    time_unit = benchmark["time_unit"] if "time_unit" in benchmark else "s"
-    dt = Timedelta(benchmark[field_name], time_unit)
-    return dt / Timedelta(1, "s")
+    timedelta = benchmark[field_name]
+    time_unit = benchmark.get("time_unit", "s")
+    return timedelta * _TIME_UNIT_TO_SECONDS_MULTIPLIER.get(time_unit)
 
 
 def calculate_geomean(json):
@@ -273,6 +285,7 @@ def get_difference_report(json1, json2, utest=False):
     partitions = partition_benchmarks(json1, json2)
     for partition in partitions:
         benchmark_name = partition[0][0]["name"]
+        label = partition[0][0]["label"] if "label" in partition[0][0] else ""
         time_unit = partition[0][0]["time_unit"]
         measurements = []
         utest_results = {}
@@ -286,8 +299,12 @@ def get_difference_report(json1, json2, utest=False):
                     "cpu_time": bn["cpu_time"],
                     "real_time_other": other_bench["real_time"],
                     "cpu_time_other": other_bench["cpu_time"],
-                    "time": calculate_change(bn["real_time"], other_bench["real_time"]),
-                    "cpu": calculate_change(bn["cpu_time"], other_bench["cpu_time"]),
+                    "time": calculate_change(
+                        bn["real_time"], other_bench["real_time"]
+                    ),
+                    "cpu": calculate_change(
+                        bn["cpu_time"], other_bench["cpu_time"]
+                    ),
                 }
             )
 
@@ -298,7 +315,7 @@ def get_difference_report(json1, json2, utest=False):
             have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(
                 timings_cpu, timings_time
             )
-            if cpu_pvalue and time_pvalue:
+            if cpu_pvalue is not None and time_pvalue is not None:
                 utest_results = {
                     "have_optimal_repetitions": have_optimal_repetitions,
                     "cpu_pvalue": cpu_pvalue,
@@ -313,16 +330,20 @@ def get_difference_report(json1, json2, utest=False):
         # benchmark suite.
         if measurements:
             run_type = (
-                partition[0][0]["run_type"] if "run_type" in partition[0][0] else ""
+                partition[0][0]["run_type"]
+                if "run_type" in partition[0][0]
+                else ""
             )
             aggregate_name = (
                 partition[0][0]["aggregate_name"]
-                if run_type == "aggregate" and "aggregate_name" in partition[0][0]
+                if run_type == "aggregate"
+                and "aggregate_name" in partition[0][0]
                 else ""
             )
             diff_report.append(
                 {
                     "name": benchmark_name,
+                    "label": label,
                     "measurements": measurements,
                     "time_unit": time_unit,
                     "run_type": run_type,
@@ -337,6 +358,7 @@ def get_difference_report(json1, json2, utest=False):
         diff_report.append(
             {
                 "name": "OVERALL_GEOMEAN",
+                "label": "",
                 "measurements": [
                     {
                         "real_time": lhs_gmean[0],
@@ -392,7 +414,7 @@ def get_color(res):
         # and if it is non-aggregate, then don't print it.
         if (
             not include_aggregates_only
-            or not "run_type" in benchmark
+            or "run_type" not in benchmark
             or benchmark["run_type"] == "aggregate"
         ):
             for measurement in benchmark["measurements"]:
@@ -438,7 +460,9 @@ class TestGetUniqueBenchmarkNames(unittest.TestCase):
     def load_results(self):
         import json
 
-        testInputs = os.path.join(os.path.dirname(os.path.realpath(__file__)), "Inputs")
+        testInputs = os.path.join(
+            os.path.dirname(os.path.realpath(__file__)), "Inputs"
+        )
         testOutput = os.path.join(testInputs, "test3_run0.json")
         with open(testOutput, "r") as f:
             json = json.load(f)
@@ -485,16 +509,73 @@ def test_json_diff_report_pretty_printing(self):
             ["BM_SameTimes", "+0.0000", "+0.0000", "10", "10", "10", "10"],
             ["BM_2xFaster", "-0.5000", "-0.5000", "50", "25", "50", "25"],
             ["BM_2xSlower", "+1.0000", "+1.0000", "50", "100", "50", "100"],
-            ["BM_1PercentFaster", "-0.0100", "-0.0100", "100", "99", "100", "99"],
-            ["BM_1PercentSlower", "+0.0100", "+0.0100", "100", "101", "100", "101"],
-            ["BM_10PercentFaster", "-0.1000", "-0.1000", "100", "90", "100", "90"],
-            ["BM_10PercentSlower", "+0.1000", "+0.1000", "100", "110", "100", "110"],
-            ["BM_100xSlower", "+99.0000", "+99.0000", "100", "10000", "100", "10000"],
-            ["BM_100xFaster", "-0.9900", "-0.9900", "10000", "100", "10000", "100"],
-            ["BM_10PercentCPUToTime", "+0.1000", "-0.1000", "100", "110", "100", "90"],
+            [
+                "BM_1PercentFaster",
+                "-0.0100",
+                "-0.0100",
+                "100",
+                "99",
+                "100",
+                "99",
+            ],
+            [
+                "BM_1PercentSlower",
+                "+0.0100",
+                "+0.0100",
+                "100",
+                "101",
+                "100",
+                "101",
+            ],
+            [
+                "BM_10PercentFaster",
+                "-0.1000",
+                "-0.1000",
+                "100",
+                "90",
+                "100",
+                "90",
+            ],
+            [
+                "BM_10PercentSlower",
+                "+0.1000",
+                "+0.1000",
+                "100",
+                "110",
+                "100",
+                "110",
+            ],
+            [
+                "BM_100xSlower",
+                "+99.0000",
+                "+99.0000",
+                "100",
+                "10000",
+                "100",
+                "10000",
+            ],
+            [
+                "BM_100xFaster",
+                "-0.9900",
+                "-0.9900",
+                "10000",
+                "100",
+                "10000",
+                "100",
+            ],
+            [
+                "BM_10PercentCPUToTime",
+                "+0.1000",
+                "-0.1000",
+                "100",
+                "110",
+                "100",
+                "90",
+            ],
             ["BM_ThirdFaster", "-0.3333", "-0.3334", "100", "67", "100", "67"],
             ["BM_NotBadTimeUnit", "-0.9000", "+0.2000", "0", "0", "0", "1"],
-            ["OVERALL_GEOMEAN", "-0.8344", "-0.8026", "0", "0", "0", "0"],
+            ["BM_hasLabel", "+0.0000", "+0.0000", "1", "1", "1", "1"],
+            ["OVERALL_GEOMEAN", "-0.8113", "-0.7779", "0", "0", "0", "0"],
         ]
         output_lines_with_header = print_difference_report(
             self.json_diff_report, use_color=False
@@ -512,6 +593,7 @@ def test_json_diff_report_output(self):
         expected_output = [
             {
                 "name": "BM_SameTimes",
+                "label": "",
                 "measurements": [
                     {
                         "time": 0.0000,
@@ -527,6 +609,7 @@ def test_json_diff_report_output(self):
             },
             {
                 "name": "BM_2xFaster",
+                "label": "",
                 "measurements": [
                     {
                         "time": -0.5000,
@@ -542,6 +625,7 @@ def test_json_diff_report_output(self):
             },
             {
                 "name": "BM_2xSlower",
+                "label": "",
                 "measurements": [
                     {
                         "time": 1.0000,
@@ -557,6 +641,7 @@ def test_json_diff_report_output(self):
             },
             {
                 "name": "BM_1PercentFaster",
+                "label": "",
                 "measurements": [
                     {
                         "time": -0.0100,
@@ -572,6 +657,7 @@ def test_json_diff_report_output(self):
             },
             {
                 "name": "BM_1PercentSlower",
+                "label": "",
                 "measurements": [
                     {
                         "time": 0.0100,
@@ -587,6 +673,7 @@ def test_json_diff_report_output(self):
             },
             {
                 "name": "BM_10PercentFaster",
+                "label": "",
                 "measurements": [
                     {
                         "time": -0.1000,
@@ -602,6 +689,7 @@ def test_json_diff_report_output(self):
             },
             {
                 "name": "BM_10PercentSlower",
+                "label": "",
                 "measurements": [
                     {
                         "time": 0.1000,
@@ -617,6 +705,7 @@ def test_json_diff_report_output(self):
             },
             {
                 "name": "BM_100xSlower",
+                "label": "",
                 "measurements": [
                     {
                         "time": 99.0000,
@@ -632,6 +721,7 @@ def test_json_diff_report_output(self):
             },
             {
                 "name": "BM_100xFaster",
+                "label": "",
                 "measurements": [
                     {
                         "time": -0.9900,
@@ -647,6 +737,7 @@ def test_json_diff_report_output(self):
             },
             {
                 "name": "BM_10PercentCPUToTime",
+                "label": "",
                 "measurements": [
                     {
                         "time": 0.1000,
@@ -662,6 +753,7 @@ def test_json_diff_report_output(self):
             },
             {
                 "name": "BM_ThirdFaster",
+                "label": "",
                 "measurements": [
                     {
                         "time": -0.3333,
@@ -677,6 +769,7 @@ def test_json_diff_report_output(self):
             },
             {
                 "name": "BM_NotBadTimeUnit",
+                "label": "",
                 "measurements": [
                     {
                         "time": -0.9000,
@@ -690,16 +783,33 @@ def test_json_diff_report_output(self):
                 "time_unit": "s",
                 "utest": {},
             },
+            {
+                "name": "BM_hasLabel",
+                "label": "a label",
+                "measurements": [
+                    {
+                        "time": 0.0000,
+                        "cpu": 0.0000,
+                        "real_time": 1,
+                        "real_time_other": 1,
+                        "cpu_time": 1,
+                        "cpu_time_other": 1,
+                    }
+                ],
+                "time_unit": "s",
+                "utest": {},
+            },
             {
                 "name": "OVERALL_GEOMEAN",
+                "label": "",
                 "measurements": [
                     {
-                        "real_time": 1.193776641714438e-06,
-                        "cpu_time": 1.2144445585302297e-06,
+                        "real_time": 3.1622776601683826e-06,
+                        "cpu_time": 3.2130844755623912e-06,
                         "real_time_other": 1.9768988699420897e-07,
                         "cpu_time_other": 2.397447755209533e-07,
-                        "time": -0.834399601997324,
-                        "cpu": -0.8025889499549471,
+                        "time": -0.8112976497120911,
+                        "cpu": -0.7778551721181174,
                     }
                 ],
                 "time_unit": "s",
@@ -711,6 +821,7 @@ def test_json_diff_report_output(self):
         self.assertEqual(len(self.json_diff_report), len(expected_output))
         for out, expected in zip(self.json_diff_report, expected_output):
             self.assertEqual(out["name"], expected["name"])
+            self.assertEqual(out["label"], expected["label"])
             self.assertEqual(out["time_unit"], expected["time_unit"])
             assert_utest(self, out, expected)
             assert_measurements(self, out, expected)
@@ -1086,7 +1197,9 @@ def test_json_diff_report(self):
             assert_measurements(self, out, expected)
 
 
-class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(unittest.TestCase):
+class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
+    unittest.TestCase
+):
     @classmethod
     def setUpClass(cls):
         def load_results():
@@ -1369,12 +1482,108 @@ def test_json_diff_report_pretty_printing(self):
 
         for n in range(len(self.json["benchmarks"]) ** 2):
             random.shuffle(self.json["benchmarks"])
-            sorted_benchmarks = util.sort_benchmark_results(self.json)["benchmarks"]
+            sorted_benchmarks = util.sort_benchmark_results(self.json)[
+                "benchmarks"
+            ]
             self.assertEqual(len(expected_names), len(sorted_benchmarks))
             for out, expected in zip(sorted_benchmarks, expected_names):
                 self.assertEqual(out["name"], expected)
 
 
+class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly2(
+    unittest.TestCase
+):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+
+            testInputs = os.path.join(
+                os.path.dirname(os.path.realpath(__file__)), "Inputs"
+            )
+            testOutput1 = os.path.join(testInputs, "test5_run0.json")
+            testOutput2 = os.path.join(testInputs, "test5_run1.json")
+            with open(testOutput1, "r") as f:
+                json1 = json.load(f)
+                json1["benchmarks"] = [
+                    json1["benchmarks"][0] for i in range(1000)
+                ]
+            with open(testOutput2, "r") as f:
+                json2 = json.load(f)
+                json2["benchmarks"] = [
+                    json2["benchmarks"][0] for i in range(1000)
+                ]
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_line = [
+            "BM_ManyRepetitions_pvalue",
+            "0.0000",
+            "0.0000",
+            "U",
+            "Test,",
+            "Repetitions:",
+            "1000",
+            "vs",
+            "1000",
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, utest=True, utest_alpha=0.05, use_color=False
+        )
+        output_lines = output_lines_with_header[2:]
+        found = False
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(" ") if x]
+            found = expect_line == parts
+            if found:
+                break
+        self.assertTrue(found)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                "name": "BM_ManyRepetitions",
+                "label": "",
+                "time_unit": "s",
+                "run_type": "",
+                "aggregate_name": "",
+                "utest": {
+                    "have_optimal_repetitions": True,
+                    "cpu_pvalue": 0.0,
+                    "time_pvalue": 0.0,
+                    "nr_of_repetitions": 1000,
+                    "nr_of_repetitions_other": 1000,
+                },
+            },
+            {
+                "name": "OVERALL_GEOMEAN",
+                "label": "",
+                "measurements": [
+                    {
+                        "real_time": 1.0,
+                        "cpu_time": 1000.000000000069,
+                        "real_time_other": 1000.000000000069,
+                        "cpu_time_other": 1.0,
+                        "time": 999.000000000069,
+                        "cpu": -0.9990000000000001,
+                    }
+                ],
+                "time_unit": "s",
+                "run_type": "aggregate",
+                "aggregate_name": "geomean",
+                "utest": {},
+            },
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(self.json_diff_report, expected_output):
+            self.assertEqual(out["name"], expected["name"])
+            self.assertEqual(out["time_unit"], expected["time_unit"])
+            assert_utest(self, out, expected)
+
+
 def assert_utest(unittest_instance, lhs, rhs):
     if lhs["utest"]:
         unittest_instance.assertAlmostEqual(
diff --git a/third-party/benchmark/tools/gbench/util.py b/third-party/benchmark/tools/gbench/util.py
index a46903a6d248d2..4d061a3a1e3447 100644
--- a/third-party/benchmark/tools/gbench/util.py
+++ b/third-party/benchmark/tools/gbench/util.py
@@ -2,10 +2,10 @@
 """
 import json
 import os
-import tempfile
+import re
 import subprocess
 import sys
-import functools
+import tempfile
 
 # Input file type enumeration
 IT_Invalid = 0
@@ -58,7 +58,7 @@ def classify_input_file(filename):
     """
     Return a tuple (type, msg) where 'type' specifies the classified type
     of 'filename'. If 'type' is 'IT_Invalid' then 'msg' is a human readable
-    string represeting the error.
+    string representing the error.
     """
     ftype = IT_Invalid
     err_msg = None
@@ -72,7 +72,8 @@ def classify_input_file(filename):
         ftype = IT_JSON
     else:
         err_msg = (
-            "'%s' does not name a valid benchmark executable or JSON file" % filename
+            "'%s' does not name a valid benchmark executable or JSON file"
+            % filename
         )
     return ftype, err_msg
 
@@ -113,13 +114,41 @@ def remove_benchmark_flags(prefix, benchmark_flags):
     return [f for f in benchmark_flags if not f.startswith(prefix)]
 
 
-def load_benchmark_results(fname):
+def load_benchmark_results(fname, benchmark_filter):
     """
     Read benchmark output from a file and return the JSON object.
+
+    Apply benchmark_filter, a regular expression, with nearly the same
+    semantics of the --benchmark_filter argument.  May be None.
+    Note: the Python regular expression engine is used instead of the
+    one used by the C++ code, which may produce different results
+    in complex cases.
+
     REQUIRES: 'fname' names a file containing JSON benchmark output.
     """
+
+    def benchmark_wanted(benchmark):
+        if benchmark_filter is None:
+            return True
+        name = benchmark.get("run_name", None) or benchmark["name"]
+        return re.search(benchmark_filter, name) is not None
+
     with open(fname, "r") as f:
-        return json.load(f)
+        results = json.load(f)
+        if "context" in results:
+            if "json_schema_version" in results["context"]:
+                json_schema_version = results["context"]["json_schema_version"]
+                if json_schema_version != 1:
+                    print(
+                        "In %s, got unnsupported JSON schema version: %i, expected 1"
+                        % (fname, json_schema_version)
+                    )
+                    sys.exit(1)
+        if "benchmarks" in results:
+            results["benchmarks"] = list(
+                filter(benchmark_wanted, results["benchmarks"])
+            )
+        return results
 
 
 def sort_benchmark_results(result):
@@ -168,7 +197,9 @@ def run_benchmark(exe_name, benchmark_flags):
         is_temp_output = True
         thandle, output_name = tempfile.mkstemp()
         os.close(thandle)
-        benchmark_flags = list(benchmark_flags) + ["--benchmark_out=%s" % output_name]
+        benchmark_flags = list(benchmark_flags) + [
+            "--benchmark_out=%s" % output_name
+        ]
 
     cmd = [exe_name] + benchmark_flags
     print("RUNNING: %s" % " ".join(cmd))
@@ -176,7 +207,7 @@ def run_benchmark(exe_name, benchmark_flags):
     if exitCode != 0:
         print("TEST FAILED...")
         sys.exit(exitCode)
-    json_res = load_benchmark_results(output_name)
+    json_res = load_benchmark_results(output_name, None)
     if is_temp_output:
         os.unlink(output_name)
     return json_res
@@ -191,7 +222,10 @@ def run_or_load_benchmark(filename, benchmark_flags):
     """
     ftype = check_input_file(filename)
     if ftype == IT_JSON:
-        return load_benchmark_results(filename)
+        benchmark_filter = find_benchmark_flag(
+            "--benchmark_filter=", benchmark_flags
+        )
+        return load_benchmark_results(filename, benchmark_filter)
     if ftype == IT_Executable:
         return run_benchmark(filename, benchmark_flags)
     raise ValueError("Unknown file type %s" % ftype)
diff --git a/third-party/benchmark/tools/requirements.txt b/third-party/benchmark/tools/requirements.txt
index 3b3331b5af1273..f32f35b8fbfdad 100644
--- a/third-party/benchmark/tools/requirements.txt
+++ b/third-party/benchmark/tools/requirements.txt
@@ -1 +1,2 @@
-scipy>=1.5.0
\ No newline at end of file
+numpy == 1.25
+scipy == 1.10.0
diff --git a/third-party/benchmark/tools/strip_asm.py b/third-party/benchmark/tools/strip_asm.py
index 086255dc657781..bc3a774a793207 100755
--- a/third-party/benchmark/tools/strip_asm.py
+++ b/third-party/benchmark/tools/strip_asm.py
@@ -1,20 +1,20 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """
 strip_asm.py - Cleanup ASM output for the specified file
 """
 
-from argparse import ArgumentParser
-import sys
 import os
 import re
+import sys
+from argparse import ArgumentParser
 
 
 def find_used_labels(asm):
     found = set()
-    label_re = re.compile("\s*j[a-z]+\s+\.L([a-zA-Z0-9][a-zA-Z0-9_]*)")
-    for l in asm.splitlines():
-        m = label_re.match(l)
+    label_re = re.compile(r"\s*j[a-z]+\s+\.L([a-zA-Z0-9][a-zA-Z0-9_]*)")
+    for line in asm.splitlines():
+        m = label_re.match(line)
         if m:
             found.add(".L%s" % m.group(1))
     return found
@@ -23,8 +23,8 @@ def find_used_labels(asm):
 def normalize_labels(asm):
     decls = set()
     label_decl = re.compile("^[.]{0,1}L([a-zA-Z0-9][a-zA-Z0-9_]*)(?=:)")
-    for l in asm.splitlines():
-        m = label_decl.match(l)
+    for line in asm.splitlines():
+        m = label_decl.match(line)
         if m:
             decls.add(m.group(0))
     if len(decls) == 0:
@@ -33,7 +33,7 @@ def normalize_labels(asm):
     if not needs_dot:
         return asm
     for ld in decls:
-        asm = re.sub("(^|\s+)" + ld + "(?=:|\s)", "\\1." + ld, asm)
+        asm = re.sub(r"(^|\s+)" + ld + r"(?=:|\s)", "\\1." + ld, asm)
     return asm
 
 
@@ -41,11 +41,11 @@ def transform_labels(asm):
     asm = normalize_labels(asm)
     used_decls = find_used_labels(asm)
     new_asm = ""
-    label_decl = re.compile("^\.L([a-zA-Z0-9][a-zA-Z0-9_]*)(?=:)")
-    for l in asm.splitlines():
-        m = label_decl.match(l)
+    label_decl = re.compile(r"^\.L([a-zA-Z0-9][a-zA-Z0-9_]*)(?=:)")
+    for line in asm.splitlines():
+        m = label_decl.match(line)
         if not m or m.group(0) in used_decls:
-            new_asm += l
+            new_asm += line
             new_asm += "\n"
     return new_asm
 
@@ -63,21 +63,24 @@ def is_identifier(tk):
     return True
 
 
-def process_identifiers(l):
+def process_identifiers(line):
     """
     process_identifiers - process all identifiers and modify them to have
     consistent names across all platforms; specifically across ELF and MachO.
     For example, MachO inserts an additional understore at the beginning of
     names. This function removes that.
     """
-    parts = re.split(r"([a-zA-Z0-9_]+)", l)
+    parts = re.split(r"([a-zA-Z0-9_]+)", line)
     new_line = ""
     for tk in parts:
         if is_identifier(tk):
             if tk.startswith("__Z"):
                 tk = tk[1:]
             elif (
-                tk.startswith("_") and len(tk) > 1 and tk[1].isalpha() and tk[1] != "Z"
+                tk.startswith("_")
+                and len(tk) > 1
+                and tk[1].isalpha()
+                and tk[1] != "Z"
             ):
                 tk = tk[1:]
         new_line += tk
@@ -93,33 +96,35 @@ def process_asm(asm):
 
     # TODO: Add more things we want to remove
     discard_regexes = [
-        re.compile("\s+\..*$"),  # directive
-        re.compile("\s*#(NO_APP|APP)$"),  # inline ASM
-        re.compile("\s*#.*$"),  # comment line
-        re.compile("\s*\.globa?l\s*([.a-zA-Z_][a-zA-Z0-9$_.]*)"),  # global directive
+        re.compile(r"\s+\..*$"),  # directive
+        re.compile(r"\s*#(NO_APP|APP)$"),  # inline ASM
+        re.compile(r"\s*#.*$"),  # comment line
+        re.compile(
+            r"\s*\.globa?l\s*([.a-zA-Z_][a-zA-Z0-9$_.]*)"
+        ),  # global directive
         re.compile(
-            "\s*\.(string|asciz|ascii|[1248]?byte|short|word|long|quad|value|zero)"
+            r"\s*\.(string|asciz|ascii|[1248]?byte|short|word|long|quad|value|zero)"
         ),
     ]
-    keep_regexes = []
+    keep_regexes: list[re.Pattern] = []
     fn_label_def = re.compile("^[a-zA-Z_][a-zA-Z0-9_.]*:")
-    for l in asm.splitlines():
+    for line in asm.splitlines():
         # Remove Mach-O attribute
-        l = l.replace("@GOTPCREL", "")
+        line = line.replace("@GOTPCREL", "")
         add_line = True
         for reg in discard_regexes:
-            if reg.match(l) is not None:
+            if reg.match(line) is not None:
                 add_line = False
                 break
         for reg in keep_regexes:
-            if reg.match(l) is not None:
+            if reg.match(line) is not None:
                 add_line = True
                 break
         if add_line:
-            if fn_label_def.match(l) and len(new_contents) != 0:
+            if fn_label_def.match(line) and len(new_contents) != 0:
                 new_contents += "\n"
-            l = process_identifiers(l)
-            new_contents += l
+            line = process_identifiers(line)
+            new_contents += line
             new_contents += "\n"
     return new_contents
 
@@ -127,7 +132,11 @@ def process_asm(asm):
 def main():
     parser = ArgumentParser(description="generate a stripped assembly file")
     parser.add_argument(
-        "input", metavar="input", type=str, nargs=1, help="An input assembly file"
+        "input",
+        metavar="input",
+        type=str,
+        nargs=1,
+        help="An input assembly file",
     )
     parser.add_argument(
         "out", metavar="output", type=str, nargs=1, help="The output file"
@@ -136,9 +145,9 @@ def main():
     input = args.input[0]
     output = args.out[0]
     if not os.path.isfile(input):
-        print(("ERROR: input file '%s' does not exist") % input)
+        print("ERROR: input file '%s' does not exist" % input)
         sys.exit(1)
-    contents = None
+
     with open(input, "r") as f:
         contents = f.read()
     new_contents = process_asm(contents)

>From 1c69b4c1d7a413cd9872a18dfe1ae1753b2f2f54 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Thu, 29 Feb 2024 13:19:58 -0800
Subject: [PATCH 2/2] re-patched previous local changes

---
 third-party/benchmark/src/benchmark.cc | 5 +++++
 third-party/benchmark/src/sysinfo.cc   | 7 +++++++
 2 files changed, 12 insertions(+)

diff --git a/third-party/benchmark/src/benchmark.cc b/third-party/benchmark/src/benchmark.cc
index 31f2cde8ff1061..495944db29ff78 100644
--- a/third-party/benchmark/src/benchmark.cc
+++ b/third-party/benchmark/src/benchmark.cc
@@ -202,6 +202,9 @@ State::State(std::string name, IterationCount max_iters,
 #elif defined(__GNUC__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Winvalid-offsetof"
+#elif defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Winvalid-offsetof"
 #endif
 #if defined(__NVCC__)
 #pragma nv_diagnostic push
@@ -219,6 +222,8 @@ State::State(std::string name, IterationCount max_iters,
 #pragma warning pop
 #elif defined(__GNUC__)
 #pragma GCC diagnostic pop
+#elif defined(__clang__)
+#pragma clang diagnostic pop
 #endif
 #if defined(__NVCC__)
 #pragma nv_diagnostic pop
diff --git a/third-party/benchmark/src/sysinfo.cc b/third-party/benchmark/src/sysinfo.cc
index daeb98b026d18f..46df973b069a70 100644
--- a/third-party/benchmark/src/sysinfo.cc
+++ b/third-party/benchmark/src/sysinfo.cc
@@ -12,6 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#if defined(_MSC_VER)
+// FIXME: This must be defined before any other includes to disable deprecation
+// warnings for use of codecvt from C++17. We should remove our reliance on
+// the deprecated functionality instead.
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#endif
+
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS