[libcxx-commits] [libcxx] 5208ec5 - [libc++] Update Google benchmark to v 1.5.5

Mon Jul 12 10:59:15 PDT 2021

Author: Louis Dionne
Date: 2021-07-12T13:59:03-04:00
New Revision: 5208ec5c66dc610a6cf4af999bb9211b945e1b33

URL: https://github.com/llvm/llvm-project/commit/5208ec5c66dc610a6cf4af999bb9211b945e1b33
DIFF: https://github.com/llvm/llvm-project/commit/5208ec5c66dc610a6cf4af999bb9211b945e1b33.diff

LOG: [libc++] Update Google benchmark to v 1.5.5

Added: 
    libcxx/utils/google-benchmark/.github/.libcxx-setup.sh
    libcxx/utils/google-benchmark/.github/ISSUE_TEMPLATE/bug_report.md
    libcxx/utils/google-benchmark/.github/ISSUE_TEMPLATE/feature_request.md
    libcxx/utils/google-benchmark/.github/workflows/bazel.yml
    libcxx/utils/google-benchmark/.github/workflows/build-and-test-perfcounters.yml
    libcxx/utils/google-benchmark/.github/workflows/build-and-test.yml
    libcxx/utils/google-benchmark/.github/workflows/pylint.yml
    libcxx/utils/google-benchmark/.github/workflows/sanitizer.yml
    libcxx/utils/google-benchmark/.github/workflows/test_bindings.yml
    libcxx/utils/google-benchmark/BUILD.bazel
    libcxx/utils/google-benchmark/_config.yml
    libcxx/utils/google-benchmark/bindings/python/BUILD
    libcxx/utils/google-benchmark/bindings/python/build_defs.bzl
    libcxx/utils/google-benchmark/bindings/python/google_benchmark/BUILD
    libcxx/utils/google-benchmark/bindings/python/google_benchmark/__init__.py
    libcxx/utils/google-benchmark/bindings/python/google_benchmark/benchmark.cc
    libcxx/utils/google-benchmark/bindings/python/google_benchmark/example.py
    libcxx/utils/google-benchmark/bindings/python/pybind11.BUILD
    libcxx/utils/google-benchmark/bindings/python/python_headers.BUILD
    libcxx/utils/google-benchmark/bindings/python/requirements.txt
    libcxx/utils/google-benchmark/cmake/GoogleTest.cmake
    libcxx/utils/google-benchmark/cmake/GoogleTest.cmake.in
    libcxx/utils/google-benchmark/dependencies.md
    libcxx/utils/google-benchmark/docs/_config.yml
    libcxx/utils/google-benchmark/docs/perf_counters.md
    libcxx/utils/google-benchmark/docs/random_interleaving.md
    libcxx/utils/google-benchmark/docs/releasing.md
    libcxx/utils/google-benchmark/requirements.txt
    libcxx/utils/google-benchmark/setup.py
    libcxx/utils/google-benchmark/src/benchmark_name.cc
    libcxx/utils/google-benchmark/src/perf_counters.cc
    libcxx/utils/google-benchmark/src/perf_counters.h
    libcxx/utils/google-benchmark/test/BUILD
    libcxx/utils/google-benchmark/test/args_product_test.cc
    libcxx/utils/google-benchmark/test/benchmark_name_gtest.cc
    libcxx/utils/google-benchmark/test/benchmark_random_interleaving_gtest.cc
    libcxx/utils/google-benchmark/test/commandlineflags_gtest.cc
    libcxx/utils/google-benchmark/test/internal_threading_test.cc
    libcxx/utils/google-benchmark/test/perf_counters_gtest.cc
    libcxx/utils/google-benchmark/test/perf_counters_test.cc
    libcxx/utils/google-benchmark/test/repetitions_test.cc
    libcxx/utils/google-benchmark/tools/BUILD.bazel
    libcxx/utils/google-benchmark/tools/gbench/Inputs/test4_run.json
    libcxx/utils/google-benchmark/tools/requirements.txt

Modified: 
    libcxx/utils/google-benchmark/.clang-format
    libcxx/utils/google-benchmark/.gitignore
    libcxx/utils/google-benchmark/.travis.yml
    libcxx/utils/google-benchmark/AUTHORS
    libcxx/utils/google-benchmark/CMakeLists.txt
    libcxx/utils/google-benchmark/CONTRIBUTORS
    libcxx/utils/google-benchmark/README.md
    libcxx/utils/google-benchmark/WORKSPACE
    libcxx/utils/google-benchmark/appveyor.yml
    libcxx/utils/google-benchmark/cmake/AddCXXCompilerFlag.cmake
    libcxx/utils/google-benchmark/cmake/CXXFeatureCheck.cmake
    libcxx/utils/google-benchmark/cmake/GetGitVersion.cmake
    libcxx/utils/google-benchmark/cmake/benchmark.pc.in
    libcxx/utils/google-benchmark/cmake/gnu_posix_regex.cpp
    libcxx/utils/google-benchmark/cmake/posix_regex.cpp
    libcxx/utils/google-benchmark/cmake/std_regex.cpp
    libcxx/utils/google-benchmark/docs/AssemblyTests.md
    libcxx/utils/google-benchmark/docs/tools.md
    libcxx/utils/google-benchmark/include/benchmark/benchmark.h
    libcxx/utils/google-benchmark/src/CMakeLists.txt
    libcxx/utils/google-benchmark/src/benchmark.cc
    libcxx/utils/google-benchmark/src/benchmark_api_internal.cc
    libcxx/utils/google-benchmark/src/benchmark_api_internal.h
    libcxx/utils/google-benchmark/src/benchmark_register.cc
    libcxx/utils/google-benchmark/src/benchmark_register.h
    libcxx/utils/google-benchmark/src/benchmark_runner.cc
    libcxx/utils/google-benchmark/src/benchmark_runner.h
    libcxx/utils/google-benchmark/src/commandlineflags.cc
    libcxx/utils/google-benchmark/src/commandlineflags.h
    libcxx/utils/google-benchmark/src/complexity.cc
    libcxx/utils/google-benchmark/src/console_reporter.cc
    libcxx/utils/google-benchmark/src/counter.cc
    libcxx/utils/google-benchmark/src/counter.h
    libcxx/utils/google-benchmark/src/csv_reporter.cc
    libcxx/utils/google-benchmark/src/cycleclock.h
    libcxx/utils/google-benchmark/src/internal_macros.h
    libcxx/utils/google-benchmark/src/json_reporter.cc
    libcxx/utils/google-benchmark/src/mutex.h
    libcxx/utils/google-benchmark/src/reporter.cc
    libcxx/utils/google-benchmark/src/sleep.cc
    libcxx/utils/google-benchmark/src/statistics.cc
    libcxx/utils/google-benchmark/src/string_util.cc
    libcxx/utils/google-benchmark/src/string_util.h
    libcxx/utils/google-benchmark/src/sysinfo.cc
    libcxx/utils/google-benchmark/src/thread_manager.h
    libcxx/utils/google-benchmark/src/thread_timer.h
    libcxx/utils/google-benchmark/src/timers.cc
    libcxx/utils/google-benchmark/test/AssemblyTests.cmake
    libcxx/utils/google-benchmark/test/CMakeLists.txt
    libcxx/utils/google-benchmark/test/basic_test.cc
    libcxx/utils/google-benchmark/test/benchmark_gtest.cc
    libcxx/utils/google-benchmark/test/complexity_test.cc
    libcxx/utils/google-benchmark/test/cxx03_test.cc
    libcxx/utils/google-benchmark/test/filter_test.cc
    libcxx/utils/google-benchmark/test/fixture_test.cc
    libcxx/utils/google-benchmark/test/map_test.cc
    libcxx/utils/google-benchmark/test/memory_manager_test.cc
    libcxx/utils/google-benchmark/test/multiple_ranges_test.cc
    libcxx/utils/google-benchmark/test/options_test.cc
    libcxx/utils/google-benchmark/test/output_test.h
    libcxx/utils/google-benchmark/test/output_test_helper.cc
    libcxx/utils/google-benchmark/test/register_benchmark_test.cc
    libcxx/utils/google-benchmark/test/reporter_output_test.cc
    libcxx/utils/google-benchmark/test/skip_with_error_test.cc
    libcxx/utils/google-benchmark/test/state_assembly_test.cc
    libcxx/utils/google-benchmark/test/statistics_gtest.cc
    libcxx/utils/google-benchmark/test/string_util_gtest.cc
    libcxx/utils/google-benchmark/test/user_counters_tabular_test.cc
    libcxx/utils/google-benchmark/test/user_counters_test.cc
    libcxx/utils/google-benchmark/test/user_counters_thousands_test.cc
    libcxx/utils/google-benchmark/tools/compare.py
    libcxx/utils/google-benchmark/tools/gbench/Inputs/test1_run1.json
    libcxx/utils/google-benchmark/tools/gbench/Inputs/test1_run2.json
    libcxx/utils/google-benchmark/tools/gbench/report.py
    libcxx/utils/google-benchmark/tools/gbench/util.py

Removed: 
    libcxx/utils/google-benchmark/.travis-libcxx-setup.sh
    libcxx/utils/google-benchmark/README.LLVM
    libcxx/utils/google-benchmark/cmake/HandleGTest.cmake
    libcxx/utils/google-benchmark/mingw.py
    libcxx/utils/google-benchmark/releasing.md


################################################################################
diff  --git a/libcxx/utils/google-benchmark/.clang-format b/libcxx/utils/google-benchmark/.clang-format
index 06ea346a1067b..e7d00feaa08a9 100644

--- a/libcxx/utils/google-benchmark/.clang-format
+++ b/libcxx/utils/google-benchmark/.clang-format
@@ -1,4 +1,5 @@
 ---
 Language:        Cpp
 BasedOnStyle:  Google
+PointerAlignment: Left
 ...

diff  --git a/libcxx/utils/google-benchmark/.github/.libcxx-setup.sh b/libcxx/utils/google-benchmark/.github/.libcxx-setup.sh
new file mode 100755
index 0000000000000..56008403ae921
--- /dev/null
+++ b/libcxx/utils/google-benchmark/.github/.libcxx-setup.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Checkout LLVM sources
+git clone --depth=1 https://github.com/llvm/llvm-project.git llvm-project
+
+# Setup libc++ options
+if [ -z "$BUILD_32_BITS" ]; then
+  export BUILD_32_BITS=OFF && echo disabling 32 bit build
+fi
+
+# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
+cd ./llvm-project
+cmake -DCMAKE_C_COMPILER=${C_COMPILER}          \
+      -DCMAKE_CXX_COMPILER=${COMPILER}          \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
+      -DCMAKE_INSTALL_PREFIX=/usr               \
+      -DLIBCXX_ABI_UNSTABLE=OFF                 \
+      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER}  \
+      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS}     \
+      -DLLVM_ENABLE_PROJECTS='libcxx;libcxxabi' \
+      -S llvm -B llvm-build -G "Unix Makefiles"
+make -C llvm-build -j3 cxx cxxabi
+sudo make -C llvm-build install-cxx install-cxxabi
+cd ..

diff  --git a/libcxx/utils/google-benchmark/.github/ISSUE_TEMPLATE/bug_report.md b/libcxx/utils/google-benchmark/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000000000..6c2ced9b2ec5b
--- /dev/null
+++ b/libcxx/utils/google-benchmark/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,32 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: "[BUG]"
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**System**
+Which OS, compiler, and compiler version are you using:
+  - OS: 
+  - Compiler and version: 
+
+**To reproduce**
+Steps to reproduce the behavior:
+1. sync to commit ...
+2. cmake/bazel...
+3. make ...
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Additional context**
+Add any other context about the problem here.

diff  --git a/libcxx/utils/google-benchmark/.github/ISSUE_TEMPLATE/feature_request.md b/libcxx/utils/google-benchmark/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000000..9e8ab6a673f6b
--- /dev/null
+++ b/libcxx/utils/google-benchmark/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: "[FR]"
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.

diff  --git a/libcxx/utils/google-benchmark/.github/workflows/bazel.yml b/libcxx/utils/google-benchmark/.github/workflows/bazel.yml
new file mode 100644
index 0000000000000..a53661b2f9b11
--- /dev/null
+++ b/libcxx/utils/google-benchmark/.github/workflows/bazel.yml
@@ -0,0 +1,30 @@
+name: bazel
+
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+    
+    steps:
+    - uses: actions/checkout at v1
+
+    - name: mount bazel cache
+      uses: actions/cache at v2.0.0
+      env:
+        cache-name: bazel-cache
+      with:
+        path: "~/.cache/bazel"
+        key: ${{ env.cache-name }}-${{ runner.os }}-${{ github.ref }}
+        restore-keys: |
+          ${{ env.cache-name }}-${{ runner.os }}-main
+
+    - name: build
+      run: |
+        bazel build //:benchmark //:benchmark_main //test/...
+
+    - name: test
+      run: |
+        bazel test --test_output=all //test/...

diff  --git a/libcxx/utils/google-benchmark/.github/workflows/build-and-test-perfcounters.yml b/libcxx/utils/google-benchmark/.github/workflows/build-and-test-perfcounters.yml
new file mode 100644
index 0000000000000..b2b541919766f
--- /dev/null
+++ b/libcxx/utils/google-benchmark/.github/workflows/build-and-test-perfcounters.yml
@@ -0,0 +1,44 @@
+name: build-and-test-perfcounters
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  job:
+    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
+    name: ${{ matrix.os }}.${{ matrix.build_type }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, ubuntu-16.04, ubuntu-20.04]
+        build_type: ['Release', 'Debug']
+    steps:
+    - uses: actions/checkout at v2
+
+    - name: install libpfm
+      run: sudo apt install libpfm4-dev
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake  -DBENCHMARK_ENABLE_LIBPFM=1 -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    # Skip testing, for now. It seems perf_event_open does not succeed on the
+    # hosting machine, very likely a permissions issue.
+    # TODO(mtrofin): Enable test.
+    # - name: test
+    #   shell: bash
+    #   working-directory: ${{ runner.workspace }}/_build
+    #   run: sudo ctest -C ${{ matrix.build_type }} --rerun-failed --output-on-failure

diff  --git a/libcxx/utils/google-benchmark/.github/workflows/build-and-test.yml b/libcxx/utils/google-benchmark/.github/workflows/build-and-test.yml
new file mode 100644
index 0000000000000..9e5be3b1dc172
--- /dev/null
+++ b/libcxx/utils/google-benchmark/.github/workflows/build-and-test.yml
@@ -0,0 +1,110 @@
+name: build-and-test
+
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  # TODO: add 32-bit builds (g++ and clang++) for ubuntu
+  #   (requires g++-multilib and libc6:i386)
+  # TODO: add coverage build (requires lcov)
+  # TODO: add clang + libc++ builds for ubuntu
+  # TODO: add clang + ubsan/asan/msan + libc++ builds for ubuntu
+  job:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.compiler }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, ubuntu-16.04, ubuntu-20.04, macos-latest]
+        build_type: ['Release', 'Debug']
+        compiler: [g++, clang++]
+        include:
+          - displayTargetName: windows-latest-release
+            os: windows-latest
+            build_type: 'Release'
+          - displayTargetName: windows-latest-debug
+            os: windows-latest
+            build_type: 'Debug'
+    steps:
+      - uses: actions/checkout at v2
+
+      - name: create build environment
+        run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: >
+          cmake $GITHUB_WORKSPACE
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+      - name: build
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: cmake --build . --config ${{ matrix.build_type }}
+
+      - name: test
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: ctest -C ${{ matrix.build_type }} -VV
+
+  ubuntu-14_04:
+    name: ubuntu-14.04.${{ matrix.build_type }}.${{ matrix.compiler }}
+    runs-on: [ubuntu-latest]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: ['Release', 'Debug']
+        compiler: [g++-4.8, clang++-3.6]
+        include:
+          - compiler: g++-6
+            build_type: 'Debug'
+            run_tests: true
+          - compiler: g++-6
+            build_type: 'Release'
+            run_tests: true
+    container: ubuntu:14.04
+    steps:
+      - uses: actions/checkout at v2
+
+      - name: install required bits
+        run: |
+          sudo apt update
+          sudo apt -y install clang-3.6 cmake3 g++-4.8 git
+
+      - name: install other bits
+        if: ${{ matrix.compiler }} == g++-6
+        run: |
+          sudo apt -y install software-properties-common
+          sudo add-apt-repository -y "ppa:ubuntu-toolchain-r/test"
+          sudo apt update
+          sudo apt -y install g++-6
+
+      - name: create build environment
+        run: cmake -E make_directory $GITHUB_WORKSPACE/_build
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ github.workspace }}/_build
+        run: >
+          cmake $GITHUB_WORKSPACE
+          -DBENCHMARK_ENABLE_TESTING=${{ matrix.run_tests }}
+          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=${{ matrix.run_tests }}
+
+      - name: build
+        shell: bash
+        working-directory: ${{ github.workspace }}/_build
+        run: cmake --build . --config ${{ matrix.build_type }}
+
+      - name: test
+        if: ${{ matrix.run_tests }}
+        shell: bash
+        working-directory: ${{ github.workspace }}/_build
+        run: ctest -C ${{ matrix.build_type }} -VV

diff  --git a/libcxx/utils/google-benchmark/.github/workflows/pylint.yml b/libcxx/utils/google-benchmark/.github/workflows/pylint.yml
new file mode 100644
index 0000000000000..0f73a5823206e
--- /dev/null
+++ b/libcxx/utils/google-benchmark/.github/workflows/pylint.yml
@@ -0,0 +1,26 @@
+name: pylint
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  pylint:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout at v2
+    - name: Set up Python 3.8
+      uses: actions/setup-python at v1
+      with:
+        python-version: 3.8
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pylint pylint-exit conan
+    - name: Run pylint
+      run: |
+        pylint `find . -name '*.py'|xargs` || pylint-exit $?

diff  --git a/libcxx/utils/google-benchmark/.github/workflows/sanitizer.yml b/libcxx/utils/google-benchmark/.github/workflows/sanitizer.yml
new file mode 100644
index 0000000000000..fbc984492df68
--- /dev/null
+++ b/libcxx/utils/google-benchmark/.github/workflows/sanitizer.yml
@@ -0,0 +1,78 @@
+name: sanitizer
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  CC: clang
+  CXX: clang++
+  EXTRA_CXX_FLAGS: "-stdlib=libc++"
+  UBSAN_OPTIONS: "print_stacktrace=1"
+
+jobs:
+  job:
+    name: ${{ matrix.sanitizer }}.${{ matrix.build_type }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: ['Debug', 'RelWithDebInfo']
+        sanitizer: ['asan', 'ubsan', 'tsan']
+        # TODO: add 'msan' above. currently failing and needs investigation.
+    steps:
+    - uses: actions/checkout at v2
+
+    - name: configure msan env
+      if: matrix.sanitizer == 'msan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=MemoryWithOrigins" >> $GITHUB_ENV
+
+    - name: configure ubsan env
+      if: matrix.sanitizer == 'ubsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Undefined" >> $GITHUB_ENV
+
+    - name: configure asan env
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=address -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Address" >> $GITHUB_ENV
+
+    - name: configure tsan env
+      if: matrix.sanitizer == 'tsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Thread" >> $GITHUB_ENV
+
+    - name: install llvm stuff
+      run: "${GITHUB_WORKSPACE}/.github/.libcxx-setup.sh"
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=${{ env.CC }}
+        -DCMAKE_CXX_COMPILER=${{ env.CXX }}
+        -DCMAKE_C_FLAGS="${{ env.EXTRA_FLAGS }}"
+        -DCMAKE_CXX_FLAGS="${{ env.EXTRA_FLAGS }} ${{ env.EXTRA_CXX_FLAGS }}"
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    - name: test
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: ctest -C ${{ matrix.build_type }} -VV

diff  --git a/libcxx/utils/google-benchmark/.github/workflows/test_bindings.yml b/libcxx/utils/google-benchmark/.github/workflows/test_bindings.yml
new file mode 100644
index 0000000000000..4a580ebe047a4
--- /dev/null
+++ b/libcxx/utils/google-benchmark/.github/workflows/test_bindings.yml
@@ -0,0 +1,24 @@
+name: test-bindings
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  python_bindings:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout at v2
+      - name: Set up Python
+        uses: actions/setup-python at v1
+        with:
+          python-version: 3.8
+      - name: Install benchmark
+        run:
+          python setup.py install
+      - name: Run example bindings
+        run:
+          python bindings/python/google_benchmark/example.py

diff  --git a/libcxx/utils/google-benchmark/.gitignore b/libcxx/utils/google-benchmark/.gitignore
index 8c30e28f53a06..be55d774e21bd 100644
--- a/libcxx/utils/google-benchmark/.gitignore
+++ b/libcxx/utils/google-benchmark/.gitignore
@@ -8,6 +8,7 @@
 !/cmake/*.cmake
 !/test/AssemblyTests.cmake
 *~
+*.swp
 *.pyc
 __pycache__
 
@@ -56,3 +57,10 @@ build*/
 # Visual Studio 2015/2017 cache/options directory
 .vs/
 CMakeSettings.json
+
+# Visual Studio Code cache/options directory
+.vscode/
+
+# Python build stuff
+dist/
+*.egg-info*

diff  --git a/libcxx/utils/google-benchmark/.travis-libcxx-setup.sh b/libcxx/utils/google-benchmark/.travis-libcxx-setup.sh
deleted file mode 100644
index a591743c6a6ba..0000000000000
--- a/libcxx/utils/google-benchmark/.travis-libcxx-setup.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-# Install a newer CMake version
-curl -sSL https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh -o install-cmake.sh
-chmod +x install-cmake.sh
-sudo ./install-cmake.sh --prefix=/usr/local --skip-license
-
-# Checkout LLVM sources
-git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
-git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
-git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
-
-# Setup libc++ options
-if [ -z "$BUILD_32_BITS" ]; then
-  export BUILD_32_BITS=OFF && echo disabling 32 bit build
-fi
-
-# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
-mkdir llvm-build && cd llvm-build
-cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
-      -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
-      -DLIBCXX_ABI_UNSTABLE=ON \
-      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
-      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \
-      ../llvm-source
-make cxx -j2
-sudo make install-cxxabi install-cxx
-cd ../

diff  --git a/libcxx/utils/google-benchmark/.travis.yml b/libcxx/utils/google-benchmark/.travis.yml
index 4625dfb0878ff..8cfed3d10dab5 100644
--- a/libcxx/utils/google-benchmark/.travis.yml
+++ b/libcxx/utils/google-benchmark/.travis.yml
@@ -2,10 +2,6 @@ sudo: required
 dist: trusty
 language: cpp
 
-env:
-  global:
-    - /usr/local/bin:$PATH
-
 matrix:
   include:
     - compiler: gcc
@@ -14,133 +10,146 @@ matrix:
           packages:
             - lcov
       env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release
     - compiler: gcc
       addons:
         apt:
           packages:
             - g++-multilib
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug BUILD_32_BITS=ON
+            - libc6:i386
+      env:
+        - COMPILER=g++
+        - C_COMPILER=gcc
+        - BUILD_TYPE=Debug
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
     - compiler: gcc
       addons:
         apt:
           packages:
             - g++-multilib
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release BUILD_32_BITS=ON
+            - libc6:i386
+      env:
+        - COMPILER=g++
+        - C_COMPILER=gcc
+        - BUILD_TYPE=Release
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
     - compiler: gcc
       env:
         - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
         - ENABLE_SANITIZER=1
         - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
     # Clang w/ libc++
     - compiler: clang
+      dist: xenial
       addons:
         apt:
           packages:
             clang-3.8
       env:
+        - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
         - LIBCXX_BUILD=1
-        - EXTRA_FLAGS="-stdlib=libc++"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
     - compiler: clang
+      dist: xenial
       addons:
         apt:
           packages:
             clang-3.8
       env:
+        - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
         - LIBCXX_BUILD=1
-        - EXTRA_FLAGS="-stdlib=libc++"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
     # Clang w/ 32bit libc++
     - compiler: clang
+      dist: xenial
       addons:
         apt:
           packages:
             - clang-3.8
             - g++-multilib
+            - libc6:i386
       env:
+        - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
         - LIBCXX_BUILD=1
         - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-stdlib=libc++ -m32"
+        - EXTRA_FLAGS="-m32"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
     # Clang w/ 32bit libc++
     - compiler: clang
+      dist: xenial
       addons:
         apt:
           packages:
             - clang-3.8
             - g++-multilib
+            - libc6:i386
       env:
+        - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
         - LIBCXX_BUILD=1
         - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-stdlib=libc++ -m32"
+        - EXTRA_FLAGS="-m32"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
     # Clang w/ libc++, ASAN, UBSAN
     - compiler: clang
+      dist: xenial
       addons:
         apt:
           packages:
             clang-3.8
       env:
+        - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
         - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
         - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
         - UBSAN_OPTIONS=print_stacktrace=1
     # Clang w/ libc++ and MSAN
     - compiler: clang
+      dist: xenial
       addons:
         apt:
           packages:
             clang-3.8
       env:
+        - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
         - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
         - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
     # Clang w/ libc++ and MSAN
     - compiler: clang
+      dist: xenial
       addons:
         apt:
           packages:
             clang-3.8
       env:
+        - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
         - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
         - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Debug
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Release
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
     - os: osx
       osx_image: xcode8.3
       compiler: clang
       env:
-        - COMPILER=clang++ BUILD_TYPE=Release BUILD_32_BITS=ON
-    - os: osx
-      osx_image: xcode8.3
-      compiler: gcc
-      env:
-        - COMPILER=g++-7 C_COMPILER=gcc-7  BUILD_TYPE=Debug
+        - COMPILER=clang++
+        - BUILD_TYPE=Release
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
 
 before_script:
   - if [ -n "${LIBCXX_BUILD}" ]; then
-      source .travis-libcxx-setup.sh;
+      source .libcxx-setup.sh;
     fi
   - if [ -n "${ENABLE_SANITIZER}" ]; then
       export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
@@ -178,17 +187,17 @@ install:
     fi
   - if [ "${TRAVIS_OS_NAME}" == "linux" ]; then
       sudo apt-get update -qq;
-      sudo apt-get install -qq unzip;
-      wget https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-linux-x86_64.sh --output-document bazel-installer.sh;
+      sudo apt-get install -qq unzip cmake3;
+      wget https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-linux-x86_64.sh --output-document bazel-installer.sh;
       travis_wait sudo bash bazel-installer.sh;
     fi
   - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
-      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-darwin-x86_64.sh;
+      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-darwin-x86_64.sh;
       travis_wait sudo bash bazel-installer.sh;
     fi
 
 script:
-  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS}" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ${EXTRA_OPTIONS} ..
+  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_C_FLAGS="${EXTRA_FLAGS}" -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS} ${EXTRA_CXX_FLAGS}" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ${EXTRA_OPTIONS} ..
   - make
   - ctest -C ${BUILD_TYPE} --output-on-failure
   - bazel test -c dbg --define google_benchmark.have_regex=posix --announce_rc --verbose_failures --test_output=errors --keep_going //test/...

diff  --git a/libcxx/utils/google-benchmark/AUTHORS b/libcxx/utils/google-benchmark/AUTHORS
index 3593870661ec4..838dd4f5bd5e7 100644
--- a/libcxx/utils/google-benchmark/AUTHORS
+++ b/libcxx/utils/google-benchmark/AUTHORS
@@ -9,24 +9,32 @@
 # Please keep the list sorted.
 
 Albert Pretorius <pretoalb at gmail.com>
+Alex Steele <steeleal123 at gmail.com>
+Andriy Berestovskyy <berestovskyy at gmail.com>
 Arne Beer <arne at twobeer.de>
 Carto
+Christian Wassermann <christian_wassermann at web.de>
 Christopher Seymour <chris.j.seymour at hotmail.com>
+Colin Braley <braley.colin at gmail.com>
+Daniel Harvey <danielharvey458 at gmail.com>
 David Coeurjolly <david.coeurjolly at liris.cnrs.fr>
 Deniz Evrenci <denizevrenci at gmail.com>
 Dirac Research 
 Dominik Czarnota <dominik.b.czarnota at gmail.com>
+Eric Backus <eric_backus at alum.mit.edu>
 Eric Fiselier <eric at efcs.ca>
 Eugene Zhuk <eugene.zhuk at gmail.com>
 Evgeny Safronov <division494 at gmail.com>
 Federico Ficarelli <federico.ficarelli at gmail.com>
 Felix Homann <linuxaudio at showlabor.de>
+Gergő Szitár <szitar.gergo at gmail.com>
 Google Inc.
 International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez at gmail.com>
 Jern-Kuan Leong <jernkuan at gmail.com>
 JianXiong Zhou <zhoujianxiong2 at gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes at gmail.com>
+Jordan Williams <jwillikers at protonmail.com>
 Jussi Knuuttila <jussi.knuuttila at gmail.com>
 Kaito Udagawa <umireon at gmail.com>
 Kishan Kumar <kumar.kishan at outlook.com>
@@ -35,14 +43,17 @@ Matt Clarkson <mattyclarkson at gmail.com>
 Maxim Vafin <maxvafin at gmail.com>
 MongoDB Inc.
 Nick Hutchinson <nshutchinson at gmail.com>
+Norman Heino <norman.heino at gmail.com>
 Oleksandr Sochka <sasha.sochka at gmail.com>
 Ori Livneh <ori.livneh at gmail.com>
 Paul Redmond <paul.redmond at gmail.com>
 Radoslav Yovchev <radoslav.tm at gmail.com>
 Roman Lebedev <lebedev.ri at gmail.com>
+Sayan Bhattacharjee <aero.sayan at gmail.com>
 Shuo Chen <chenshuo at chenshuo.com>
 Steinar H. Gunderson <sgunderson at bigfoot.com>
 Stripe, Inc.
+Tobias Schmidt <tobias.schmidt at in.tum.de>
 Yixuan Qiu <yixuanq at gmail.com>
 Yusuke Suzuki <utatane.tea at gmail.com>
 Zbigniew Skowron <zbychs at gmail.com>

diff  --git a/libcxx/utils/google-benchmark/BUILD.bazel b/libcxx/utils/google-benchmark/BUILD.bazel
new file mode 100644
index 0000000000000..eb35b62730c67
--- /dev/null
+++ b/libcxx/utils/google-benchmark/BUILD.bazel
@@ -0,0 +1,44 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+licenses(["notice"])
+
+config_setting(
+    name = "windows",
+    values = {
+        "cpu": "x64_windows",
+    },
+    visibility = [":__subpackages__"],
+)
+
+cc_library(
+    name = "benchmark",
+    srcs = glob(
+        [
+            "src/*.cc",
+            "src/*.h",
+        ],
+        exclude = ["src/benchmark_main.cc"],
+    ),
+    hdrs = ["include/benchmark/benchmark.h"],
+    linkopts = select({
+        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
+        "//conditions:default": ["-pthread"],
+    }),
+    strip_include_prefix = "include",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "benchmark_main",
+    srcs = ["src/benchmark_main.cc"],
+    hdrs = ["include/benchmark/benchmark.h"],
+    strip_include_prefix = "include",
+    visibility = ["//visibility:public"],
+    deps = [":benchmark"],
+)
+
+cc_library(
+    name = "benchmark_internal_headers",
+    hdrs = glob(["src/*.h"]),
+    visibility = ["//test:__pkg__"],
+)

diff  --git a/libcxx/utils/google-benchmark/CMakeLists.txt b/libcxx/utils/google-benchmark/CMakeLists.txt
index 310c7ee9f6b2b..ef8dcdc68cfb8 100644
--- a/libcxx/utils/google-benchmark/CMakeLists.txt
+++ b/libcxx/utils/google-benchmark/CMakeLists.txt
@@ -1,17 +1,20 @@
-cmake_minimum_required (VERSION 2.8.12)
-
-project (benchmark)
+cmake_minimum_required (VERSION 3.5.1)
 
 foreach(p
+    CMP0048 # OK to clear PROJECT_VERSION on project()
     CMP0054 # CMake 3.1
     CMP0056 # export EXE_LINKER_FLAGS to try_run
     CMP0057 # Support no if() IN_LIST operator
+    CMP0063 # Honor visibility properties for all targets
+    CMP0077 # Allow option() overrides in importing projects
     )
   if(POLICY ${p})
     cmake_policy(SET ${p} NEW)
   endif()
 endforeach()
 
+project (benchmark VERSION 1.5.4 LANGUAGES CXX)
+
 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
@@ -31,6 +34,20 @@ option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree buildi
 # in cases where it is not possible to build or find a valid version of gtest.
 option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
 
+option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm" OFF)
+
+set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+if(MSVC)
+    # As of CMake 3.18, CMAKE_SYSTEM_PROCESSOR is not set properly for MSVC and
+    # cross-compilation (e.g. Host=x86_64, target=aarch64) requires using the
+    # undocumented, but working variable.
+    # See https://gitlab.kitware.com/cmake/cmake/-/issues/15170
+    set(CMAKE_SYSTEM_PROCESSOR ${MSVC_CXX_ARCHITECTURE_ID})
+    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ARM")
+      set(CMAKE_CROSSCOMPILING TRUE)
+    endif()
+endif()
+
 set(ENABLE_ASSEMBLY_TESTS_DEFAULT OFF)
 function(should_enable_assembly_tests)
   if(CMAKE_BUILD_TYPE)
@@ -77,8 +94,14 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 include(GetGitVersion)
 get_git_version(GIT_VERSION)
 
+# If no git version can be determined, use the version
+# from the project() command
+if ("${GIT_VERSION}" STREQUAL "0.0.0")
+  set(VERSION "${benchmark_VERSION}")
+else()
+  set(VERSION "${GIT_VERSION}")
+endif()
 # Tell the user what versions we are using
-string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
 message(STATUS "Version: ${VERSION}")
 
 # The version of the libraries
@@ -140,6 +163,10 @@ else()
   add_cxx_compiler_flag(-Werror RELEASE)
   add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
   add_cxx_compiler_flag(-Werror MINSIZEREL)
+  if (NOT BENCHMARK_ENABLE_TESTING)
+    # Disable warning when compiling tests as gtest does not use 'override'.
+    add_cxx_compiler_flag(-Wsuggest-override)
+  endif()
   add_cxx_compiler_flag(-pedantic)
   add_cxx_compiler_flag(-pedantic-errors)
   add_cxx_compiler_flag(-Wshorten-64-to-32)
@@ -182,10 +209,15 @@ else()
     add_definitions(-D_GNU_SOURCE=1)
   endif()
 
+  if (QNXNTO)
+    add_definitions(-D_QNX_SOURCE)
+  endif()
+
   # Link time optimisation
   if (BENCHMARK_ENABLE_LTO)
     add_cxx_compiler_flag(-flto)
-    if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+    add_cxx_compiler_flag(-Wno-lto-type-mismatch)
+    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
       find_program(GCC_AR gcc-ar)
       if (GCC_AR)
         set(CMAKE_AR ${GCC_AR})
@@ -194,7 +226,7 @@ else()
       if (GCC_RANLIB)
         set(CMAKE_RANLIB ${GCC_RANLIB})
       endif()
-    elseif("${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
+    elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
       include(llvm-toolchain)
     endif()
   endif()
@@ -236,11 +268,17 @@ if (BENCHMARK_USE_LIBCXX)
   endif()
 endif(BENCHMARK_USE_LIBCXX)
 
+set(EXTRA_CXX_FLAGS "")
+if (WIN32 AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+  # Clang on Windows fails to compile the regex feature check under C++11
+  set(EXTRA_CXX_FLAGS "-DCMAKE_CXX_STANDARD=14")
+endif()
+
 # C++ feature checks
 # Determine the correct regular expression engine to use
-cxx_feature_check(STD_REGEX)
-cxx_feature_check(GNU_POSIX_REGEX)
-cxx_feature_check(POSIX_REGEX)
+cxx_feature_check(STD_REGEX ${EXTRA_CXX_FLAGS})
+cxx_feature_check(GNU_POSIX_REGEX ${EXTRA_CXX_FLAGS})
+cxx_feature_check(POSIX_REGEX ${EXTRA_CXX_FLAGS})
 if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
   message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
 endif()
@@ -248,10 +286,16 @@ if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
         AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
   message(WARNING "Using std::regex with exceptions disabled is not fully supported")
 endif()
+
 cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 
+if (BENCHMARK_ENABLE_LIBPFM)
+  find_package(PFM)
+endif()
+
 # Set up directories
 include_directories(${PROJECT_SOURCE_DIR}/include)
 
@@ -260,8 +304,10 @@ add_subdirectory(src)
 
 if (BENCHMARK_ENABLE_TESTING)
   enable_testing()
-  if (BENCHMARK_ENABLE_GTEST_TESTS)
-    include(HandleGTest)
+  if (BENCHMARK_ENABLE_GTEST_TESTS AND
+      NOT (TARGET gtest AND TARGET gtest_main AND
+           TARGET gmock AND TARGET gmock_main))
+    include(GoogleTest)
   endif()
   add_subdirectory(test)
 endif()

diff  --git a/libcxx/utils/google-benchmark/CONTRIBUTORS b/libcxx/utils/google-benchmark/CONTRIBUTORS
index f684c7d4b2d4c..7489731de5a82 100644
--- a/libcxx/utils/google-benchmark/CONTRIBUTORS
+++ b/libcxx/utils/google-benchmark/CONTRIBUTORS
@@ -22,34 +22,47 @@
 #
 # Please keep the list sorted.
 
+Abhina Sreeskantharajan <abhina.sreeskantharajan at ibm.com>
 Albert Pretorius <pretoalb at gmail.com>
+Alex Steele <steelal123 at gmail.com>
+Andriy Berestovskyy <berestovskyy at gmail.com>
 Arne Beer <arne at twobeer.de>
 Billy Robert O'Neal III <billy.oneal at gmail.com> <bion at microsoft.com>
 Chris Kennelly <ckennelly at google.com> <ckennelly at ckennelly.com>
+Christian Wassermann <christian_wassermann at web.de>
 Christopher Seymour <chris.j.seymour at hotmail.com>
+Colin Braley <braley.colin at gmail.com>
 Cyrille Faucheux <cyrille.faucheux at gmail.com>
+Daniel Harvey <danielharvey458 at gmail.com>
 David Coeurjolly <david.coeurjolly at liris.cnrs.fr>
 Deniz Evrenci <denizevrenci at gmail.com>
 Dominic Hamon <dma at stripysock.com> <dominic at google.com>
 Dominik Czarnota <dominik.b.czarnota at gmail.com>
+Eric Backus <eric_backus at alum.mit.edu>
 Eric Fiselier <eric at efcs.ca>
 Eugene Zhuk <eugene.zhuk at gmail.com>
 Evgeny Safronov <division494 at gmail.com>
+Fanbo Meng <fanbo.meng at ibm.com>
 Federico Ficarelli <federico.ficarelli at gmail.com>
 Felix Homann <linuxaudio at showlabor.de>
+Geoffrey Martin-Noble <gcmn at google.com> <gmngeoffrey at gmail.com>
+Gergő Szitár <szitar.gergo at gmail.com>
+Hannes Hauswedell <h2 at fsfe.org>
 Ismael Jimenez Martinez <ismael.jimenez.martinez at gmail.com>
 Jern-Kuan Leong <jernkuan at gmail.com>
 JianXiong Zhou <zhoujianxiong2 at gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes at gmail.com>
 John Millikin <jmillikin at stripe.com>
+Jordan Williams <jwillikers at protonmail.com>
 Jussi Knuuttila <jussi.knuuttila at gmail.com>
 Kai Wolf <kai.wolf at gmail.com>
-Kishan Kumar <kumar.kishan at outlook.com>
 Kaito Udagawa <umireon at gmail.com>
+Kishan Kumar <kumar.kishan at outlook.com>
 Lei Xu <eddyxu at gmail.com>
 Matt Clarkson <mattyclarkson at gmail.com>
 Maxim Vafin <maxvafin at gmail.com>
 Nick Hutchinson <nshutchinson at gmail.com>
+Norman Heino <norman.heino at gmail.com>
 Oleksandr Sochka <sasha.sochka at gmail.com>
 Ori Livneh <ori.livneh at gmail.com>
 Pascal Leroy <phl at google.com>
@@ -60,8 +73,10 @@ Raul Marin <rmrodriguez at cartodb.com>
 Ray Glover <ray.glover at uk.ibm.com>
 Robert Guo <robert.guo at mongodb.com>
 Roman Lebedev <lebedev.ri at gmail.com>
+Sayan Bhattacharjee <aero.sayan at gmail.com>
 Shuo Chen <chenshuo at chenshuo.com>
 Steven Wan <wan.yu at ibm.com>
+Tobias Schmidt <tobias.schmidt at in.tum.de>
 Tobias Ulvgård <tobias.ulvgard at dirac.se>
 Tom Madams <tom.ej.madams at gmail.com> <tmadams at google.com>
 Yixuan Qiu <yixuanq at gmail.com>

diff  --git a/libcxx/utils/google-benchmark/README.LLVM b/libcxx/utils/google-benchmark/README.LLVM
deleted file mode 100644
index 34afcc02b2947..0000000000000
--- a/libcxx/utils/google-benchmark/README.LLVM
+++ /dev/null
@@ -1,28 +0,0 @@
-LLVM notes
-----------
-
-This directory contains the Google Benchmark source code with some unnecessary
-files removed. Note that this directory is under a 
diff erent license than
-libc++.
-
-Changes:
-* https://github.com/google/benchmark/commit/4abdfbb802d1b514703223f5f852ce4a507d32d2
-  is applied on top of
-  https://github.com/google/benchmark/commit/4528c76b718acc9b57956f63069c699ae21edcab
-  to add RISC-V timer support.
-* https://github.com/google/benchmark/commit/8e48105d465c586068dd8e248fe75a8971c6ba3a
-  is applied on top of
-  https://github.com/google/benchmark/commit/4528c76b718acc9b57956f63069c699ae21edcab
-  to fix cross-build from linux to windows via MinGW.
-* https://github.com/google/benchmark/commit/a77d5f70efaebe2b7e8c10134526a23a7ce7ef35
-  and
-  https://github.com/google/benchmark/commit/ecc1685340f58f7fe6b707036bc0bb1fccabb0c1
-  are applied on top of
-  https://github.com/google/benchmark/commit/8e48105d465c586068dd8e248fe75a8971c6ba3a
-  to fix timestamp-related inline asm issues and 32-bit RISC-V build failures.
-  The second cherrypicked commit fixes formatting issues introduced by the
-  preceding change.
-* https://github.com/google/benchmark/commit/ffe1342eb2faa7d2e7c35b4db2ccf99fab81ec20
-  is applied to add the CycleTimer implementation for M68k
-* https://github.com/google/benchmark/commit/d9abf017632be4a00b92cf4289539b353fcea5d2
-  is applied to rename 'mftbl' to 'mftb'.

diff  --git a/libcxx/utils/google-benchmark/README.md b/libcxx/utils/google-benchmark/README.md
index 858ea2334ef50..aa61cef1b162f 100644
--- a/libcxx/utils/google-benchmark/README.md
+++ b/libcxx/utils/google-benchmark/README.md
@@ -1,32 +1,111 @@
-# benchmark
+# Benchmark
+
+[![build-and-test](https://github.com/google/benchmark/workflows/build-and-test/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Abuild-and-test)
+[![bazel](https://github.com/google/benchmark/actions/workflows/bazel.yml/badge.svg)](https://github.com/google/benchmark/actions/workflows/bazel.yml)
+[![pylint](https://github.com/google/benchmark/workflows/pylint/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Apylint)
+[![test-bindings](https://github.com/google/benchmark/workflows/test-bindings/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Atest-bindings)
+
 [![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
-[![Build status](https://ci.appveyor.com/api/projects/status/u0qsyp7t1tk7cpxs/branch/master?svg=true)](https://ci.appveyor.com/project/google/benchmark/branch/master)
 [![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
-[![slackin](https://slackin-iqtfqnpzxd.now.sh/badge.svg)](https://slackin-iqtfqnpzxd.now.sh/)
 
-A library to support the benchmarking of functions, similar to unit-tests.
+
+A library to benchmark code snippets, similar to unit tests. Example:
+
+```c++
+#include <benchmark/benchmark.h>
+
+static void BM_SomeFunction(benchmark::State& state) {
+  // Perform setup here
+  for (auto _ : state) {
+    // This code gets timed
+    SomeFunction();
+  }
+}
+// Register the function as a benchmark
+BENCHMARK(BM_SomeFunction);
+// Run the benchmark
+BENCHMARK_MAIN();
+```
+
+To get started, see [Requirements](#requirements) and
+[Installation](#installation). See [Usage](#usage) for a full example and the
+[User Guide](#user-guide) for a more comprehensive feature overview.
+
+It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/master/docs/primer.md)
+as some of the structural aspects of the APIs are similar.
+
+### Resources
 
 [Discussion group](https://groups.google.com/d/forum/benchmark-discuss)
 
-IRC channel: [freenode](https://freenode.net) #googlebenchmark
+IRC channels:
+* [libera](https://libera.chat) #benchmark
 
 [Additional Tooling Documentation](docs/tools.md)
 
 [Assembly Testing Documentation](docs/AssemblyTests.md)
 
+## Requirements
+
+The library can be used with C++03. However, it requires C++11 to build,
+including compiler and standard library support.
+
+The following minimum versions are required to build the library:
+
+* GCC 4.8
+* Clang 3.4
+* Visual Studio 14 2015
+* Intel 2015 Update 1
+
+See [Platform-Specific Build Instructions](#platform-specific-build-instructions).
 
-## Building
+## Installation
 
-The basic steps for configuring and building the library look like this:
+This describes the installation process using cmake. As pre-requisites, you'll
+need git and cmake installed.
+
+_See [dependencies.md](dependencies.md) for more details regarding supported
+versions of build tools._
 
 ```bash
+# Check out the library.
 $ git clone https://github.com/google/benchmark.git
 # Benchmark requires Google Test as a dependency. Add the source tree as a subdirectory.
 $ git clone https://github.com/google/googletest.git benchmark/googletest
-$ mkdir build && cd build
-$ cmake -G <generator> [options] ../benchmark
-# Assuming a makefile generator was used
-$ make
+# Go to the library root directory
+$ cd benchmark
+# Make a build directory to place the build output.
+$ cmake -E make_directory "build"
+# Generate build system files with cmake.
+$ cmake -E chdir "build" cmake -DCMAKE_BUILD_TYPE=Release ../
+# or, starting with CMake 3.13, use a simpler form:
+# cmake -DCMAKE_BUILD_TYPE=Release -S . -B "build"
+# Build the library.
+$ cmake --build "build" --config Release
+```
+This builds the `benchmark` and `benchmark_main` libraries and tests.
+On a unix system, the build directory should now look something like this:
+
+```
+/benchmark
+  /build
+    /src
+      /libbenchmark.a
+      /libbenchmark_main.a
+    /test
+      ...
+```
+
+Next, you can run the tests to check the build.
+
+```bash
+$ cmake -E chdir "build" ctest --build-config Release
+```
+
+If you want to install the library globally, also run:
+
+```
+sudo cmake --build "build" --config Release --target install
 ```
 
 Note that Google Benchmark requires Google Test to build and run the tests. This
@@ -40,37 +119,25 @@ dependency can be provided two ways:
 If you do not wish to build and run the tests, add `-DBENCHMARK_ENABLE_GTEST_TESTS=OFF`
 to `CMAKE_ARGS`.
 
+### Debug vs Release
 
-## Installation Guide
-
-For Ubuntu and Debian Based System
-
-First make sure you have git and cmake installed (If not please install them)
-
-```
-sudo apt-get install git cmake
-```
-
-Now, let's clone the repository and build it
+By default, benchmark builds as a debug library. You will see a warning in the
+output when this is the case. To build it as a release library instead, add
+`-DCMAKE_BUILD_TYPE=Release` when generating the build system files, as shown
+above. The use of `--config Release` in build commands is needed to properly
+support multi-configuration tools (like Visual Studio for example) and can be
+skipped for other build systems (like Makefile).
 
-```
-git clone https://github.com/google/benchmark.git
-cd benchmark
-# If you want to build tests and don't use BENCHMARK_DOWNLOAD_DEPENDENCIES, then
-# git clone https://github.com/google/googletest.git
-mkdir build
-cd build
-cmake .. -DCMAKE_BUILD_TYPE=RELEASE
-make
-```
+To enable link-time optimisation, also add `-DBENCHMARK_ENABLE_LTO=true` when
+generating the build system files.
 
-If you need to install the library globally
+If you are using gcc, you might need to set `GCC_AR` and `GCC_RANLIB` cmake
+cache variables, if autodetection fails.
 
-```
-sudo make install
-```
+If you are using clang, you may need to set `LLVMAR_EXECUTABLE`,
+`LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
 
-## Stable and Experimental Library Versions
+### Stable and Experimental Library Versions
 
 The main branch contains the latest stable version of the benchmarking library;
 the API of which can be considered largely stable, with source breaking changes
@@ -82,16 +149,13 @@ to use, test, and provide feedback on the new features are encouraged to try
 this branch. However, this branch provides no stability guarantees and reserves
 the right to change and break the API at any time.
 
-## Further knowledge
-
-It may help to read the [Google Test documentation](https://github.com/google/googletest/blob/master/googletest/docs/primer.md)
-as some of the structural aspects of the APIs are similar.
+## Usage
 
-## Example usage
 ### Basic usage
-Define a function that executes the code to be measured, register it as a
-benchmark function using the `BENCHMARK` macro, and ensure an appropriate `main`
-function is available:
+
+Define a function that executes the code to measure, register it as a benchmark
+function using the `BENCHMARK` macro, and ensure an appropriate `main` function
+is available:
 
 ```c++
 #include <benchmark/benchmark.h>
@@ -114,15 +178,47 @@ BENCHMARK(BM_StringCopy);
 BENCHMARK_MAIN();
 ```
 
-Don't forget to inform your linker to add benchmark library e.g. through 
-`-lbenchmark` compilation flag. Alternatively, you may leave out the 
-`BENCHMARK_MAIN();` at the end of the source file and link against 
-`-lbenchmark_main` to get the same default behavior.
+To run the benchmark, compile and link against the `benchmark` library
+(libbenchmark.a/.so). If you followed the build steps above, this library will 
+be under the build directory you created.
+
+```bash
+# Example on linux after running the build steps above. Assumes the
+# `benchmark` and `build` directories are under the current directory.
+$ g++ mybenchmark.cc -std=c++11 -isystem benchmark/include \
+  -Lbenchmark/build/src -lbenchmark -lpthread -o mybenchmark
+```
+
+Alternatively, link against the `benchmark_main` library and remove
+`BENCHMARK_MAIN();` above to get the same behavior.
+
+The compiled executable will run all benchmarks by default. Pass the `--help`
+flag for option information or see the guide below.
+
+### Usage with CMake
+
+If using CMake, it is recommended to link against the project-provided
+`benchmark::benchmark` and `benchmark::benchmark_main` targets using
+`target_link_libraries`.
+It is possible to use ```find_package``` to import an installed version of the
+library.
+```cmake
+find_package(benchmark REQUIRED)
+```
+Alternatively, ```add_subdirectory``` will incorporate the library directly in
+to one's CMake project.
+```cmake
+add_subdirectory(benchmark)
+```
+Either way, link to the library as follows.
+```cmake
+target_link_libraries(MyTarget benchmark::benchmark)
+```
+
+## Platform Specific Build Instructions
 
-The benchmark library will measure and report the timing for code within the
-`for(...)` loop.
+### Building with GCC
 
-#### Platform-specific libraries
 When the library is built using GCC it is necessary to link with the pthread
 library due to how GCC implements `std::thread`. Failing to link to pthread will
 lead to runtime exceptions (unless you're using libc++), not linker errors. See
@@ -131,13 +227,282 @@ can link to pthread by adding `-pthread` to your linker command. Note, you can
 also use `-lpthread`, but there are potential issues with ordering of command
 line parameters if you use that.
 
-If you're running benchmarks on Windows, the shlwapi library (`-lshlwapi`) is
-also required.
+### Building with Visual Studio 2015 or 2017
+
+The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
+
+```
+// Alternatively, can add libraries using linker options.
+#ifdef _WIN32
+#pragma comment ( lib, "Shlwapi.lib" )
+#ifdef _DEBUG
+#pragma comment ( lib, "benchmarkd.lib" )
+#else
+#pragma comment ( lib, "benchmark.lib" )
+#endif
+#endif
+```
+
+Can also use the graphical version of CMake:
+* Open `CMake GUI`.
+* Under `Where to build the binaries`, same path as source plus `build`.
+* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
+* Click `Configure`, `Generate`, `Open Project`.
+* If build fails, try deleting entire directory and starting again, or unticking options to build less.
+
+### Building with Intel 2015 Update 1 or Intel System Studio Update 4
+
+See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
+
+### Building on Solaris
 
 If you're running benchmarks on solaris, you'll want the kstat library linked in
 too (`-lkstat`).
 
-### Passing arguments
+## User Guide
+
+### Command Line
+
+[Output Formats](#output-formats)
+
+[Output Files](#output-files)
+
+[Running Benchmarks](#running-benchmarks)
+
+[Running a Subset of Benchmarks](#running-a-subset-of-benchmarks)
+
+[Result Comparison](#result-comparison)
+
+[Extra Context](#extra-context)
+
+### Library
+
+[Runtime and Reporting Considerations](#runtime-and-reporting-considerations)
+
+[Passing Arguments](#passing-arguments)
+
+[Custom Benchmark Name](#custom-benchmark-name)
+
+[Calculating Asymptotic Complexity](#asymptotic-complexity)
+
+[Templated Benchmarks](#templated-benchmarks)
+
+[Fixtures](#fixtures)
+
+[Custom Counters](#custom-counters)
+
+[Multithreaded Benchmarks](#multithreaded-benchmarks)
+
+[CPU Timers](#cpu-timers)
+
+[Manual Timing](#manual-timing)
+
+[Setting the Time Unit](#setting-the-time-unit)
+
+[Random Interleaving](docs/random_interleaving.md)
+
+[User-Requested Performance Counters](docs/perf_counters.md)
+
+[Preventing Optimization](#preventing-optimization)
+
+[Reporting Statistics](#reporting-statistics)
+
+[Custom Statistics](#custom-statistics)
+
+[Using RegisterBenchmark](#using-register-benchmark)
+
+[Exiting with an Error](#exiting-with-an-error)
+
+[A Faster KeepRunning Loop](#a-faster-keep-running-loop)
+
+[Disabling CPU Frequency Scaling](#disabling-cpu-frequency-scaling)
+
+
+<a name="output-formats" />
+
+### Output Formats
+
+The library supports multiple output formats. Use the
+`--benchmark_format=<console|json|csv>` flag (or set the
+`BENCHMARK_FORMAT=<console|json|csv>` environment variable) to set
+the format type. `console` is the default format.
+
+The Console format is intended to be a human readable format. By default
+the format generates color output. Context is output on stderr and the
+tabular data on stdout. Example tabular output looks like:
+
+```
+Benchmark                               Time(ns)    CPU(ns) Iterations
+----------------------------------------------------------------------
+BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
+BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
+BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
+```
+
+The JSON format outputs human readable json split into two top level attributes.
+The `context` attribute contains information about the run in general, including
+information about the CPU and the date.
+The `benchmarks` attribute contains a list of every benchmark run. Example json
+output looks like:
+
+```json
+{
+  "context": {
+    "date": "2015/03/17-18:40:25",
+    "num_cpus": 40,
+    "mhz_per_cpu": 2801,
+    "cpu_scaling_enabled": false,
+    "build_type": "debug"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SetInsert/1024/1",
+      "iterations": 94877,
+      "real_time": 29275,
+      "cpu_time": 29836,
+      "bytes_per_second": 134066,
+      "items_per_second": 33516
+    },
+    {
+      "name": "BM_SetInsert/1024/8",
+      "iterations": 21609,
+      "real_time": 32317,
+      "cpu_time": 32429,
+      "bytes_per_second": 986770,
+      "items_per_second": 246693
+    },
+    {
+      "name": "BM_SetInsert/1024/10",
+      "iterations": 21393,
+      "real_time": 32724,
+      "cpu_time": 33355,
+      "bytes_per_second": 1199226,
+      "items_per_second": 299807
+    }
+  ]
+}
+```
+
+The CSV format outputs comma-separated values. The `context` is output on stderr
+and the CSV itself on stdout. Example CSV output looks like:
+
+```
+name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
+"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
+"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
+"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
+```
+
+<a name="output-files" />
+
+### Output Files
+
+Write benchmark results to a file with the `--benchmark_out=<filename>` option
+(or set `BENCHMARK_OUT`). Specify the output format with
+`--benchmark_out_format={json|console|csv}` (or set
+`BENCHMARK_OUT_FORMAT={json|console|csv}`). Note that the 'csv' reporter is
+deprecated and the saved `.csv` file 
+[is not parsable](https://github.com/google/benchmark/issues/794) by csv 
+parsers.
+
+Specifying `--benchmark_out` does not suppress the console output.
+
+<a name="running-benchmarks" />
+
+### Running Benchmarks
+
+Benchmarks are executed by running the produced binaries. Benchmarks binaries,
+by default, accept options that may be specified either through their command
+line interface or by setting environment variables before execution. For every
+`--option_flag=<value>` CLI switch, a corresponding environment variable
+`OPTION_FLAG=<value>` exist and is used as default if set (CLI switches always
+ prevails). A complete list of CLI options is available running benchmarks
+ with the `--help` switch.
+
+<a name="running-a-subset-of-benchmarks" />
+
+### Running a Subset of Benchmarks
+
+The `--benchmark_filter=<regex>` option (or `BENCHMARK_FILTER=<regex>`
+environment variable) can be used to only run the benchmarks that match
+the specified `<regex>`. For example:
+
+```bash
+$ ./run_benchmarks.x --benchmark_filter=BM_memcpy/32
+Run on (1 X 2300 MHz CPU )
+2016-06-25 19:34:24
+Benchmark              Time           CPU Iterations
+----------------------------------------------------
+BM_memcpy/32          11 ns         11 ns   79545455
+BM_memcpy/32k       2181 ns       2185 ns     324074
+BM_memcpy/32          12 ns         12 ns   54687500
+BM_memcpy/32k       1834 ns       1837 ns     357143
+```
+
+<a name="result-comparison" />
+
+### Result comparison
+
+It is possible to compare the benchmarking results.
+See [Additional Tooling Documentation](docs/tools.md)
+
+<a name="extra-context" />
+
+### Extra Context
+
+Sometimes it's useful to add extra context to the content printed before the
+results. By default this section includes information about the CPU on which
+the benchmarks are running. If you do want to add more context, you can use
+the `benchmark_context` command line flag:
+
+```bash
+$ ./run_benchmarks --benchmark_context=pwd=`pwd`
+Run on (1 x 2300 MHz CPU)
+pwd: /home/user/benchmark/
+Benchmark              Time           CPU Iterations
+----------------------------------------------------
+BM_memcpy/32          11 ns         11 ns   79545455
+BM_memcpy/32k       2181 ns       2185 ns     324074
+```
+
+You can get the same effect with the API:
+
+```c++
+  benchmark::AddCustomContext("foo", "bar");
+```
+
+Note that attempts to add a second value with the same key will fail with an
+error message.
+
+<a name="runtime-and-reporting-considerations" />
+
+### Runtime and Reporting Considerations
+
+When the benchmark binary is executed, each benchmark function is run serially.
+The number of iterations to run is determined dynamically by running the
+benchmark a few times and measuring the time taken and ensuring that the
+ultimate result will be statistically stable. As such, faster benchmark
+functions will be run for more iterations than slower benchmark functions, and
+the number of iterations is thus reported.
+
+In all cases, the number of iterations for which the benchmark is run is
+governed by the amount of time the benchmark takes. Concretely, the number of
+iterations is at least one, not more than 1e9, until CPU time is greater than
+the minimum time, or the wallclock time is 5x minimum time. The minimum time is
+set per benchmark by calling `MinTime` on the registered benchmark object.
+
+Average timings are then reported over the iterations run. If multiple
+repetitions are requested using the `--benchmark_repetitions` command-line
+option, or at registration time, the benchmark function will be run several
+times and statistical results across these repetitions will also be reported.
+
+As well as the per-benchmark entries, a preamble in the report will include
+information about the machine on which the benchmarks are run.
+
+<a name="passing-arguments" />
+
+### Passing Arguments
+
 Sometimes a family of benchmarks can be implemented with just one routine that
 takes an extra argument to specify which one of the family of benchmarks to
 run. For example, the following code defines a family of benchmarks for
@@ -173,8 +538,26 @@ range multiplier is changed to multiples of two.
 ```c++
 BENCHMARK(BM_memcpy)->RangeMultiplier(2)->Range(8, 8<<10);
 ```
+
 Now arguments generated are [ 8, 16, 32, 64, 128, 256, 512, 1024, 2k, 4k, 8k ].
 
+The preceding code shows a method of defining a sparse range.  The following
+example shows a method of defining a dense range. It is then used to benchmark
+the performance of `std::vector` initialization for uniformly increasing sizes.
+
+```c++
+static void BM_DenseRange(benchmark::State& state) {
+  for(auto _ : state) {
+    std::vector<int> v(state.range(0), state.range(0));
+    benchmark::DoNotOptimize(v.data());
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(BM_DenseRange)->DenseRange(0, 1024, 128);
+```
+
+Now arguments generated are [ 0, 128, 256, 384, 512, 640, 768, 896, 1024 ].
+
 You might have a benchmark that depends on two or more inputs. For example, the
 following code defines a family of benchmarks for measuring the speed of set
 insertion.
@@ -210,6 +593,29 @@ pair.
 BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
 ```
 
+Some benchmarks may require specific argument values that cannot be expressed
+with `Ranges`. In this case, `ArgsProduct` offers the ability to generate a
+benchmark input for each combination in the product of the supplied vectors.
+
+```c++
+BENCHMARK(BM_SetInsert)
+    ->ArgsProduct({{1<<10, 3<<10, 8<<10}, {20, 40, 60, 80}})
+// would generate the same benchmark arguments as
+BENCHMARK(BM_SetInsert)
+    ->Args({1<<10, 20})
+    ->Args({3<<10, 20})
+    ->Args({8<<10, 20})
+    ->Args({3<<10, 40})
+    ->Args({8<<10, 40})
+    ->Args({1<<10, 40})
+    ->Args({1<<10, 60})
+    ->Args({3<<10, 60})
+    ->Args({8<<10, 60})
+    ->Args({1<<10, 80})
+    ->Args({3<<10, 80})
+    ->Args({8<<10, 80});
+```
+
 For more complex patterns of inputs, passing a custom function to `Apply` allows
 programmatic specification of an arbitrary set of arguments on which to run the
 benchmark. The following example enumerates a dense range on one parameter,
@@ -224,7 +630,32 @@ static void CustomArguments(benchmark::internal::Benchmark* b) {
 BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
 ```
 
-### Calculate asymptotic complexity (Big O)
+#### Passing Arbitrary Arguments to a Benchmark
+
+In C++11 it is possible to define a benchmark that takes an arbitrary number
+of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
+macro creates a benchmark that invokes `func`  with the `benchmark::State` as
+the first argument followed by the specified `args...`.
+The `test_case_name` is appended to the name of the benchmark and
+should describe the values passed.
+
+```c++
+template <class ...ExtraArgs>
+void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
+  [...]
+}
+// Registers a benchmark named "BM_takes_args/int_string_test" that passes
+// the specified values to `extra_args`.
+BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+```
+
+Note that elements of `...args` may refer to global variables. Users should
+avoid modifying global state inside of a benchmark.
+
+<a name="asymptotic-complexity" />
+
+### Calculating Asymptotic Complexity (Big O)
+
 Asymptotic complexity might be calculated for a family of benchmarks. The
 following code will calculate the coefficient for the high-order term in the
 running time and the normalized root-mean square error of string comparison.
@@ -255,13 +686,28 @@ that might be used to customize high-order term calculation.
 
 ```c++
 BENCHMARK(BM_StringCompare)->RangeMultiplier(2)
-    ->Range(1<<10, 1<<18)->Complexity([](int64_t n)->double{return n; });
+    ->Range(1<<10, 1<<18)->Complexity([](benchmark::IterationCount n)->double{return n; });
+```
+
+<a name="custom-benchmark-name" />
+
+### Custom Benchmark Name
+
+You can change the benchmark's name as follows:
+
+```c++
+BENCHMARK(BM_memcpy)->Name("memcpy")->RangeMultiplier(2)->Range(8, 8<<10);
 ```
 
-### Templated benchmarks
-Templated benchmarks work the same way: This example produces and consumes
-messages of size `sizeof(v)` `range_x` times. It also outputs throughput in the
-absence of multiprogramming.
+The invocation will execute the benchmark as before using `BM_memcpy` but changes
+the prefix in the report to `memcpy`.
+
+<a name="templated-benchmarks" />
+
+### Templated Benchmarks
+
+This example produces and consumes messages of size `sizeof(v)` `range_x`
+times. It also outputs throughput in the absence of multiprogramming.
 
 ```c++
 template <class Q> void BM_Sequential(benchmark::State& state) {
@@ -292,219 +738,392 @@ Three macros are provided for adding benchmark templates.
 #define BENCHMARK_TEMPLATE2(func, arg1, arg2)
 ```
 
-### A Faster KeepRunning loop
+<a name="fixtures" />
 
-In C++11 mode, a ranged-based for loop should be used in preference to
-the `KeepRunning` loop for running the benchmarks. For example:
-
-```c++
-static void BM_Fast(benchmark::State &state) {
-  for (auto _ : state) {
-    FastOperation();
-  }
-}
-BENCHMARK(BM_Fast);
-```
-
-The reason the ranged-for loop is faster than using `KeepRunning`, is
-because `KeepRunning` requires a memory load and store of the iteration count
-ever iteration, whereas the ranged-for variant is able to keep the iteration count
-in a register.
+### Fixtures
 
-For example, an empty inner loop of using the ranged-based for method looks like:
+Fixture tests are created by first defining a type that derives from
+`::benchmark::Fixture` and then creating/registering the tests using the
+following macros:
 
-```asm
-# Loop Init
-  mov rbx, qword ptr [r14 + 104]
-  call benchmark::State::StartKeepRunning()
-  test rbx, rbx
-  je .LoopEnd
-.LoopHeader: # =>This Inner Loop Header: Depth=1
-  add rbx, -1
-  jne .LoopHeader
-.LoopEnd:
-```
+* `BENCHMARK_F(ClassName, Method)`
+* `BENCHMARK_DEFINE_F(ClassName, Method)`
+* `BENCHMARK_REGISTER_F(ClassName, Method)`
 
-Compared to an empty `KeepRunning` loop, which looks like:
+For Example:
 
-```asm
-.LoopHeader: # in Loop: Header=BB0_3 Depth=1
-  cmp byte ptr [rbx], 1
-  jne .LoopInit
-.LoopBody: # =>This Inner Loop Header: Depth=1
-  mov rax, qword ptr [rbx + 8]
-  lea rcx, [rax + 1]
-  mov qword ptr [rbx + 8], rcx
-  cmp rax, qword ptr [rbx + 104]
-  jb .LoopHeader
-  jmp .LoopEnd
-.LoopInit:
-  mov rdi, rbx
-  call benchmark::State::StartKeepRunning()
-  jmp .LoopBody
-.LoopEnd:
-```
+```c++
+class MyFixture : public benchmark::Fixture {
+public:
+  void SetUp(const ::benchmark::State& state) {
+  }
 
-Unless C++03 compatibility is required, the ranged-for variant of writing
-the benchmark loop should be preferred.  
+  void TearDown(const ::benchmark::State& state) {
+  }
+};
 
-## Passing arbitrary arguments to a benchmark
-In C++11 it is possible to define a benchmark that takes an arbitrary number
-of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
-macro creates a benchmark that invokes `func`  with the `benchmark::State` as
-the first argument followed by the specified `args...`.
-The `test_case_name` is appended to the name of the benchmark and
-should describe the values passed.
+BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
 
-```c++
-template <class ...ExtraArgs>
-void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
-  [...]
+BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
 }
-// Registers a benchmark named "BM_takes_args/int_string_test" that passes
-// the specified values to `extra_args`.
-BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+/* BarTest is NOT registered */
+BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
+/* BarTest is now registered */
 ```
-Note that elements of `...args` may refer to global variables. Users should
-avoid modifying global state inside of a benchmark.
 
-## Using RegisterBenchmark(name, fn, args...)
+#### Templated Fixtures
 
-The `RegisterBenchmark(name, func, args...)` function provides an alternative
-way to create and register benchmarks.
-`RegisterBenchmark(name, func, args...)` creates, registers, and returns a
-pointer to a new benchmark with the specified `name` that invokes
-`func(st, args...)` where `st` is a `benchmark::State` object.
+Also you can create templated fixture by using the following macros:
 
-Unlike the `BENCHMARK` registration macros, which can only be used at the global
-scope, the `RegisterBenchmark` can be called anywhere. This allows for
-benchmark tests to be registered programmatically.
+* `BENCHMARK_TEMPLATE_F(ClassName, Method, ...)`
+* `BENCHMARK_TEMPLATE_DEFINE_F(ClassName, Method, ...)`
 
-Additionally `RegisterBenchmark` allows any callable object to be registered
-as a benchmark. Including capturing lambdas and function objects.
+For example:
 
-For Example:
 ```c++
-auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ };
+template<typename T>
+class MyFixture : public benchmark::Fixture {};
 
-int main(int argc, char** argv) {
-  for (auto& test_input : { /* ... */ })
-      benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
+BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
 }
-```
-
-### Multithreaded benchmarks
-In a multithreaded test (benchmark invoked by multiple threads simultaneously),
-it is guaranteed that none of the threads will start until all have reached
-the start of the benchmark loop, and all will have finished before any thread
-exits the benchmark loop. (This behavior is also provided by the `KeepRunning()`
-API) As such, any global setup or teardown can be wrapped in a check against the thread
-index:
 
-```c++
-static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
-    // Setup code here.
-  }
-  for (auto _ : state) {
-    // Run the test as normal.
-  }
-  if (state.thread_index == 0) {
-    // Teardown code here.
+BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
   }
 }
-BENCHMARK(BM_MultiThreaded)->Threads(2);
-```
-
-If the benchmarked code itself uses threads and you want to compare it to
-single-threaded code, you may want to use real-time ("wallclock") measurements
-for latency comparisons:
 
-```c++
-BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
+BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
 ```
 
-Without `UseRealTime`, CPU time is used by default.
+<a name="custom-counters" />
 
-## Controlling timers
-Normally, the entire duration of the work loop (`for (auto _ : state) {}`)
-is measured. But sometimes, it is nessesary to do some work inside of
-that loop, every iteration, but without counting that time to the benchmark time.
-That is possible, althought it is not recommended, since it has high overhead.
+### Custom Counters
+
+You can add your own counters with user-defined names. The example below
+will add columns "Foo", "Bar" and "Baz" in its output:
 
 ```c++
-static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
-  std::set<int> data;
+static void UserCountersExample1(benchmark::State& state) {
+  double numFoos = 0, numBars = 0, numBazs = 0;
   for (auto _ : state) {
-    state.PauseTiming(); // Stop timers. They will not count until they are resumed.
-    data = ConstructRandomSet(state.range(0)); // Do something that should not be measured
-    state.ResumeTiming(); // And resume timers. They are now counting again.
-    // The rest will be measured.
-    for (int j = 0; j < state.range(1); ++j)
-      data.insert(RandomNumber());
+    // ... count Foo,Bar,Baz events
   }
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
 }
-BENCHMARK(BM_SetInsert_With_Timer_Control)->Ranges({{1<<10, 8<<10}, {128, 512}});
 ```
 
-## Manual timing
-For benchmarking something for which neither CPU time nor real-time are
-correct or accurate enough, completely manual timing is supported using
-the `UseManualTime` function.
+The `state.counters` object is a `std::map` with `std::string` keys
+and `Counter` values. The latter is a `double`-like class, via an implicit
+conversion to `double&`. Thus you can use all of the standard arithmetic
+assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
 
-When `UseManualTime` is used, the benchmarked code must call
-`SetIterationTime` once per iteration of the benchmark loop to
-report the manually measured time.
+In multithreaded benchmarks, each counter is set on the calling thread only.
+When the benchmark finishes, the counters from each thread will be summed;
+the resulting sum is the value which will be shown for the benchmark.
 
-An example use case for this is benchmarking GPU execution (e.g. OpenCL
-or CUDA kernels, OpenGL or Vulkan or Direct3D draw calls), which cannot
-be accurately measured using CPU time or real-time. Instead, they can be
-measured accurately using a dedicated API, and these measurement results
-can be reported back with `SetIterationTime`.
+The `Counter` constructor accepts three parameters: the value as a `double`
+; a bit flag which allows you to show counters as rates, and/or as per-thread
+iteration, and/or as per-thread averages, and/or iteration invariants,
+and/or finally inverting the result; and a flag specifying the 'unit' - i.e.
+is 1k a 1000 (default, `benchmark::Counter::OneK::kIs1000`), or 1024
+(`benchmark::Counter::OneK::kIs1024`)?
 
 ```c++
-static void BM_ManualTiming(benchmark::State& state) {
-  int microseconds = state.range(0);
-  std::chrono::duration<double, std::micro> sleep_duration {
-    static_cast<double>(microseconds)
-  };
+  // sets a simple counter
+  state.counters["Foo"] = numFoos;
 
-  for (auto _ : state) {
-    auto start = std::chrono::high_resolution_clock::now();
-    // Simulate some useful workload with a sleep
-    std::this_thread::sleep_for(sleep_duration);
-    auto end   = std::chrono::high_resolution_clock::now();
+  // Set the counter as a rate. It will be presented divided
+  // by the duration of the benchmark.
+  // Meaning: per one second, how many 'foo's are processed?
+  state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate);
 
-    auto elapsed_seconds =
-      std::chrono::duration_cast<std::chrono::duration<double>>(
-        end - start);
+  // Set the counter as a rate. It will be presented divided
+  // by the duration of the benchmark, and the result inverted.
+  // Meaning: how many seconds it takes to process one 'foo'?
+  state.counters["FooInvRate"] = Counter(numFoos, benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
 
-    state.SetIterationTime(elapsed_seconds.count());
-  }
-}
-BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime();
-```
+  // Set the counter as a thread-average quantity. It will
+  // be presented divided by the number of threads.
+  state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads);
 
-### Preventing optimisation
-To prevent a value or expression from being optimized away by the compiler
-the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
-functions can be used.
+  // There's also a combined flag:
+  state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
 
-```c++
-static void BM_test(benchmark::State& state) {
-  for (auto _ : state) {
-      int x = 0;
-      for (int i=0; i < 64; ++i) {
-        benchmark::DoNotOptimize(x += i);
-      }
-  }
-}
+  // This says that we process with the rate of state.range(0) bytes every iteration:
+  state.counters["BytesProcessed"] = Counter(state.range(0), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
 ```
 
-`DoNotOptimize(<expr>)` forces the  *result* of `<expr>` to be stored in either
-memory or a register. For GNU based compilers it acts as read/write barrier
+When you're compiling in C++11 mode or later you can use `insert()` with
+`std::initializer_list`:
+
+```c++
+  // With C++11, this can be done:
+  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
+  // ... instead of:
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
+```
+
+#### Counter Reporting
+
+When using the console reporter, by default, user counters are printed at
+the end after the table, the same way as ``bytes_processed`` and
+``items_processed``. This is best for cases in which there are few counters,
+or where there are only a couple of lines per benchmark. Here's an example of
+the default output:
+
+```
+------------------------------------------------------------------------------
+Benchmark                        Time           CPU Iterations UserCounters...
+------------------------------------------------------------------------------
+BM_UserCounter/threads:8      2248 ns      10277 ns      68808 Bar=16 Bat=40 Baz=24 Foo=8
+BM_UserCounter/threads:1      9797 ns       9788 ns      71523 Bar=2 Bat=5 Baz=3 Foo=1024m
+BM_UserCounter/threads:2      4924 ns       9842 ns      71036 Bar=4 Bat=10 Baz=6 Foo=2
+BM_UserCounter/threads:4      2589 ns      10284 ns      68012 Bar=8 Bat=20 Baz=12 Foo=4
+BM_UserCounter/threads:8      2212 ns      10287 ns      68040 Bar=16 Bat=40 Baz=24 Foo=8
+BM_UserCounter/threads:16     1782 ns      10278 ns      68144 Bar=32 Bat=80 Baz=48 Foo=16
+BM_UserCounter/threads:32     1291 ns      10296 ns      68256 Bar=64 Bat=160 Baz=96 Foo=32
+BM_UserCounter/threads:4      2615 ns      10307 ns      68040 Bar=8 Bat=20 Baz=12 Foo=4
+BM_Factorial                    26 ns         26 ns   26608979 40320
+BM_Factorial/real_time          26 ns         26 ns   26587936 40320
+BM_CalculatePiRange/1           16 ns         16 ns   45704255 0
+BM_CalculatePiRange/8           73 ns         73 ns    9520927 3.28374
+BM_CalculatePiRange/64         609 ns        609 ns    1140647 3.15746
+BM_CalculatePiRange/512       4900 ns       4901 ns     142696 3.14355
+```
+
+If this doesn't suit you, you can print each counter as a table column by
+passing the flag `--benchmark_counters_tabular=true` to the benchmark
+application. This is best for cases in which there are a lot of counters, or
+a lot of lines per individual benchmark. Note that this will trigger a
+reprinting of the table header any time the counter set changes between
+individual benchmarks. Here's an example of corresponding output when
+`--benchmark_counters_tabular=true` is passed:
+
+```
+---------------------------------------------------------------------------------------
+Benchmark                        Time           CPU Iterations    Bar   Bat   Baz   Foo
+---------------------------------------------------------------------------------------
+BM_UserCounter/threads:8      2198 ns       9953 ns      70688     16    40    24     8
+BM_UserCounter/threads:1      9504 ns       9504 ns      73787      2     5     3     1
+BM_UserCounter/threads:2      4775 ns       9550 ns      72606      4    10     6     2
+BM_UserCounter/threads:4      2508 ns       9951 ns      70332      8    20    12     4
+BM_UserCounter/threads:8      2055 ns       9933 ns      70344     16    40    24     8
+BM_UserCounter/threads:16     1610 ns       9946 ns      70720     32    80    48    16
+BM_UserCounter/threads:32     1192 ns       9948 ns      70496     64   160    96    32
+BM_UserCounter/threads:4      2506 ns       9949 ns      70332      8    20    12     4
+--------------------------------------------------------------
+Benchmark                        Time           CPU Iterations
+--------------------------------------------------------------
+BM_Factorial                    26 ns         26 ns   26392245 40320
+BM_Factorial/real_time          26 ns         26 ns   26494107 40320
+BM_CalculatePiRange/1           15 ns         15 ns   45571597 0
+BM_CalculatePiRange/8           74 ns         74 ns    9450212 3.28374
+BM_CalculatePiRange/64         595 ns        595 ns    1173901 3.15746
+BM_CalculatePiRange/512       4752 ns       4752 ns     147380 3.14355
+BM_CalculatePiRange/4k       37970 ns      37972 ns      18453 3.14184
+BM_CalculatePiRange/32k     303733 ns     303744 ns       2305 3.14162
+BM_CalculatePiRange/256k   2434095 ns    2434186 ns        288 3.1416
+BM_CalculatePiRange/1024k  9721140 ns    9721413 ns         71 3.14159
+BM_CalculatePi/threads:8      2255 ns       9943 ns      70936
+```
+
+Note above the additional header printed when the benchmark changes from
+``BM_UserCounter`` to ``BM_Factorial``. This is because ``BM_Factorial`` does
+not have the same counter set as ``BM_UserCounter``.
+
+<a name="multithreaded-benchmarks"/>
+
+### Multithreaded Benchmarks
+
+In a multithreaded test (benchmark invoked by multiple threads simultaneously),
+it is guaranteed that none of the threads will start until all have reached
+the start of the benchmark loop, and all will have finished before any thread
+exits the benchmark loop. (This behavior is also provided by the `KeepRunning()`
+API) As such, any global setup or teardown can be wrapped in a check against the thread
+index:
+
+```c++
+static void BM_MultiThreaded(benchmark::State& state) {
+  if (state.thread_index == 0) {
+    // Setup code here.
+  }
+  for (auto _ : state) {
+    // Run the test as normal.
+  }
+  if (state.thread_index == 0) {
+    // Teardown code here.
+  }
+}
+BENCHMARK(BM_MultiThreaded)->Threads(2);
+```
+
+If the benchmarked code itself uses threads and you want to compare it to
+single-threaded code, you may want to use real-time ("wallclock") measurements
+for latency comparisons:
+
+```c++
+BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
+```
+
+Without `UseRealTime`, CPU time is used by default.
+
+<a name="cpu-timers" />
+
+### CPU Timers
+
+By default, the CPU timer only measures the time spent by the main thread.
+If the benchmark itself uses threads internally, this measurement may not
+be what you are looking for. Instead, there is a way to measure the total
+CPU usage of the process, by all the threads.
+
+```c++
+void callee(int i);
+
+static void MyMain(int size) {
+#pragma omp parallel for
+  for(int i = 0; i < size; i++)
+    callee(i);
+}
+
+static void BM_OpenMP(benchmark::State& state) {
+  for (auto _ : state)
+    MyMain(state.range(0));
+}
+
+// Measure the time spent by the main thread, use it to decide for how long to
+// run the benchmark loop. Depending on the internal implementation detail may
+// measure to anywhere from near-zero (the overhead spent before/after work
+// handoff to worker thread[s]) to the whole single-thread time.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10);
+
+// Measure the user-visible time, the wall clock (literally, the time that
+// has passed on the clock on the wall), use it to decide for how long to
+// run the benchmark loop. This will always be meaningful, an will match the
+// time spent by the main thread in single-threaded case, in general decreasing
+// with the number of internal threads doing the work.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->UseRealTime();
+
+// Measure the total CPU consumption, use it to decide for how long to
+// run the benchmark loop. This will always measure to no less than the
+// time spent by the main thread in single-threaded case.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime();
+
+// A mixture of the last two. Measure the total CPU consumption, but use the
+// wall clock to decide for how long to run the benchmark loop.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime()->UseRealTime();
+```
+
+#### Controlling Timers
+
+Normally, the entire duration of the work loop (`for (auto _ : state) {}`)
+is measured. But sometimes, it is necessary to do some work inside of
+that loop, every iteration, but without counting that time to the benchmark time.
+That is possible, although it is not recommended, since it has high overhead.
+
+```c++
+static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
+  std::set<int> data;
+  for (auto _ : state) {
+    state.PauseTiming(); // Stop timers. They will not count until they are resumed.
+    data = ConstructRandomSet(state.range(0)); // Do something that should not be measured
+    state.ResumeTiming(); // And resume timers. They are now counting again.
+    // The rest will be measured.
+    for (int j = 0; j < state.range(1); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert_With_Timer_Control)->Ranges({{1<<10, 8<<10}, {128, 512}});
+```
+
+<a name="manual-timing" />
+
+### Manual Timing
+
+For benchmarking something for which neither CPU time nor real-time are
+correct or accurate enough, completely manual timing is supported using
+the `UseManualTime` function.
+
+When `UseManualTime` is used, the benchmarked code must call
+`SetIterationTime` once per iteration of the benchmark loop to
+report the manually measured time.
+
+An example use case for this is benchmarking GPU execution (e.g. OpenCL
+or CUDA kernels, OpenGL or Vulkan or Direct3D draw calls), which cannot
+be accurately measured using CPU time or real-time. Instead, they can be
+measured accurately using a dedicated API, and these measurement results
+can be reported back with `SetIterationTime`.
+
+```c++
+static void BM_ManualTiming(benchmark::State& state) {
+  int microseconds = state.range(0);
+  std::chrono::duration<double, std::micro> sleep_duration {
+    static_cast<double>(microseconds)
+  };
+
+  for (auto _ : state) {
+    auto start = std::chrono::high_resolution_clock::now();
+    // Simulate some useful workload with a sleep
+    std::this_thread::sleep_for(sleep_duration);
+    auto end = std::chrono::high_resolution_clock::now();
+
+    auto elapsed_seconds =
+      std::chrono::duration_cast<std::chrono::duration<double>>(
+        end - start);
+
+    state.SetIterationTime(elapsed_seconds.count());
+  }
+}
+BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime();
+```
+
+<a name="setting-the-time-unit" />
+
+### Setting the Time Unit
+
+If a benchmark runs a few milliseconds it may be hard to visually compare the
+measured times, since the output data is given in nanoseconds per default. In
+order to manually set the time unit, you can specify it manually:
+
+```c++
+BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
+```
+
+<a name="preventing-optimization" />
+
+### Preventing Optimization
+
+To prevent a value or expression from being optimized away by the compiler
+the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
+functions can be used.
+
+```c++
+static void BM_test(benchmark::State& state) {
+  for (auto _ : state) {
+      int x = 0;
+      for (int i=0; i < 64; ++i) {
+        benchmark::DoNotOptimize(x += i);
+      }
+  }
+}
+```
+
+`DoNotOptimize(<expr>)` forces the  *result* of `<expr>` to be stored in either
+memory or a register. For GNU based compilers it acts as read/write barrier
 for global memory. More specifically it forces the compiler to flush pending
 writes to memory and reload any other values as necessary.
 
@@ -545,16 +1164,10 @@ static void BM_vector_push_back(benchmark::State& state) {
 
 Note that `ClobberMemory()` is only available for GNU or MSVC based compilers.
 
-### Set time unit manually
-If a benchmark runs a few milliseconds it may be hard to visually compare the
-measured times, since the output data is given in nanoseconds per default. In
-order to manually set the time unit, you can specify it manually:
+<a name="reporting-statistics" />
 
-```c++
-BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
-```
+### Statistics: Reporting the Mean, Median and Standard Deviation of Repeated Benchmarks
 
-### Reporting the mean, median and standard deviation by repeated benchmarks
 By default each benchmark is run once and that single result is reported.
 However benchmarks are often noisy and a single result may not be representative
 of the overall behavior. For this reason it's possible to repeatedly rerun the
@@ -580,10 +1193,13 @@ Calling `ReportAggregatesOnly(bool)` / `DisplayAggregatesOnly(bool)` on a
 registered benchmark object overrides the value of the appropriate flag for that
 benchmark.
 
-## User-defined statistics for repeated benchmarks
+<a name="custom-statistics" />
+
+### Custom Statistics
+
 While having mean, median and standard deviation is nice, this may not be
-enough for everyone. For example you may want to know what is the largest
-observation, e.g. because you have some real-time constraints. This is easy.
+enough for everyone. For example you may want to know what the largest
+observation is, e.g. because you have some real-time constraints. This is easy.
 The following code will specify a custom statistic to be calculated, defined
 by a lambda function.
 
@@ -603,194 +1219,39 @@ BENCHMARK(BM_spin_empty)
   ->Arg(512);
 ```
 
-## Fixtures
-Fixture tests are created by
-first defining a type that derives from `::benchmark::Fixture` and then
-creating/registering the tests using the following macros:
+<a name="using-register-benchmark" />
 
-* `BENCHMARK_F(ClassName, Method)`
-* `BENCHMARK_DEFINE_F(ClassName, Method)`
-* `BENCHMARK_REGISTER_F(ClassName, Method)`
+### Using RegisterBenchmark(name, fn, args...)
 
-For Example:
+The `RegisterBenchmark(name, func, args...)` function provides an alternative
+way to create and register benchmarks.
+`RegisterBenchmark(name, func, args...)` creates, registers, and returns a
+pointer to a new benchmark with the specified `name` that invokes
+`func(st, args...)` where `st` is a `benchmark::State` object.
 
-```c++
-class MyFixture : public benchmark::Fixture {};
+Unlike the `BENCHMARK` registration macros, which can only be used at the global
+scope, the `RegisterBenchmark` can be called anywhere. This allows for
+benchmark tests to be registered programmatically.
 
-BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
+Additionally `RegisterBenchmark` allows any callable object to be registered
+as a benchmark. Including capturing lambdas and function objects.
 
-BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
+For Example:
+```c++
+auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ };
+
+int main(int argc, char** argv) {
+  for (auto& test_input : { /* ... */ })
+      benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
 }
-/* BarTest is NOT registered */
-BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
-/* BarTest is now registered */
 ```
 
-### Templated fixtures
-Also you can create templated fixture by using the following macros:
+<a name="exiting-with-an-error" />
 
-* `BENCHMARK_TEMPLATE_F(ClassName, Method, ...)`
-* `BENCHMARK_TEMPLATE_DEFINE_F(ClassName, Method, ...)`
-
-For example:
-```c++
-template<typename T>
-class MyFixture : public benchmark::Fixture {};
-
-BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-
-BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-
-BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
-```
-
-## User-defined counters
-
-You can add your own counters with user-defined names. The example below
-will add columns "Foo", "Bar" and "Baz" in its output:
-
-```c++
-static void UserCountersExample1(benchmark::State& state) {
-  double numFoos = 0, numBars = 0, numBazs = 0;
-  for (auto _ : state) {
-    // ... count Foo,Bar,Baz events
-  }
-  state.counters["Foo"] = numFoos;
-  state.counters["Bar"] = numBars;
-  state.counters["Baz"] = numBazs;
-}
-```
-
-The `state.counters` object is a `std::map` with `std::string` keys
-and `Counter` values. The latter is a `double`-like class, via an implicit
-conversion to `double&`. Thus you can use all of the standard arithmetic
-assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
-
-In multithreaded benchmarks, each counter is set on the calling thread only.
-When the benchmark finishes, the counters from each thread will be summed;
-the resulting sum is the value which will be shown for the benchmark.
-
-The `Counter` constructor accepts three parameters: the value as a `double`
-; a bit flag which allows you to show counters as rates, and/or as per-thread
-iteration, and/or as per-thread averages, and/or iteration invariants;
-and a flag specifying the 'unit' - i.e. is 1k a 1000 (default,
-`benchmark::Counter::OneK::kIs1000`), or 1024
-(`benchmark::Counter::OneK::kIs1024`)?
-
-```c++
-  // sets a simple counter
-  state.counters["Foo"] = numFoos;
-
-  // Set the counter as a rate. It will be presented divided
-  // by the duration of the benchmark.
-  state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate);
-
-  // Set the counter as a thread-average quantity. It will
-  // be presented divided by the number of threads.
-  state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads);
-
-  // There's also a combined flag:
-  state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
-
-  // This says that we process with the rate of state.range(0) bytes every iteration:
-  state.counters["BytesProcessed"] = Counter(state.range(0), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
-```
-
-When you're compiling in C++11 mode or later you can use `insert()` with
-`std::initializer_list`:
-
-```c++
-  // With C++11, this can be done:
-  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
-  // ... instead of:
-  state.counters["Foo"] = numFoos;
-  state.counters["Bar"] = numBars;
-  state.counters["Baz"] = numBazs;
-```
-
-### Counter reporting
-
-When using the console reporter, by default, user counters are are printed at
-the end after the table, the same way as ``bytes_processed`` and
-``items_processed``. This is best for cases in which there are few counters,
-or where there are only a couple of lines per benchmark. Here's an example of
-the default output:
-
-```
-------------------------------------------------------------------------------
-Benchmark                        Time           CPU Iterations UserCounters...
-------------------------------------------------------------------------------
-BM_UserCounter/threads:8      2248 ns      10277 ns      68808 Bar=16 Bat=40 Baz=24 Foo=8
-BM_UserCounter/threads:1      9797 ns       9788 ns      71523 Bar=2 Bat=5 Baz=3 Foo=1024m
-BM_UserCounter/threads:2      4924 ns       9842 ns      71036 Bar=4 Bat=10 Baz=6 Foo=2
-BM_UserCounter/threads:4      2589 ns      10284 ns      68012 Bar=8 Bat=20 Baz=12 Foo=4
-BM_UserCounter/threads:8      2212 ns      10287 ns      68040 Bar=16 Bat=40 Baz=24 Foo=8
-BM_UserCounter/threads:16     1782 ns      10278 ns      68144 Bar=32 Bat=80 Baz=48 Foo=16
-BM_UserCounter/threads:32     1291 ns      10296 ns      68256 Bar=64 Bat=160 Baz=96 Foo=32
-BM_UserCounter/threads:4      2615 ns      10307 ns      68040 Bar=8 Bat=20 Baz=12 Foo=4
-BM_Factorial                    26 ns         26 ns   26608979 40320
-BM_Factorial/real_time          26 ns         26 ns   26587936 40320
-BM_CalculatePiRange/1           16 ns         16 ns   45704255 0
-BM_CalculatePiRange/8           73 ns         73 ns    9520927 3.28374
-BM_CalculatePiRange/64         609 ns        609 ns    1140647 3.15746
-BM_CalculatePiRange/512       4900 ns       4901 ns     142696 3.14355
-```
-
-If this doesn't suit you, you can print each counter as a table column by
-passing the flag `--benchmark_counters_tabular=true` to the benchmark
-application. This is best for cases in which there are a lot of counters, or
-a lot of lines per individual benchmark. Note that this will trigger a
-reprinting of the table header any time the counter set changes between
-individual benchmarks. Here's an example of corresponding output when
-`--benchmark_counters_tabular=true` is passed:
-
-```
----------------------------------------------------------------------------------------
-Benchmark                        Time           CPU Iterations    Bar   Bat   Baz   Foo
----------------------------------------------------------------------------------------
-BM_UserCounter/threads:8      2198 ns       9953 ns      70688     16    40    24     8
-BM_UserCounter/threads:1      9504 ns       9504 ns      73787      2     5     3     1
-BM_UserCounter/threads:2      4775 ns       9550 ns      72606      4    10     6     2
-BM_UserCounter/threads:4      2508 ns       9951 ns      70332      8    20    12     4
-BM_UserCounter/threads:8      2055 ns       9933 ns      70344     16    40    24     8
-BM_UserCounter/threads:16     1610 ns       9946 ns      70720     32    80    48    16
-BM_UserCounter/threads:32     1192 ns       9948 ns      70496     64   160    96    32
-BM_UserCounter/threads:4      2506 ns       9949 ns      70332      8    20    12     4
---------------------------------------------------------------
-Benchmark                        Time           CPU Iterations
---------------------------------------------------------------
-BM_Factorial                    26 ns         26 ns   26392245 40320
-BM_Factorial/real_time          26 ns         26 ns   26494107 40320
-BM_CalculatePiRange/1           15 ns         15 ns   45571597 0
-BM_CalculatePiRange/8           74 ns         74 ns    9450212 3.28374
-BM_CalculatePiRange/64         595 ns        595 ns    1173901 3.15746
-BM_CalculatePiRange/512       4752 ns       4752 ns     147380 3.14355
-BM_CalculatePiRange/4k       37970 ns      37972 ns      18453 3.14184
-BM_CalculatePiRange/32k     303733 ns     303744 ns       2305 3.14162
-BM_CalculatePiRange/256k   2434095 ns    2434186 ns        288 3.1416
-BM_CalculatePiRange/1024k  9721140 ns    9721413 ns         71 3.14159
-BM_CalculatePi/threads:8      2255 ns       9943 ns      70936
-```
-Note above the additional header printed when the benchmark changes from
-``BM_UserCounter`` to ``BM_Factorial``. This is because ``BM_Factorial`` does
-not have the same counter set as ``BM_UserCounter``.
-
-## Exiting Benchmarks in Error
+### Exiting with an Error
 
 When errors caused by external influences, such as file I/O and network
 communication, occur within a benchmark the
@@ -801,7 +1262,9 @@ Users must explicitly exit the loop, otherwise all iterations will be performed.
 Users may explicitly return to exit the benchmark immediately.
 
 The `SkipWithError(...)` function may be used at any point within the benchmark,
-including before and after the benchmark loop.
+including before and after the benchmark loop. Moreover, if `SkipWithError(...)`
+has been used, it is not required to reach the benchmark loop and one may return
+from the benchmark function early.
 
 For example:
 
@@ -809,188 +1272,105 @@ For example:
 static void BM_test(benchmark::State& state) {
   auto resource = GetResource();
   if (!resource.good()) {
-      state.SkipWithError("Resource is not good!");
-      // KeepRunning() loop will not be entered.
+    state.SkipWithError("Resource is not good!");
+    // KeepRunning() loop will not be entered.
   }
-  for (state.KeepRunning()) {
-      auto data = resource.read_data();
-      if (!resource.good()) {
-        state.SkipWithError("Failed to read data!");
-        break; // Needed to skip the rest of the iteration.
-     }
-     do_stuff(data);
+  while (state.KeepRunning()) {
+    auto data = resource.read_data();
+    if (!resource.good()) {
+      state.SkipWithError("Failed to read data!");
+      break; // Needed to skip the rest of the iteration.
+    }
+    do_stuff(data);
   }
 }
 
 static void BM_test_ranged_fo(benchmark::State & state) {
-  state.SkipWithError("test will not be entered");
+  auto resource = GetResource();
+  if (!resource.good()) {
+    state.SkipWithError("Resource is not good!");
+    return; // Early return is allowed when SkipWithError() has been used.
+  }
   for (auto _ : state) {
-    state.SkipWithError("Failed!");
-    break; // REQUIRED to prevent all further iterations.
+    auto data = resource.read_data();
+    if (!resource.good()) {
+      state.SkipWithError("Failed to read data!");
+      break; // REQUIRED to prevent all further iterations.
+    }
+    do_stuff(data);
   }
 }
 ```
+<a name="a-faster-keep-running-loop" />
 
-## Running a subset of the benchmarks
-
-The `--benchmark_filter=<regex>` option can be used to only run the benchmarks
-which match the specified `<regex>`. For example:
+### A Faster KeepRunning Loop
 
-```bash
-$ ./run_benchmarks.x --benchmark_filter=BM_memcpy/32
-Run on (1 X 2300 MHz CPU )
-2016-06-25 19:34:24
-Benchmark              Time           CPU Iterations
-----------------------------------------------------
-BM_memcpy/32          11 ns         11 ns   79545455
-BM_memcpy/32k       2181 ns       2185 ns     324074
-BM_memcpy/32          12 ns         12 ns   54687500
-BM_memcpy/32k       1834 ns       1837 ns     357143
-```
-
-## Runtime and reporting considerations
-When the benchmark binary is executed, each benchmark function is run serially.
-The number of iterations to run is determined dynamically by running the
-benchmark a few times and measuring the time taken and ensuring that the
-ultimate result will be statistically stable. As such, faster benchmark
-functions will be run for more iterations than slower benchmark functions, and
-the number of iterations is thus reported.
-
-In all cases, the number of iterations for which the benchmark is run is
-governed by the amount of time the benchmark takes. Concretely, the number of
-iterations is at least one, not more than 1e9, until CPU time is greater than
-the minimum time, or the wallclock time is 5x minimum time. The minimum time is
-set per benchmark by calling `MinTime` on the registered benchmark object.
-
-Average timings are then reported over the iterations run. If multiple
-repetitions are requested using the `--benchmark_repetitions` command-line
-option, or at registration time, the benchmark function will be run several
-times and statistical results across these repetitions will also be reported.
-
-As well as the per-benchmark entries, a preamble in the report will include
-information about the machine on which the benchmarks are run.
-
-### Output Formats
-The library supports multiple output formats. Use the
-`--benchmark_format=<console|json|csv>` flag to set the format type. `console`
-is the default format.
-
-The Console format is intended to be a human readable format. By default
-the format generates color output. Context is output on stderr and the
-tabular data on stdout. Example tabular output looks like:
-```
-Benchmark                               Time(ns)    CPU(ns) Iterations
-----------------------------------------------------------------------
-BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
-BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
-BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
-```
+In C++11 mode, a ranged-based for loop should be used in preference to
+the `KeepRunning` loop for running the benchmarks. For example:
 
-The JSON format outputs human readable json split into two top level attributes.
-The `context` attribute contains information about the run in general, including
-information about the CPU and the date.
-The `benchmarks` attribute contains a list of every benchmark run. Example json
-output looks like:
-```json
-{
-  "context": {
-    "date": "2015/03/17-18:40:25",
-    "num_cpus": 40,
-    "mhz_per_cpu": 2801,
-    "cpu_scaling_enabled": false,
-    "build_type": "debug"
-  },
-  "benchmarks": [
-    {
-      "name": "BM_SetInsert/1024/1",
-      "iterations": 94877,
-      "real_time": 29275,
-      "cpu_time": 29836,
-      "bytes_per_second": 134066,
-      "items_per_second": 33516
-    },
-    {
-      "name": "BM_SetInsert/1024/8",
-      "iterations": 21609,
-      "real_time": 32317,
-      "cpu_time": 32429,
-      "bytes_per_second": 986770,
-      "items_per_second": 246693
-    },
-    {
-      "name": "BM_SetInsert/1024/10",
-      "iterations": 21393,
-      "real_time": 32724,
-      "cpu_time": 33355,
-      "bytes_per_second": 1199226,
-      "items_per_second": 299807
-    }
-  ]
+```c++
+static void BM_Fast(benchmark::State &state) {
+  for (auto _ : state) {
+    FastOperation();
+  }
 }
+BENCHMARK(BM_Fast);
 ```
 
-The CSV format outputs comma-separated values. The `context` is output on stderr
-and the CSV itself on stdout. Example CSV output looks like:
-```
-name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
-"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
-"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
-"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
-```
-
-### Output Files
-The library supports writing the output of the benchmark to a file specified
-by `--benchmark_out=<filename>`. The format of the output can be specified
-using `--benchmark_out_format={json|console|csv}`. Specifying
-`--benchmark_out` does not suppress the console output.
-
-## Result comparison
-
-It is possible to compare the benchmarking results. See [Additional Tooling Documentation](docs/tools.md)
+The reason the ranged-for loop is faster than using `KeepRunning`, is
+because `KeepRunning` requires a memory load and store of the iteration count
+ever iteration, whereas the ranged-for variant is able to keep the iteration count
+in a register.
 
-## Debug vs Release
-By default, benchmark builds as a debug library. You will see a warning in the
-output when this is the case. To build it as a release library instead, use:
+For example, an empty inner loop of using the ranged-based for method looks like:
 
-```
-cmake -DCMAKE_BUILD_TYPE=Release
+```asm
+# Loop Init
+  mov rbx, qword ptr [r14 + 104]
+  call benchmark::State::StartKeepRunning()
+  test rbx, rbx
+  je .LoopEnd
+.LoopHeader: # =>This Inner Loop Header: Depth=1
+  add rbx, -1
+  jne .LoopHeader
+.LoopEnd:
 ```
 
-To enable link-time optimisation, use
+Compared to an empty `KeepRunning` loop, which looks like:
 
+```asm
+.LoopHeader: # in Loop: Header=BB0_3 Depth=1
+  cmp byte ptr [rbx], 1
+  jne .LoopInit
+.LoopBody: # =>This Inner Loop Header: Depth=1
+  mov rax, qword ptr [rbx + 8]
+  lea rcx, [rax + 1]
+  mov qword ptr [rbx + 8], rcx
+  cmp rax, qword ptr [rbx + 104]
+  jb .LoopHeader
+  jmp .LoopEnd
+.LoopInit:
+  mov rdi, rbx
+  call benchmark::State::StartKeepRunning()
+  jmp .LoopBody
+.LoopEnd:
 ```
-cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_LTO=true
-```
-
-If you are using gcc, you might need to set `GCC_AR` and `GCC_RANLIB` cmake
-cache variables, if autodetection fails.
-
-If you are using clang, you may need to set `LLVMAR_EXECUTABLE`,
-`LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
-
-## Compiler Support
 
-Google Benchmark uses C++11 when building the library. As such we require
-a modern C++ toolchain, both compiler and standard library.
-
-The following minimum versions are strongly recommended build the library:
-
-* GCC 4.8
-* Clang 3.4
-* Visual Studio 2013
-* Intel 2015 Update 1
+Unless C++03 compatibility is required, the ranged-for variant of writing
+the benchmark loop should be preferred.
 
-Anything older *may* work.
+<a name="disabling-cpu-frequency-scaling" />
 
-Note: Using the library and its headers in C++03 is supported. C++11 is only
-required to build the library.
+### Disabling CPU Frequency Scaling
 
-## Disable CPU frequency scaling
 If you see this error:
+
 ```
 ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 ```
+
 you might want to disable the CPU frequency scaling while running the benchmark:
+
 ```bash
 sudo cpupower frequency-set --governor performance
 ./mybench

diff  --git a/libcxx/utils/google-benchmark/WORKSPACE b/libcxx/utils/google-benchmark/WORKSPACE
index 54734f1ea55e7..631f3ba05de53 100644
--- a/libcxx/utils/google-benchmark/WORKSPACE
+++ b/libcxx/utils/google-benchmark/WORKSPACE
@@ -1,7 +1,51 @@
 workspace(name = "com_github_google_benchmark")
 
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+    name = "rules_cc",
+    strip_prefix = "rules_cc-a508235df92e71d537fcbae0c7c952ea6957a912",
+    urls = ["https://github.com/bazelbuild/rules_cc/archive/a508235df92e71d537fcbae0c7c952ea6957a912.zip"],
+    sha256 = "d7dc12c1d5bc1a87474de8e3d17b7731a4dcebcfb8aa3990fe8ac7734ef12f2f",
+)
+
+http_archive(
+    name = "com_google_absl",
+    sha256 = "f41868f7a938605c92936230081175d1eae87f6ea2c248f41077c8f88316f111",
+    strip_prefix = "abseil-cpp-20200225.2",
+    urls = ["https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz"],
+)
+
+http_archive(
+    name = "com_google_googletest",
+    strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
+    urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
+    sha256 = "8f827dd550db8b4fdf73904690df0be9fccc161017c9038a724bc9a0617a1bc8",
+)
+
 http_archive(
-     name = "com_google_googletest",
-     urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
-     strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
+    name = "pybind11",
+    build_file = "@//bindings/python:pybind11.BUILD",
+    sha256 = "1eed57bc6863190e35637290f97a20c81cfe4d9090ac0a24f3bbf08f265eb71d",
+    strip_prefix = "pybind11-2.4.3",
+    urls = ["https://github.com/pybind/pybind11/archive/v2.4.3.tar.gz"],
+)
+
+new_local_repository(
+    name = "python_headers",
+    build_file = "@//bindings/python:python_headers.BUILD",
+    path = "/usr/include/python3.6",  # May be overwritten by setup.py.
+)
+
+http_archive(
+    name = "rules_python",
+    url = "https://github.com/bazelbuild/rules_python/releases/download/0.1.0/rules_python-0.1.0.tar.gz",
+    sha256 = "b6d46438523a3ec0f3cead544190ee13223a52f6a6765a29eae7b7cc24cc83a0",
+)
+
+load("@rules_python//python:pip.bzl", pip3_install="pip_install")
+
+pip3_install(
+   name = "py_deps",
+   requirements = "//:requirements.txt",
 )

diff  --git a/libcxx/utils/google-benchmark/_config.yml b/libcxx/utils/google-benchmark/_config.yml
new file mode 100644
index 0000000000000..1fa5ff852bda8
--- /dev/null
+++ b/libcxx/utils/google-benchmark/_config.yml
@@ -0,0 +1,2 @@
+theme: jekyll-theme-midnight
+markdown: GFM

diff  --git a/libcxx/utils/google-benchmark/appveyor.yml b/libcxx/utils/google-benchmark/appveyor.yml
index cf240190bea64..81da955f02815 100644
--- a/libcxx/utils/google-benchmark/appveyor.yml
+++ b/libcxx/utils/google-benchmark/appveyor.yml
@@ -41,7 +41,7 @@ build_script:
   - cmake --build . --config %configuration%
 
 test_script:
-  - ctest -c %configuration% --timeout 300 --output-on-failure
+  - ctest --build-config %configuration% --timeout 300 --output-on-failure
 
 artifacts:
   - path: '_build/CMakeFiles/*.log'

diff  --git a/libcxx/utils/google-benchmark/bindings/python/BUILD b/libcxx/utils/google-benchmark/bindings/python/BUILD
new file mode 100644
index 0000000000000..9559a76b30a95
--- /dev/null
+++ b/libcxx/utils/google-benchmark/bindings/python/BUILD
@@ -0,0 +1,3 @@
+exports_files(glob(["*.BUILD"]))
+exports_files(["build_defs.bzl"])
+

diff  --git a/libcxx/utils/google-benchmark/bindings/python/build_defs.bzl b/libcxx/utils/google-benchmark/bindings/python/build_defs.bzl
new file mode 100644
index 0000000000000..45907aaa5e2d8
--- /dev/null
+++ b/libcxx/utils/google-benchmark/bindings/python/build_defs.bzl
@@ -0,0 +1,25 @@
+_SHARED_LIB_SUFFIX = {
+    "//conditions:default": ".so",
+    "//:windows": ".dll",
+}
+
+def py_extension(name, srcs, hdrs = [], copts = [], features = [], deps = []):
+    for shared_lib_suffix in _SHARED_LIB_SUFFIX.values():
+        shared_lib_name = name + shared_lib_suffix
+        native.cc_binary(
+            name = shared_lib_name,
+            linkshared = 1,
+            linkstatic = 1,
+            srcs = srcs + hdrs,
+            copts = copts,
+            features = features,
+            deps = deps,
+        )
+
+    return native.py_library(
+        name = name,
+        data = select({
+            platform: [name + shared_lib_suffix]
+            for platform, shared_lib_suffix in _SHARED_LIB_SUFFIX.items()
+        }),
+    )

diff  --git a/libcxx/utils/google-benchmark/bindings/python/google_benchmark/BUILD b/libcxx/utils/google-benchmark/bindings/python/google_benchmark/BUILD
new file mode 100644
index 0000000000000..3c1561f48eeed
--- /dev/null
+++ b/libcxx/utils/google-benchmark/bindings/python/google_benchmark/BUILD
@@ -0,0 +1,38 @@
+load("//bindings/python:build_defs.bzl", "py_extension")
+
+py_library(
+    name = "google_benchmark",
+    srcs = ["__init__.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":_benchmark",
+        # pip; absl:app
+    ],
+)
+
+py_extension(
+    name = "_benchmark",
+    srcs = ["benchmark.cc"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    deps = [
+        "//:benchmark",
+        "@pybind11",
+        "@python_headers",
+    ],
+)
+
+py_test(
+    name = "example",
+    srcs = ["example.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":google_benchmark",
+    ],
+)
+

diff  --git a/libcxx/utils/google-benchmark/bindings/python/google_benchmark/__init__.py b/libcxx/utils/google-benchmark/bindings/python/google_benchmark/__init__.py
new file mode 100644
index 0000000000000..1055bf2418569
--- /dev/null
+++ b/libcxx/utils/google-benchmark/bindings/python/google_benchmark/__init__.py
@@ -0,0 +1,158 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Python benchmarking utilities.
+
+Example usage:
+  import google_benchmark as benchmark
+
+  @benchmark.register
+  def my_benchmark(state):
+      ...  # Code executed outside `while` loop is not timed.
+
+      while state:
+        ...  # Code executed within `while` loop is timed.
+
+  if __name__ == '__main__':
+    benchmark.main()
+"""
+
+from absl import app
+from google_benchmark import _benchmark
+from google_benchmark._benchmark import (
+    Counter,
+    kNanosecond,
+    kMicrosecond,
+    kMillisecond,
+    kSecond,
+    oNone,
+    o1,
+    oN,
+    oNSquared,
+    oNCubed,
+    oLogN,
+    oNLogN,
+    oAuto,
+    oLambda,
+)
+
+
+__all__ = [
+    "register",
+    "main",
+    "Counter",
+    "kNanosecond",
+    "kMicrosecond",
+    "kMillisecond",
+    "kSecond",
+    "oNone",
+    "o1",
+    "oN",
+    "oNSquared",
+    "oNCubed",
+    "oLogN",
+    "oNLogN",
+    "oAuto",
+    "oLambda",
+]
+
+__version__ = "0.2.0"
+
+
+class __OptionMaker:
+    """A stateless class to collect benchmark options.
+
+    Collect all decorator calls like @option.range(start=0, limit=1<<5).
+    """
+
+    class Options:
+        """Pure data class to store options calls, along with the benchmarked function."""
+
+        def __init__(self, func):
+            self.func = func
+            self.builder_calls = []
+
+    @classmethod
+    def make(cls, func_or_options):
+        """Make Options from Options or the benchmarked function."""
+        if isinstance(func_or_options, cls.Options):
+            return func_or_options
+        return cls.Options(func_or_options)
+
+    def __getattr__(self, builder_name):
+        """Append option call in the Options."""
+
+        # The function that get returned on @option.range(start=0, limit=1<<5).
+        def __builder_method(*args, **kwargs):
+
+            # The decorator that get called, either with the benchmared function
+            # or the previous Options
+            def __decorator(func_or_options):
+                options = self.make(func_or_options)
+                options.builder_calls.append((builder_name, args, kwargs))
+                # The decorator returns Options so it is not technically a decorator
+                # and needs a final call to @regiser
+                return options
+
+            return __decorator
+
+        return __builder_method
+
+
+# Alias for nicer API.
+# We have to instantiate an object, even if stateless, to be able to use __getattr__
+# on option.range
+option = __OptionMaker()
+
+
+def register(undefined=None, *, name=None):
+    """Register function for benchmarking."""
+    if undefined is None:
+        # Decorator is called without parenthesis so we return a decorator
+        return lambda f: register(f, name=name)
+
+    # We have either the function to benchmark (simple case) or an instance of Options
+    # (@option._ case).
+    options = __OptionMaker.make(undefined)
+
+    if name is None:
+        name = options.func.__name__
+
+    # We register the benchmark and reproduce all the @option._ calls onto the
+    # benchmark builder pattern
+    benchmark = _benchmark.RegisterBenchmark(name, options.func)
+    for name, args, kwargs in options.builder_calls[::-1]:
+        getattr(benchmark, name)(*args, **kwargs)
+
+    # return the benchmarked function because the decorator does not modify it
+    return options.func
+
+
+def _flags_parser(argv):
+    argv = _benchmark.Initialize(argv)
+    return app.parse_flags_with_usage(argv)
+
+
+def _run_benchmarks(argv):
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+    return _benchmark.RunSpecifiedBenchmarks()
+
+
+def main(argv=None):
+    return app.run(_run_benchmarks, argv=argv, flags_parser=_flags_parser)
+
+
+# Methods for use with custom main function.
+initialize = _benchmark.Initialize
+run_benchmarks = _benchmark.RunSpecifiedBenchmarks

diff  --git a/libcxx/utils/google-benchmark/bindings/python/google_benchmark/benchmark.cc b/libcxx/utils/google-benchmark/bindings/python/google_benchmark/benchmark.cc
new file mode 100644
index 0000000000000..1b01fe7f7f0f7
--- /dev/null
+++ b/libcxx/utils/google-benchmark/bindings/python/google_benchmark/benchmark.cc
@@ -0,0 +1,181 @@
+// Benchmark for Python.
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "pybind11/operators.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "pybind11/stl_bind.h"
+
+#include "benchmark/benchmark.h"
+
+PYBIND11_MAKE_OPAQUE(benchmark::UserCounters);
+
+namespace {
+namespace py = ::pybind11;
+
+std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
+  // The `argv` pointers here become invalid when this function returns, but
+  // benchmark holds the pointer to `argv[0]`. We create a static copy of it
+  // so it persists, and replace the pointer below.
+  static std::string executable_name(argv[0]);
+  std::vector<char*> ptrs;
+  ptrs.reserve(argv.size());
+  for (auto& arg : argv) {
+    ptrs.push_back(const_cast<char*>(arg.c_str()));
+  }
+  ptrs[0] = const_cast<char*>(executable_name.c_str());
+  int argc = static_cast<int>(argv.size());
+  benchmark::Initialize(&argc, ptrs.data());
+  std::vector<std::string> remaining_argv;
+  remaining_argv.reserve(argc);
+  for (int i = 0; i < argc; ++i) {
+    remaining_argv.emplace_back(ptrs[i]);
+  }
+  return remaining_argv;
+}
+
+benchmark::internal::Benchmark* RegisterBenchmark(const char* name,
+                                                  py::function f) {
+  return benchmark::RegisterBenchmark(
+      name, [f](benchmark::State& state) { f(&state); });
+}
+
+PYBIND11_MODULE(_benchmark, m) {
+  using benchmark::TimeUnit;
+  py::enum_<TimeUnit>(m, "TimeUnit")
+      .value("kNanosecond", TimeUnit::kNanosecond)
+      .value("kMicrosecond", TimeUnit::kMicrosecond)
+      .value("kMillisecond", TimeUnit::kMillisecond)
+      .value("kSecond", TimeUnit::kSecond)
+      .export_values();
+
+  using benchmark::BigO;
+  py::enum_<BigO>(m, "BigO")
+      .value("oNone", BigO::oNone)
+      .value("o1", BigO::o1)
+      .value("oN", BigO::oN)
+      .value("oNSquared", BigO::oNSquared)
+      .value("oNCubed", BigO::oNCubed)
+      .value("oLogN", BigO::oLogN)
+      .value("oNLogN", BigO::oLogN)
+      .value("oAuto", BigO::oAuto)
+      .value("oLambda", BigO::oLambda)
+      .export_values();
+
+  using benchmark::internal::Benchmark;
+  py::class_<Benchmark>(m, "Benchmark")
+      // For methods returning a pointer tor the current object, reference
+      // return policy is used to ask pybind not to take ownership oof the
+      // returned object and avoid calling delete on it.
+      // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies
+      //
+      // For methods taking a const std::vector<...>&, a copy is created
+      // because a it is bound to a Python list.
+      // https://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html
+      .def("unit", &Benchmark::Unit, py::return_value_policy::reference)
+      .def("arg", &Benchmark::Arg, py::return_value_policy::reference)
+      .def("args", &Benchmark::Args, py::return_value_policy::reference)
+      .def("range", &Benchmark::Range, py::return_value_policy::reference,
+           py::arg("start"), py::arg("limit"))
+      .def("dense_range", &Benchmark::DenseRange,
+           py::return_value_policy::reference, py::arg("start"),
+           py::arg("limit"), py::arg("step") = 1)
+      .def("ranges", &Benchmark::Ranges, py::return_value_policy::reference)
+      .def("args_product", &Benchmark::ArgsProduct,
+           py::return_value_policy::reference)
+      .def("arg_name", &Benchmark::ArgName, py::return_value_policy::reference)
+      .def("arg_names", &Benchmark::ArgNames,
+           py::return_value_policy::reference)
+      .def("range_pair", &Benchmark::RangePair,
+           py::return_value_policy::reference, py::arg("lo1"), py::arg("hi1"),
+           py::arg("lo2"), py::arg("hi2"))
+      .def("range_multiplier", &Benchmark::RangeMultiplier,
+           py::return_value_policy::reference)
+      .def("min_time", &Benchmark::MinTime, py::return_value_policy::reference)
+      .def("iterations", &Benchmark::Iterations,
+           py::return_value_policy::reference)
+      .def("repetitions", &Benchmark::Repetitions,
+           py::return_value_policy::reference)
+      .def("report_aggregates_only", &Benchmark::ReportAggregatesOnly,
+           py::return_value_policy::reference, py::arg("value") = true)
+      .def("display_aggregates_only", &Benchmark::DisplayAggregatesOnly,
+           py::return_value_policy::reference, py::arg("value") = true)
+      .def("measure_process_cpu_time", &Benchmark::MeasureProcessCPUTime,
+           py::return_value_policy::reference)
+      .def("use_real_time", &Benchmark::UseRealTime,
+           py::return_value_policy::reference)
+      .def("use_manual_time", &Benchmark::UseManualTime,
+           py::return_value_policy::reference)
+      .def(
+          "complexity",
+          (Benchmark * (Benchmark::*)(benchmark::BigO)) & Benchmark::Complexity,
+          py::return_value_policy::reference,
+          py::arg("complexity") = benchmark::oAuto);
+
+  using benchmark::Counter;
+  py::class_<Counter> py_counter(m, "Counter");
+
+  py::enum_<Counter::Flags>(py_counter, "Flags")
+      .value("kDefaults", Counter::Flags::kDefaults)
+      .value("kIsRate", Counter::Flags::kIsRate)
+      .value("kAvgThreads", Counter::Flags::kAvgThreads)
+      .value("kAvgThreadsRate", Counter::Flags::kAvgThreadsRate)
+      .value("kIsIterationInvariant", Counter::Flags::kIsIterationInvariant)
+      .value("kIsIterationInvariantRate",
+             Counter::Flags::kIsIterationInvariantRate)
+      .value("kAvgIterations", Counter::Flags::kAvgIterations)
+      .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate)
+      .value("kInvert", Counter::Flags::kInvert)
+      .export_values()
+      .def(py::self | py::self);
+
+  py::enum_<Counter::OneK>(py_counter, "OneK")
+      .value("kIs1000", Counter::OneK::kIs1000)
+      .value("kIs1024", Counter::OneK::kIs1024)
+      .export_values();
+
+  py_counter
+      .def(py::init<double, Counter::Flags, Counter::OneK>(),
+           py::arg("value") = 0., py::arg("flags") = Counter::kDefaults,
+           py::arg("k") = Counter::kIs1000)
+      .def(py::init([](double value) { return Counter(value); }))
+      .def_readwrite("value", &Counter::value)
+      .def_readwrite("flags", &Counter::flags)
+      .def_readwrite("oneK", &Counter::oneK);
+  py::implicitly_convertible<py::float_, Counter>();
+  py::implicitly_convertible<py::int_, Counter>();
+
+  py::bind_map<benchmark::UserCounters>(m, "UserCounters");
+
+  using benchmark::State;
+  py::class_<State>(m, "State")
+      .def("__bool__", &State::KeepRunning)
+      .def_property_readonly("keep_running", &State::KeepRunning)
+      .def("pause_timing", &State::PauseTiming)
+      .def("resume_timing", &State::ResumeTiming)
+      .def("skip_with_error", &State::SkipWithError)
+      .def_property_readonly("error_occurred", &State::error_occurred)
+      .def("set_iteration_time", &State::SetIterationTime)
+      .def_property("bytes_processed", &State::bytes_processed,
+                    &State::SetBytesProcessed)
+      .def_property("complexity_n", &State::complexity_length_n,
+                    &State::SetComplexityN)
+      .def_property("items_processed", &State::items_processed,
+                    &State::SetItemsProcessed)
+      .def("set_label", (void (State::*)(const char*)) & State::SetLabel)
+      .def("range", &State::range, py::arg("pos") = 0)
+      .def_property_readonly("iterations", &State::iterations)
+      .def_readwrite("counters", &State::counters)
+      .def_readonly("thread_index", &State::thread_index)
+      .def_readonly("threads", &State::threads);
+
+  m.def("Initialize", Initialize);
+  m.def("RegisterBenchmark", RegisterBenchmark,
+        py::return_value_policy::reference);
+  m.def("RunSpecifiedBenchmarks",
+        []() { benchmark::RunSpecifiedBenchmarks(); });
+};
+}  // namespace

diff  --git a/libcxx/utils/google-benchmark/bindings/python/google_benchmark/example.py b/libcxx/utils/google-benchmark/bindings/python/google_benchmark/example.py
new file mode 100644
index 0000000000000..9134e8cffeafb
--- /dev/null
+++ b/libcxx/utils/google-benchmark/bindings/python/google_benchmark/example.py
@@ -0,0 +1,136 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Example of Python using C++ benchmark framework.
+
+To run this example, you must first install the `google_benchmark` Python package.
+
+To install using `setup.py`, download and extract the `google_benchmark` source.
+In the extracted directory, execute:
+  python setup.py install
+"""
+
+import random
+import time
+
+import google_benchmark as benchmark
+from google_benchmark import Counter
+
+
+ at benchmark.register
+def empty(state):
+    while state:
+        pass
+
+
+ at benchmark.register
+def sum_million(state):
+    while state:
+        sum(range(1_000_000))
+
+ at benchmark.register
+def pause_timing(state):
+    """Pause timing every iteration."""
+    while state:
+        # Construct a list of random ints every iteration without timing it
+        state.pause_timing()
+        random_list = [random.randint(0, 100) for _ in range(100)]
+        state.resume_timing()
+        # Time the in place sorting algorithm
+        random_list.sort()
+
+
+ at benchmark.register
+def skipped(state):
+    if True:  # Test some predicate here.
+        state.skip_with_error("some error")
+        return  # NOTE: You must explicitly return, or benchmark will continue.
+
+    ...  # Benchmark code would be here.
+
+
+ at benchmark.register
+def manual_timing(state):
+    while state:
+        # Manually count Python CPU time
+        start = time.perf_counter()  # perf_counter_ns() in Python 3.7+
+        # Something to benchmark
+        time.sleep(0.01)
+        end = time.perf_counter()
+        state.set_iteration_time(end - start)
+
+
+ at benchmark.register
+def custom_counters(state):
+    """Collect cutom metric using benchmark.Counter."""
+    num_foo = 0.0
+    while state:
+        # Benchmark some code here
+        pass
+        # Collect some custom metric named foo
+        num_foo += 0.13
+
+    # Automatic Counter from numbers.
+    state.counters["foo"] = num_foo
+    # Set a counter as a rate.
+    state.counters["foo_rate"] = Counter(num_foo, Counter.kIsRate)
+    #  Set a counter as an inverse of rate.
+    state.counters["foo_inv_rate"] = Counter(num_foo, Counter.kIsRate | Counter.kInvert)
+    # Set a counter as a thread-average quantity.
+    state.counters["foo_avg"] = Counter(num_foo, Counter.kAvgThreads)
+    # There's also a combined flag:
+    state.counters["foo_avg_rate"] = Counter(num_foo, Counter.kAvgThreadsRate)
+
+
+ at benchmark.register
+ at benchmark.option.measure_process_cpu_time()
+ at benchmark.option.use_real_time()
+def with_options(state):
+    while state:
+        sum(range(1_000_000))
+
+
+ at benchmark.register(name="sum_million_microseconds")
+ at benchmark.option.unit(benchmark.kMicrosecond)
+def with_options(state):
+    while state:
+        sum(range(1_000_000))
+
+
+ at benchmark.register
+ at benchmark.option.arg(100)
+ at benchmark.option.arg(1000)
+def passing_argument(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+ at benchmark.register
+ at benchmark.option.range(8, limit=8 << 10)
+def using_range(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+ at benchmark.register
+ at benchmark.option.range_multiplier(2)
+ at benchmark.option.range(1 << 10, 1 << 18)
+ at benchmark.option.complexity(benchmark.oN)
+def computing_complexity(state):
+    while state:
+        sum(range(state.range(0)))
+    state.complexity_n = state.range(0)
+
+
+if __name__ == "__main__":
+    benchmark.main()

diff  --git a/libcxx/utils/google-benchmark/bindings/python/pybind11.BUILD b/libcxx/utils/google-benchmark/bindings/python/pybind11.BUILD
new file mode 100644
index 0000000000000..bc833500383a2
--- /dev/null
+++ b/libcxx/utils/google-benchmark/bindings/python/pybind11.BUILD
@@ -0,0 +1,20 @@
+cc_library(
+    name = "pybind11",
+    hdrs = glob(
+        include = [
+            "include/pybind11/*.h",
+            "include/pybind11/detail/*.h",
+        ],
+        exclude = [
+            "include/pybind11/common.h",
+            "include/pybind11/eigen.h",
+        ],
+    ),
+    copts = [
+        "-fexceptions",
+        "-Wno-undefined-inline",
+        "-Wno-pragma-once-outside-header",
+    ],
+    includes = ["include"],
+    visibility = ["//visibility:public"],
+)

diff  --git a/libcxx/utils/google-benchmark/bindings/python/python_headers.BUILD b/libcxx/utils/google-benchmark/bindings/python/python_headers.BUILD
new file mode 100644
index 0000000000000..9c34cf6ca4bd3
--- /dev/null
+++ b/libcxx/utils/google-benchmark/bindings/python/python_headers.BUILD
@@ -0,0 +1,6 @@
+cc_library(
+    name = "python_headers",
+    hdrs = glob(["**/*.h"]),
+    includes = ["."],
+    visibility = ["//visibility:public"],
+)

diff  --git a/libcxx/utils/google-benchmark/bindings/python/requirements.txt b/libcxx/utils/google-benchmark/bindings/python/requirements.txt
new file mode 100644
index 0000000000000..f5bbe7eca5cea
--- /dev/null
+++ b/libcxx/utils/google-benchmark/bindings/python/requirements.txt
@@ -0,0 +1,2 @@
+absl-py>=0.7.1
+

diff  --git a/libcxx/utils/google-benchmark/cmake/AddCXXCompilerFlag.cmake b/libcxx/utils/google-benchmark/cmake/AddCXXCompilerFlag.cmake
index d0d2099814402..858589e9775c6 100644
--- a/libcxx/utils/google-benchmark/cmake/AddCXXCompilerFlag.cmake
+++ b/libcxx/utils/google-benchmark/cmake/AddCXXCompilerFlag.cmake
@@ -34,9 +34,11 @@ function(add_cxx_compiler_flag FLAG)
   check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
   set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
   if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
       string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
     endif()
     set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${BENCHMARK_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
   endif()
@@ -49,9 +51,11 @@ function(add_required_cxx_compiler_flag FLAG)
   check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
   set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
   if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
       string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
     endif()
     set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)

diff  --git a/libcxx/utils/google-benchmark/cmake/CXXFeatureCheck.cmake b/libcxx/utils/google-benchmark/cmake/CXXFeatureCheck.cmake
index 99b56dd623904..62e6741fe3de0 100644
--- a/libcxx/utils/google-benchmark/cmake/CXXFeatureCheck.cmake
+++ b/libcxx/utils/google-benchmark/cmake/CXXFeatureCheck.cmake
@@ -27,6 +27,11 @@ function(cxx_feature_check FILE)
     return()
   endif()
 
+  if (ARGC GREATER 1)
+    message(STATUS "Enabling additional flags: ${ARGV1}")
+    list(APPEND BENCHMARK_CXX_LINKER_FLAGS ${ARGV1})
+  endif()
+
   if (NOT DEFINED COMPILE_${FEATURE})
     message(STATUS "Performing Test ${FEATURE}")
     if(CMAKE_CROSSCOMPILING)
@@ -37,9 +42,9 @@ function(cxx_feature_check FILE)
       if(COMPILE_${FEATURE})
         message(WARNING
               "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
-        set(RUN_${FEATURE} 0)
+        set(RUN_${FEATURE} 0 CACHE INTERNAL "")
       else()
-        set(RUN_${FEATURE} 1)
+        set(RUN_${FEATURE} 1 CACHE INTERNAL "")
       endif()
     else()
       message(STATUS "Performing Test ${FEATURE}")

diff  --git a/libcxx/utils/google-benchmark/cmake/GetGitVersion.cmake b/libcxx/utils/google-benchmark/cmake/GetGitVersion.cmake
index 4f10f226d7a78..04a1f9b70d683 100644
--- a/libcxx/utils/google-benchmark/cmake/GetGitVersion.cmake
+++ b/libcxx/utils/google-benchmark/cmake/GetGitVersion.cmake
@@ -20,16 +20,20 @@ set(__get_git_version INCLUDED)
 
 function(get_git_version var)
   if(GIT_EXECUTABLE)
-      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
           WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
           RESULT_VARIABLE status
-          OUTPUT_VARIABLE GIT_VERSION
+          OUTPUT_VARIABLE GIT_DESCRIBE_VERSION
           ERROR_QUIET)
-      if(${status})
-          set(GIT_VERSION "v0.0.0")
+      if(status)
+          set(GIT_DESCRIBE_VERSION "v0.0.0")
+      endif()
+      
+      string(STRIP ${GIT_DESCRIBE_VERSION} GIT_DESCRIBE_VERSION)
+      if(GIT_DESCRIBE_VERSION MATCHES v[^-]*-) 
+         string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  GIT_VERSION ${GIT_DESCRIBE_VERSION})
       else()
-          string(STRIP ${GIT_VERSION} GIT_VERSION)
-          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
+         string(REGEX REPLACE "v(.*)" "\\1" GIT_VERSION ${GIT_DESCRIBE_VERSION})
       endif()
 
       # Work out if the repository is dirty
@@ -43,12 +47,12 @@ function(get_git_version var)
           ERROR_QUIET)
       string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
       if (${GIT_DIRTY})
-          set(GIT_VERSION "${GIT_VERSION}-dirty")
+          set(GIT_DESCRIBE_VERSION "${GIT_DESCRIBE_VERSION}-dirty")
       endif()
+      message(STATUS "git version: ${GIT_DESCRIBE_VERSION} normalized to ${GIT_VERSION}")
   else()
-      set(GIT_VERSION "v0.0.0")
+      set(GIT_VERSION "0.0.0")
   endif()
 
-  message(STATUS "git Version: ${GIT_VERSION}")
   set(${var} ${GIT_VERSION} PARENT_SCOPE)
 endfunction()

diff  --git a/libcxx/utils/google-benchmark/cmake/GoogleTest.cmake b/libcxx/utils/google-benchmark/cmake/GoogleTest.cmake
new file mode 100644
index 0000000000000..dd611fc875f19
--- /dev/null
+++ b/libcxx/utils/google-benchmark/cmake/GoogleTest.cmake
@@ -0,0 +1,41 @@
+# Download and unpack googletest at configure time
+set(GOOGLETEST_PREFIX "${benchmark_BINARY_DIR}/third_party/googletest")
+configure_file(${benchmark_SOURCE_DIR}/cmake/GoogleTest.cmake.in ${GOOGLETEST_PREFIX}/CMakeLists.txt @ONLY)
+
+set(GOOGLETEST_PATH "${CMAKE_CURRENT_SOURCE_DIR}/googletest" CACHE PATH "") # Mind the quotes
+execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
+  -DALLOW_DOWNLOADING_GOOGLETEST=${BENCHMARK_DOWNLOAD_DEPENDENCIES} -DGOOGLETEST_PATH:PATH=${GOOGLETEST_PATH} .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+endif()
+
+execute_process(
+  COMMAND ${CMAKE_COMMAND} --build .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "Build step for googletest failed: ${result}")
+endif()
+
+# Prevent overriding the parent project's compiler/linker
+# settings on Windows
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+include(${GOOGLETEST_PREFIX}/googletest-paths.cmake)
+
+# Add googletest directly to our build. This defines
+# the gtest and gtest_main targets.
+add_subdirectory(${GOOGLETEST_SOURCE_DIR}
+                 ${GOOGLETEST_BINARY_DIR}
+                 EXCLUDE_FROM_ALL)
+
+set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES>)

diff  --git a/libcxx/utils/google-benchmark/cmake/GoogleTest.cmake.in b/libcxx/utils/google-benchmark/cmake/GoogleTest.cmake.in
new file mode 100644
index 0000000000000..fd957ff564095
--- /dev/null
+++ b/libcxx/utils/google-benchmark/cmake/GoogleTest.cmake.in
@@ -0,0 +1,58 @@
+cmake_minimum_required(VERSION 2.8.12)
+
+project(googletest-download NONE)
+
+# Enable ExternalProject CMake module
+include(ExternalProject)
+
+option(ALLOW_DOWNLOADING_GOOGLETEST "If googletest src tree is not found in location specified by GOOGLETEST_PATH, do fetch the archive from internet" OFF)
+set(GOOGLETEST_PATH "/usr/src/googletest" CACHE PATH
+                    "Path to the googletest root tree. Should contain googletest and googlemock subdirs. And CMakeLists.txt in root, and in both of these subdirs")
+
+# Download and install GoogleTest
+
+message(STATUS "Looking for Google Test sources")
+message(STATUS "Looking for Google Test sources in ${GOOGLETEST_PATH}")
+if(EXISTS "${GOOGLETEST_PATH}"            AND IS_DIRECTORY "${GOOGLETEST_PATH}"            AND EXISTS "${GOOGLETEST_PATH}/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googletest" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googletest" AND EXISTS "${GOOGLETEST_PATH}/googletest/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googlemock" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googlemock" AND EXISTS "${GOOGLETEST_PATH}/googlemock/CMakeLists.txt")
+  message(STATUS "Found Google Test in ${GOOGLETEST_PATH}")
+
+  ExternalProject_Add(
+    googletest
+    PREFIX            "${CMAKE_BINARY_DIR}"
+    DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+    SOURCE_DIR        "${GOOGLETEST_PATH}" # use existing src dir.
+    BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+  )
+else()
+  if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+  else()
+    message(WARNING "Did not find Google Test sources! Fetching from web...")
+    ExternalProject_Add(
+      googletest
+      GIT_REPOSITORY    https://github.com/google/googletest.git
+      GIT_TAG           master
+      PREFIX            "${CMAKE_BINARY_DIR}"
+      STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
+      DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+      SOURCE_DIR        "${CMAKE_BINARY_DIR}/src"
+      BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND     ""
+      INSTALL_COMMAND   ""
+      TEST_COMMAND      ""
+    )
+  endif()
+endif()
+
+ExternalProject_Get_Property(googletest SOURCE_DIR BINARY_DIR)
+file(WRITE googletest-paths.cmake
+"set(GOOGLETEST_SOURCE_DIR \"${SOURCE_DIR}\")
+set(GOOGLETEST_BINARY_DIR \"${BINARY_DIR}\")
+")

diff  --git a/libcxx/utils/google-benchmark/cmake/HandleGTest.cmake b/libcxx/utils/google-benchmark/cmake/HandleGTest.cmake
deleted file mode 100644
index b9c14436dbfa4..0000000000000
--- a/libcxx/utils/google-benchmark/cmake/HandleGTest.cmake
+++ /dev/null
@@ -1,113 +0,0 @@
-
-include(split_list)
-
-macro(build_external_gtest)
-  include(ExternalProject)
-  set(GTEST_FLAGS "")
-  if (BENCHMARK_USE_LIBCXX)
-    if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-      list(APPEND GTEST_FLAGS -stdlib=libc++)
-    else()
-      message(WARNING "Unsupported compiler (${CMAKE_CXX_COMPILER}) when using libc++")
-    endif()
-  endif()
-  if (BENCHMARK_BUILD_32_BITS)
-    list(APPEND GTEST_FLAGS -m32)
-  endif()
-  if (NOT "${CMAKE_CXX_FLAGS}" STREQUAL "")
-    list(APPEND GTEST_FLAGS ${CMAKE_CXX_FLAGS})
-  endif()
-  string(TOUPPER "${CMAKE_BUILD_TYPE}" GTEST_BUILD_TYPE)
-  if ("${GTEST_BUILD_TYPE}" STREQUAL "COVERAGE")
-    set(GTEST_BUILD_TYPE "DEBUG")
-  endif()
-  # FIXME: Since 10/Feb/2017 the googletest trunk has had a bug where
-  # -Werror=unused-function fires during the build on OS X. This is a temporary
-  # workaround to keep our travis bots from failing. It should be removed
-  # once gtest is fixed.
-  if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-    list(APPEND GTEST_FLAGS "-Wno-unused-function")
-  endif()
-  split_list(GTEST_FLAGS)
-  set(EXCLUDE_FROM_ALL_OPT "")
-  set(EXCLUDE_FROM_ALL_VALUE "")
-  if (${CMAKE_VERSION} VERSION_GREATER "3.0.99")
-      set(EXCLUDE_FROM_ALL_OPT "EXCLUDE_FROM_ALL")
-      set(EXCLUDE_FROM_ALL_VALUE "ON")
-  endif()
-  ExternalProject_Add(googletest
-      ${EXCLUDE_FROM_ALL_OPT} ${EXCLUDE_FROM_ALL_VALUE}
-      GIT_REPOSITORY https://github.com/google/googletest.git
-      GIT_TAG master
-      PREFIX "${CMAKE_BINARY_DIR}/googletest"
-      INSTALL_DIR "${CMAKE_BINARY_DIR}/googletest"
-      CMAKE_CACHE_ARGS
-        -DCMAKE_BUILD_TYPE:STRING=${GTEST_BUILD_TYPE}
-        -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-        -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-        -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
-        -DCMAKE_INSTALL_LIBDIR:PATH=<INSTALL_DIR>/lib
-        -DCMAKE_CXX_FLAGS:STRING=${GTEST_FLAGS}
-        -Dgtest_force_shared_crt:BOOL=ON
-      )
-
-  ExternalProject_Get_Property(googletest install_dir)
-  set(GTEST_INCLUDE_DIRS ${install_dir}/include)
-  file(MAKE_DIRECTORY ${GTEST_INCLUDE_DIRS})
-
-  set(LIB_SUFFIX "${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  set(LIB_PREFIX "${CMAKE_STATIC_LIBRARY_PREFIX}")
-  if("${GTEST_BUILD_TYPE}" STREQUAL "DEBUG")
-    set(LIB_SUFFIX "d${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  endif()
-
-  # Use gmock_main instead of gtest_main because it initializes gtest as well.
-  # Note: The libraries are listed in reverse order of their dependancies.
-  foreach(LIB gtest gmock gmock_main)
-    add_library(${LIB} UNKNOWN IMPORTED)
-    set_target_properties(${LIB} PROPERTIES
-      IMPORTED_LOCATION ${install_dir}/lib/${LIB_PREFIX}${LIB}${LIB_SUFFIX}
-      INTERFACE_INCLUDE_DIRECTORIES ${GTEST_INCLUDE_DIRS}
-      INTERFACE_LINK_LIBRARIES "${GTEST_BOTH_LIBRARIES}"
-    )
-    add_dependencies(${LIB} googletest)
-    list(APPEND GTEST_BOTH_LIBRARIES ${LIB})
-  endforeach()
-endmacro(build_external_gtest)
-
-if (BENCHMARK_ENABLE_GTEST_TESTS)
-  if (IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/googletest)
-    set(GTEST_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/googletest")
-    set(INSTALL_GTEST OFF CACHE INTERNAL "")
-    set(INSTALL_GMOCK OFF CACHE INTERNAL "")
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/googletest)
-    set(GTEST_BOTH_LIBRARIES gtest gmock gmock_main)
-    foreach(HEADER test mock)
-      # CMake 2.8 and older don't respect INTERFACE_INCLUDE_DIRECTORIES, so we
-      # have to add the paths ourselves.
-      set(HFILE g${HEADER}/g${HEADER}.h)
-      set(HPATH ${GTEST_ROOT}/google${HEADER}/include)
-      find_path(HEADER_PATH_${HEADER} ${HFILE}
-          NO_DEFAULT_PATHS
-          HINTS ${HPATH}
-      )
-      if (NOT HEADER_PATH_${HEADER})
-        message(FATAL_ERROR "Failed to find header ${HFILE} in ${HPATH}")
-      endif()
-      list(APPEND GTEST_INCLUDE_DIRS ${HEADER_PATH_${HEADER}})
-    endforeach()
-  elseif(BENCHMARK_DOWNLOAD_DEPENDENCIES)
-    build_external_gtest()
-  else()
-    find_package(GTest REQUIRED)
-    find_path(GMOCK_INCLUDE_DIRS gmock/gmock.h
-        HINTS ${GTEST_INCLUDE_DIRS})
-    if (NOT GMOCK_INCLUDE_DIRS)
-      message(FATAL_ERROR "Failed to find header gmock/gmock.h with hint ${GTEST_INCLUDE_DIRS}")
-    endif()
-    set(GTEST_INCLUDE_DIRS ${GTEST_INCLUDE_DIRS} ${GMOCK_INCLUDE_DIRS})
-    # FIXME: We don't currently require the gmock library to build the tests,
-    # and it's likely we won't find it, so we don't try. As long as we've
-    # found the gmock/gmock.h header and gtest_main that should be good enough.
-  endif()
-endif()

diff  --git a/libcxx/utils/google-benchmark/cmake/benchmark.pc.in b/libcxx/utils/google-benchmark/cmake/benchmark.pc.in
index 1e84bff68d811..34beb012eef1a 100644
--- a/libcxx/utils/google-benchmark/cmake/benchmark.pc.in
+++ b/libcxx/utils/google-benchmark/cmake/benchmark.pc.in
@@ -1,11 +1,12 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-libdir=${prefix}/lib
-includedir=${prefix}/include
+libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
 
 Name: @PROJECT_NAME@
 Description: Google microbenchmark framework
 Version: @VERSION@
 
 Libs: -L${libdir} -lbenchmark
+Libs.private: -lpthread
 Cflags: -I${includedir}

diff  --git a/libcxx/utils/google-benchmark/cmake/gnu_posix_regex.cpp b/libcxx/utils/google-benchmark/cmake/gnu_posix_regex.cpp
index 105189f02ee6f..b5b91cdab7c2a 100644
--- a/libcxx/utils/google-benchmark/cmake/gnu_posix_regex.cpp
+++ b/libcxx/utils/google-benchmark/cmake/gnu_posix_regex.cpp
@@ -9,3 +9,4 @@ int main() {
   }
   return regexec(&re, str.c_str(), 0, nullptr, 0) ? -1 : 0;
 }
+

diff  --git a/libcxx/utils/google-benchmark/cmake/posix_regex.cpp b/libcxx/utils/google-benchmark/cmake/posix_regex.cpp
index 02f6dfc278a7c..466dc62560a27 100644
--- a/libcxx/utils/google-benchmark/cmake/posix_regex.cpp
+++ b/libcxx/utils/google-benchmark/cmake/posix_regex.cpp
@@ -11,3 +11,4 @@ int main() {
   regfree(&re);
   return ret;
 }
+

diff  --git a/libcxx/utils/google-benchmark/cmake/std_regex.cpp b/libcxx/utils/google-benchmark/cmake/std_regex.cpp
index 8177c482e838b..696f2a26bce02 100644
--- a/libcxx/utils/google-benchmark/cmake/std_regex.cpp
+++ b/libcxx/utils/google-benchmark/cmake/std_regex.cpp
@@ -7,3 +7,4 @@ int main() {
        std::regex_constants::extended | std::regex_constants::nosubs);
   return std::regex_search(str, re) ? 0 : -1;
 }
+

diff  --git a/libcxx/utils/google-benchmark/dependencies.md b/libcxx/utils/google-benchmark/dependencies.md
new file mode 100644
index 0000000000000..6289b4e3548bb
--- /dev/null
+++ b/libcxx/utils/google-benchmark/dependencies.md
@@ -0,0 +1,18 @@
+# Build tool dependency policy
+
+To ensure the broadest compatibility when building the benchmark library, but
+still allow forward progress, we require any build tooling to be available for:
+
+* Debian stable AND
+* The last two Ubuntu LTS releases AND
+
+Currently, this means using build tool versions that are available for Ubuntu
+16.04 (Xenial), Ubuntu 18.04 (Bionic), and Debian stretch.
+
+_Note, [travis](.travis.yml) runs under Ubuntu 14.04 (Trusty) for linux builds._
+
+## cmake
+The current supported version is cmake 3.5.1 as of 2018-06-06.
+
+_Note, this version is also available for Ubuntu 14.04, the previous Ubuntu LTS
+release, as `cmake3`._

diff  --git a/libcxx/utils/google-benchmark/docs/AssemblyTests.md b/libcxx/utils/google-benchmark/docs/AssemblyTests.md
index 0d06f50ac652d..1fbdc269b53d6 100644
--- a/libcxx/utils/google-benchmark/docs/AssemblyTests.md
+++ b/libcxx/utils/google-benchmark/docs/AssemblyTests.md
@@ -144,3 +144,4 @@ tests to other architectures and compilers (using `CHECK` prefixes).
 
 Furthermore, the tests fail for builds which specify additional flags
 that modify code generation, including `--coverage` or `-fsanitize=`.
+

diff  --git a/libcxx/utils/google-benchmark/docs/_config.yml b/libcxx/utils/google-benchmark/docs/_config.yml
new file mode 100644
index 0000000000000..fc24e7a62dc28
--- /dev/null
+++ b/libcxx/utils/google-benchmark/docs/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-hacker
\ No newline at end of file

diff  --git a/libcxx/utils/google-benchmark/docs/perf_counters.md b/libcxx/utils/google-benchmark/docs/perf_counters.md
new file mode 100644
index 0000000000000..74560e9669712
--- /dev/null
+++ b/libcxx/utils/google-benchmark/docs/perf_counters.md
@@ -0,0 +1,34 @@
+<a name="perf-counters" />
+
+# User-Requested Performance Counters
+
+When running benchmarks, the user may choose to request collection of
+performance counters. This may be useful in investigation scenarios - narrowing
+down the cause of a regression; or verifying that the underlying cause of a
+performance improvement matches expectations.
+
+This feature is available if:
+
+* The benchmark is run on an architecture featuring a Performance Monitoring
+  Unit (PMU),
+* The benchmark is compiled with support for collecting counters. Currently,
+  this requires [libpfm](http://perfmon2.sourceforge.net/) be available at build
+  time
+
+The feature does not require modifying benchmark code. Counter collection is
+handled at the boundaries where timer collection is also handled. 
+
+To opt-in:
+
+*  Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`.
+*  Enable the cmake flag BENCHMARK_ENABLE_LIBPFM.
+
+To use, pass a comma-separated list of counter names through the
+`--benchmark_perf_counters` flag. The names are decoded through libpfm - meaning,
+they are platform specific, but some (e.g. `CYCLES` or `INSTRUCTIONS`) are
+mapped by libpfm to platform-specifics - see libpfm
+[documentation](http://perfmon2.sourceforge.net/docs.html) for more details.
+
+The counter values are reported back through the [User Counters](../README.md#custom-counters)
+mechanism, meaning, they are available in all the formats (e.g. JSON) supported
+by User Counters.
\ No newline at end of file

diff  --git a/libcxx/utils/google-benchmark/docs/random_interleaving.md b/libcxx/utils/google-benchmark/docs/random_interleaving.md
new file mode 100644
index 0000000000000..c083036841480
--- /dev/null
+++ b/libcxx/utils/google-benchmark/docs/random_interleaving.md
@@ -0,0 +1,13 @@
+<a name="interleaving" />
+
+# Random Interleaving
+
+[Random Interleaving](https://github.com/google/benchmark/issues/1051) is a
+technique to lower run-to-run variance. It randomly interleaves repetitions of a
+microbenchmark with repetitions from other microbenchmarks in the same benchmark
+test. Data shows it is able to lower run-to-run variance by
+[40%](https://github.com/google/benchmark/issues/1051) on average.
+
+To use, you mainly need to set `--benchmark_enable_random_interleaving=true`,
+and optionally specify non-zero repetition count `--benchmark_repetitions=9`
+and optionally decrease the per-repetition time `--benchmark_min_time=0.1`.

diff  --git a/libcxx/utils/google-benchmark/releasing.md b/libcxx/utils/google-benchmark/docs/releasing.md
similarity index 64%
rename from libcxx/utils/google-benchmark/releasing.md
rename to libcxx/utils/google-benchmark/docs/releasing.md
index f0cd7010e3a90..7a6dfc4017b28 100644
--- a/libcxx/utils/google-benchmark/releasing.md
+++ b/libcxx/utils/google-benchmark/docs/releasing.md
@@ -1,6 +1,6 @@
 # How to release
 
-* Make sure you're on master and synced to HEAD
+* Make sure you're on main and synced to HEAD
 * Ensure the project builds and tests run (sanity check only, obviously)
     * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
       passes
@@ -8,6 +8,12 @@
     * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
       commits between the last annotated tag and HEAD
     * Pick the most interesting.
+* Create one last commit that updates the version saved in `CMakeLists.txt` to the release version you're creating. (This version will be used if benchmark is installed from the archive you'll be creating in the next step.)
+
+```
+project (benchmark VERSION 1.5.3 LANGUAGES CXX)
+```
+
 * Create a release through github's interface
     * Note this will create a lightweight tag.
     * Update this to an annotated tag:

diff  --git a/libcxx/utils/google-benchmark/docs/tools.md b/libcxx/utils/google-benchmark/docs/tools.md
index 4a3b2e9bd2c9c..f2d0c497f3fc7 100644
--- a/libcxx/utils/google-benchmark/docs/tools.md
+++ b/libcxx/utils/google-benchmark/docs/tools.md
@@ -4,7 +4,11 @@
 
 The `compare.py` can be used to compare the result of benchmarks.
 
-**NOTE**: the utility relies on the scipy package which can be installed using [these instructions](https://www.scipy.org/install.html).
+### Dependencies
+The utility relies on the [scipy](https://www.scipy.org) package which can be installed using pip:
+```bash
+pip3 install -r requirements.txt
+```
 
 ### Displaying aggregates only
 

diff  --git a/libcxx/utils/google-benchmark/include/benchmark/benchmark.h b/libcxx/utils/google-benchmark/include/benchmark/benchmark.h
index a0fd7c6e1cade..9b5480244d6fb 100644
--- a/libcxx/utils/google-benchmark/include/benchmark/benchmark.h
+++ b/libcxx/utils/google-benchmark/include/benchmark/benchmark.h
@@ -42,6 +42,7 @@ BENCHMARK(BM_StringCopy);
 int main(int argc, char** argv) {
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
   return 0;
 }
 
@@ -56,8 +57,7 @@ static void BM_memcpy(benchmark::State& state) {
   memset(src, 'x', state.range(0));
   for (auto _ : state)
     memcpy(dst, src, state.range(0));
-  state.SetBytesProcessed(int64_t(state.iterations()) *
-                          int64_t(state.range(0)));
+  state.SetBytesProcessed(state.iterations() * state.range(0));
   delete[] src; delete[] dst;
 }
 BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
@@ -122,8 +122,7 @@ template <class Q> int BM_Sequential(benchmark::State& state) {
       q.Wait(&v);
   }
   // actually messages, not bytes:
-  state.SetBytesProcessed(
-      static_cast<int64_t>(state.iterations())*state.range(0));
+  state.SetBytesProcessed(state.iterations() * state.range(0));
 }
 BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
 
@@ -169,6 +168,12 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_HAS_CXX11
 #endif
 
+// This _MSC_VER check should detect VS 2017 v15.3 and newer.
+#if __cplusplus >= 201703L || \
+    (defined(_MSC_VER) && _MSC_VER >= 1911 && _MSVC_LANG >= 201703L)
+#define BENCHMARK_HAS_CXX17
+#endif
+
 #include <stdint.h>
 
 #include <algorithm>
@@ -178,6 +183,7 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #include <map>
 #include <set>
 #include <string>
+#include <utility>
 #include <vector>
 
 #if defined(BENCHMARK_HAS_CXX11)
@@ -200,13 +206,19 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
   TypeName& operator=(const TypeName&) = delete
 #endif
 
-#if defined(__GNUC__)
+#ifdef BENCHMARK_HAS_CXX17
+#define BENCHMARK_UNUSED [[maybe_unused]]
+#elif defined(__GNUC__) || defined(__clang__)
 #define BENCHMARK_UNUSED __attribute__((unused))
+#else
+#define BENCHMARK_UNUSED
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
 #define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
 #define BENCHMARK_NOEXCEPT noexcept
 #define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
 #elif defined(_MSC_VER) && !defined(__clang__)
-#define BENCHMARK_UNUSED
 #define BENCHMARK_ALWAYS_INLINE __forceinline
 #if _MSC_VER >= 1900
 #define BENCHMARK_NOEXCEPT noexcept
@@ -217,7 +229,6 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #endif
 #define __func__ __FUNCTION__
 #else
-#define BENCHMARK_UNUSED
 #define BENCHMARK_ALWAYS_INLINE
 #define BENCHMARK_NOEXCEPT
 #define BENCHMARK_NOEXCEPT_OP(x)
@@ -246,11 +257,17 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #endif
 
 #if defined(__GNUC__) || __has_builtin(__builtin_unreachable)
-  #define BENCHMARK_UNREACHABLE() __builtin_unreachable()
+#define BENCHMARK_UNREACHABLE() __builtin_unreachable()
 #elif defined(_MSC_VER)
-  #define BENCHMARK_UNREACHABLE() __assume(false)
+#define BENCHMARK_UNREACHABLE() __assume(false)
+#else
+#define BENCHMARK_UNREACHABLE() ((void)0)
+#endif
+
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_OVERRIDE override
 #else
-  #define BENCHMARK_UNREACHABLE() ((void)0)
+#define BENCHMARK_OVERRIDE
 #endif
 
 namespace benchmark {
@@ -258,6 +275,7 @@ class BenchmarkReporter;
 class MemoryManager;
 
 void Initialize(int* argc, char** argv);
+void Shutdown();
 
 // Report to stdout all arguments in 'argv' as unrecognized except the first.
 // Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
@@ -284,6 +302,9 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
 // allocation measurements for benchmark runs.
 void RegisterMemoryManager(MemoryManager* memory_manager);
 
+// Add a key-value pair to output as part of the context stanza in the report.
+void AddCustomContext(const std::string& key, const std::string& value);
+
 namespace internal {
 class Benchmark;
 class BenchmarkImp;
@@ -370,7 +391,10 @@ class Counter {
     // It will be presented divided by the number of iterations.
     kAvgIterations = 1U << 3U,
     // Mark the counter as a iteration-average rate. See above.
-    kAvgIterationsRate = kIsRate | kAvgIterations
+    kAvgIterationsRate = kIsRate | kAvgIterations,
+
+    // In the end, invert the result. This is always done last!
+    kInvert = 1U << 31U
   };
 
   enum OneK {
@@ -405,7 +429,7 @@ typedef std::map<std::string, Counter> UserCounters;
 
 // TimeUnit is passed to a benchmark in order to specify the order of magnitude
 // for the measured time.
-enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
+enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond, kSecond };
 
 // BigO is passed to a benchmark in order to specify the asymptotic
 // computational
@@ -413,14 +437,17 @@ enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
 // calculated automatically to the best fit.
 enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
 
+typedef uint64_t IterationCount;
+
 // BigOFunc is passed to a benchmark in order to specify the asymptotic
 // computational complexity for the benchmark.
-typedef double(BigOFunc)(int64_t);
+typedef double(BigOFunc)(IterationCount);
 
 // StatisticsFunc is passed to a benchmark in order to compute some descriptive
 // statistics over all the measurements of some type
 typedef double(StatisticsFunc)(const std::vector<double>&);
 
+namespace internal {
 struct Statistics {
   std::string name_;
   StatisticsFunc* compute_;
@@ -429,10 +456,10 @@ struct Statistics {
       : name_(name), compute_(compute) {}
 };
 
-namespace internal {
-struct BenchmarkInstance;
+class BenchmarkInstance;
 class ThreadTimer;
 class ThreadManager;
+class PerfCountersMeasurement;
 
 enum AggregationReportMode
 #if defined(BENCHMARK_HAS_CXX11)
@@ -488,7 +515,7 @@ class State {
   //   while (state.KeepRunningBatch(1000)) {
   //     // process 1000 elements
   //   }
-  bool KeepRunningBatch(size_t n);
+  bool KeepRunningBatch(IterationCount n);
 
   // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
   //           by the current thread.
@@ -538,6 +565,9 @@ class State {
   // responsibility to exit the scope as needed.
   void SkipWithError(const char* msg);
 
+  // Returns true if an error has been reported with 'SkipWithError(...)'.
+  bool error_occurred() const { return error_occurred_; }
+
   // REQUIRES: called exactly once per iteration of the benchmarking loop.
   // Set the manually measured time for this benchmark iteration, which
   // is used instead of automatically measured time if UseManualTime() was
@@ -574,7 +604,7 @@ class State {
   void SetComplexityN(int64_t complexity_n) { complexity_n_ = complexity_n; }
 
   BENCHMARK_ALWAYS_INLINE
-  int64_t complexity_length_n() { return complexity_n_; }
+  int64_t complexity_length_n() const { return complexity_n_; }
 
   // If this routine is called with items > 0, then an items/s
   // label is printed on the benchmark report line for the currently
@@ -627,7 +657,7 @@ class State {
   int64_t range_y() const { return range(1); }
 
   BENCHMARK_ALWAYS_INLINE
-  size_t iterations() const {
+  IterationCount iterations() const {
     if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
       return 0;
     }
@@ -638,15 +668,15 @@ class State {
      :  // items we expect on the first cache line (ie 64 bytes of the struct)
   // When total_iterations_ is 0, KeepRunning() and friends will return false.
   // May be larger than max_iterations.
-  size_t total_iterations_;
+  IterationCount total_iterations_;
 
   // When using KeepRunningBatch(), batch_leftover_ holds the number of
   // iterations beyond max_iters that were run. Used to track
   // completed_iterations_ accurately.
-  size_t batch_leftover_;
+  IterationCount batch_leftover_;
 
  public:
-  const size_t max_iterations;
+  const IterationCount max_iterations;
 
  private:
   bool started_;
@@ -667,30 +697,32 @@ class State {
   const int threads;
 
  private:
-  State(size_t max_iters, const std::vector<int64_t>& ranges, int thread_i,
-        int n_threads, internal::ThreadTimer* timer,
-        internal::ThreadManager* manager);
+  State(IterationCount max_iters, const std::vector<int64_t>& ranges,
+        int thread_i, int n_threads, internal::ThreadTimer* timer,
+        internal::ThreadManager* manager,
+        internal::PerfCountersMeasurement* perf_counters_measurement);
 
   void StartKeepRunning();
   // Implementation of KeepRunning() and KeepRunningBatch().
   // is_batch must be true unless n is 1.
-  bool KeepRunningInternal(size_t n, bool is_batch);
+  bool KeepRunningInternal(IterationCount n, bool is_batch);
   void FinishKeepRunning();
-  internal::ThreadTimer* timer_;
-  internal::ThreadManager* manager_;
+  internal::ThreadTimer* const timer_;
+  internal::ThreadManager* const manager_;
+  internal::PerfCountersMeasurement* const perf_counters_measurement_;
 
-  friend struct internal::BenchmarkInstance;
+  friend class internal::BenchmarkInstance;
 };
 
 inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
   return KeepRunningInternal(1, /*is_batch=*/false);
 }
 
-inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningBatch(size_t n) {
+inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningBatch(IterationCount n) {
   return KeepRunningInternal(n, /*is_batch=*/true);
 }
 
-inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningInternal(size_t n,
+inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningInternal(IterationCount n,
                                                                bool is_batch) {
   // total_iterations_ is set to 0 by the constructor, and always set to a
   // nonzero value by StartKepRunning().
@@ -754,7 +786,7 @@ struct State::StateIterator {
   }
 
  private:
-  size_t cached_;
+  IterationCount cached_;
   State* const parent_;
 };
 
@@ -783,6 +815,9 @@ class Benchmark {
   // Note: the following methods all return "this" so that multiple
   // method calls can be chained together in one expression.
 
+  // Specify the name of the benchmark
+  Benchmark* Name(const std::string& name);
+
   // Run this benchmark once with "x" as the extra argument passed
   // to the function.
   // REQUIRES: The function passed to the constructor must accept an arg1.
@@ -821,6 +856,11 @@ class Benchmark {
   // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
   Benchmark* Ranges(const std::vector<std::pair<int64_t, int64_t> >& ranges);
 
+  // Run this benchmark once for each combination of values in the (cartesian)
+  // product of the supplied argument lists.
+  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
+  Benchmark* ArgsProduct(const std::vector<std::vector<int64_t> >& arglists);
+
   // Equivalent to ArgNames({name})
   Benchmark* ArgName(const std::string& name);
 
@@ -858,7 +898,7 @@ class Benchmark {
   // NOTE: This function should only be used when *exact* iteration control is
   //   needed and never to control or limit how long a benchmark runs, where
   // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
-  Benchmark* Iterations(size_t n);
+  Benchmark* Iterations(IterationCount n);
 
   // Specify the amount of times to repeat this benchmark. This option overrides
   // the `benchmark_repetitions` flag.
@@ -874,11 +914,18 @@ class Benchmark {
   // Same as ReportAggregatesOnly(), but applies to display reporter only.
   Benchmark* DisplayAggregatesOnly(bool value = true);
 
-  // If a particular benchmark is I/O bound, runs multiple threads internally or
-  // if for some reason CPU timings are not representative, call this method. If
-  // called, the elapsed time will be used to control how many iterations are
-  // run, and in the printing of items/second or MB/seconds values.  If not
-  // called, the cpu time used by the benchmark will be used.
+  // By default, the CPU time is measured only for the main thread, which may
+  // be unrepresentative if the benchmark uses threads internally. If called,
+  // the total CPU time spent by all the threads will be measured instead.
+  // By default, the only the main thread CPU time will be measured.
+  Benchmark* MeasureProcessCPUTime();
+
+  // If a particular benchmark should use the Wall clock instead of the CPU time
+  // (be it either the CPU time of the main thread only (default), or the
+  // total CPU usage of the benchmark), call this method. If called, the elapsed
+  // (wall) time will be used to control how many iterations are run, and in the
+  // printing of items/second or MB/seconds values.
+  // If not called, the CPU time used by the benchmark will be used.
   Benchmark* UseRealTime();
 
   // If a benchmark must measure time manually (e.g. if GPU execution time is
@@ -942,6 +989,7 @@ class Benchmark {
 
  private:
   friend class BenchmarkFamilies;
+  friend class BenchmarkInstance;
 
   std::string name_;
   AggregationReportMode aggregation_report_mode_;
@@ -950,8 +998,9 @@ class Benchmark {
   TimeUnit time_unit_;
   int range_multiplier_;
   double min_time_;
-  size_t iterations_;
+  IterationCount iterations_;
   int repetitions_;
+  bool measure_process_cpu_time_;
   bool use_real_time_;
   bool use_manual_time_;
   BigO complexity_;
@@ -988,7 +1037,7 @@ class FunctionBenchmark : public Benchmark {
   FunctionBenchmark(const char* name, Function* func)
       : Benchmark(name), func_(func) {}
 
-  virtual void Run(State& st);
+  virtual void Run(State& st) BENCHMARK_OVERRIDE;
 
  private:
   Function* func_;
@@ -998,7 +1047,7 @@ class FunctionBenchmark : public Benchmark {
 template <class Lambda>
 class LambdaBenchmark : public Benchmark {
  public:
-  virtual void Run(State& st) { lambda_(st); }
+  virtual void Run(State& st) BENCHMARK_OVERRIDE { lambda_(st); }
 
  private:
   template <class OLambda>
@@ -1050,7 +1099,7 @@ class Fixture : public internal::Benchmark {
  public:
   Fixture() : internal::Benchmark("") {}
 
-  virtual void Run(State& st) {
+  virtual void Run(State& st) BENCHMARK_OVERRIDE {
     this->SetUp(st);
     this->BenchmarkCase(st);
     this->TearDown(st);
@@ -1083,9 +1132,12 @@ class Fixture : public internal::Benchmark {
 
 // Helpers for generating unique variable names
 #define BENCHMARK_PRIVATE_NAME(n) \
-  BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+  BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
 #define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
 #define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
+// Helper for concatenation with macro name expansion
+#define BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method) \
+    BaseClass##_##Method##_Benchmark
 
 #define BENCHMARK_PRIVATE_DECLARE(n)                                 \
   static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
@@ -1155,37 +1207,37 @@ class Fixture : public internal::Benchmark {
 #define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
 #endif
 
-#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)        \
-  class BaseClass##_##Method##_Benchmark : public BaseClass { \
-   public:                                                    \
-    BaseClass##_##Method##_Benchmark() : BaseClass() {        \
-      this->SetName(#BaseClass "/" #Method);                  \
-    }                                                         \
-                                                              \
-   protected:                                                 \
-    virtual void BenchmarkCase(::benchmark::State&);          \
+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)                  \
+  class BaseClass##_##Method##_Benchmark : public BaseClass {           \
+   public:                                                              \
+    BaseClass##_##Method##_Benchmark() : BaseClass() {                  \
+      this->SetName(#BaseClass "/" #Method);                            \
+    }                                                                   \
+                                                                        \
+   protected:                                                           \
+    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
   };
 
-#define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
-  class BaseClass##_##Method##_Benchmark : public BaseClass<a> {    \
-   public:                                                          \
-    BaseClass##_##Method##_Benchmark() : BaseClass<a>() {           \
-      this->SetName(#BaseClass "<" #a ">/" #Method);                \
-    }                                                               \
-                                                                    \
-   protected:                                                       \
-    virtual void BenchmarkCase(::benchmark::State&);                \
+#define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a)     \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<a> {        \
+   public:                                                              \
+    BaseClass##_##Method##_Benchmark() : BaseClass<a>() {               \
+      this->SetName(#BaseClass "<" #a ">/" #Method);                    \
+    }                                                                   \
+                                                                        \
+   protected:                                                           \
+    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
   };
 
-#define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
-  class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {    \
-   public:                                                             \
-    BaseClass##_##Method##_Benchmark() : BaseClass<a, b>() {           \
-      this->SetName(#BaseClass "<" #a "," #b ">/" #Method);            \
-    }                                                                  \
-                                                                       \
-   protected:                                                          \
-    virtual void BenchmarkCase(::benchmark::State&);                   \
+#define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b)  \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {     \
+   public:                                                              \
+    BaseClass##_##Method##_Benchmark() : BaseClass<a, b>() {            \
+      this->SetName(#BaseClass "<" #a "," #b ">/" #Method);             \
+    }                                                                   \
+                                                                        \
+   protected:                                                           \
+    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
   };
 
 #ifdef BENCHMARK_HAS_CXX11
@@ -1197,7 +1249,7 @@ class Fixture : public internal::Benchmark {
     }                                                                      \
                                                                            \
    protected:                                                              \
-    virtual void BenchmarkCase(::benchmark::State&);                       \
+    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;    \
   };
 #else
 #define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(n, a) \
@@ -1206,27 +1258,27 @@ class Fixture : public internal::Benchmark {
 
 #define BENCHMARK_DEFINE_F(BaseClass, Method)    \
   BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #define BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)    \
   BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #define BENCHMARK_TEMPLATE2_DEFINE_F(BaseClass, Method, a, b)    \
   BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, ...)            \
   BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 #else
 #define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, a) \
   BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)
 #endif
 
 #define BENCHMARK_REGISTER_F(BaseClass, Method) \
-  BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
+  BENCHMARK_PRIVATE_REGISTER_F(BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method))
 
 #define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
   BENCHMARK_PRIVATE_DECLARE(TestName) =        \
@@ -1236,23 +1288,23 @@ class Fixture : public internal::Benchmark {
 #define BENCHMARK_F(BaseClass, Method)           \
   BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
   BENCHMARK_REGISTER_F(BaseClass, Method);       \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #define BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)           \
   BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
   BENCHMARK_REGISTER_F(BaseClass, Method);                    \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #define BENCHMARK_TEMPLATE2_F(BaseClass, Method, a, b)           \
   BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
   BENCHMARK_REGISTER_F(BaseClass, Method);                       \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_F(BaseClass, Method, ...)                   \
   BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
   BENCHMARK_REGISTER_F(BaseClass, Method);                             \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 #else
 #define BENCHMARK_TEMPLATE_F(BaseClass, Method, a) \
   BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)
@@ -1264,6 +1316,8 @@ class Fixture : public internal::Benchmark {
     ::benchmark::Initialize(&argc, argv);                               \
     if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
     ::benchmark::RunSpecifiedBenchmarks();                              \
+    ::benchmark::Shutdown();                                            \
+    return 0;                                                           \
   }                                                                     \
   int main(int, char**)
 
@@ -1280,10 +1334,16 @@ struct CPUInfo {
     int num_sharing;
   };
 
+  enum Scaling {
+    UNKNOWN,
+    ENABLED,
+    DISABLED
+  };
+
   int num_cpus;
+  Scaling scaling;
   double cycles_per_second;
   std::vector<CacheInfo> caches;
-  bool scaling_enabled;
   std::vector<double> load_avg;
 
   static const CPUInfo& Get();
@@ -1293,15 +1353,33 @@ struct CPUInfo {
   BENCHMARK_DISALLOW_COPY_AND_ASSIGN(CPUInfo);
 };
 
-//Adding Struct for System Information
+// Adding Struct for System Information
 struct SystemInfo {
   std::string name;
   static const SystemInfo& Get();
+
  private:
   SystemInfo();
   BENCHMARK_DISALLOW_COPY_AND_ASSIGN(SystemInfo);
 };
 
+// BenchmarkName contains the components of the Benchmark's name
+// which allows individual fields to be modified or cleared before
+// building the final name using 'str()'.
+struct BenchmarkName {
+  std::string function_name;
+  std::string args;
+  std::string min_time;
+  std::string iterations;
+  std::string repetitions;
+  std::string time_type;
+  std::string threads;
+
+  // Return the full name of the benchmark with each non-empty
+  // field separated by a '/'
+  std::string str() const;
+};
+
 // Interface for custom benchmark result printers.
 // By default, benchmark reports are printed to stdout. However an application
 // can control the destination of the reports by calling
@@ -1319,12 +1397,14 @@ class BenchmarkReporter {
   };
 
   struct Run {
+    static const int64_t no_repetition_index = -1;
     enum RunType { RT_Iteration, RT_Aggregate };
 
     Run()
         : run_type(RT_Iteration),
           error_occurred(false),
           iterations(1),
+          threads(1),
           time_unit(kNanosecond),
           real_accumulated_time(0),
           cpu_accumulated_time(0),
@@ -1340,14 +1420,19 @@ class BenchmarkReporter {
           max_bytes_used(0) {}
 
     std::string benchmark_name() const;
-    std::string run_name;
-    RunType run_type;          // is this a measurement, or an aggregate?
+    BenchmarkName run_name;
+    int64_t family_index;
+    int64_t per_family_instance_index;
+    RunType run_type;
     std::string aggregate_name;
     std::string report_label;  // Empty if not set by benchmark.
     bool error_occurred;
     std::string error_message;
 
-    int64_t iterations;
+    IterationCount iterations;
+    int64_t threads;
+    int64_t repetition_index;
+    int64_t repetitions;
     TimeUnit time_unit;
     double real_accumulated_time;
     double cpu_accumulated_time;
@@ -1373,7 +1458,7 @@ class BenchmarkReporter {
     int64_t complexity_n;
 
     // what statistics to compute from the measurements
-    const std::vector<Statistics>* statistics;
+    const std::vector<internal::Statistics>* statistics;
 
     // Inform print function whether the current run is a complexity report
     bool report_big_o;
@@ -1387,6 +1472,19 @@ class BenchmarkReporter {
     int64_t max_bytes_used;
   };
 
+  struct PerFamilyRunReports {
+    PerFamilyRunReports() : num_runs_total(0), num_runs_done(0) {}
+
+    // How many runs will all instances of this benchmark perform?
+    int num_runs_total;
+
+    // How many runs have happened already?
+    int num_runs_done;
+
+    // The reports about (non-errneous!) runs of this family.
+    std::vector<BenchmarkReporter::Run> Runs;
+  };
+
   // Construct a BenchmarkReporter with the output stream set to 'std::cout'
   // and the error stream set to 'std::cerr'
   BenchmarkReporter();
@@ -1459,8 +1557,8 @@ class ConsoleReporter : public BenchmarkReporter {
         prev_counters_(),
         printed_header_(false) {}
 
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  virtual void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
 
  protected:
   virtual void PrintRunData(const Run& report);
@@ -1475,9 +1573,9 @@ class ConsoleReporter : public BenchmarkReporter {
 class JSONReporter : public BenchmarkReporter {
  public:
   JSONReporter() : first_report_(true) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
-  virtual void Finalize();
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  virtual void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
+  virtual void Finalize() BENCHMARK_OVERRIDE;
 
  private:
   void PrintRunData(const Run& report);
@@ -1485,12 +1583,13 @@ class JSONReporter : public BenchmarkReporter {
   bool first_report_;
 };
 
-class BENCHMARK_DEPRECATED_MSG("The CSV Reporter will be removed in a future release")
-      CSVReporter : public BenchmarkReporter {
+class BENCHMARK_DEPRECATED_MSG(
+    "The CSV Reporter will be removed in a future release") CSVReporter
+    : public BenchmarkReporter {
  public:
   CSVReporter() : printed_header_(false) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  virtual void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
 
  private:
   void PrintRunData(const Run& report);
@@ -1524,6 +1623,8 @@ class MemoryManager {
 
 inline const char* GetTimeUnitString(TimeUnit unit) {
   switch (unit) {
+    case kSecond:
+      return "s";
     case kMillisecond:
       return "ms";
     case kMicrosecond:
@@ -1536,6 +1637,8 @@ inline const char* GetTimeUnitString(TimeUnit unit) {
 
 inline double GetTimeUnitMultiplier(TimeUnit unit) {
   switch (unit) {
+    case kSecond:
+      return 1;
     case kMillisecond:
       return 1e3;
     case kMicrosecond:

diff  --git a/libcxx/utils/google-benchmark/mingw.py b/libcxx/utils/google-benchmark/mingw.py
deleted file mode 100644
index 0b69692ca2a40..0000000000000
--- a/libcxx/utils/google-benchmark/mingw.py
+++ /dev/null
@@ -1,320 +0,0 @@
-#!/usr/bin/env python
-# encoding: utf-8
-
-import argparse
-import errno
-import logging
-import os
-import platform
-import re
-import sys
-import subprocess
-import tempfile
-
-try:
-    import winreg
-except ImportError:
-    import _winreg as winreg
-try:
-    import urllib.request as request
-except ImportError:
-    import urllib as request
-try:
-    import urllib.parse as parse
-except ImportError:
-    import urlparse as parse
-
-class EmptyLogger(object):
-    '''
-    Provides an implementation that performs no logging
-    '''
-    def debug(self, *k, **kw):
-        pass
-    def info(self, *k, **kw):
-        pass
-    def warn(self, *k, **kw):
-        pass
-    def error(self, *k, **kw):
-        pass
-    def critical(self, *k, **kw):
-        pass
-    def setLevel(self, *k, **kw):
-        pass
-
-urls = (
-    'http://downloads.sourceforge.net/project/mingw-w64/Toolchains%20'
-        'targetting%20Win32/Personal%20Builds/mingw-builds/installer/'
-        'repository.txt',
-    'http://downloads.sourceforge.net/project/mingwbuilds/host-windows/'
-        'repository.txt'
-)
-'''
-A list of mingw-build repositories
-'''
-
-def repository(urls = urls, log = EmptyLogger()):
-    '''
-    Downloads and parse mingw-build repository files and parses them
-    '''
-    log.info('getting mingw-builds repository')
-    versions = {}
-    re_sourceforge = re.compile(r'http://sourceforge.net/projects/([^/]+)/files')
-    re_sub = r'http://downloads.sourceforge.net/project/\1'
-    for url in urls:
-        log.debug(' - requesting: %s', url)
-        socket = request.urlopen(url)
-        repo = socket.read()
-        if not isinstance(repo, str):
-            repo = repo.decode();
-        socket.close()
-        for entry in repo.split('\n')[:-1]:
-            value = entry.split('|')
-            version = tuple([int(n) for n in value[0].strip().split('.')])
-            version = versions.setdefault(version, {})
-            arch = value[1].strip()
-            if arch == 'x32':
-                arch = 'i686'
-            elif arch == 'x64':
-                arch = 'x86_64'
-            arch = version.setdefault(arch, {})
-            threading = arch.setdefault(value[2].strip(), {})
-            exceptions = threading.setdefault(value[3].strip(), {})
-            revision = exceptions.setdefault(int(value[4].strip()[3:]),
-                re_sourceforge.sub(re_sub, value[5].strip()))
-    return versions
-
-def find_in_path(file, path=None):
-    '''
-    Attempts to find an executable in the path
-    '''
-    if platform.system() == 'Windows':
-        file += '.exe'
-    if path is None:
-        path = os.environ.get('PATH', '')
-    if type(path) is type(''):
-        path = path.split(os.pathsep)
-    return list(filter(os.path.exists,
-        map(lambda dir, file=file: os.path.join(dir, file), path)))
-
-def find_7zip(log = EmptyLogger()):
-    '''
-    Attempts to find 7zip for unpacking the mingw-build archives
-    '''
-    log.info('finding 7zip')
-    path = find_in_path('7z')
-    if not path:
-        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\7-Zip')
-        path, _ = winreg.QueryValueEx(key, 'Path')
-        path = [os.path.join(path, '7z.exe')]
-    log.debug('found \'%s\'', path[0])
-    return path[0]
-
-find_7zip()
-
-def unpack(archive, location, log = EmptyLogger()):
-    '''
-    Unpacks a mingw-builds archive
-    '''
-    sevenzip = find_7zip(log)
-    log.info('unpacking %s', os.path.basename(archive))
-    cmd = [sevenzip, 'x', archive, '-o' + location, '-y']
-    log.debug(' - %r', cmd)
-    with open(os.devnull, 'w') as devnull:
-        subprocess.check_call(cmd, stdout = devnull)
-
-def download(url, location, log = EmptyLogger()):
-    '''
-    Downloads and unpacks a mingw-builds archive
-    '''
-    log.info('downloading MinGW')
-    log.debug(' - url: %s', url)
-    log.debug(' - location: %s', location)
-
-    re_content = re.compile(r'attachment;[ \t]*filename=(")?([^"]*)(")?[\r\n]*')
-
-    stream = request.urlopen(url)
-    try:
-        content = stream.getheader('Content-Disposition') or ''
-    except AttributeError:
-        content = stream.headers.getheader('Content-Disposition') or ''
-    matches = re_content.match(content)
-    if matches:
-        filename = matches.group(2)
-    else:
-        parsed = parse.urlparse(stream.geturl())
-        filename = os.path.basename(parsed.path)
-
-    try:
-        os.makedirs(location)
-    except OSError as e:
-        if e.errno == errno.EEXIST and os.path.isdir(location):
-            pass
-        else:
-            raise
-
-    archive = os.path.join(location, filename)
-    with open(archive, 'wb') as out:
-        while True:
-            buf = stream.read(1024)
-            if not buf:
-                break
-            out.write(buf)
-    unpack(archive, location, log = log)
-    os.remove(archive)
-
-    possible = os.path.join(location, 'mingw64')
-    if not os.path.exists(possible):
-        possible = os.path.join(location, 'mingw32')
-        if not os.path.exists(possible):
-            raise ValueError('Failed to find unpacked MinGW: ' + possible)
-    return possible
-
-def root(location = None, arch = None, version = None, threading = None,
-        exceptions = None, revision = None, log = EmptyLogger()):
-    '''
-    Returns the root folder of a specific version of the mingw-builds variant
-    of gcc. Will download the compiler if needed
-    '''
-
-    # Get the repository if we don't have all the information
-    if not (arch and version and threading and exceptions and revision):
-        versions = repository(log = log)
-
-    # Determine some defaults
-    version = version or max(versions.keys())
-    if not arch:
-        arch = platform.machine().lower()
-        if arch == 'x86':
-            arch = 'i686'
-        elif arch == 'amd64':
-            arch = 'x86_64'
-    if not threading:
-        keys = versions[version][arch].keys()
-        if 'posix' in keys:
-            threading = 'posix'
-        elif 'win32' in keys:
-            threading = 'win32'
-        else:
-            threading = keys[0]
-    if not exceptions:
-        keys = versions[version][arch][threading].keys()
-        if 'seh' in keys:
-            exceptions = 'seh'
-        elif 'sjlj' in keys:
-            exceptions = 'sjlj'
-        else:
-            exceptions = keys[0]
-    if revision == None:
-        revision = max(versions[version][arch][threading][exceptions].keys())
-    if not location:
-        location = os.path.join(tempfile.gettempdir(), 'mingw-builds')
-
-    # Get the download url
-    url = versions[version][arch][threading][exceptions][revision]
-
-    # Tell the user whatzzup
-    log.info('finding MinGW %s', '.'.join(str(v) for v in version))
-    log.debug(' - arch: %s', arch)
-    log.debug(' - threading: %s', threading)
-    log.debug(' - exceptions: %s', exceptions)
-    log.debug(' - revision: %s', revision)
-    log.debug(' - url: %s', url)
-
-    # Store each specific revision 
diff erently
-    slug = '{version}-{arch}-{threading}-{exceptions}-rev{revision}'
-    slug = slug.format(
-        version = '.'.join(str(v) for v in version),
-        arch = arch,
-        threading = threading,
-        exceptions = exceptions,
-        revision = revision
-    )
-    if arch == 'x86_64':
-        root_dir = os.path.join(location, slug, 'mingw64')
-    elif arch == 'i686':
-        root_dir = os.path.join(location, slug, 'mingw32')
-    else:
-        raise ValueError('Unknown MinGW arch: ' + arch)
-
-    # Download if needed
-    if not os.path.exists(root_dir):
-        downloaded = download(url, os.path.join(location, slug), log = log)
-        if downloaded != root_dir:
-            raise ValueError('The location of mingw did not match\n%s\n%s'
-                % (downloaded, root_dir))
-
-    return root_dir
-
-def str2ver(string):
-    '''
-    Converts a version string into a tuple
-    '''
-    try:
-        version = tuple(int(v) for v in string.split('.'))
-        if len(version) is not 3:
-            raise ValueError()
-    except ValueError:
-        raise argparse.ArgumentTypeError(
-            'please provide a three digit version string')
-    return version
-
-def main():
-    '''
-    Invoked when the script is run directly by the python interpreter
-    '''
-    parser = argparse.ArgumentParser(
-        description = 'Downloads a specific version of MinGW',
-        formatter_class = argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument('--location',
-        help = 'the location to download the compiler to',
-        default = os.path.join(tempfile.gettempdir(), 'mingw-builds'))
-    parser.add_argument('--arch', required = True, choices = ['i686', 'x86_64'],
-        help = 'the target MinGW architecture string')
-    parser.add_argument('--version', type = str2ver,
-        help = 'the version of GCC to download')
-    parser.add_argument('--threading', choices = ['posix', 'win32'],
-        help = 'the threading type of the compiler')
-    parser.add_argument('--exceptions', choices = ['sjlj', 'seh', 'dwarf'],
-        help = 'the method to throw exceptions')
-    parser.add_argument('--revision', type=int,
-        help = 'the revision of the MinGW release')
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument('-v', '--verbose', action='store_true',
-        help='increase the script output verbosity')
-    group.add_argument('-q', '--quiet', action='store_true',
-        help='only print errors and warning')
-    args = parser.parse_args()
-
-    # Create the logger
-    logger = logging.getLogger('mingw')
-    handler = logging.StreamHandler()
-    formatter = logging.Formatter('%(message)s')
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    logger.setLevel(logging.INFO)
-    if args.quiet:
-        logger.setLevel(logging.WARN)
-    if args.verbose:
-        logger.setLevel(logging.DEBUG)
-
-    # Get MinGW
-    root_dir = root(location = args.location, arch = args.arch,
-        version = args.version, threading = args.threading,
-        exceptions = args.exceptions, revision = args.revision,
-        log = logger)
-
-    sys.stdout.write('%s\n' % os.path.join(root_dir, 'bin'))
-
-if __name__ == '__main__':
-    try:
-        main()
-    except IOError as e:
-        sys.stderr.write('IO error: %s\n' % e)
-        sys.exit(1)
-    except OSError as e:
-        sys.stderr.write('OS error: %s\n' % e)
-        sys.exit(1)
-    except KeyboardInterrupt as e:
-        sys.stderr.write('Killed\n')
-        sys.exit(1)

diff  --git a/libcxx/utils/google-benchmark/requirements.txt b/libcxx/utils/google-benchmark/requirements.txt
new file mode 100644
index 0000000000000..85e8986040685
--- /dev/null
+++ b/libcxx/utils/google-benchmark/requirements.txt
@@ -0,0 +1,2 @@
+numpy == 1.19.4
+scipy == 1.5.4

diff  --git a/libcxx/utils/google-benchmark/setup.py b/libcxx/utils/google-benchmark/setup.py
new file mode 100644
index 0000000000000..5cdab10cf77c7
--- /dev/null
+++ b/libcxx/utils/google-benchmark/setup.py
@@ -0,0 +1,140 @@
+import os
+import posixpath
+import re
+import shutil
+import sys
+
+from distutils import sysconfig
+import setuptools
+from setuptools.command import build_ext
+
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+
+
+IS_WINDOWS = sys.platform.startswith("win")
+
+
+def _get_version():
+    """Parse the version string from __init__.py."""
+    with open(
+        os.path.join(HERE, "bindings", "python", "google_benchmark", "__init__.py")
+    ) as init_file:
+        try:
+            version_line = next(
+                line for line in init_file if line.startswith("__version__")
+            )
+        except StopIteration:
+            raise ValueError("__version__ not defined in __init__.py")
+        else:
+            namespace = {}
+            exec(version_line, namespace)  # pylint: disable=exec-used
+            return namespace["__version__"]
+
+
+def _parse_requirements(path):
+    with open(os.path.join(HERE, path)) as requirements:
+        return [
+            line.rstrip()
+            for line in requirements
+            if not (line.isspace() or line.startswith("#"))
+        ]
+
+
+class BazelExtension(setuptools.Extension):
+    """A C/C++ extension that is defined as a Bazel BUILD target."""
+
+    def __init__(self, name, bazel_target):
+        self.bazel_target = bazel_target
+        self.relpath, self.target_name = posixpath.relpath(bazel_target, "//").split(
+            ":"
+        )
+        setuptools.Extension.__init__(self, name, sources=[])
+
+
+class BuildBazelExtension(build_ext.build_ext):
+    """A command that runs Bazel to build a C/C++ extension."""
+
+    def run(self):
+        for ext in self.extensions:
+            self.bazel_build(ext)
+        build_ext.build_ext.run(self)
+
+    def bazel_build(self, ext):
+        """Runs the bazel build to create the package."""
+        with open("WORKSPACE", "r") as workspace:
+            workspace_contents = workspace.read()
+
+        with open("WORKSPACE", "w") as workspace:
+            workspace.write(
+                re.sub(
+                    r'(?<=path = ").*(?=",  # May be overwritten by setup\.py\.)',
+                    sysconfig.get_python_inc().replace(os.path.sep, posixpath.sep),
+                    workspace_contents,
+                )
+            )
+
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+
+        bazel_argv = [
+            "bazel",
+            "build",
+            ext.bazel_target,
+            "--symlink_prefix=" + os.path.join(self.build_temp, "bazel-"),
+            "--compilation_mode=" + ("dbg" if self.debug else "opt"),
+        ]
+
+        if IS_WINDOWS:
+            # Link with python*.lib.
+            for library_dir in self.library_dirs:
+                bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
+
+        self.spawn(bazel_argv)
+
+        shared_lib_suffix = '.dll' if IS_WINDOWS else '.so'
+        ext_bazel_bin_path = os.path.join(
+            self.build_temp, 'bazel-bin',
+            ext.relpath, ext.target_name + shared_lib_suffix)
+
+        ext_dest_path = self.get_ext_fullpath(ext.name)
+        ext_dest_dir = os.path.dirname(ext_dest_path)
+        if not os.path.exists(ext_dest_dir):
+            os.makedirs(ext_dest_dir)
+        shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
+
+
+setuptools.setup(
+    name="google_benchmark",
+    version=_get_version(),
+    url="https://github.com/google/benchmark",
+    description="A library to benchmark code snippets.",
+    author="Google",
+    author_email="benchmark-py at google.com",
+    # Contained modules and scripts.
+    package_dir={"": "bindings/python"},
+    packages=setuptools.find_packages("bindings/python"),
+    install_requires=_parse_requirements("bindings/python/requirements.txt"),
+    cmdclass=dict(build_ext=BuildBazelExtension),
+    ext_modules=[
+        BazelExtension(
+            "google_benchmark._benchmark",
+            "//bindings/python/google_benchmark:_benchmark",
+        )
+    ],
+    zip_safe=False,
+    # PyPI package information.
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Topic :: Software Development :: Testing",
+        "Topic :: System :: Benchmark",
+    ],
+    license="Apache 2.0",
+    keywords="benchmark",
+)

diff  --git a/libcxx/utils/google-benchmark/src/CMakeLists.txt b/libcxx/utils/google-benchmark/src/CMakeLists.txt
index 7a77fdf41de90..a6c8e9a7a0b7e 100644
--- a/libcxx/utils/google-benchmark/src/CMakeLists.txt
+++ b/libcxx/utils/google-benchmark/src/CMakeLists.txt
@@ -1,4 +1,5 @@
 # Allow the source files to find headers in src/
+include(GNUInstallDirs)
 include_directories(${PROJECT_SOURCE_DIR}/src)
 
 if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
@@ -17,6 +18,7 @@ foreach(item ${BENCHMARK_MAIN})
 endforeach()
 
 add_library(benchmark ${SOURCE_FILES})
+add_library(benchmark::benchmark ALIAS benchmark)
 set_target_properties(benchmark PROPERTIES
   OUTPUT_NAME "benchmark"
   VERSION ${GENERIC_LIB_VERSION}
@@ -26,6 +28,12 @@ target_include_directories(benchmark PUBLIC
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
     )
 
+# libpfm, if available
+if (HAVE_LIBPFM)
+  target_link_libraries(benchmark libpfm.a)
+  add_definitions(-DHAVE_LIBPFM)
+endif()
+
 # Link threads.
 target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 find_library(LIBRT rt)
@@ -33,6 +41,14 @@ if(LIBRT)
   target_link_libraries(benchmark ${LIBRT})
 endif()
 
+if(CMAKE_BUILD_TYPE)
+  string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER)
+endif()
+if(NOT CMAKE_THREAD_LIBS_INIT AND "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}" MATCHES ".*-fsanitize=[^ ]*address.*")
+  message(WARNING "CMake's FindThreads.cmake did not fail, but CMAKE_THREAD_LIBS_INIT ended up being empty. This was fixed in https://github.com/Kitware/CMake/commit/d53317130e84898c5328c237186dbd995aaf1c12 Let's guess that -pthread is sufficient.")
+  target_link_libraries(benchmark -pthread)
+endif()
+
 # We need extra libraries on Windows
 if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
   target_link_libraries(benchmark shlwapi)
@@ -45,6 +61,7 @@ endif()
 
 # Benchmark main library
 add_library(benchmark_main "benchmark_main.cc")
+add_library(benchmark::benchmark_main ALIAS benchmark_main)
 set_target_properties(benchmark_main PROPERTIES
   OUTPUT_NAME "benchmark_main"
   VERSION ${GENERIC_LIB_VERSION}
@@ -53,13 +70,8 @@ set_target_properties(benchmark_main PROPERTIES
 target_include_directories(benchmark PUBLIC
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
     )
-target_link_libraries(benchmark_main benchmark)
+target_link_libraries(benchmark_main benchmark::benchmark)
 
-set(include_install_dir "include")
-set(lib_install_dir "lib/")
-set(bin_install_dir "bin/")
-set(config_install_dir "lib/cmake/${PROJECT_NAME}")
-set(pkgconfig_install_dir "lib/pkgconfig")
 
 set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
 
@@ -83,26 +95,26 @@ if (BENCHMARK_ENABLE_INSTALL)
   install(
     TARGETS benchmark benchmark_main
     EXPORT ${targets_export_name}
-    ARCHIVE DESTINATION ${lib_install_dir}
-    LIBRARY DESTINATION ${lib_install_dir}
-    RUNTIME DESTINATION ${bin_install_dir}
-    INCLUDES DESTINATION ${include_install_dir})
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
   install(
     DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
-    DESTINATION ${include_install_dir}
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
     FILES_MATCHING PATTERN "*.*h")
 
   install(
       FILES "${project_config}" "${version_config}"
-      DESTINATION "${config_install_dir}")
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 
   install(
       FILES "${pkg_config}"
-      DESTINATION "${pkgconfig_install_dir}")
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
 
   install(
       EXPORT "${targets_export_name}"
       NAMESPACE "${namespace}"
-      DESTINATION "${config_install_dir}")
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 endif()

diff  --git a/libcxx/utils/google-benchmark/src/benchmark.cc b/libcxx/utils/google-benchmark/src/benchmark.cc
index aab07500af422..89f64967bf18f 100644
--- a/libcxx/utils/google-benchmark/src/benchmark.cc
+++ b/libcxx/utils/google-benchmark/src/benchmark.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "benchmark/benchmark.h"
+
 #include "benchmark_api_internal.h"
 #include "benchmark_runner.h"
 #include "internal_macros.h"
@@ -32,7 +33,10 @@
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <limits>
+#include <map>
 #include <memory>
+#include <random>
 #include <string>
 #include <thread>
 #include <utility>
@@ -45,85 +49,94 @@
 #include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
+#include "perf_counters.h"
 #include "re.h"
 #include "statistics.h"
 #include "string_util.h"
 #include "thread_manager.h"
 #include "thread_timer.h"
 
-DEFINE_bool(benchmark_list_tests, false,
-            "Print a list of benchmarks. This option overrides all other "
-            "options.");
-
-DEFINE_string(benchmark_filter, ".",
-              "A regular expression that specifies the set of benchmarks "
-              "to execute.  If this flag is empty, or if this flag is the "
-              "string \"all\", all benchmarks linked into the binary are "
-              "run.");
-
-DEFINE_double(benchmark_min_time, 0.5,
-              "Minimum number of seconds we should run benchmark before "
-              "results are considered significant.  For cpu-time based "
-              "tests, this is the lower bound on the total cpu time "
-              "used by all threads that make up the test.  For real-time "
-              "based tests, this is the lower bound on the elapsed time "
-              "of the benchmark execution, regardless of number of "
-              "threads.");
-
-DEFINE_int32(benchmark_repetitions, 1,
-             "The number of runs of each benchmark. If greater than 1, the "
-             "mean and standard deviation of the runs will be reported.");
-
-DEFINE_bool(
-    benchmark_report_aggregates_only, false,
-    "Report the result of each benchmark repetitions. When 'true' is specified "
-    "only the mean, standard deviation, and other statistics are reported for "
-    "repeated benchmarks. Affects all reporters.");
-
-DEFINE_bool(
-    benchmark_display_aggregates_only, false,
-    "Display the result of each benchmark repetitions. When 'true' is "
-    "specified only the mean, standard deviation, and other statistics are "
-    "displayed for repeated benchmarks. Unlike "
-    "benchmark_report_aggregates_only, only affects the display reporter, but "
-    "*NOT* file reporter, which will still contain all the output.");
-
-DEFINE_string(benchmark_format, "console",
-              "The format to use for console output. Valid values are "
-              "'console', 'json', or 'csv'.");
-
-DEFINE_string(benchmark_out_format, "json",
-              "The format to use for file output. Valid values are "
-              "'console', 'json', or 'csv'.");
-
-DEFINE_string(benchmark_out, "", "The file to write additional output to");
-
-DEFINE_string(benchmark_color, "auto",
-              "Whether to use colors in the output.  Valid values: "
-              "'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use "
-              "colors if the output is being sent to a terminal and the TERM "
-              "environment variable is set to a terminal type that supports "
-              "colors.");
-
-DEFINE_bool(benchmark_counters_tabular, false,
-            "Whether to use tabular format when printing user counters to "
-            "the console.  Valid values: 'true'/'yes'/1, 'false'/'no'/0."
-            "Defaults to false.");
-
-DEFINE_int32(v, 0, "The level of verbose logging to output");
+// Print a list of benchmarks. This option overrides all other options.
+DEFINE_bool(benchmark_list_tests, false);
+
+// A regular expression that specifies the set of benchmarks to execute.  If
+// this flag is empty, or if this flag is the string \"all\", all benchmarks
+// linked into the binary are run.
+DEFINE_string(benchmark_filter, ".");
+
+// Minimum number of seconds we should run benchmark before results are
+// considered significant.  For cpu-time based tests, this is the lower bound
+// on the total cpu time used by all threads that make up the test.  For
+// real-time based tests, this is the lower bound on the elapsed time of the
+// benchmark execution, regardless of number of threads.
+DEFINE_double(benchmark_min_time, 0.5);
+
+// The number of runs of each benchmark. If greater than 1, the mean and
+// standard deviation of the runs will be reported.
+DEFINE_int32(benchmark_repetitions, 1);
+
+// If set, enable random interleaving of repetitions of all benchmarks.
+// See http://github.com/google/benchmark/issues/1051 for details.
+DEFINE_bool(benchmark_enable_random_interleaving, false);
+
+// Report the result of each benchmark repetitions. When 'true' is specified
+// only the mean, standard deviation, and other statistics are reported for
+// repeated benchmarks. Affects all reporters.
+DEFINE_bool(benchmark_report_aggregates_only, false);
+
+// Display the result of each benchmark repetitions. When 'true' is specified
+// only the mean, standard deviation, and other statistics are displayed for
+// repeated benchmarks. Unlike benchmark_report_aggregates_only, only affects
+// the display reporter, but  *NOT* file reporter, which will still contain
+// all the output.
+DEFINE_bool(benchmark_display_aggregates_only, false);
+
+// The format to use for console output.
+// Valid values are 'console', 'json', or 'csv'.
+DEFINE_string(benchmark_format, "console");
+
+// The format to use for file output.
+// Valid values are 'console', 'json', or 'csv'.
+DEFINE_string(benchmark_out_format, "json");
+
+// The file to write additional output to.
+DEFINE_string(benchmark_out, "");
+
+// Whether to use colors in the output.  Valid values:
+// 'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use colors if
+// the output is being sent to a terminal and the TERM environment variable is
+// set to a terminal type that supports colors.
+DEFINE_string(benchmark_color, "auto");
+
+// Whether to use tabular format when printing user counters to the console.
+// Valid values: 'true'/'yes'/1, 'false'/'no'/0.  Defaults to false.
+DEFINE_bool(benchmark_counters_tabular, false);
+
+// The level of verbose logging to output
+DEFINE_int32(v, 0);
+
+// List of additional perf counters to collect, in libpfm format. For more
+// information about libpfm: https://man7.org/linux/man-pages/man3/libpfm.3.html
+DEFINE_string(benchmark_perf_counters, "");
 
 namespace benchmark {
-
 namespace internal {
 
+// Extra context to include in the output formatted as comma-separated key-value
+// pairs. Kept internal as it's only used for parsing from env/command line.
+DEFINE_kvpairs(benchmark_context, {});
+
+std::map<std::string, std::string>* global_context = nullptr;
+
 // FIXME: wouldn't LTO mess this up?
 void UseCharPointer(char const volatile*) {}
 
 }  // namespace internal
 
-State::State(size_t max_iters, const std::vector<int64_t>& ranges, int thread_i,
-             int n_threads, internal::ThreadTimer* timer,
-             internal::ThreadManager* manager)
+State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
+             int thread_i, int n_threads, internal::ThreadTimer* timer,
+             internal::ThreadManager* manager,
+             internal::PerfCountersMeasurement* perf_counters_measurement)
     : total_iterations_(0),
       batch_leftover_(0),
       max_iterations(max_iters),
@@ -136,7 +149,8 @@ State::State(size_t max_iters, const std::vector<int64_t>& ranges, int thread_i,
       thread_index(thread_i),
       threads(n_threads),
       timer_(timer),
-      manager_(manager) {
+      manager_(manager),
+      perf_counters_measurement_(perf_counters_measurement) {
   CHECK(max_iterations != 0) << "At least one iteration must be run";
   CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
 
@@ -148,7 +162,7 @@ State::State(size_t max_iters, const std::vector<int64_t>& ranges, int thread_i,
   // which must be suppressed.
 #if defined(__INTEL_COMPILER)
 #pragma warning push
-#pragma warning(disable:1875)
+#pragma warning(disable : 1875)
 #elif defined(__GNUC__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Winvalid-offsetof"
@@ -169,11 +183,23 @@ void State::PauseTiming() {
   // Add in time accumulated so far
   CHECK(started_ && !finished_ && !error_occurred_);
   timer_->StopTimer();
+  if (perf_counters_measurement_) {
+    auto measurements = perf_counters_measurement_->StopAndGetMeasurements();
+    for (const auto& name_and_measurement : measurements) {
+      auto name = name_and_measurement.first;
+      auto measurement = name_and_measurement.second;
+      CHECK_EQ(counters[name], 0.0);
+      counters[name] = Counter(measurement, Counter::kAvgIterations);
+    }
+  }
 }
 
 void State::ResumeTiming() {
   CHECK(started_ && !finished_ && !error_occurred_);
   timer_->StartTimer();
+  if (perf_counters_measurement_) {
+    perf_counters_measurement_->Start();
+  }
 }
 
 void State::SkipWithError(const char* msg) {
@@ -221,6 +247,37 @@ void State::FinishKeepRunning() {
 namespace internal {
 namespace {
 
+// Flushes streams after invoking reporter methods that write to them. This
+// ensures users get timely updates even when streams are not line-buffered.
+void FlushStreams(BenchmarkReporter* reporter) {
+  if (!reporter) return;
+  std::flush(reporter->GetOutputStream());
+  std::flush(reporter->GetErrorStream());
+}
+
+// Reports in both display and file reporters.
+void Report(BenchmarkReporter* display_reporter,
+            BenchmarkReporter* file_reporter, const RunResults& run_results) {
+  auto report_one = [](BenchmarkReporter* reporter, bool aggregates_only,
+                       const RunResults& results) {
+    assert(reporter);
+    // If there are no aggregates, do output non-aggregates.
+    aggregates_only &= !results.aggregates_only.empty();
+    if (!aggregates_only) reporter->ReportRuns(results.non_aggregates);
+    if (!results.aggregates_only.empty())
+      reporter->ReportRuns(results.aggregates_only);
+  };
+
+  report_one(display_reporter, run_results.display_report_aggregates_only,
+             run_results);
+  if (file_reporter)
+    report_one(file_reporter, run_results.file_report_aggregates_only,
+               run_results);
+
+  FlushStreams(display_reporter);
+  FlushStreams(file_reporter);
+}
+
 void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
                    BenchmarkReporter* display_reporter,
                    BenchmarkReporter* file_reporter) {
@@ -233,10 +290,10 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
   size_t stat_field_width = 0;
   for (const BenchmarkInstance& benchmark : benchmarks) {
     name_field_width =
-        std::max<size_t>(name_field_width, benchmark.name.size());
-    might_have_aggregates |= benchmark.repetitions > 1;
+        std::max<size_t>(name_field_width, benchmark.name().str().size());
+    might_have_aggregates |= benchmark.repetitions() > 1;
 
-    for (const auto& Stat : *benchmark.statistics)
+    for (const auto& Stat : benchmark.statistics())
       stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
   }
   if (might_have_aggregates) name_field_width += 1 + stat_field_width;
@@ -245,50 +302,86 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
   BenchmarkReporter::Context context;
   context.name_field_width = name_field_width;
 
-  // Keep track of running times of all instances of current benchmark
-  std::vector<BenchmarkReporter::Run> complexity_reports;
-
-  // We flush streams after invoking reporter methods that write to them. This
-  // ensures users get timely updates even when streams are not line-buffered.
-  auto flushStreams = [](BenchmarkReporter* reporter) {
-    if (!reporter) return;
-    std::flush(reporter->GetOutputStream());
-    std::flush(reporter->GetErrorStream());
-  };
+  // Keep track of running times of all instances of each benchmark family.
+  std::map<int /*family_index*/, BenchmarkReporter::PerFamilyRunReports>
+      per_family_reports;
 
   if (display_reporter->ReportContext(context) &&
       (!file_reporter || file_reporter->ReportContext(context))) {
-    flushStreams(display_reporter);
-    flushStreams(file_reporter);
-
-    for (const auto& benchmark : benchmarks) {
-      RunResults run_results = RunBenchmark(benchmark, &complexity_reports);
-
-      auto report = [&run_results](BenchmarkReporter* reporter,
-                                   bool report_aggregates_only) {
-        assert(reporter);
-        // If there are no aggregates, do output non-aggregates.
-        report_aggregates_only &= !run_results.aggregates_only.empty();
-        if (!report_aggregates_only)
-          reporter->ReportRuns(run_results.non_aggregates);
-        if (!run_results.aggregates_only.empty())
-          reporter->ReportRuns(run_results.aggregates_only);
-      };
-
-      report(display_reporter, run_results.display_report_aggregates_only);
-      if (file_reporter)
-        report(file_reporter, run_results.file_report_aggregates_only);
-
-      flushStreams(display_reporter);
-      flushStreams(file_reporter);
+    FlushStreams(display_reporter);
+    FlushStreams(file_reporter);
+
+    size_t num_repetitions_total = 0;
+
+    std::vector<internal::BenchmarkRunner> runners;
+    runners.reserve(benchmarks.size());
+    for (const BenchmarkInstance& benchmark : benchmarks) {
+      BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
+      if (benchmark.complexity() != oNone)
+        reports_for_family = &per_family_reports[benchmark.family_index()];
+
+      runners.emplace_back(benchmark, reports_for_family);
+      int num_repeats_of_this_instance = runners.back().GetNumRepeats();
+      num_repetitions_total += num_repeats_of_this_instance;
+      if (reports_for_family)
+        reports_for_family->num_runs_total += num_repeats_of_this_instance;
+    }
+    assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
+
+    std::vector<int> repetition_indices;
+    repetition_indices.reserve(num_repetitions_total);
+    for (size_t runner_index = 0, num_runners = runners.size();
+         runner_index != num_runners; ++runner_index) {
+      const internal::BenchmarkRunner& runner = runners[runner_index];
+      std::fill_n(std::back_inserter(repetition_indices),
+                  runner.GetNumRepeats(), runner_index);
+    }
+    assert(repetition_indices.size() == num_repetitions_total &&
+           "Unexpected number of repetition indexes.");
+
+    if (FLAGS_benchmark_enable_random_interleaving) {
+      std::random_device rd;
+      std::mt19937 g(rd());
+      std::shuffle(repetition_indices.begin(), repetition_indices.end(), g);
+    }
+
+    for (size_t repetition_index : repetition_indices) {
+      internal::BenchmarkRunner& runner = runners[repetition_index];
+      runner.DoOneRepetition();
+      if (runner.HasRepeatsRemaining()) continue;
+      // FIXME: report each repetition separately, not all of them in bulk.
+
+      RunResults run_results = runner.GetResults();
+
+      // Maybe calculate complexity report
+      if (const auto* reports_for_family = runner.GetReportsForFamily()) {
+        if (reports_for_family->num_runs_done ==
+            reports_for_family->num_runs_total) {
+          auto additional_run_stats = ComputeBigO(reports_for_family->Runs);
+          run_results.aggregates_only.insert(run_results.aggregates_only.end(),
+                                             additional_run_stats.begin(),
+                                             additional_run_stats.end());
+          per_family_reports.erase(
+              (int)reports_for_family->Runs.front().family_index);
+        }
+      }
+
+      Report(display_reporter, file_reporter, run_results);
     }
   }
   display_reporter->Finalize();
   if (file_reporter) file_reporter->Finalize();
-  flushStreams(display_reporter);
-  flushStreams(file_reporter);
+  FlushStreams(display_reporter);
+  FlushStreams(file_reporter);
 }
 
+// Disable deprecated warnings temporarily because we need to reference
+// CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
 std::unique_ptr<BenchmarkReporter> CreateReporter(
     std::string const& name, ConsoleReporter::OutputOptions output_opts) {
   typedef std::unique_ptr<BenchmarkReporter> PtrType;
@@ -304,6 +397,10 @@ std::unique_ptr<BenchmarkReporter> CreateReporter(
   }
 }
 
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
 }  // end namespace
 
 bool IsZero(double n) {
@@ -312,7 +409,7 @@ bool IsZero(double n) {
 
 ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
   int output_opts = ConsoleReporter::OO_Defaults;
-  auto is_benchmark_color = [force_no_color] () -> bool {
+  auto is_benchmark_color = [force_no_color]() -> bool {
     if (force_no_color) {
       return false;
     }
@@ -372,7 +469,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
   if (!fname.empty()) {
     output_file.open(fname);
     if (!output_file.is_open()) {
-      Err << "invalid file name: '" << fname << std::endl;
+      Err << "invalid file name: '" << fname << "'" << std::endl;
       std::exit(1);
     }
     if (!file_reporter) {
@@ -393,7 +490,8 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
   }
 
   if (FLAGS_benchmark_list_tests) {
-    for (auto const& benchmark : benchmarks) Out << benchmark.name << "\n";
+    for (auto const& benchmark : benchmarks)
+      Out << benchmark.name().str() << "\n";
   } else {
     internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
   }
@@ -405,6 +503,16 @@ void RegisterMemoryManager(MemoryManager* manager) {
   internal::memory_manager = manager;
 }
 
+void AddCustomContext(const std::string& key, const std::string& value) {
+  if (internal::global_context == nullptr) {
+    internal::global_context = new std::map<std::string, std::string>();
+  }
+  if (!internal::global_context->emplace(key, value).second) {
+    std::cerr << "Failed to add custom context \"" << key << "\" as it already "
+              << "exists with value \"" << value << "\"\n";
+  }
+}
+
 namespace internal {
 
 void PrintUsageAndExit() {
@@ -414,6 +522,7 @@ void PrintUsageAndExit() {
           "          [--benchmark_filter=<regex>]\n"
           "          [--benchmark_min_time=<min_time>]\n"
           "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_enable_random_interleaving={true|false}]\n"
           "          [--benchmark_report_aggregates_only={true|false}]\n"
           "          [--benchmark_display_aggregates_only={true|false}]\n"
           "          [--benchmark_format=<console|json|csv>]\n"
@@ -421,6 +530,7 @@ void PrintUsageAndExit() {
           "          [--benchmark_out_format=<json|console|csv>]\n"
           "          [--benchmark_color={auto|true|false}]\n"
           "          [--benchmark_counters_tabular={true|false}]\n"
+          "          [--benchmark_context=<key>=<value>,...]\n"
           "          [--v=<verbosity>]\n");
   exit(0);
 }
@@ -429,7 +539,7 @@ void ParseCommandLineFlags(int* argc, char** argv) {
   using namespace benchmark;
   BenchmarkReporter::Context::executable_name =
       (argc && *argc > 0) ? argv[0] : "unknown";
-  for (int i = 1; i < *argc; ++i) {
+  for (int i = 1; argc && i < *argc; ++i) {
     if (ParseBoolFlag(argv[i], "benchmark_list_tests",
                       &FLAGS_benchmark_list_tests) ||
         ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
@@ -437,6 +547,8 @@ void ParseCommandLineFlags(int* argc, char** argv) {
                         &FLAGS_benchmark_min_time) ||
         ParseInt32Flag(argv[i], "benchmark_repetitions",
                        &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
+                      &FLAGS_benchmark_enable_random_interleaving) ||
         ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
                       &FLAGS_benchmark_report_aggregates_only) ||
         ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
@@ -451,6 +563,10 @@ void ParseCommandLineFlags(int* argc, char** argv) {
         ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
         ParseBoolFlag(argv[i], "benchmark_counters_tabular",
                       &FLAGS_benchmark_counters_tabular) ||
+        ParseStringFlag(argv[i], "benchmark_perf_counters",
+                        &FLAGS_benchmark_perf_counters) ||
+        ParseKeyValueFlag(argv[i], "benchmark_context",
+                          &FLAGS_benchmark_context) ||
         ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
       for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
 
@@ -461,13 +577,17 @@ void ParseCommandLineFlags(int* argc, char** argv) {
     }
   }
   for (auto const* flag :
-       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format})
+       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format}) {
     if (*flag != "console" && *flag != "json" && *flag != "csv") {
       PrintUsageAndExit();
     }
+  }
   if (FLAGS_benchmark_color.empty()) {
     PrintUsageAndExit();
   }
+  for (const auto& kv : FLAGS_benchmark_context) {
+    AddCustomContext(kv.first, kv.second);
+  }
 }
 
 int InitializeStreams() {
@@ -482,6 +602,10 @@ void Initialize(int* argc, char** argv) {
   internal::LogLevel() = FLAGS_v;
 }
 
+void Shutdown() {
+  delete internal::global_context;
+}
+
 bool ReportUnrecognizedArguments(int argc, char** argv) {
   for (int i = 1; i < argc; ++i) {
     fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0],

diff  --git a/libcxx/utils/google-benchmark/src/benchmark_api_internal.cc b/libcxx/utils/google-benchmark/src/benchmark_api_internal.cc
index 8d3108363b8c1..89da519afc8cb 100644
--- a/libcxx/utils/google-benchmark/src/benchmark_api_internal.cc
+++ b/libcxx/utils/google-benchmark/src/benchmark_api_internal.cc
@@ -1,15 +1,94 @@
 #include "benchmark_api_internal.h"
 
+#include <cinttypes>
+
+#include "string_util.h"
+
 namespace benchmark {
 namespace internal {
 
+BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
+                                     int per_family_instance_idx,
+                                     const std::vector<int64_t>& args,
+                                     int thread_count)
+    : benchmark_(*benchmark),
+      family_index_(family_idx),
+      per_family_instance_index_(per_family_instance_idx),
+      aggregation_report_mode_(benchmark_.aggregation_report_mode_),
+      args_(args),
+      time_unit_(benchmark_.time_unit_),
+      measure_process_cpu_time_(benchmark_.measure_process_cpu_time_),
+      use_real_time_(benchmark_.use_real_time_),
+      use_manual_time_(benchmark_.use_manual_time_),
+      complexity_(benchmark_.complexity_),
+      complexity_lambda_(benchmark_.complexity_lambda_),
+      statistics_(benchmark_.statistics_),
+      repetitions_(benchmark_.repetitions_),
+      min_time_(benchmark_.min_time_),
+      iterations_(benchmark_.iterations_),
+      threads_(thread_count) {
+  name_.function_name = benchmark_.name_;
+
+  size_t arg_i = 0;
+  for (const auto& arg : args) {
+    if (!name_.args.empty()) {
+      name_.args += '/';
+    }
+
+    if (arg_i < benchmark->arg_names_.size()) {
+      const auto& arg_name = benchmark_.arg_names_[arg_i];
+      if (!arg_name.empty()) {
+        name_.args += StrFormat("%s:", arg_name.c_str());
+      }
+    }
+
+    name_.args += StrFormat("%" PRId64, arg);
+    ++arg_i;
+  }
+
+  if (!IsZero(benchmark->min_time_)) {
+    name_.min_time = StrFormat("min_time:%0.3f", benchmark_.min_time_);
+  }
+
+  if (benchmark_.iterations_ != 0) {
+    name_.iterations = StrFormat(
+        "iterations:%lu", static_cast<unsigned long>(benchmark_.iterations_));
+  }
+
+  if (benchmark_.repetitions_ != 0) {
+    name_.repetitions = StrFormat("repeats:%d", benchmark_.repetitions_);
+  }
+
+  if (benchmark_.measure_process_cpu_time_) {
+    name_.time_type = "process_time";
+  }
+
+  if (benchmark_.use_manual_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "manual_time";
+  } else if (benchmark_.use_real_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "real_time";
+  }
+
+  if (!benchmark_.thread_counts_.empty()) {
+    name_.threads = StrFormat("threads:%d", threads_);
+  }
+}
+
 State BenchmarkInstance::Run(
-    size_t iters, int thread_id, internal::ThreadTimer* timer,
-    internal::ThreadManager* manager) const {
-  State st(iters, arg, thread_id, threads, timer, manager);
-  benchmark->Run(st);
+    IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+    internal::ThreadManager* manager,
+    internal::PerfCountersMeasurement* perf_counters_measurement) const {
+  State st(iters, args_, thread_id, threads_, timer, manager,
+           perf_counters_measurement);
+  benchmark_.Run(st);
   return st;
 }
 
-}  // internal
-}  // benchmark
+}  // namespace internal
+}  // namespace benchmark

diff  --git a/libcxx/utils/google-benchmark/src/benchmark_api_internal.h b/libcxx/utils/google-benchmark/src/benchmark_api_internal.h
index 0524a85c01d00..9296b7d2c8165 100644
--- a/libcxx/utils/google-benchmark/src/benchmark_api_internal.h
+++ b/libcxx/utils/google-benchmark/src/benchmark_api_internal.h
@@ -1,9 +1,6 @@
 #ifndef BENCHMARK_API_INTERNAL_H
 #define BENCHMARK_API_INTERNAL_H
 
-#include "benchmark/benchmark.h"
-#include "commandlineflags.h"
-
 #include <cmath>
 #include <iosfwd>
 #include <limits>
@@ -11,31 +8,60 @@
 #include <string>
 #include <vector>
 
+#include "benchmark/benchmark.h"
+#include "commandlineflags.h"
+
 namespace benchmark {
 namespace internal {
 
 // Information kept per benchmark we may want to run
-struct BenchmarkInstance {
-  std::string name;
-  Benchmark* benchmark;
-  AggregationReportMode aggregation_report_mode;
-  std::vector<int64_t> arg;
-  TimeUnit time_unit;
-  int range_multiplier;
-  bool use_real_time;
-  bool use_manual_time;
-  BigO complexity;
-  BigOFunc* complexity_lambda;
-  UserCounters counters;
-  const std::vector<Statistics>* statistics;
-  bool last_benchmark_instance;
-  int repetitions;
-  double min_time;
-  size_t iterations;
-  int threads;  // Number of concurrent threads to us
-
-  State Run(size_t iters, int thread_id, internal::ThreadTimer* timer,
-            internal::ThreadManager* manager) const;
+class BenchmarkInstance {
+ public:
+  BenchmarkInstance(Benchmark* benchmark, int family_index,
+                    int per_family_instance_index,
+                    const std::vector<int64_t>& args, int threads);
+
+  const BenchmarkName& name() const { return name_; }
+  int family_index() const { return family_index_; }
+  int per_family_instance_index() const { return per_family_instance_index_; }
+  AggregationReportMode aggregation_report_mode() const {
+    return aggregation_report_mode_;
+  }
+  TimeUnit time_unit() const { return time_unit_; }
+  bool measure_process_cpu_time() const { return measure_process_cpu_time_; }
+  bool use_real_time() const { return use_real_time_; }
+  bool use_manual_time() const { return use_manual_time_; }
+  BigO complexity() const { return complexity_; }
+  BigOFunc& complexity_lambda() const { return *complexity_lambda_; }
+  const std::vector<Statistics>& statistics() const { return statistics_; }
+  int repetitions() const { return repetitions_; }
+  double min_time() const { return min_time_; }
+  IterationCount iterations() const { return iterations_; }
+  int threads() const { return threads_; }
+
+  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+            internal::ThreadManager* manager,
+            internal::PerfCountersMeasurement* perf_counters_measurement) const;
+
+ private:
+  BenchmarkName name_;
+  Benchmark& benchmark_;
+  const int family_index_;
+  const int per_family_instance_index_;
+  AggregationReportMode aggregation_report_mode_;
+  const std::vector<int64_t>& args_;
+  TimeUnit time_unit_;
+  bool measure_process_cpu_time_;
+  bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
+  UserCounters counters_;
+  const std::vector<Statistics>& statistics_;
+  int repetitions_;
+  double min_time_;
+  IterationCount iterations_;
+  int threads_;  // Number of concurrent threads to us
 };
 
 bool FindBenchmarksInternal(const std::string& re,

diff  --git a/libcxx/utils/google-benchmark/src/benchmark_name.cc b/libcxx/utils/google-benchmark/src/benchmark_name.cc
new file mode 100644
index 0000000000000..2a17ebce277f5
--- /dev/null
+++ b/libcxx/utils/google-benchmark/src/benchmark_name.cc
@@ -0,0 +1,58 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <benchmark/benchmark.h>
+
+namespace benchmark {
+
+namespace {
+
+// Compute the total size of a pack of std::strings
+size_t size_impl() { return 0; }
+
+template <typename Head, typename... Tail>
+size_t size_impl(const Head& head, const Tail&... tail) {
+  return head.size() + size_impl(tail...);
+}
+
+// Join a pack of std::strings using a delimiter
+// TODO: use absl::StrJoin
+void join_impl(std::string&, char) {}
+
+template <typename Head, typename... Tail>
+void join_impl(std::string& s, const char delimiter, const Head& head,
+               const Tail&... tail) {
+  if (!s.empty() && !head.empty()) {
+    s += delimiter;
+  }
+
+  s += head;
+
+  join_impl(s, delimiter, tail...);
+}
+
+template <typename... Ts>
+std::string join(char delimiter, const Ts&... ts) {
+  std::string s;
+  s.reserve(sizeof...(Ts) + size_impl(ts...));
+  join_impl(s, delimiter, ts...);
+  return s;
+}
+}  // namespace
+
+std::string BenchmarkName::str() const {
+  return join('/', function_name, args, min_time, iterations, repetitions,
+              time_type, threads);
+}
+}  // namespace benchmark

diff  --git a/libcxx/utils/google-benchmark/src/benchmark_register.cc b/libcxx/utils/google-benchmark/src/benchmark_register.cc
index f17f5b223cec6..574462220e7c2 100644
--- a/libcxx/utils/google-benchmark/src/benchmark_register.cc
+++ b/libcxx/utils/google-benchmark/src/benchmark_register.cc
@@ -24,6 +24,7 @@
 
 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
@@ -31,6 +32,7 @@
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <numeric>
 #include <sstream>
 #include <thread>
 
@@ -127,8 +129,13 @@ bool BenchmarkFamilies::FindBenchmarks(
   // Special list of thread counts to use when none are specified
   const std::vector<int> one_thread = {1};
 
+  int next_family_index = 0;
+
   MutexLock l(mutex_);
   for (std::unique_ptr<Benchmark>& family : families_) {
+    int family_index = next_family_index;
+    int per_family_instance_index = 0;
+
     // Family was deleted or benchmark doesn't match
     if (!family) continue;
 
@@ -148,71 +155,24 @@ bool BenchmarkFamilies::FindBenchmarks(
     }
     // reserve in the special case the regex ".", since we know the final
     // family size.
-    if (spec == ".") benchmarks->reserve(family_size);
+    if (spec == ".") benchmarks->reserve(benchmarks->size() + family_size);
 
     for (auto const& args : family->args_) {
       for (int num_threads : *thread_counts) {
-        BenchmarkInstance instance;
-        instance.name = family->name_;
-        instance.benchmark = family.get();
-        instance.aggregation_report_mode = family->aggregation_report_mode_;
-        instance.arg = args;
-        instance.time_unit = family->time_unit_;
-        instance.range_multiplier = family->range_multiplier_;
-        instance.min_time = family->min_time_;
-        instance.iterations = family->iterations_;
-        instance.repetitions = family->repetitions_;
-        instance.use_real_time = family->use_real_time_;
-        instance.use_manual_time = family->use_manual_time_;
-        instance.complexity = family->complexity_;
-        instance.complexity_lambda = family->complexity_lambda_;
-        instance.statistics = &family->statistics_;
-        instance.threads = num_threads;
-
-        // Add arguments to instance name
-        size_t arg_i = 0;
-        for (auto const& arg : args) {
-          instance.name += "/";
-
-          if (arg_i < family->arg_names_.size()) {
-            const auto& arg_name = family->arg_names_[arg_i];
-            if (!arg_name.empty()) {
-              instance.name +=
-                  StrFormat("%s:", family->arg_names_[arg_i].c_str());
-            }
-          }
-
-          // we know that the args are always non-negative (see 'AddRange()'),
-          // thus print as 'unsigned'. BUT, do a cast due to the 32-bit builds.
-          instance.name += StrFormat("%lu", static_cast<unsigned long>(arg));
-          ++arg_i;
-        }
+        BenchmarkInstance instance(family.get(), family_index,
+                                   per_family_instance_index, args,
+                                   num_threads);
 
-        if (!IsZero(family->min_time_))
-          instance.name += StrFormat("/min_time:%0.3f", family->min_time_);
-        if (family->iterations_ != 0) {
-          instance.name +=
-              StrFormat("/iterations:%lu",
-                        static_cast<unsigned long>(family->iterations_));
-        }
-        if (family->repetitions_ != 0)
-          instance.name += StrFormat("/repeats:%d", family->repetitions_);
-
-        if (family->use_manual_time_) {
-          instance.name += "/manual_time";
-        } else if (family->use_real_time_) {
-          instance.name += "/real_time";
-        }
+        const auto full_name = instance.name().str();
+        if ((re.Match(full_name) && !isNegativeFilter) ||
+            (!re.Match(full_name) && isNegativeFilter)) {
+          benchmarks->push_back(std::move(instance));
 
-        // Add the number of threads used to the name
-        if (!family->thread_counts_.empty()) {
-          instance.name += StrFormat("/threads:%d", instance.threads);
-        }
+          ++per_family_instance_index;
 
-        if ((re.Match(instance.name) && !isNegativeFilter) ||
-            (!re.Match(instance.name) && isNegativeFilter)) {
-          instance.last_benchmark_instance = (&args == &family->args_.back());
-          benchmarks->push_back(std::move(instance));
+          // Only bump the next family index once we've estabilished that
+          // at least one instance of this family will be run.
+          if (next_family_index == family_index) ++next_family_index;
         }
       }
     }
@@ -247,6 +207,7 @@ Benchmark::Benchmark(const char* name)
       min_time_(0),
       iterations_(0),
       repetitions_(0),
+      measure_process_cpu_time_(false),
       use_real_time_(false),
       use_manual_time_(false),
       complexity_(oNone),
@@ -258,6 +219,11 @@ Benchmark::Benchmark(const char* name)
 
 Benchmark::~Benchmark() {}
 
+Benchmark* Benchmark::Name(const std::string& name) {
+  SetName(name.c_str());
+  return this;
+}
+
 Benchmark* Benchmark::Arg(int64_t x) {
   CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
   args_.push_back({x});
@@ -284,33 +250,41 @@ Benchmark* Benchmark::Ranges(
     const std::vector<std::pair<int64_t, int64_t>>& ranges) {
   CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
   std::vector<std::vector<int64_t>> arglists(ranges.size());
-  std::size_t total = 1;
   for (std::size_t i = 0; i < ranges.size(); i++) {
     AddRange(&arglists[i], ranges[i].first, ranges[i].second,
              range_multiplier_);
-    total *= arglists[i].size();
   }
 
-  std::vector<std::size_t> ctr(arglists.size(), 0);
+  ArgsProduct(arglists);
 
-  for (std::size_t i = 0; i < total; i++) {
-    std::vector<int64_t> tmp;
-    tmp.reserve(arglists.size());
-
-    for (std::size_t j = 0; j < arglists.size(); j++) {
-      tmp.push_back(arglists[j].at(ctr[j]));
-    }
+  return this;
+}
 
-    args_.push_back(std::move(tmp));
+Benchmark* Benchmark::ArgsProduct(
+    const std::vector<std::vector<int64_t>>& arglists) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(arglists.size()));
 
-    for (std::size_t j = 0; j < arglists.size(); j++) {
-      if (ctr[j] + 1 < arglists[j].size()) {
-        ++ctr[j];
-        break;
-      }
-      ctr[j] = 0;
+  std::vector<std::size_t> indices(arglists.size());
+  const std::size_t total = std::accumulate(
+      std::begin(arglists), std::end(arglists), std::size_t{1},
+      [](const std::size_t res, const std::vector<int64_t>& arglist) {
+        return res * arglist.size();
+      });
+  std::vector<int64_t> args;
+  args.reserve(arglists.size());
+  for (std::size_t i = 0; i < total; i++) {
+    for (std::size_t arg = 0; arg < arglists.size(); arg++) {
+      args.push_back(arglists[arg][indices[arg]]);
     }
+    args_.push_back(args);
+    args.clear();
+
+    std::size_t arg = 0;
+    do {
+      indices[arg] = (indices[arg] + 1) % arglists[arg].size();
+    } while (indices[arg++] == 0 && arg < arglists.size());
   }
+
   return this;
 }
 
@@ -328,7 +302,6 @@ Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
 
 Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
   CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-  CHECK_GE(start, 0);
   CHECK_LE(start, limit);
   for (int64_t arg = start; arg <= limit; arg += step) {
     args_.push_back({arg});
@@ -360,7 +333,7 @@ Benchmark* Benchmark::MinTime(double t) {
   return this;
 }
 
-Benchmark* Benchmark::Iterations(size_t n) {
+Benchmark* Benchmark::Iterations(IterationCount n) {
   CHECK(n > 0);
   CHECK(IsZero(min_time_));
   iterations_ = n;
@@ -394,6 +367,12 @@ Benchmark* Benchmark::DisplayAggregatesOnly(bool value) {
   return this;
 }
 
+Benchmark* Benchmark::MeasureProcessCPUTime() {
+  // Can be used together with UseRealTime() / UseManualTime().
+  measure_process_cpu_time_ = true;
+  return this;
+}
+
 Benchmark* Benchmark::UseRealTime() {
   CHECK(!use_manual_time_)
       << "Cannot set UseRealTime and UseManualTime simultaneously.";

diff  --git a/libcxx/utils/google-benchmark/src/benchmark_register.h b/libcxx/utils/google-benchmark/src/benchmark_register.h
index 0705e219f2fa2..09496607f224d 100644
--- a/libcxx/utils/google-benchmark/src/benchmark_register.h
+++ b/libcxx/utils/google-benchmark/src/benchmark_register.h
@@ -1,33 +1,108 @@
 #ifndef BENCHMARK_REGISTER_H
 #define BENCHMARK_REGISTER_H
 
+#include <limits>
 #include <vector>
 
 #include "check.h"
 
+namespace benchmark {
+namespace internal {
+
+// Append the powers of 'mult' in the closed interval [lo, hi].
+// Returns iterator to the start of the inserted range.
 template <typename T>
-void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
+typename std::vector<T>::iterator
+AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
   CHECK_GE(lo, 0);
   CHECK_GE(hi, lo);
   CHECK_GE(mult, 2);
 
-  // Add "lo"
-  dst->push_back(lo);
+  const size_t start_offset = dst->size();
 
   static const T kmax = std::numeric_limits<T>::max();
 
-  // Now space out the benchmarks in multiples of "mult"
-  for (T i = 1; i < kmax / mult; i *= mult) {
-    if (i >= hi) break;
-    if (i > lo) {
+  // Space out the values in multiples of "mult"
+  for (T i = static_cast<T>(1); i <= hi; i *= mult) {
+    if (i >= lo) {
       dst->push_back(i);
     }
+    // Break the loop here since multiplying by
+    // 'mult' would move outside of the range of T
+    if (i > kmax / mult) break;
+  }
+
+  return dst->begin() + start_offset;
+}
+
+template <typename T>
+void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
+  // We negate lo and hi so we require that they cannot be equal to 'min'.
+  CHECK_GT(lo, std::numeric_limits<T>::min());
+  CHECK_GT(hi, std::numeric_limits<T>::min());
+  CHECK_GE(hi, lo);
+  CHECK_LE(hi, 0);
+
+  // Add positive powers, then negate and reverse.
+  // Casts necessary since small integers get promoted
+  // to 'int' when negating.
+  const auto lo_complement = static_cast<T>(-lo);
+  const auto hi_complement = static_cast<T>(-hi);
+
+  const auto it = AddPowers(dst, hi_complement, lo_complement, mult);
+
+  std::for_each(it, dst->end(), [](T& t) { t *= -1; });
+  std::reverse(it, dst->end());
+}
+
+template <typename T>
+void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
+  static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
+                "Args type must be a signed integer");
+
+  CHECK_GE(hi, lo);
+  CHECK_GE(mult, 2);
+
+  // Add "lo"
+  dst->push_back(lo);
+
+  // Handle lo == hi as a special case, so we then know
+  // lo < hi and so it is safe to add 1 to lo and subtract 1
+  // from hi without falling outside of the range of T.
+  if (lo == hi) return;
+
+  // Ensure that lo_inner <= hi_inner below.
+  if (lo + 1 == hi) {
+    dst->push_back(hi);
+    return;
   }
 
-  // Add "hi" (if 
diff erent from "lo")
-  if (hi != lo) {
+  // Add all powers of 'mult' in the range [lo+1, hi-1] (inclusive).
+  const auto lo_inner = static_cast<T>(lo + 1);
+  const auto hi_inner = static_cast<T>(hi - 1);
+
+  // Insert negative values
+  if (lo_inner < 0) {
+    AddNegatedPowers(dst, lo_inner, std::min(hi_inner, T{-1}), mult);
+  }
+
+  // Treat 0 as a special case (see discussion on #762).
+  if (lo < 0 && hi >= 0) {
+    dst->push_back(0);
+  }
+
+  // Insert positive values
+  if (hi_inner > 0) {
+    AddPowers(dst, std::max(lo_inner, T{1}), hi_inner, mult);
+  }
+
+  // Add "hi" (if 
diff erent from last value).
+  if (hi != dst->back()) {
     dst->push_back(hi);
   }
 }
 
+}  // namespace internal
+}  // namespace benchmark
+
 #endif  // BENCHMARK_REGISTER_H

diff  --git a/libcxx/utils/google-benchmark/src/benchmark_runner.cc b/libcxx/utils/google-benchmark/src/benchmark_runner.cc
index 38faeec8e3ee7..6742d42dbecd3 100644
--- a/libcxx/utils/google-benchmark/src/benchmark_runner.cc
+++ b/libcxx/utils/google-benchmark/src/benchmark_runner.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "benchmark_runner.h"
+
 #include "benchmark/benchmark.h"
 #include "benchmark_api_internal.h"
 #include "internal_macros.h"
@@ -45,6 +46,7 @@
 #include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
+#include "perf_counters.h"
 #include "re.h"
 #include "statistics.h"
 #include "string_util.h"
@@ -59,34 +61,41 @@ MemoryManager* memory_manager = nullptr;
 
 namespace {
 
-static const size_t kMaxIterations = 1000000000;
+static constexpr IterationCount kMaxIterations = 1000000000;
 
 BenchmarkReporter::Run CreateRunReport(
     const benchmark::internal::BenchmarkInstance& b,
-    const internal::ThreadManager::Result& results, size_t memory_iterations,
-    const MemoryManager::Result& memory_result, double seconds) {
+    const internal::ThreadManager::Result& results,
+    IterationCount memory_iterations,
+    const MemoryManager::Result& memory_result, double seconds,
+    int64_t repetition_index, int64_t repeats) {
   // Create report about this benchmark run.
   BenchmarkReporter::Run report;
 
-  report.run_name = b.name;
+  report.run_name = b.name();
+  report.family_index = b.family_index();
+  report.per_family_instance_index = b.per_family_instance_index();
   report.error_occurred = results.has_error_;
   report.error_message = results.error_message_;
   report.report_label = results.report_label_;
   // This is the total iterations across all threads.
   report.iterations = results.iterations;
-  report.time_unit = b.time_unit;
+  report.time_unit = b.time_unit();
+  report.threads = b.threads();
+  report.repetition_index = repetition_index;
+  report.repetitions = repeats;
 
   if (!report.error_occurred) {
-    if (b.use_manual_time) {
+    if (b.use_manual_time()) {
       report.real_accumulated_time = results.manual_time_used;
     } else {
       report.real_accumulated_time = results.real_time_used;
     }
     report.cpu_accumulated_time = results.cpu_time_used;
     report.complexity_n = results.complexity_n;
-    report.complexity = b.complexity;
-    report.complexity_lambda = b.complexity_lambda;
-    report.statistics = b.statistics;
+    report.complexity = b.complexity();
+    report.complexity_lambda = b.complexity_lambda();
+    report.statistics = &b.statistics();
     report.counters = results.counters;
 
     if (memory_iterations > 0) {
@@ -98,18 +107,24 @@ BenchmarkReporter::Run CreateRunReport(
       report.max_bytes_used = memory_result.max_bytes_used;
     }
 
-    internal::Finish(&report.counters, results.iterations, seconds, b.threads);
+    internal::Finish(&report.counters, results.iterations, seconds,
+                     b.threads());
   }
   return report;
 }
 
 // Execute one thread of benchmark b for the specified number of iterations.
-// Adds the stats collected for the thread into *total.
-void RunInThread(const BenchmarkInstance* b, size_t iters, int thread_id,
-                 ThreadManager* manager) {
-  internal::ThreadTimer timer;
-  State st = b->Run(iters, thread_id, &timer, manager);
-  CHECK(st.iterations() >= st.max_iterations)
+// Adds the stats collected for the thread into manager->results.
+void RunInThread(const BenchmarkInstance* b, IterationCount iters,
+                 int thread_id, ThreadManager* manager,
+                 PerfCountersMeasurement* perf_counters_measurement) {
+  internal::ThreadTimer timer(
+      b->measure_process_cpu_time()
+          ? internal::ThreadTimer::CreateProcessCpuTime()
+          : internal::ThreadTimer::Create());
+  State st =
+      b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
+  CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
       << "Benchmark returned before State::KeepRunning() returned false!";
   {
     MutexLock l(manager->GetBenchmarkMutex());
@@ -124,225 +139,209 @@ void RunInThread(const BenchmarkInstance* b, size_t iters, int thread_id,
   manager->NotifyThreadComplete();
 }
 
-class BenchmarkRunner {
- public:
-  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
-                  std::vector<BenchmarkReporter::Run>* complexity_reports_)
-      : b(b_),
-        complexity_reports(*complexity_reports_),
-        min_time(!IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time),
-        repeats(b.repetitions != 0 ? b.repetitions
+}  // end namespace
+
+BenchmarkRunner::BenchmarkRunner(
+    const benchmark::internal::BenchmarkInstance& b_,
+    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
+    : b(b_),
+      reports_for_family(reports_for_family_),
+      min_time(!IsZero(b.min_time()) ? b.min_time() : FLAGS_benchmark_min_time),
+      repeats(b.repetitions() != 0 ? b.repetitions()
                                    : FLAGS_benchmark_repetitions),
-        has_explicit_iteration_count(b.iterations != 0),
-        pool(b.threads - 1),
-        iters(has_explicit_iteration_count ? b.iterations : 1) {
+      has_explicit_iteration_count(b.iterations() != 0),
+      pool(b.threads() - 1),
+      iters(has_explicit_iteration_count ? b.iterations() : 1),
+      perf_counters_measurement(
+          PerfCounters::Create(StrSplit(FLAGS_benchmark_perf_counters, ','))),
+      perf_counters_measurement_ptr(perf_counters_measurement.IsValid()
+                                        ? &perf_counters_measurement
+                                        : nullptr) {
+  run_results.display_report_aggregates_only =
+      (FLAGS_benchmark_report_aggregates_only ||
+       FLAGS_benchmark_display_aggregates_only);
+  run_results.file_report_aggregates_only =
+      FLAGS_benchmark_report_aggregates_only;
+  if (b.aggregation_report_mode() != internal::ARM_Unspecified) {
     run_results.display_report_aggregates_only =
-        (FLAGS_benchmark_report_aggregates_only ||
-         FLAGS_benchmark_display_aggregates_only);
+        (b.aggregation_report_mode() &
+         internal::ARM_DisplayReportAggregatesOnly);
     run_results.file_report_aggregates_only =
-        FLAGS_benchmark_report_aggregates_only;
-    if (b.aggregation_report_mode != internal::ARM_Unspecified) {
-      run_results.display_report_aggregates_only =
-          (b.aggregation_report_mode &
-           internal::ARM_DisplayReportAggregatesOnly);
-      run_results.file_report_aggregates_only =
-          (b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly);
-    }
+        (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
+    CHECK(FLAGS_benchmark_perf_counters.empty() ||
+          perf_counters_measurement.IsValid())
+        << "Perf counters were requested but could not be set up.";
+  }
+}
 
-    for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
-      const bool is_the_first_repetition = repetition_num == 0;
-      DoOneRepetition(is_the_first_repetition);
-    }
+BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
+  VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n";
 
-    // Calculate additional statistics
-    run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(b.threads()));
 
-    // Maybe calculate complexity report
-    if ((b.complexity != oNone) && b.last_benchmark_instance) {
-      auto additional_run_stats = ComputeBigO(complexity_reports);
-      run_results.aggregates_only.insert(run_results.aggregates_only.end(),
-                                         additional_run_stats.begin(),
-                                         additional_run_stats.end());
-      complexity_reports.clear();
-    }
+  // Run all but one thread in separate threads
+  for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+    pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
+                           manager.get(), perf_counters_measurement_ptr);
   }
+  // And run one thread here directly.
+  // (If we were asked to run just one thread, we don't create new threads.)
+  // Yes, we need to do this here *after* we start the separate threads.
+  RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr);
 
-  RunResults&& get_results() { return std::move(run_results); }
+  // The main thread has finished. Now let's wait for the other threads.
+  manager->WaitForAllThreads();
+  for (std::thread& thread : pool) thread.join();
 
- private:
-  RunResults run_results;
+  IterationResults i;
+  // Acquire the measurements/counters from the manager, UNDER THE LOCK!
+  {
+    MutexLock l(manager->GetBenchmarkMutex());
+    i.results = manager->results;
+  }
 
-  const benchmark::internal::BenchmarkInstance& b;
-  std::vector<BenchmarkReporter::Run>& complexity_reports;
+  // And get rid of the manager.
+  manager.reset();
 
-  const double min_time;
-  const int repeats;
-  const bool has_explicit_iteration_count;
+  // Adjust real/manual time stats since they were reported per thread.
+  i.results.real_time_used /= b.threads();
+  i.results.manual_time_used /= b.threads();
+  // If we were measuring whole-process CPU usage, adjust the CPU time too.
+  if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
 
-  std::vector<std::thread> pool;
+  VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
+          << i.results.real_time_used << "\n";
 
-  size_t iters;  // preserved between repetitions!
-  // So only the first repetition has to find/calculate it,
-  // the other repetitions will just use that precomputed iteration count.
+  // By using KeepRunningBatch a benchmark can iterate more times than
+  // requested, so take the iteration count from i.results.
+  i.iters = i.results.iterations / b.threads();
 
-  struct IterationResults {
-    internal::ThreadManager::Result results;
-    size_t iters;
-    double seconds;
-  };
-  IterationResults DoNIterations() {
-    VLOG(2) << "Running " << b.name << " for " << iters << "\n";
+  // Base decisions off of real time if requested by this benchmark.
+  i.seconds = i.results.cpu_time_used;
+  if (b.use_manual_time()) {
+    i.seconds = i.results.manual_time_used;
+  } else if (b.use_real_time()) {
+    i.seconds = i.results.real_time_used;
+  }
 
-    std::unique_ptr<internal::ThreadManager> manager;
-    manager.reset(new internal::ThreadManager(b.threads));
+  return i;
+}
 
-    // Run all but one thread in separate threads
-    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-      pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
-                             manager.get());
-    }
-    // And run one thread here directly.
-    // (If we were asked to run just one thread, we don't create new threads.)
-    // Yes, we need to do this here *after* we start the separate threads.
-    RunInThread(&b, iters, 0, manager.get());
+IterationCount BenchmarkRunner::PredictNumItersNeeded(
+    const IterationResults& i) const {
+  // See how much iterations should be increased by.
+  // Note: Avoid division by zero with max(seconds, 1ns).
+  double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
+  // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+  // use the multiplier directly.
+  // Otherwise we use at most 10 times expansion.
+  // NOTE: When the last run was at least 10% of the min time the max
+  // expansion should be 14x.
+  bool is_significant = (i.seconds / min_time) > 0.1;
+  multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
+  if (multiplier <= 1.0) multiplier = 2.0;
+
+  // So what seems to be the sufficiently-large iteration count? Round up.
+  const IterationCount max_next_iters = static_cast<IterationCount>(
+      std::lround(std::max(multiplier * static_cast<double>(i.iters),
+                           static_cast<double>(i.iters) + 1.0)));
+  // But we do have *some* sanity limits though..
+  const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
+
+  VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+  return next_iters;  // round up before conversion to integer.
+}
 
-    // The main thread has finished. Now let's wait for the other threads.
-    manager->WaitForAllThreads();
-    for (std::thread& thread : pool) thread.join();
+bool BenchmarkRunner::ShouldReportIterationResults(
+    const IterationResults& i) const {
+  // Determine if this run should be reported;
+  // Either it has run for a sufficient amount of time
+  // or because an error was reported.
+  return i.results.has_error_ ||
+         i.iters >= kMaxIterations ||  // Too many iterations already.
+         i.seconds >= min_time ||      // The elapsed time is large enough.
+         // CPU time is specified but the elapsed real time greatly exceeds
+         // the minimum time.
+         // Note that user provided timers are except from this sanity check.
+         ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time());
+}
 
-    IterationResults i;
-    // Acquire the measurements/counters from the manager, UNDER THE LOCK!
-    {
-      MutexLock l(manager->GetBenchmarkMutex());
-      i.results = manager->results;
-    }
+void BenchmarkRunner::DoOneRepetition() {
+  assert(HasRepeatsRemaining() && "Already done all repetitions?");
+
+  const bool is_the_first_repetition = num_repetitions_done == 0;
+  IterationResults i;
+
+  // We *may* be gradually increasing the length (iteration count)
+  // of the benchmark until we decide the results are significant.
+  // And once we do, we report those last results and exit.
+  // Please do note that the if there are repetitions, the iteration count
+  // is *only* calculated for the *first* repetition, and other repetitions
+  // simply use that precomputed iteration count.
+  for (;;) {
+    i = DoNIterations();
+
+    // Do we consider the results to be significant?
+    // If we are doing repetitions, and the first repetition was already done,
+    // it has calculated the correct iteration time, so we have run that very
+    // iteration count just now. No need to calculate anything. Just report.
+    // Else, the normal rules apply.
+    const bool results_are_significant = !is_the_first_repetition ||
+                                         has_explicit_iteration_count ||
+                                         ShouldReportIterationResults(i);
+
+    if (results_are_significant) break;  // Good, let's report them!
+
+    // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
+    // iteration count, and run the benchmark again...
+
+    iters = PredictNumItersNeeded(i);
+    assert(iters > i.iters &&
+           "if we did more iterations than we want to do the next time, "
+           "then we should have accepted the current iteration run.");
+  }
 
-    // And get rid of the manager.
+  // Oh, one last thing, we need to also produce the 'memory measurements'..
+  MemoryManager::Result memory_result;
+  IterationCount memory_iterations = 0;
+  if (memory_manager != nullptr) {
+    // Only run a few iterations to reduce the impact of one-time
+    // allocations in benchmarks that are not properly managed.
+    memory_iterations = std::min<IterationCount>(16, iters);
+    memory_manager->Start();
+    std::unique_ptr<internal::ThreadManager> manager;
+    manager.reset(new internal::ThreadManager(1));
+    RunInThread(&b, memory_iterations, 0, manager.get(),
+                perf_counters_measurement_ptr);
+    manager->WaitForAllThreads();
     manager.reset();
 
-    // Adjust real/manual time stats since they were reported per thread.
-    i.results.real_time_used /= b.threads;
-    i.results.manual_time_used /= b.threads;
-
-    VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
-            << i.results.real_time_used << "\n";
-
-    // So for how long were we running?
-    i.iters = iters;
-    // Base decisions off of real time if requested by this benchmark.
-    i.seconds = i.results.cpu_time_used;
-    if (b.use_manual_time) {
-      i.seconds = i.results.manual_time_used;
-    } else if (b.use_real_time) {
-      i.seconds = i.results.real_time_used;
-    }
-
-    return i;
+    memory_manager->Stop(&memory_result);
   }
 
-  size_t PredictNumItersNeeded(const IterationResults& i) const {
-    // See how much iterations should be increased by.
-    // Note: Avoid division by zero with max(seconds, 1ns).
-    double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
-    // If our last run was at least 10% of FLAGS_benchmark_min_time then we
-    // use the multiplier directly.
-    // Otherwise we use at most 10 times expansion.
-    // NOTE: When the last run was at least 10% of the min time the max
-    // expansion should be 14x.
-    bool is_significant = (i.seconds / min_time) > 0.1;
-    multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
-    if (multiplier <= 1.0) multiplier = 2.0;
-
-    // So what seems to be the sufficiently-large iteration count? Round up.
-    const size_t max_next_iters =
-        0.5 + std::max(multiplier * i.iters, i.iters + 1.0);
-    // But we do have *some* sanity limits though..
-    const size_t next_iters = std::min(max_next_iters, kMaxIterations);
-
-    VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
-    return next_iters;  // round up before conversion to integer.
-  }
+  // Ok, now actualy report.
+  BenchmarkReporter::Run report =
+      CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
+                      num_repetitions_done, repeats);
 
-  bool ShouldReportIterationResults(const IterationResults& i) const {
-    // Determine if this run should be reported;
-    // Either it has run for a sufficient amount of time
-    // or because an error was reported.
-    return i.results.has_error_ ||
-           i.iters >= kMaxIterations ||  // Too many iterations already.
-           i.seconds >= min_time ||      // The elapsed time is large enough.
-           // CPU time is specified but the elapsed real time greatly exceeds
-           // the minimum time.
-           // Note that user provided timers are except from this sanity check.
-           ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time);
+  if (reports_for_family) {
+    ++reports_for_family->num_runs_done;
+    if (!report.error_occurred) reports_for_family->Runs.push_back(report);
   }
 
-  void DoOneRepetition(bool is_the_first_repetition) {
-    IterationResults i;
-
-    // We *may* be gradually increasing the length (iteration count)
-    // of the benchmark until we decide the results are significant.
-    // And once we do, we report those last results and exit.
-    // Please do note that the if there are repetitions, the iteration count
-    // is *only* calculated for the *first* repetition, and other repetitions
-    // simply use that precomputed iteration count.
-    for (;;) {
-      i = DoNIterations();
-
-      // Do we consider the results to be significant?
-      // If we are doing repetitions, and the first repetition was already done,
-      // it has calculated the correct iteration time, so we have run that very
-      // iteration count just now. No need to calculate anything. Just report.
-      // Else, the normal rules apply.
-      const bool results_are_significant = !is_the_first_repetition ||
-                                           has_explicit_iteration_count ||
-                                           ShouldReportIterationResults(i);
-
-      if (results_are_significant) break;  // Good, let's report them!
-
-      // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
-      // iteration count, and run the benchmark again...
-
-      iters = PredictNumItersNeeded(i);
-      assert(iters > i.iters &&
-             "if we did more iterations than we want to do the next time, "
-             "then we should have accepted the current iteration run.");
-    }
-
-    // Oh, one last thing, we need to also produce the 'memory measurements'..
-    MemoryManager::Result memory_result;
-    size_t memory_iterations = 0;
-    if (memory_manager != nullptr) {
-      // Only run a few iterations to reduce the impact of one-time
-      // allocations in benchmarks that are not properly managed.
-      memory_iterations = std::min<size_t>(16, iters);
-      memory_manager->Start();
-      std::unique_ptr<internal::ThreadManager> manager;
-      manager.reset(new internal::ThreadManager(1));
-      RunInThread(&b, memory_iterations, 0, manager.get());
-      manager->WaitForAllThreads();
-      manager.reset();
-
-      memory_manager->Stop(&memory_result);
-    }
-
-    // Ok, now actualy report.
-    BenchmarkReporter::Run report = CreateRunReport(
-        b, i.results, memory_iterations, memory_result, i.seconds);
+  run_results.non_aggregates.push_back(report);
 
-    if (!report.error_occurred && b.complexity != oNone)
-      complexity_reports.push_back(report);
+  ++num_repetitions_done;
+}
 
-    run_results.non_aggregates.push_back(report);
-  }
-};
+RunResults&& BenchmarkRunner::GetResults() {
+  assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?");
 
-}  // end namespace
+  // Calculate additional statistics over the repetitions of this instance.
+  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
 
-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports) {
-  internal::BenchmarkRunner r(b, complexity_reports);
-  return r.get_results();
+  return std::move(run_results);
 }
 
 }  // end namespace internal

diff  --git a/libcxx/utils/google-benchmark/src/benchmark_runner.h b/libcxx/utils/google-benchmark/src/benchmark_runner.h
index 96e8282a11aa8..8a855236b2277 100644
--- a/libcxx/utils/google-benchmark/src/benchmark_runner.h
+++ b/libcxx/utils/google-benchmark/src/benchmark_runner.h
@@ -15,8 +15,13 @@
 #ifndef BENCHMARK_RUNNER_H_
 #define BENCHMARK_RUNNER_H_
 
+#include <thread>
+#include <vector>
+
 #include "benchmark_api_internal.h"
 #include "internal_macros.h"
+#include "perf_counters.h"
+#include "thread_manager.h"
 
 DECLARE_double(benchmark_min_time);
 
@@ -26,6 +31,8 @@ DECLARE_bool(benchmark_report_aggregates_only);
 
 DECLARE_bool(benchmark_display_aggregates_only);
 
+DECLARE_string(benchmark_perf_counters);
+
 namespace benchmark {
 
 namespace internal {
@@ -40,9 +47,57 @@ struct RunResults {
   bool file_report_aggregates_only = false;
 };
 
-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports);
+class BenchmarkRunner {
+ public:
+  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);
+
+  int GetNumRepeats() const { return repeats; }
+
+  bool HasRepeatsRemaining() const {
+    return GetNumRepeats() != num_repetitions_done;
+  }
+
+  void DoOneRepetition();
+
+  RunResults&& GetResults();
+
+  BenchmarkReporter::PerFamilyRunReports* GetReportsForFamily() const {
+    return reports_for_family;
+  };
+
+ private:
+  RunResults run_results;
+
+  const benchmark::internal::BenchmarkInstance& b;
+  BenchmarkReporter::PerFamilyRunReports* reports_for_family;
+
+  const double min_time;
+  const int repeats;
+  const bool has_explicit_iteration_count;
+
+  int num_repetitions_done = 0;
+
+  std::vector<std::thread> pool;
+
+  IterationCount iters;  // preserved between repetitions!
+  // So only the first repetition has to find/calculate it,
+  // the other repetitions will just use that precomputed iteration count.
+
+  PerfCountersMeasurement perf_counters_measurement;
+  PerfCountersMeasurement* const perf_counters_measurement_ptr;
+
+  struct IterationResults {
+    internal::ThreadManager::Result results;
+    IterationCount iters;
+    double seconds;
+  };
+  IterationResults DoNIterations();
+
+  IterationCount PredictNumItersNeeded(const IterationResults& i) const;
+
+  bool ShouldReportIterationResults(const IterationResults& i) const;
+};
 
 }  // namespace internal
 

diff  --git a/libcxx/utils/google-benchmark/src/commandlineflags.cc b/libcxx/utils/google-benchmark/src/commandlineflags.cc
index 734e88bbec686..5724aaa29402e 100644
--- a/libcxx/utils/google-benchmark/src/commandlineflags.cc
+++ b/libcxx/utils/google-benchmark/src/commandlineflags.cc
@@ -14,13 +14,20 @@
 
 #include "commandlineflags.h"
 
+#include <algorithm>
 #include <cctype>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include <limits>
+#include <map>
+#include <utility>
+
+#include "../src/string_util.h"
 
 namespace benchmark {
+namespace {
+
 // Parses 'str' for a 32-bit signed integer.  If successful, writes
 // the result to *value and returns true; otherwise leaves *value
 // unchanged and returns false.
@@ -75,6 +82,30 @@ bool ParseDouble(const std::string& src_text, const char* str, double* value) {
   return true;
 }
 
+// Parses 'str' into KV pairs. If successful, writes the result to *value and
+// returns true; otherwise leaves *value unchanged and returns false.
+bool ParseKvPairs(const std::string& src_text, const char* str,
+                  std::map<std::string, std::string>* value) {
+  std::map<std::string, std::string> kvs;
+  for (const auto& kvpair : StrSplit(str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) {
+      std::cerr << src_text << " is expected to be a comma-separated list of "
+                << "<key>=<value> strings, but actually has value \"" << str
+                << "\".\n";
+      return false;
+    }
+    if (!kvs.emplace(kv[0], kv[1]).second) {
+      std::cerr << src_text << " is expected to contain unique keys but key \""
+                << kv[0] << "\" was repeated.\n";
+      return false;
+    }
+  }
+
+  *value = kvs;
+  return true;
+}
+
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "BENCHMARK_FOO" in the open-source version.
@@ -85,47 +116,59 @@ static std::string FlagToEnvVar(const char* flag) {
   for (size_t i = 0; i != flag_str.length(); ++i)
     env_var += static_cast<char>(::toupper(flag_str.c_str()[i]));
 
-  return "BENCHMARK_" + env_var;
+  return env_var;
 }
 
-// Reads and returns the Boolean environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-//
-// The value is considered true iff it's not "0".
-bool BoolFromEnv(const char* flag, bool default_value) {
+}  // namespace
+
+bool BoolFromEnv(const char* flag, bool default_val) {
   const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = getenv(env_var.c_str());
-  return string_value == nullptr ? default_value
-                                 : strcmp(string_value, "0") != 0;
+  const char* const value_str = getenv(env_var.c_str());
+  return value_str == nullptr ? default_val : IsTruthyFlagValue(value_str);
 }
 
-// Reads and returns a 32-bit integer stored in the environment
-// variable corresponding to the given flag; if it isn't set or
-// doesn't represent a valid 32-bit integer, returns default_value.
-int32_t Int32FromEnv(const char* flag, int32_t default_value) {
+int32_t Int32FromEnv(const char* flag, int32_t default_val) {
   const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = getenv(env_var.c_str());
-  if (string_value == nullptr) {
-    // The environment variable is not set.
-    return default_value;
+  const char* const value_str = getenv(env_var.c_str());
+  int32_t value = default_val;
+  if (value_str == nullptr ||
+      !ParseInt32(std::string("Environment variable ") + env_var, value_str,
+                  &value)) {
+    return default_val;
   }
+  return value;
+}
 
-  int32_t result = default_value;
-  if (!ParseInt32(std::string("Environment variable ") + env_var, string_value,
-                  &result)) {
-    std::cout << "The default value " << default_value << " is used.\n";
-    return default_value;
+double DoubleFromEnv(const char* flag, double default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+  double value = default_val;
+  if (value_str == nullptr ||
+      !ParseDouble(std::string("Environment variable ") + env_var, value_str,
+                   &value)) {
+    return default_val;
   }
-
-  return result;
+  return value;
 }
 
-// Reads and returns the string environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-const char* StringFromEnv(const char* flag, const char* default_value) {
+const char* StringFromEnv(const char* flag, const char* default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value = getenv(env_var.c_str());
-  return value == nullptr ? default_value : value;
+  return value == nullptr ? default_val : value;
+}
+
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+
+  if (value_str == nullptr) return default_val;
+
+  std::map<std::string, std::string> value;
+  if (!ParseKvPairs("Environment variable " + env_var, value_str, &value)) {
+    return default_val;
+  }
+  return value;
 }
 
 // Parses a string as a command line flag.  The string should have
@@ -205,14 +248,39 @@ bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
   return true;
 }
 
+bool ParseKeyValueFlag(
+    const char* str, const char* flag,
+    std::map<std::string, std::string>* value) {
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  if (value_str == nullptr) return false;
+
+  for (const auto& kvpair : StrSplit(value_str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) return false;
+    value->emplace(kv[0], kv[1]);
+  }
+
+  return true;
+}
+
 bool IsFlag(const char* str, const char* flag) {
   return (ParseFlagValue(str, flag, true) != nullptr);
 }
 
 bool IsTruthyFlagValue(const std::string& value) {
-  if (value.empty()) return true;
-  char ch = value[0];
-  return isalnum(ch) &&
-         !(ch == '0' || ch == 'f' || ch == 'F' || ch == 'n' || ch == 'N');
+  if (value.size() == 1) {
+    char v = value[0];
+    return isalnum(v) &&
+           !(v == '0' || v == 'f' || v == 'F' || v == 'n' || v == 'N');
+  } else if (!value.empty()) {
+    std::string value_lower(value);
+    std::transform(value_lower.begin(), value_lower.end(), value_lower.begin(),
+                   [](char c) { return static_cast<char>(::tolower(c)); });
+    return !(value_lower == "false" || value_lower == "no" ||
+             value_lower == "off");
+  } else
+    return true;
 }
+
 }  // end namespace benchmark

diff  --git a/libcxx/utils/google-benchmark/src/commandlineflags.h b/libcxx/utils/google-benchmark/src/commandlineflags.h
index 945c9a9fc4af3..0c988cccb3ae1 100644
--- a/libcxx/utils/google-benchmark/src/commandlineflags.h
+++ b/libcxx/utils/google-benchmark/src/commandlineflags.h
@@ -2,6 +2,7 @@
 #define BENCHMARK_COMMANDLINEFLAGS_H_
 
 #include <cstdint>
+#include <map>
 #include <string>
 
 // Macro for referencing flags.
@@ -10,31 +11,61 @@
 // Macros for declaring flags.
 #define DECLARE_bool(name) extern bool FLAG(name)
 #define DECLARE_int32(name) extern int32_t FLAG(name)
-#define DECLARE_int64(name) extern int64_t FLAG(name)
 #define DECLARE_double(name) extern double FLAG(name)
 #define DECLARE_string(name) extern std::string FLAG(name)
+#define DECLARE_kvpairs(name) \
+  extern std::map<std::string, std::string> FLAG(name)
 
 // Macros for defining flags.
-#define DEFINE_bool(name, default_val, doc) bool FLAG(name) = (default_val)
-#define DEFINE_int32(name, default_val, doc) int32_t FLAG(name) = (default_val)
-#define DEFINE_int64(name, default_val, doc) int64_t FLAG(name) = (default_val)
-#define DEFINE_double(name, default_val, doc) double FLAG(name) = (default_val)
-#define DEFINE_string(name, default_val, doc) \
-  std::string FLAG(name) = (default_val)
+#define DEFINE_bool(name, default_val) \
+  bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
+#define DEFINE_int32(name, default_val) \
+  int32_t FLAG(name) = benchmark::Int32FromEnv(#name, default_val)
+#define DEFINE_double(name, default_val) \
+  double FLAG(name) = benchmark::DoubleFromEnv(#name, default_val)
+#define DEFINE_string(name, default_val) \
+  std::string FLAG(name) = benchmark::StringFromEnv(#name, default_val)
+#define DEFINE_kvpairs(name, default_val)         \
+  std::map<std::string, std::string> FLAG(name) = \
+      benchmark::KvPairsFromEnv(#name, default_val)
 
 namespace benchmark {
-// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
-// to *value and returns true; otherwise leaves *value unchanged and returns
-// false.
-bool ParseInt32(const std::string& src_text, const char* str, int32_t* value);
 
-// Parses a bool/Int32/string from the environment variable
-// corresponding to the given Google Test flag.
+// Parses a bool from the environment variable corresponding to the given flag.
+//
+// If the variable exists, returns IsTruthyFlagValue() value;  if not,
+// returns the given default value.
 bool BoolFromEnv(const char* flag, bool default_val);
+
+// Parses an Int32 from the environment variable corresponding to the given
+// flag.
+//
+// If the variable exists, returns ParseInt32() value;  if not, returns
+// the given default value.
 int32_t Int32FromEnv(const char* flag, int32_t default_val);
+
+// Parses an Double from the environment variable corresponding to the given
+// flag.
+//
+// If the variable exists, returns ParseDouble();  if not, returns
+// the given default value.
 double DoubleFromEnv(const char* flag, double default_val);
+
+// Parses a string from the environment variable corresponding to the given
+// flag.
+//
+// If variable exists, returns its value;  if not, returns
+// the given default value.
 const char* StringFromEnv(const char* flag, const char* default_val);
 
+// Parses a set of kvpairs from the environment variable corresponding to the
+// given flag.
+//
+// If variable exists, returns its value;  if not, returns
+// the given default value.
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val);
+
 // Parses a string for a bool flag, in the form of either
 // "--flag=value" or "--flag".
 //
@@ -46,34 +77,40 @@ const char* StringFromEnv(const char* flag, const char* default_val);
 // true.  On failure, returns false without changing *value.
 bool ParseBoolFlag(const char* str, const char* flag, bool* value);
 
-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
+// Parses a string for an Int32 flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);
 
-// Parses a string for a Double flag, in the form of
-// "--flag=value".
+// Parses a string for a Double flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 bool ParseDoubleFlag(const char* str, const char* flag, double* value);
 
-// Parses a string for a string flag, in the form of
-// "--flag=value".
+// Parses a string for a string flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 bool ParseStringFlag(const char* str, const char* flag, std::string* value);
 
+// Parses a string for a kvpairs flag in the form "--flag=key=value,key=value"
+//
+// On success, stores the value of the flag in *value and returns true. On
+// failure returns false, though *value may have been mutated.
+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value);
+
 // Returns true if the string matches the flag.
 bool IsFlag(const char* str, const char* flag);
 
 // Returns true unless value starts with one of: '0', 'f', 'F', 'n' or 'N', or
-// some non-alphanumeric character. As a special case, also returns true if
-// value is the empty string.
+// some non-alphanumeric character. Also returns false if the value matches
+// one of 'no', 'false', 'off' (case-insensitive). As a special case, also
+// returns true if value is the empty string.
 bool IsTruthyFlagValue(const std::string& value);
+
 }  // end namespace benchmark
 
 #endif  // BENCHMARK_COMMANDLINEFLAGS_H_

diff  --git a/libcxx/utils/google-benchmark/src/complexity.cc b/libcxx/utils/google-benchmark/src/complexity.cc
index 6ef17660c9546..29f7c3b03155a 100644
--- a/libcxx/utils/google-benchmark/src/complexity.cc
+++ b/libcxx/utils/google-benchmark/src/complexity.cc
@@ -29,20 +29,23 @@ BigOFunc* FittingCurve(BigO complexity) {
   static const double kLog2E = 1.44269504088896340736;
   switch (complexity) {
     case oN:
-      return [](int64_t n) -> double { return static_cast<double>(n); };
+      return [](IterationCount n) -> double { return static_cast<double>(n); };
     case oNSquared:
-      return [](int64_t n) -> double { return std::pow(n, 2); };
+      return [](IterationCount n) -> double { return std::pow(n, 2); };
     case oNCubed:
-      return [](int64_t n) -> double { return std::pow(n, 3); };
+      return [](IterationCount n) -> double { return std::pow(n, 3); };
     case oLogN:
       /* Note: can't use log2 because Android's GNU STL lacks it */
-      return [](int64_t n) { return kLog2E * log(static_cast<double>(n)); };
+      return
+          [](IterationCount n) { return kLog2E * log(static_cast<double>(n)); };
     case oNLogN:
       /* Note: can't use log2 because Android's GNU STL lacks it */
-      return [](int64_t n) { return kLog2E * n * log(static_cast<double>(n)); };
+      return [](IterationCount n) {
+        return kLog2E * n * log(static_cast<double>(n));
+      };
     case o1:
     default:
-      return [](int64_t) { return 1.0; };
+      return [](IterationCount) { return 1.0; };
   }
 }
 
@@ -79,7 +82,6 @@ std::string GetBigOString(BigO complexity) {
 LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                        const std::vector<double>& time,
                        BigOFunc* fitting_curve) {
-  double sigma_gn = 0.0;
   double sigma_gn_squared = 0.0;
   double sigma_time = 0.0;
   double sigma_time_gn = 0.0;
@@ -87,7 +89,6 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
   // Calculate least square fitting parameter
   for (size_t i = 0; i < n.size(); ++i) {
     double gn_i = fitting_curve(n[i]);
-    sigma_gn += gn_i;
     sigma_gn_squared += gn_i * gn_i;
     sigma_time += time[i];
     sigma_time_gn += time[i] * gn_i;
@@ -183,14 +184,21 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
     result_real = MinimalLeastSq(n, real_time, result_cpu.complexity);
   }
 
-  std::string run_name = reports[0].benchmark_name().substr(
-      0, reports[0].benchmark_name().find('/'));
+  // Drop the 'args' when reporting complexity.
+  auto run_name = reports[0].run_name;
+  run_name.args.clear();
 
   // Get the data from the accumulator to BenchmarkReporter::Run's.
   Run big_o;
   big_o.run_name = run_name;
+  big_o.family_index = reports[0].family_index;
+  big_o.per_family_instance_index = reports[0].per_family_instance_index;
   big_o.run_type = BenchmarkReporter::Run::RT_Aggregate;
+  big_o.repetitions = reports[0].repetitions;
+  big_o.repetition_index = Run::no_repetition_index;
+  big_o.threads = reports[0].threads;
   big_o.aggregate_name = "BigO";
+  big_o.report_label = reports[0].report_label;
   big_o.iterations = 0;
   big_o.real_accumulated_time = result_real.coef;
   big_o.cpu_accumulated_time = result_cpu.coef;
@@ -207,11 +215,15 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
   // Only add label to mean/stddev if it is same for all runs
   Run rms;
   rms.run_name = run_name;
-  big_o.report_label = reports[0].report_label;
+  rms.family_index = reports[0].family_index;
+  rms.per_family_instance_index = reports[0].per_family_instance_index;
   rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
   rms.aggregate_name = "RMS";
   rms.report_label = big_o.report_label;
   rms.iterations = 0;
+  rms.repetition_index = Run::no_repetition_index;
+  rms.repetitions = reports[0].repetitions;
+  rms.threads = reports[0].threads;
   rms.real_accumulated_time = result_real.rms / multiplier;
   rms.cpu_accumulated_time = result_cpu.rms / multiplier;
   rms.report_rms = true;

diff  --git a/libcxx/utils/google-benchmark/src/console_reporter.cc b/libcxx/utils/google-benchmark/src/console_reporter.cc
index ca364727cb46a..6fd764525e814 100644
--- a/libcxx/utils/google-benchmark/src/console_reporter.cc
+++ b/libcxx/utils/google-benchmark/src/console_reporter.cc
@@ -12,21 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-#include "counter.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <cstdio>
+#include <cstring>
 #include <iostream>
 #include <string>
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
 #include "colorprint.h"
 #include "commandlineflags.h"
+#include "complexity.h"
+#include "counter.h"
 #include "internal_macros.h"
 #include "string_util.h"
 #include "timers.h"
@@ -64,9 +64,8 @@ void ConsoleReporter::PrintHeader(const Run& run) {
       str += " UserCounters...";
     }
   }
-  str += "\n";
   std::string line = std::string(str.length(), '-');
-  GetOutputStream() << line << "\n" << str << line << "\n";
+  GetOutputStream() << line << "\n" << str << "\n" << line << "\n";
 }
 
 void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
@@ -157,16 +156,14 @@ void ConsoleReporter::PrintRunData(const Run& result) {
     const std::size_t cNameLen = std::max(std::string::size_type(10),
                                           c.first.length());
     auto const& s = HumanReadableNumber(c.second.value, c.second.oneK);
+    const char* unit = "";
+    if (c.second.flags & Counter::kIsRate)
+      unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
     if (output_options_ & OO_Tabular) {
-      if (c.second.flags & Counter::kIsRate) {
-        printer(Out, COLOR_DEFAULT, " %*s/s", cNameLen - 2, s.c_str());
-      } else {
-        printer(Out, COLOR_DEFAULT, " %*s", cNameLen, s.c_str());
-      }
-    } else {
-      const char* unit = (c.second.flags & Counter::kIsRate) ? "/s" : "";
-      printer(Out, COLOR_DEFAULT, " %s=%s%s", c.first.c_str(), s.c_str(),
+      printer(Out, COLOR_DEFAULT, " %*s%s", cNameLen - strlen(unit), s.c_str(),
               unit);
+    } else {
+      printer(Out, COLOR_DEFAULT, " %s=%s%s", c.first.c_str(), s.c_str(), unit);
     }
   }
 

diff  --git a/libcxx/utils/google-benchmark/src/counter.cc b/libcxx/utils/google-benchmark/src/counter.cc
index cb604e060b623..cf5b78ee3ac6b 100644
--- a/libcxx/utils/google-benchmark/src/counter.cc
+++ b/libcxx/utils/google-benchmark/src/counter.cc
@@ -17,7 +17,7 @@
 namespace benchmark {
 namespace internal {
 
-double Finish(Counter const& c, int64_t iterations, double cpu_time,
+double Finish(Counter const& c, IterationCount iterations, double cpu_time,
               double num_threads) {
   double v = c.value;
   if (c.flags & Counter::kIsRate) {
@@ -32,10 +32,15 @@ double Finish(Counter const& c, int64_t iterations, double cpu_time,
   if (c.flags & Counter::kAvgIterations) {
     v /= iterations;
   }
+
+  if (c.flags & Counter::kInvert) {  // Invert is *always* last.
+    v = 1.0 / v;
+  }
   return v;
 }
 
-void Finish(UserCounters* l, int64_t iterations, double cpu_time, double num_threads) {
+void Finish(UserCounters* l, IterationCount iterations, double cpu_time,
+            double num_threads) {
   for (auto& c : *l) {
     c.second.value = Finish(c.second, iterations, cpu_time, num_threads);
   }

diff  --git a/libcxx/utils/google-benchmark/src/counter.h b/libcxx/utils/google-benchmark/src/counter.h
index d884e50aa12b6..1f5a58e31f0ca 100644
--- a/libcxx/utils/google-benchmark/src/counter.h
+++ b/libcxx/utils/google-benchmark/src/counter.h
@@ -12,15 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef BENCHMARK_COUNTER_H_
+#define BENCHMARK_COUNTER_H_
+
 #include "benchmark/benchmark.h"
 
 namespace benchmark {
 
 // these counter-related functions are hidden to reduce API surface.
 namespace internal {
-void Finish(UserCounters* l, int64_t iterations, double time, double num_threads);
+void Finish(UserCounters* l, IterationCount iterations, double time,
+            double num_threads);
 void Increment(UserCounters* l, UserCounters const& r);
 bool SameNames(UserCounters const& l, UserCounters const& r);
 }  // end namespace internal
 
 }  // end namespace benchmark
+
+#endif  // BENCHMARK_COUNTER_H_

diff  --git a/libcxx/utils/google-benchmark/src/csv_reporter.cc b/libcxx/utils/google-benchmark/src/csv_reporter.cc
index d2f1d27eb62cb..af2c18fc8a6ee 100644
--- a/libcxx/utils/google-benchmark/src/csv_reporter.cc
+++ b/libcxx/utils/google-benchmark/src/csv_reporter.cc
@@ -37,6 +37,18 @@ std::vector<std::string> elements = {
     "error_occurred", "error_message"};
 }  // namespace
 
+std::string CsvEscape(const std::string & s) {
+  std::string tmp;
+  tmp.reserve(s.size() + 2);
+  for (char c : s) {
+    switch (c) {
+    case '"' : tmp += "\"\""; break;
+    default  : tmp += c; break;
+    }
+  }
+  return '"' + tmp + '"';
+}
+
 bool CSVReporter::ReportContext(const Context& context) {
   PrintBasicContext(&GetErrorStream(), context);
   return true;
@@ -89,18 +101,11 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
 
 void CSVReporter::PrintRunData(const Run& run) {
   std::ostream& Out = GetOutputStream();
-
-  // Field with embedded double-quote characters must be doubled and the field
-  // delimited with double-quotes.
-  std::string name = run.benchmark_name();
-  ReplaceAll(&name, "\"", "\"\"");
-  Out << '"' << name << "\",";
+  Out << CsvEscape(run.benchmark_name()) << ",";
   if (run.error_occurred) {
     Out << std::string(elements.size() - 3, ',');
     Out << "true,";
-    std::string msg = run.error_message;
-    ReplaceAll(&msg, "\"", "\"\"");
-    Out << '"' << msg << "\"\n";
+    Out << CsvEscape(run.error_message) << "\n";
     return;
   }
 
@@ -130,11 +135,7 @@ void CSVReporter::PrintRunData(const Run& run) {
   }
   Out << ",";
   if (!run.report_label.empty()) {
-    // Field with embedded double-quote characters must be doubled and the field
-    // delimited with double-quotes.
-    std::string label = run.report_label;
-    ReplaceAll(&label, "\"", "\"\"");
-    Out << "\"" << label << "\"";
+    Out << CsvEscape(run.report_label);
   }
   Out << ",,";  // for error_occurred and error_message
 

diff  --git a/libcxx/utils/google-benchmark/src/cycleclock.h b/libcxx/utils/google-benchmark/src/cycleclock.h
index 92f4a495f0a1f..f22ca9f7d2998 100644
--- a/libcxx/utils/google-benchmark/src/cycleclock.h
+++ b/libcxx/utils/google-benchmark/src/cycleclock.h
@@ -36,7 +36,7 @@
 // declarations of some other intrinsics, breaking compilation.
 // Therefore, we simply declare __rdtsc ourselves. See also
 // http://connect.microsoft.com/VisualStudio/feedback/details/262047
-#if defined(COMPILER_MSVC) && !defined(_M_IX86)
+#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64)
 extern "C" uint64_t __rdtsc();
 #pragma intrinsic(__rdtsc)
 #endif
@@ -114,6 +114,12 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   // when I know it will work.  Otherwise, I'll use __rdtsc and hope
   // the code is being compiled with a non-ancient compiler.
   _asm rdtsc
+#elif defined(COMPILER_MSVC) && defined(_M_ARM64)
+  // See https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics?view=vs-2019
+  // and https://reviews.llvm.org/D53115
+  int64_t virtual_timer_value;
+  virtual_timer_value = _ReadStatusReg(ARM64_CNTVCT);
+  return virtual_timer_value;
 #elif defined(COMPILER_MSVC)
   return __rdtsc();
 #elif defined(BENCHMARK_OS_NACL)
@@ -167,10 +173,19 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__loongarch__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 #elif defined(__s390__)  // Covers both s390 and s390x.
   // Return the CPU clock.
   uint64_t tsc;
+#if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL)
+  // z/OS XL compiler HLASM syntax.
+  asm(" stck %0" : "=m"(tsc) : : "cc");
+#else
   asm("stck %0" : "=Q"(tsc) : : "cc");
+#endif
   return tsc;
 #elif defined(__riscv) // RISC-V
   // Use RDCYCLE (and RDCYCLEH on riscv32)
@@ -193,6 +208,10 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   asm volatile("rdcycle %0" : "=r"(cycles));
   return cycles;
 #endif
+#elif defined(__e2k__) || defined(__elbrus__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 #else
 // The soft failover to a generic implementation is automatic only for ARM.
 // For other platforms the developer is expected to make an attempt to create

diff  --git a/libcxx/utils/google-benchmark/src/internal_macros.h b/libcxx/utils/google-benchmark/src/internal_macros.h
index 5dbf4fd27521b..91f367b894bcd 100644
--- a/libcxx/utils/google-benchmark/src/internal_macros.h
+++ b/libcxx/utils/google-benchmark/src/internal_macros.h
@@ -13,7 +13,11 @@
 #endif
 
 #if defined(__clang__)
-  #if !defined(COMPILER_CLANG)
+  #if defined(__ibmxl__)
+    #if !defined(COMPILER_IBMXL)
+      #define COMPILER_IBMXL
+    #endif
+  #elif !defined(COMPILER_CLANG)
     #define COMPILER_CLANG
   #endif
 #elif defined(_MSC_VER)
@@ -58,6 +62,8 @@
   #define BENCHMARK_OS_NETBSD 1
 #elif defined(__OpenBSD__)
   #define BENCHMARK_OS_OPENBSD 1
+#elif defined(__DragonFly__)
+  #define BENCHMARK_OS_DRAGONFLY 1
 #elif defined(__linux__)
   #define BENCHMARK_OS_LINUX 1
 #elif defined(__native_client__)
@@ -70,6 +76,10 @@
 #define BENCHMARK_OS_FUCHSIA 1
 #elif defined (__SVR4) && defined (__sun)
 #define BENCHMARK_OS_SOLARIS 1
+#elif defined(__QNX__)
+#define BENCHMARK_OS_QNX 1
+#elif defined(__MVS__)
+#define BENCHMARK_OS_ZOS 1
 #endif
 
 #if defined(__ANDROID__) && defined(__GLIBCXX__)

diff  --git a/libcxx/utils/google-benchmark/src/json_reporter.cc b/libcxx/utils/google-benchmark/src/json_reporter.cc
index 7d01e8e4e316c..26898456f8546 100644
--- a/libcxx/utils/google-benchmark/src/json_reporter.cc
+++ b/libcxx/utils/google-benchmark/src/json_reporter.cc
@@ -16,6 +16,7 @@
 #include "complexity.h"
 
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
 #include <iomanip>  // for setprecision
 #include <iostream>
@@ -28,39 +29,73 @@
 #include "timers.h"
 
 namespace benchmark {
+namespace internal {
+extern std::map<std::string, std::string>* global_context;
+}
 
 namespace {
 
+std::string StrEscape(const std::string & s) {
+  std::string tmp;
+  tmp.reserve(s.size());
+  for (char c : s) {
+    switch (c) {
+    case '\b': tmp += "\\b"; break;
+    case '\f': tmp += "\\f"; break;
+    case '\n': tmp += "\\n"; break;
+    case '\r': tmp += "\\r"; break;
+    case '\t': tmp += "\\t"; break;
+    case '\\': tmp += "\\\\"; break;
+    case '"' : tmp += "\\\""; break;
+    default  : tmp += c; break;
+    }
+  }
+  return tmp;
+}
+
 std::string FormatKV(std::string const& key, std::string const& value) {
-  return StrFormat("\"%s\": \"%s\"", key.c_str(), value.c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, const char* value) {
-  return StrFormat("\"%s\": \"%s\"", key.c_str(), value);
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, bool value) {
-  return StrFormat("\"%s\": %s", key.c_str(), value ? "true" : "false");
+  return StrFormat("\"%s\": %s", StrEscape(key).c_str(), value ? "true" : "false");
 }
 
 std::string FormatKV(std::string const& key, int64_t value) {
   std::stringstream ss;
-  ss << '"' << key << "\": " << value;
+  ss << '"' << StrEscape(key) << "\": " << value;
   return ss.str();
 }
 
-std::string FormatKV(std::string const& key, double value) {
+std::string FormatKV(std::string const& key, IterationCount value) {
   std::stringstream ss;
-  ss << '"' << key << "\": ";
+  ss << '"' << StrEscape(key) << "\": " << value;
+  return ss.str();
+}
 
-  const auto max_digits10 = std::numeric_limits<decltype(value)>::max_digits10;
-  const auto max_fractional_digits10 = max_digits10 - 1;
+std::string FormatKV(std::string const& key, double value) {
+  std::stringstream ss;
+  ss << '"' << StrEscape(key) << "\": ";
 
-  ss << std::scientific << std::setprecision(max_fractional_digits10) << value;
+  if (std::isnan(value))
+    ss << (value < 0 ? "-" : "") << "NaN";
+  else if (std::isinf(value))
+    ss << (value < 0 ? "-" : "") << "Infinity";
+  else {
+    const auto max_digits10 =
+        std::numeric_limits<decltype(value)>::max_digits10;
+    const auto max_fractional_digits10 = max_digits10 - 1;
+    ss << std::scientific << std::setprecision(max_fractional_digits10)
+       << value;
+  }
   return ss.str();
 }
 
-int64_t RoundDouble(double v) { return static_cast<int64_t>(v + 0.5); }
+int64_t RoundDouble(double v) { return std::lround(v); }
 
 }  // end namespace
 
@@ -80,12 +115,7 @@ bool JSONReporter::ReportContext(const Context& context) {
   out << indent << FormatKV("host_name", context.sys_info.name) << ",\n";
 
   if (Context::executable_name) {
-    // windows uses backslash for its path separator,
-    // which must be escaped in JSON otherwise it blows up conforming JSON
-    // decoders
-    std::string executable_name = Context::executable_name;
-    ReplaceAll(&executable_name, "\\", "\\\\");
-    out << indent << FormatKV("executable", executable_name) << ",\n";
+    out << indent << FormatKV("executable", Context::executable_name) << ",\n";
   }
 
   CPUInfo const& info = context.cpu_info;
@@ -95,8 +125,10 @@ bool JSONReporter::ReportContext(const Context& context) {
       << FormatKV("mhz_per_cpu",
                   RoundDouble(info.cycles_per_second / 1000000.0))
       << ",\n";
-  out << indent << FormatKV("cpu_scaling_enabled", info.scaling_enabled)
-      << ",\n";
+  if (CPUInfo::Scaling::UNKNOWN != info.scaling) {
+    out << indent << FormatKV("cpu_scaling_enabled", info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
+        << ",\n";
+  }
 
   out << indent << "\"caches\": [\n";
   indent = std::string(6, ' ');
@@ -108,7 +140,7 @@ bool JSONReporter::ReportContext(const Context& context) {
     out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
         << ",\n";
     out << cache_indent
-        << FormatKV("size", static_cast<int64_t>(CI.size) * 1000u) << ",\n";
+        << FormatKV("size", static_cast<int64_t>(CI.size)) << ",\n";
     out << cache_indent
         << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
         << "\n";
@@ -131,6 +163,13 @@ bool JSONReporter::ReportContext(const Context& context) {
   const char build_type[] = "debug";
 #endif
   out << indent << FormatKV("library_build_type", build_type) << "\n";
+
+  if (internal::global_context != nullptr) {
+    for (const auto& kv: *internal::global_context) {
+      out << indent << FormatKV(kv.first, kv.second) << "\n";
+    }
+  }
+
   // Close context block and open the list of benchmarks.
   out << inner_indent << "},\n";
   out << inner_indent << "\"benchmarks\": [\n";
@@ -168,7 +207,11 @@ void JSONReporter::PrintRunData(Run const& run) {
   std::string indent(6, ' ');
   std::ostream& out = GetOutputStream();
   out << indent << FormatKV("name", run.benchmark_name()) << ",\n";
-  out << indent << FormatKV("run_name", run.run_name) << ",\n";
+  out << indent << FormatKV("family_index", run.family_index) << ",\n";
+  out << indent
+      << FormatKV("per_family_instance_index", run.per_family_instance_index)
+      << ",\n";
+  out << indent << FormatKV("run_name", run.run_name.str()) << ",\n";
   out << indent << FormatKV("run_type", [&run]() -> const char* {
     switch (run.run_type) {
       case BenchmarkReporter::Run::RT_Iteration:
@@ -178,6 +221,12 @@ void JSONReporter::PrintRunData(Run const& run) {
     }
     BENCHMARK_UNREACHABLE();
   }()) << ",\n";
+  out << indent << FormatKV("repetitions", run.repetitions) << ",\n";
+  if (run.run_type != BenchmarkReporter::Run::RT_Aggregate) {
+    out << indent << FormatKV("repetition_index", run.repetition_index)
+        << ",\n";
+  }
+  out << indent << FormatKV("threads", run.threads) << ",\n";
   if (run.run_type == BenchmarkReporter::Run::RT_Aggregate) {
     out << indent << FormatKV("aggregate_name", run.aggregate_name) << ",\n";
   }

diff  --git a/libcxx/utils/google-benchmark/src/mutex.h b/libcxx/utils/google-benchmark/src/mutex.h
index 5f461d05a0c64..9cc414ec467e6 100644
--- a/libcxx/utils/google-benchmark/src/mutex.h
+++ b/libcxx/utils/google-benchmark/src/mutex.h
@@ -9,60 +9,60 @@
 // Enable thread safety attributes only with clang.
 // The attributes can be safely erased when compiling with other compilers.
 #if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
-#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
+#define THREAD_ANNOTATION_ATTRIBUTE_(x) __attribute__((x))
 #else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
+#define THREAD_ANNOTATION_ATTRIBUTE_(x)  // no-op
 #endif
 
-#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(capability(x))
 
-#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE_(scoped_lockable)
 
-#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(guarded_by(x))
 
-#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(pt_guarded_by(x))
 
 #define ACQUIRED_BEFORE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_before(__VA_ARGS__))
 
 #define ACQUIRED_AFTER(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_after(__VA_ARGS__))
 
 #define REQUIRES(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_capability(__VA_ARGS__))
 
 #define REQUIRES_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_shared_capability(__VA_ARGS__))
 
 #define ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_capability(__VA_ARGS__))
 
 #define ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_shared_capability(__VA_ARGS__))
 
 #define RELEASE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_capability(__VA_ARGS__))
 
 #define RELEASE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_shared_capability(__VA_ARGS__))
 
 #define TRY_ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_capability(__VA_ARGS__))
 
 #define TRY_ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_shared_capability(__VA_ARGS__))
 
-#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE_(locks_excluded(__VA_ARGS__))
 
-#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(assert_capability(x))
 
 #define ASSERT_SHARED_CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
+  THREAD_ANNOTATION_ATTRIBUTE_(assert_shared_capability(x))
 
-#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(lock_returned(x))
 
 #define NO_THREAD_SAFETY_ANALYSIS \
-  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+  THREAD_ANNOTATION_ATTRIBUTE_(no_thread_safety_analysis)
 
 namespace benchmark {
 
@@ -71,7 +71,7 @@ typedef std::condition_variable Condition;
 // NOTE: Wrappers for std::mutex and std::unique_lock are provided so that
 // we can annotate them with thread safety attributes and use the
 // -Wthread-safety warning with clang. The standard library types cannot be
-// used directly because they do not provided the required annotations.
+// used directly because they do not provide the required annotations.
 class CAPABILITY("mutex") Mutex {
  public:
   Mutex() {}

diff  --git a/libcxx/utils/google-benchmark/src/perf_counters.cc b/libcxx/utils/google-benchmark/src/perf_counters.cc
new file mode 100644
index 0000000000000..4ddf0de2502c7
--- /dev/null
+++ b/libcxx/utils/google-benchmark/src/perf_counters.cc
@@ -0,0 +1,132 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "perf_counters.h"
+
+#include <cstring>
+#include <vector>
+
+#if defined HAVE_LIBPFM
+#include "perfmon/pfmlib.h"
+#include "perfmon/pfmlib_perf_event.h"
+#endif
+
+namespace benchmark {
+namespace internal {
+
+constexpr size_t PerfCounterValues::kMaxCounters;
+
+#if defined HAVE_LIBPFM
+const bool PerfCounters::kSupported = true;
+
+bool PerfCounters::Initialize() { return pfm_initialize() == PFM_SUCCESS; }
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (counter_names.empty()) {
+    return NoCounters();
+  }
+  if (counter_names.size() > PerfCounterValues::kMaxCounters) {
+    GetErrorLogInstance()
+        << counter_names.size()
+        << " counters were requested. The minimum is 1, the maximum is "
+        << PerfCounterValues::kMaxCounters << "\n";
+    return NoCounters();
+  }
+  std::vector<int> counter_ids(counter_names.size());
+
+  const int mode = PFM_PLM3;  // user mode only
+  for (size_t i = 0; i < counter_names.size(); ++i) {
+    const bool is_first = i == 0;
+    struct perf_event_attr attr{};
+    attr.size = sizeof(attr);
+    const int group_id = !is_first ? counter_ids[0] : -1;
+    const auto& name = counter_names[i];
+    if (name.empty()) {
+      GetErrorLogInstance() << "A counter name was the empty string\n";
+      return NoCounters();
+    }
+    pfm_perf_encode_arg_t arg{};
+    arg.attr = &attr;
+
+    const int pfm_get =
+        pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT, &arg);
+    if (pfm_get != PFM_SUCCESS) {
+      GetErrorLogInstance() << "Unknown counter name: " << name << "\n";
+      return NoCounters();
+    }
+    attr.disabled = is_first;
+    // Note: the man page for perf_event_create suggests inerit = true and
+    // read_format = PERF_FORMAT_GROUP don't work together, but that's not the
+    // case.
+    attr.inherit = true;
+    attr.pinned = is_first;
+    attr.exclude_kernel = true;
+    attr.exclude_user = false;
+    attr.exclude_hv = true;
+    // Read all counters in one read.
+    attr.read_format = PERF_FORMAT_GROUP;
+
+    int id = -1;
+    static constexpr size_t kNrOfSyscallRetries = 5;
+    // Retry syscall as it was interrupted often (b/64774091).
+    for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
+         ++num_retries) {
+      id = perf_event_open(&attr, 0, -1, group_id, 0);
+      if (id >= 0 || errno != EINTR) {
+        break;
+      }
+    }
+    if (id < 0) {
+      GetErrorLogInstance()
+          << "Failed to get a file descriptor for " << name << "\n";
+      return NoCounters();
+    }
+
+    counter_ids[i] = id;
+  }
+  if (ioctl(counter_ids[0], PERF_EVENT_IOC_ENABLE) != 0) {
+    GetErrorLogInstance() << "Failed to start counters\n";
+    return NoCounters();
+  }
+
+  return PerfCounters(counter_names, std::move(counter_ids));
+}
+
+PerfCounters::~PerfCounters() {
+  if (counter_ids_.empty()) {
+    return;
+  }
+  ioctl(counter_ids_[0], PERF_EVENT_IOC_DISABLE);
+  for (int fd : counter_ids_) {
+    close(fd);
+  }
+}
+#else   // defined HAVE_LIBPFM
+const bool PerfCounters::kSupported = false;
+
+bool PerfCounters::Initialize() { return false; }
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (!counter_names.empty()) {
+    GetErrorLogInstance() << "Performance counters not supported.";
+  }
+  return NoCounters();
+}
+
+PerfCounters::~PerfCounters() = default;
+#endif  // defined HAVE_LIBPFM
+}  // namespace internal
+}  // namespace benchmark

diff  --git a/libcxx/utils/google-benchmark/src/perf_counters.h b/libcxx/utils/google-benchmark/src/perf_counters.h
new file mode 100644
index 0000000000000..b6629b99070b0
--- /dev/null
+++ b/libcxx/utils/google-benchmark/src/perf_counters.h
@@ -0,0 +1,172 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_PERF_COUNTERS_H
+#define BENCHMARK_PERF_COUNTERS_H
+
+#include <array>
+#include <cstdint>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "check.h"
+#include "log.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#include <unistd.h>
+#endif
+
+namespace benchmark {
+namespace internal {
+
+// Typically, we can only read a small number of counters. There is also a
+// padding preceding counter values, when reading multiple counters with one
+// syscall (which is desirable). PerfCounterValues abstracts these details.
+// The implementation ensures the storage is inlined, and allows 0-based
+// indexing into the counter values.
+// The object is used in conjunction with a PerfCounters object, by passing it
+// to Snapshot(). The values are populated such that
+// perfCounters->names()[i]'s value is obtained at position i (as given by
+// operator[]) of this object.
+class PerfCounterValues {
+ public:
+  explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) {
+    CHECK_LE(nr_counters_, kMaxCounters);
+  }
+
+  uint64_t operator[](size_t pos) const { return values_[kPadding + pos]; }
+
+  static constexpr size_t kMaxCounters = 3;
+
+ private:
+  friend class PerfCounters;
+  // Get the byte buffer in which perf counters can be captured.
+  // This is used by PerfCounters::Read
+  std::pair<char*, size_t> get_data_buffer() {
+    return {reinterpret_cast<char*>(values_.data()),
+            sizeof(uint64_t) * (kPadding + nr_counters_)};
+  }
+
+  static constexpr size_t kPadding = 1;
+  std::array<uint64_t, kPadding + kMaxCounters> values_;
+  const size_t nr_counters_;
+};
+
+// Collect PMU counters. The object, once constructed, is ready to be used by
+// calling read(). PMU counter collection is enabled from the time create() is
+// called, to obtain the object, until the object's destructor is called.
+class PerfCounters final {
+ public:
+  // True iff this platform supports performance counters.
+  static const bool kSupported;
+
+  bool IsValid() const { return is_valid_; }
+  static PerfCounters NoCounters() { return PerfCounters(); }
+
+  ~PerfCounters();
+  PerfCounters(PerfCounters&&) = default;
+  PerfCounters(const PerfCounters&) = delete;
+
+  // Platform-specific implementations may choose to do some library
+  // initialization here.
+  static bool Initialize();
+
+  // Return a PerfCounters object ready to read the counters with the names
+  // specified. The values are user-mode only. The counter name format is
+  // implementation and OS specific.
+  // TODO: once we move to C++-17, this should be a std::optional, and then the
+  // IsValid() boolean can be dropped.
+  static PerfCounters Create(const std::vector<std::string>& counter_names);
+
+  // Take a snapshot of the current value of the counters into the provided
+  // valid PerfCounterValues storage. The values are populated such that:
+  // names()[i]'s value is (*values)[i]
+  BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) const {
+#ifndef BENCHMARK_OS_WINDOWS
+    assert(values != nullptr);
+    assert(IsValid());
+    auto buffer = values->get_data_buffer();
+    auto read_bytes = ::read(counter_ids_[0], buffer.first, buffer.second);
+    return static_cast<size_t>(read_bytes) == buffer.second;
+#else
+    (void)values;
+    return false;
+#endif
+  }
+
+  const std::vector<std::string>& names() const { return counter_names_; }
+  size_t num_counters() const { return counter_names_.size(); }
+
+ private:
+  PerfCounters(const std::vector<std::string>& counter_names,
+               std::vector<int>&& counter_ids)
+      : counter_ids_(std::move(counter_ids)),
+        counter_names_(counter_names),
+        is_valid_(true) {}
+  PerfCounters() : is_valid_(false) {}
+
+  std::vector<int> counter_ids_;
+  const std::vector<std::string> counter_names_;
+  const bool is_valid_;
+};
+
+// Typical usage of the above primitives.
+class PerfCountersMeasurement final {
+ public:
+  PerfCountersMeasurement(PerfCounters&& c)
+      : counters_(std::move(c)),
+        start_values_(counters_.IsValid() ? counters_.names().size() : 0),
+        end_values_(counters_.IsValid() ? counters_.names().size() : 0) {}
+
+  bool IsValid() const { return counters_.IsValid(); }
+
+  BENCHMARK_ALWAYS_INLINE void Start() {
+    assert(IsValid());
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    counters_.Snapshot(&start_values_);
+    ClobberMemory();
+  }
+
+  BENCHMARK_ALWAYS_INLINE std::vector<std::pair<std::string, double>>
+  StopAndGetMeasurements() {
+    assert(IsValid());
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    counters_.Snapshot(&end_values_);
+    ClobberMemory();
+
+    std::vector<std::pair<std::string, double>> ret;
+    for (size_t i = 0; i < counters_.names().size(); ++i) {
+      double measurement = static_cast<double>(end_values_[i]) -
+                           static_cast<double>(start_values_[i]);
+      ret.push_back({counters_.names()[i], measurement});
+    }
+    return ret;
+  }
+
+ private:
+  PerfCounters counters_;
+  PerfCounterValues start_values_;
+  PerfCounterValues end_values_;
+};
+
+BENCHMARK_UNUSED static bool perf_init_anchor = PerfCounters::Initialize();
+
+}  // namespace internal
+}  // namespace benchmark
+
+#endif  // BENCHMARK_PERF_COUNTERS_H

diff  --git a/libcxx/utils/google-benchmark/src/reporter.cc b/libcxx/utils/google-benchmark/src/reporter.cc
index 59bc5f7102319..14dd40dc72f46 100644
--- a/libcxx/utils/google-benchmark/src/reporter.cc
+++ b/libcxx/utils/google-benchmark/src/reporter.cc
@@ -18,6 +18,8 @@
 #include <cstdlib>
 
 #include <iostream>
+#include <map>
+#include <string>
 #include <tuple>
 #include <vector>
 
@@ -25,6 +27,9 @@
 #include "string_util.h"
 
 namespace benchmark {
+namespace internal {
+extern std::map<std::string, std::string>* global_context;
+}
 
 BenchmarkReporter::BenchmarkReporter()
     : output_stream_(&std::cout), error_stream_(&std::cerr) {}
@@ -49,7 +54,7 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
     Out << "CPU Caches:\n";
     for (auto &CInfo : info.caches) {
       Out << "  L" << CInfo.level << " " << CInfo.type << " "
-          << (CInfo.size / 1000) << "K";
+          << (CInfo.size / 1024) << " KiB";
       if (CInfo.num_sharing != 0)
         Out << " (x" << (info.num_cpus / CInfo.num_sharing) << ")";
       Out << "\n";
@@ -64,7 +69,13 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
     Out << "\n";
   }
 
-  if (info.scaling_enabled) {
+  if (internal::global_context != nullptr) {
+    for (const auto& kv: *internal::global_context) {
+      Out << kv.first << ": " << kv.second << "\n";
+    }
+  }
+
+  if (CPUInfo::Scaling::ENABLED == info.scaling) {
     Out << "***WARNING*** CPU scaling is enabled, the benchmark "
            "real time measurements may be noisy and will incur extra "
            "overhead.\n";
@@ -83,7 +94,7 @@ BenchmarkReporter::Context::Context()
     : cpu_info(CPUInfo::Get()), sys_info(SystemInfo::Get()) {}
 
 std::string BenchmarkReporter::Run::benchmark_name() const {
-  std::string name = run_name;
+  std::string name = run_name.str();
   if (run_type == RT_Aggregate) {
     name += "_" + aggregate_name;
   }

diff  --git a/libcxx/utils/google-benchmark/src/sleep.cc b/libcxx/utils/google-benchmark/src/sleep.cc
index 1512ac90f7ead..4609d540eade8 100644
--- a/libcxx/utils/google-benchmark/src/sleep.cc
+++ b/libcxx/utils/google-benchmark/src/sleep.cc
@@ -24,6 +24,10 @@
 #include <windows.h>
 #endif
 
+#ifdef BENCHMARK_OS_ZOS
+#include <unistd.h>
+#endif
+
 namespace benchmark {
 #ifdef BENCHMARK_OS_WINDOWS
 // Window's Sleep takes milliseconds argument.
@@ -33,11 +37,23 @@ void SleepForSeconds(double seconds) {
 }
 #else   // BENCHMARK_OS_WINDOWS
 void SleepForMicroseconds(int microseconds) {
+#ifdef BENCHMARK_OS_ZOS
+  // z/OS does not support nanosleep. Instead call sleep() and then usleep() to
+  // sleep for the remaining microseconds because usleep() will fail if its
+  // argument is greater than 1000000.
+  div_t sleepTime = div(microseconds, kNumMicrosPerSecond);
+  int seconds = sleepTime.quot;
+  while (seconds != 0)
+    seconds = sleep(seconds);
+  while (usleep(sleepTime.rem) == -1 && errno == EINTR)
+    ;
+#else
   struct timespec sleep_time;
   sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
   sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
   while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
     ;  // Ignore signals and wait for the full interval to elapse.
+#endif
 }
 
 void SleepForMilliseconds(int milliseconds) {

diff  --git a/libcxx/utils/google-benchmark/src/statistics.cc b/libcxx/utils/google-benchmark/src/statistics.cc
index e821aec18b7c3..57472b9ff99bd 100644
--- a/libcxx/utils/google-benchmark/src/statistics.cc
+++ b/libcxx/utils/google-benchmark/src/statistics.cc
@@ -97,7 +97,7 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
 
   // All repetitions should be run with the same number of iterations so we
   // can take this information from the first benchmark.
-  int64_t const run_iterations = reports.front().iterations;
+  const IterationCount run_iterations = reports.front().iterations;
   // create stats for user counters
   struct CounterStat {
     Counter c;
@@ -147,8 +147,13 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
   for (const auto& Stat : *reports[0].statistics) {
     // Get the data from the accumulator to BenchmarkReporter::Run's.
     Run data;
-    data.run_name = reports[0].benchmark_name();
+    data.run_name = reports[0].run_name;
+    data.family_index = reports[0].family_index;
+    data.per_family_instance_index = reports[0].per_family_instance_index;
     data.run_type = BenchmarkReporter::Run::RT_Aggregate;
+    data.threads = reports[0].threads;
+    data.repetitions = reports[0].repetitions;
+    data.repetition_index = Run::no_repetition_index;
     data.aggregate_name = Stat.name_;
     data.report_label = report_label;
 

diff  --git a/libcxx/utils/google-benchmark/src/string_util.cc b/libcxx/utils/google-benchmark/src/string_util.cc
index 05ac5b4ea367e..3551418174fd0 100644
--- a/libcxx/utils/google-benchmark/src/string_util.cc
+++ b/libcxx/utils/google-benchmark/src/string_util.cc
@@ -1,6 +1,9 @@
 #include "string_util.h"
 
 #include <array>
+#ifdef BENCHMARK_STL_ANDROID_GNUSTL
+#include <cerrno>
+#endif
 #include <cmath>
 #include <cstdarg>
 #include <cstdio>
@@ -160,13 +163,17 @@ std::string StrFormat(const char* format, ...) {
   return tmp;
 }
 
-void ReplaceAll(std::string* str, const std::string& from,
-                const std::string& to) {
-  std::size_t start = 0;
-  while ((start = str->find(from, start)) != std::string::npos) {
-    str->replace(start, from.length(), to);
-    start += to.length();
+std::vector<std::string> StrSplit(const std::string& str, char delim) {
+  if (str.empty()) return {};
+  std::vector<std::string> ret;
+  size_t first = 0;
+  size_t next = str.find(delim);
+  for (; next != std::string::npos;
+       first = next + 1, next = str.find(delim, first)) {
+    ret.push_back(str.substr(first, next - first));
   }
+  ret.push_back(str.substr(first));
+  return ret;
 }
 
 #ifdef BENCHMARK_STL_ANDROID_GNUSTL

diff  --git a/libcxx/utils/google-benchmark/src/string_util.h b/libcxx/utils/google-benchmark/src/string_util.h
index fc5f8b0304b04..6bc28b6912a84 100644
--- a/libcxx/utils/google-benchmark/src/string_util.h
+++ b/libcxx/utils/google-benchmark/src/string_util.h
@@ -12,7 +12,9 @@ void AppendHumanReadable(int n, std::string* str);
 
 std::string HumanReadableNumber(double n, double one_k = 1024.0);
 
-#ifdef __GNUC__
+#if defined(__MINGW32__)
+__attribute__((format(__MINGW_PRINTF_FORMAT, 1, 2)))
+#elif defined(__GNUC__)
 __attribute__((format(printf, 1, 2)))
 #endif
 std::string
@@ -35,8 +37,7 @@ inline std::string StrCat(Args&&... args) {
   return ss.str();
 }
 
-void ReplaceAll(std::string* str, const std::string& from,
-                const std::string& to);
+std::vector<std::string> StrSplit(const std::string& str, char delim);
 
 #ifdef BENCHMARK_STL_ANDROID_GNUSTL
 /*

diff  --git a/libcxx/utils/google-benchmark/src/sysinfo.cc b/libcxx/utils/google-benchmark/src/sysinfo.cc
index c0c07e5e62afe..c1969ea2d3fe8 100644
--- a/libcxx/utils/google-benchmark/src/sysinfo.cc
+++ b/libcxx/utils/google-benchmark/src/sysinfo.cc
@@ -29,7 +29,8 @@
 #include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
 #include <unistd.h>
 #if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX || \
-    defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD
+    defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD || \
+    defined BENCHMARK_OS_DRAGONFLY
 #define BENCHMARK_HAS_SYSCTL
 #include <sys/sysctl.h>
 #endif
@@ -37,6 +38,9 @@
 #if defined(BENCHMARK_OS_SOLARIS)
 #include <kstat.h>
 #endif
+#if defined(BENCHMARK_OS_QNX)
+#include <sys/syspage.h>
+#endif
 
 #include <algorithm>
 #include <array>
@@ -54,6 +58,7 @@
 #include <memory>
 #include <sstream>
 #include <locale>
+#include <utility>
 
 #include "check.h"
 #include "cycleclock.h"
@@ -206,9 +211,12 @@ bool ReadFromFile(std::string const& fname, ArgT* arg) {
   return f.good();
 }
 
-bool CpuScalingEnabled(int num_cpus) {
+CPUInfo::Scaling CpuScaling(int num_cpus) {
   // We don't have a valid CPU count, so don't even bother.
-  if (num_cpus <= 0) return false;
+  if (num_cpus <= 0) return CPUInfo::Scaling::UNKNOWN;
+#ifdef BENCHMARK_OS_QNX
+  return CPUInfo::Scaling::UNKNOWN;
+#endif
 #ifndef BENCHMARK_OS_WINDOWS
   // On Linux, the CPUfreq subsystem exposes CPU information as files on the
   // local file system. If reading the exported files fails, then we may not be
@@ -217,10 +225,11 @@ bool CpuScalingEnabled(int num_cpus) {
   for (int cpu = 0; cpu < num_cpus; ++cpu) {
     std::string governor_file =
         StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
-    if (ReadFromFile(governor_file, &res) && res != "performance") return true;
+    if (ReadFromFile(governor_file, &res) && res != "performance") return CPUInfo::Scaling::ENABLED;
   }
+  return CPUInfo::Scaling::DISABLED;
 #endif
-  return false;
+  return CPUInfo::Scaling::UNKNOWN;
 }
 
 int CountSetBitsInCPUMap(std::string Val) {
@@ -264,7 +273,7 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
       else if (f && suffix != "K")
         PrintErrorAndDie("Invalid cache size format: Expected bytes ", suffix);
       else if (suffix == "K")
-        info.size *= 1000;
+        info.size *= 1024;
     }
     if (!ReadFromFile(StrCat(FPath, "type"), &info.type))
       PrintErrorAndDie("Failed to read from file ", FPath, "type");
@@ -356,6 +365,42 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
   }
   return res;
 }
+#elif BENCHMARK_OS_QNX
+std::vector<CPUInfo::CacheInfo> GetCacheSizesQNX() {
+  std::vector<CPUInfo::CacheInfo> res;
+  struct cacheattr_entry *cache = SYSPAGE_ENTRY(cacheattr);
+  uint32_t const elsize = SYSPAGE_ELEMENT_SIZE(cacheattr);
+  int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize ;
+  for(int i = 0; i < num; ++i ) {
+    CPUInfo::CacheInfo info;
+    switch (cache->flags){
+      case CACHE_FLAG_INSTR :
+        info.type = "Instruction";
+        info.level = 1;
+        break;
+      case CACHE_FLAG_DATA :
+        info.type = "Data";
+        info.level = 1;
+        break;
+      case CACHE_FLAG_UNIFIED :
+        info.type = "Unified";
+        info.level = 2;
+        break;
+      case CACHE_FLAG_SHARED :
+        info.type = "Shared";
+        info.level = 3;
+        break;
+      default :
+        continue;
+        break;
+    }
+    info.size = cache->line_size * cache->num_lines;
+    info.num_sharing = 0;
+    res.push_back(std::move(info));
+    cache = SYSPAGE_ARRAY_ADJ_OFFSET(cacheattr, cache, elsize);
+  }
+  return res;
+}
 #endif
 
 std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
@@ -363,6 +408,8 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
   return GetCacheSizesMacOSX();
 #elif defined(BENCHMARK_OS_WINDOWS)
   return GetCacheSizesWindows();
+#elif defined(BENCHMARK_OS_QNX)
+  return GetCacheSizesQNX();
 #else
   return GetCacheSizesFromKVFS();
 #endif
@@ -387,9 +434,20 @@ std::string GetSystemName() {
 #endif
   return str;
 #else // defined(BENCHMARK_OS_WINDOWS)
-#ifdef BENCHMARK_OS_MACOSX //Mac Doesnt have HOST_NAME_MAX defined
+#ifndef HOST_NAME_MAX
+#ifdef BENCHMARK_HAS_SYSCTL // BSD/Mac Doesnt have HOST_NAME_MAX defined
+#define HOST_NAME_MAX 64
+#elif defined(BENCHMARK_OS_NACL)
+#define HOST_NAME_MAX 64
+#elif defined(BENCHMARK_OS_QNX)
+#define HOST_NAME_MAX 154
+#elif defined(BENCHMARK_OS_RTEMS)
+#define HOST_NAME_MAX 256
+#else
+#warning "HOST_NAME_MAX not defined. using 64"
 #define HOST_NAME_MAX 64
 #endif
+#endif // def HOST_NAME_MAX
   char hostname[HOST_NAME_MAX];
   int retVal = gethostname(hostname, HOST_NAME_MAX);
   if (retVal != 0) return std::string("");
@@ -421,6 +479,8 @@ int GetNumCPUs() {
             strerror(errno));
   }
   return NumCPU;
+#elif defined(BENCHMARK_OS_QNX)
+  return static_cast<int>(_syspage_ptr->num_cpu);
 #else
   int NumCPUs = 0;
   int MaxID = -1;
@@ -470,7 +530,11 @@ int GetNumCPUs() {
   BENCHMARK_UNREACHABLE();
 }
 
-double GetCPUCyclesPerSecond() {
+double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
+  // Currently, scaling is only used on linux path here,
+  // suppress diagnostics about it being unused on other paths.
+  (void)scaling;
+
 #if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
   long freq;
 
@@ -481,8 +545,15 @@ double GetCPUCyclesPerSecond() {
   // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
   // well.
   if (ReadFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)
-      // If CPU scaling is in effect, we want to use the *maximum* frequency,
-      // not whatever CPU speed some random processor happens to be using now.
+      // If CPU scaling is disabled, use the the *current* frequency.
+      // Note that we specifically don't want to read cpuinfo_cur_freq,
+      // because it is only readable by root.
+      || (scaling == CPUInfo::Scaling::DISABLED &&
+          ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq",
+                       &freq))
+      // Otherwise, if CPU scaling may be in effect, we want to use
+      // the *maximum* frequency, not whatever CPU speed some random processor
+      // happens to be using now.
       || ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
                       &freq)) {
     // The value is in kHz (as the file name suggests).  For example, on a
@@ -548,6 +619,8 @@ double GetCPUCyclesPerSecond() {
       "machdep.tsc_freq";
 #elif defined BENCHMARK_OS_OPENBSD
       "hw.cpuspeed";
+#elif defined BENCHMARK_OS_DRAGONFLY
+      "hw.tsc_frequency";
 #else
       "hw.cpufrequency";
 #endif
@@ -600,6 +673,9 @@ double GetCPUCyclesPerSecond() {
   double clock_hz = knp->value.ui64;
   kstat_close(kc);
   return clock_hz;
+#elif defined (BENCHMARK_OS_QNX)
+  return static_cast<double>((int64_t)(SYSPAGE_ENTRY(cpuinfo)->speed) *
+                             (int64_t)(1000 * 1000));
 #endif
   // If we've fallen through, attempt to roughly estimate the CPU clock rate.
   const int estimate_time_ms = 1000;
@@ -609,9 +685,10 @@ double GetCPUCyclesPerSecond() {
 }
 
 std::vector<double> GetLoadAvg() {
-#if defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) || \
-    defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||  \
-    defined BENCHMARK_OS_OPENBSD
+#if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) ||     \
+     defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||      \
+     defined BENCHMARK_OS_OPENBSD || defined BENCHMARK_OS_DRAGONFLY) && \
+    !defined(__ANDROID__)
   constexpr int kMaxSamples = 3;
   std::vector<double> res(kMaxSamples, 0.0);
   const int nelem = getloadavg(res.data(), kMaxSamples);
@@ -635,12 +712,11 @@ const CPUInfo& CPUInfo::Get() {
 
 CPUInfo::CPUInfo()
     : num_cpus(GetNumCPUs()),
-      cycles_per_second(GetCPUCyclesPerSecond()),
+      scaling(CpuScaling(num_cpus)),
+      cycles_per_second(GetCPUCyclesPerSecond(scaling)),
       caches(GetCacheSizes()),
-      scaling_enabled(CpuScalingEnabled(num_cpus)),
       load_avg(GetLoadAvg()) {}
 
-
 const SystemInfo& SystemInfo::Get() {
   static const SystemInfo* info = new SystemInfo();
   return *info;

diff  --git a/libcxx/utils/google-benchmark/src/thread_manager.h b/libcxx/utils/google-benchmark/src/thread_manager.h
index 6e274c7ea6bce..28e2dd53aff2c 100644
--- a/libcxx/utils/google-benchmark/src/thread_manager.h
+++ b/libcxx/utils/google-benchmark/src/thread_manager.h
@@ -11,7 +11,7 @@ namespace internal {
 
 class ThreadManager {
  public:
-  ThreadManager(int num_threads)
+  explicit ThreadManager(int num_threads)
       : alive_threads_(num_threads), start_stop_barrier_(num_threads) {}
 
   Mutex& GetBenchmarkMutex() const RETURN_CAPABILITY(benchmark_mutex_) {
@@ -38,7 +38,7 @@ class ThreadManager {
 
  public:
   struct Result {
-    int64_t iterations = 0;
+    IterationCount iterations = 0;
     double real_time_used = 0;
     double cpu_time_used = 0;
     double manual_time_used = 0;

diff  --git a/libcxx/utils/google-benchmark/src/thread_timer.h b/libcxx/utils/google-benchmark/src/thread_timer.h
index eaf108e017dc5..1703ca0d6f877 100644
--- a/libcxx/utils/google-benchmark/src/thread_timer.h
+++ b/libcxx/utils/google-benchmark/src/thread_timer.h
@@ -8,14 +8,22 @@ namespace benchmark {
 namespace internal {
 
 class ThreadTimer {
+  explicit ThreadTimer(bool measure_process_cpu_time_)
+      : measure_process_cpu_time(measure_process_cpu_time_) {}
+
  public:
-  ThreadTimer() = default;
+  static ThreadTimer Create() {
+    return ThreadTimer(/*measure_process_cpu_time_=*/false);
+  }
+  static ThreadTimer CreateProcessCpuTime() {
+    return ThreadTimer(/*measure_process_cpu_time_=*/true);
+  }
 
   // Called by each thread
   void StartTimer() {
     running_ = true;
     start_real_time_ = ChronoClockNow();
-    start_cpu_time_ = ThreadCPUUsage();
+    start_cpu_time_ = ReadCpuTimerOfChoice();
   }
 
   // Called by each thread
@@ -25,7 +33,8 @@ class ThreadTimer {
     real_time_used_ += ChronoClockNow() - start_real_time_;
     // Floating point error can result in the subtraction producing a negative
     // time. Guard against that.
-    cpu_time_used_ += std::max<double>(ThreadCPUUsage() - start_cpu_time_, 0);
+    cpu_time_used_ +=
+        std::max<double>(ReadCpuTimerOfChoice() - start_cpu_time_, 0);
   }
 
   // Called by each thread
@@ -34,24 +43,32 @@ class ThreadTimer {
   bool running() const { return running_; }
 
   // REQUIRES: timer is not running
-  double real_time_used() {
+  double real_time_used() const {
     CHECK(!running_);
     return real_time_used_;
   }
 
   // REQUIRES: timer is not running
-  double cpu_time_used() {
+  double cpu_time_used() const {
     CHECK(!running_);
     return cpu_time_used_;
   }
 
   // REQUIRES: timer is not running
-  double manual_time_used() {
+  double manual_time_used() const {
     CHECK(!running_);
     return manual_time_used_;
   }
 
  private:
+  double ReadCpuTimerOfChoice() const {
+    if (measure_process_cpu_time) return ProcessCPUUsage();
+    return ThreadCPUUsage();
+  }
+
+  // should the thread, or the process, time be measured?
+  const bool measure_process_cpu_time;
+
   bool running_ = false;        // Is the timer running
   double start_real_time_ = 0;  // If running_
   double start_cpu_time_ = 0;   // If running_

diff  --git a/libcxx/utils/google-benchmark/src/timers.cc b/libcxx/utils/google-benchmark/src/timers.cc
index 7613ff92c6ef0..af4767dff944f 100644
--- a/libcxx/utils/google-benchmark/src/timers.cc
+++ b/libcxx/utils/google-benchmark/src/timers.cc
@@ -28,7 +28,8 @@
 #include <sys/time.h>
 #include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
 #include <unistd.h>
-#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_DRAGONFLY || \
+    defined BENCHMARK_OS_MACOSX
 #include <sys/sysctl.h>
 #endif
 #if defined(BENCHMARK_OS_MACOSX)
@@ -178,40 +179,75 @@ double ThreadCPUUsage() {
 #endif
 }
 
-namespace {
-
-std::string DateTimeString(bool local) {
+std::string LocalDateTimeString() {
+  // Write the local time in RFC3339 format yyyy-mm-ddTHH:MM:SS+/-HH:MM.
   typedef std::chrono::system_clock Clock;
   std::time_t now = Clock::to_time_t(Clock::now());
-  const std::size_t kStorageSize = 128;
-  char storage[kStorageSize];
-  std::size_t written;
+  const std::size_t kTzOffsetLen = 6;
+  const std::size_t kTimestampLen = 19;
+
+  std::size_t tz_len;
+  std::size_t timestamp_len;
+  long int offset_minutes;
+  char tz_offset_sign = '+';
+  // tz_offset is set in one of three ways:
+  // * strftime with %z - This either returns empty or the ISO 8601 time.  The maximum length an
+  //   ISO 8601 string can be is 7 (e.g. -03:30, plus trailing zero).
+  // * snprintf with %c%02li:%02li - The maximum length is 41 (one for %c, up to 19 for %02li,
+  //   one for :, up to 19 %02li, plus trailing zero).
+  // * A fixed string of "-00:00".  The maximum length is 7 (-00:00, plus trailing zero).
+  //
+  // Thus, the maximum size this needs to be is 41.
+  char tz_offset[41];
+  // Long enough buffer to avoid format-overflow warnings
+  char storage[128];
 
-  if (local) {
 #if defined(BENCHMARK_OS_WINDOWS)
-    written =
-        std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
+  std::tm *timeinfo_p = ::localtime(&now);
 #else
-    std::tm timeinfo;
-    ::localtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+  std::tm timeinfo;
+  std::tm *timeinfo_p = &timeinfo;
+  ::localtime_r(&now, &timeinfo);
 #endif
+
+  tz_len = std::strftime(tz_offset, sizeof(tz_offset), "%z", timeinfo_p);
+
+  if (tz_len < kTzOffsetLen && tz_len > 1) {
+    // Timezone offset was written. strftime writes offset as +HHMM or -HHMM,
+    // RFC3339 specifies an offset as +HH:MM or -HH:MM. To convert, we parse
+    // the offset as an integer, then reprint it to a string.
+
+    offset_minutes = ::strtol(tz_offset, NULL, 10);
+    if (offset_minutes < 0) {
+      offset_minutes *= -1;
+      tz_offset_sign = '-';
+    }
+
+    tz_len = ::snprintf(tz_offset, sizeof(tz_offset), "%c%02li:%02li",
+        tz_offset_sign, offset_minutes / 100, offset_minutes % 100);
+    CHECK(tz_len == kTzOffsetLen);
+    ((void)tz_len); // Prevent unused variable warning in optimized build.
   } else {
+    // Unknown offset. RFC3339 specifies that unknown local offsets should be
+    // written as UTC time with -00:00 timezone.
 #if defined(BENCHMARK_OS_WINDOWS)
-    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
+    // Potential race condition if another thread calls localtime or gmtime.
+    timeinfo_p = ::gmtime(&now);
 #else
-    std::tm timeinfo;
     ::gmtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
 #endif
+
+    strncpy(tz_offset, "-00:00", kTzOffsetLen + 1);
   }
-  CHECK(written < kStorageSize);
-  ((void)written);  // prevent unused variable in optimized mode.
-  return std::string(storage);
-}
 
-}  // end namespace
+  timestamp_len = std::strftime(storage, sizeof(storage), "%Y-%m-%dT%H:%M:%S",
+      timeinfo_p);
+  CHECK(timestamp_len == kTimestampLen);
+  // Prevent unused variable warning in optimized build.
+  ((void)kTimestampLen);
 
-std::string LocalDateTimeString() { return DateTimeString(true); }
+  std::strncat(storage, tz_offset, sizeof(storage) - timestamp_len - 1);
+  return std::string(storage);
+}
 
 }  // end namespace benchmark

diff  --git a/libcxx/utils/google-benchmark/test/AssemblyTests.cmake b/libcxx/utils/google-benchmark/test/AssemblyTests.cmake
index 8605221ff7107..3d078586f1de1 100644
--- a/libcxx/utils/google-benchmark/test/AssemblyTests.cmake
+++ b/libcxx/utils/google-benchmark/test/AssemblyTests.cmake
@@ -43,3 +43,4 @@ macro(add_filecheck_test name)
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endforeach()
 endmacro()
+

diff  --git a/libcxx/utils/google-benchmark/test/BUILD b/libcxx/utils/google-benchmark/test/BUILD
new file mode 100644
index 0000000000000..1f27f99ede9f3
--- /dev/null
+++ b/libcxx/utils/google-benchmark/test/BUILD
@@ -0,0 +1,74 @@
+TEST_COPTS = [
+    "-pedantic",
+    "-pedantic-errors",
+    "-std=c++11",
+    "-Wall",
+    "-Wextra",
+    "-Wshadow",
+    #    "-Wshorten-64-to-32",
+    "-Wfloat-equal",
+    "-fstrict-aliasing",
+]
+
+PER_SRC_COPTS = ({
+    "cxx03_test.cc": ["-std=c++03"],
+    # Some of the issues with DoNotOptimize only occur when optimization is enabled
+    "donotoptimize_test.cc": ["-O3"],
+})
+
+TEST_ARGS = ["--benchmark_min_time=0.01"]
+
+PER_SRC_TEST_ARGS = ({
+    "user_counters_tabular_test.cc": ["--benchmark_counters_tabular=true"],
+    "repetitions_test.cc": [" --benchmark_repetitions=3"],
+})
+
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+
+cc_library(
+    name = "output_test_helper",
+    testonly = 1,
+    srcs = ["output_test_helper.cc"],
+    hdrs = ["output_test.h"],
+    copts = TEST_COPTS,
+    deps = [
+        "//:benchmark",
+        "//:benchmark_internal_headers",
+    ],
+)
+
+[
+    cc_test(
+        name = test_src[:-len(".cc")],
+        size = "small",
+        srcs = [test_src],
+        args = TEST_ARGS + PER_SRC_TEST_ARGS.get(test_src, []),
+        copts = TEST_COPTS + PER_SRC_COPTS.get(test_src, []),
+        deps = [
+            ":output_test_helper",
+            "//:benchmark",
+            "//:benchmark_internal_headers",
+            "@com_google_googletest//:gtest",
+        ] + (
+            ["@com_google_googletest//:gtest_main"] if (test_src[-len("gtest.cc"):] == "gtest.cc") else []
+        ),
+        # FIXME: Add support for assembly tests to bazel.
+        # See Issue #556
+        # https://github.com/google/benchmark/issues/556
+    )
+    for test_src in glob(
+        ["*test.cc"],
+        exclude = [
+            "*_assembly_test.cc",
+            "link_main_test.cc",
+        ],
+    )
+]
+
+cc_test(
+    name = "link_main_test",
+    size = "small",
+    srcs = ["link_main_test.cc"],
+    copts = TEST_COPTS,
+    deps = ["//:benchmark_main"],
+)

diff  --git a/libcxx/utils/google-benchmark/test/CMakeLists.txt b/libcxx/utils/google-benchmark/test/CMakeLists.txt
index f15ce20818993..79cdf53b402c8 100644
--- a/libcxx/utils/google-benchmark/test/CMakeLists.txt
+++ b/libcxx/utils/google-benchmark/test/CMakeLists.txt
@@ -38,28 +38,28 @@ add_library(output_test_helper STATIC output_test_helper.cc output_test.h)
 
 macro(compile_benchmark_test name)
   add_executable(${name} "${name}.cc")
-  target_link_libraries(${name} benchmark ${CMAKE_THREAD_LIBS_INIT})
+  target_link_libraries(${name} benchmark::benchmark ${CMAKE_THREAD_LIBS_INIT})
 endmacro(compile_benchmark_test)
 
 macro(compile_benchmark_test_with_main name)
   add_executable(${name} "${name}.cc")
-  target_link_libraries(${name} benchmark_main)
+  target_link_libraries(${name} benchmark::benchmark_main)
 endmacro(compile_benchmark_test_with_main)
 
 macro(compile_output_test name)
   add_executable(${name} "${name}.cc" output_test.h)
-  target_link_libraries(${name} output_test_helper benchmark
+  target_link_libraries(${name} output_test_helper benchmark::benchmark
           ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endmacro(compile_output_test)
 
 # Demonstration executable
 compile_benchmark_test(benchmark_test)
-add_test(benchmark benchmark_test --benchmark_min_time=0.01)
+add_test(NAME benchmark COMMAND benchmark_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(filter_test)
 macro(add_filter_test name filter expect)
-  add_test(${name} filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
-  add_test(${name}_list_only filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect})
+  add_test(NAME ${name} COMMAND filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
+  add_test(NAME ${name}_list_only COMMAND filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect})
 endmacro(add_filter_test)
 
 add_filter_test(filter_simple "Foo" 3)
@@ -82,16 +82,19 @@ add_filter_test(filter_regex_end ".*Ba$" 1)
 add_filter_test(filter_regex_end_negative "-.*Ba$" 4)
 
 compile_benchmark_test(options_test)
-add_test(options_benchmarks options_test --benchmark_min_time=0.01)
+add_test(NAME options_benchmarks COMMAND options_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(basic_test)
-add_test(basic_benchmark basic_test --benchmark_min_time=0.01)
+add_test(NAME basic_benchmark COMMAND basic_test --benchmark_min_time=0.01)
+
+compile_output_test(repetitions_test)
+add_test(NAME repetitions_benchmark COMMAND repetitions_test --benchmark_min_time=0.01 --benchmark_repetitions=3)
 
 compile_benchmark_test(diagnostics_test)
-add_test(diagnostics_test diagnostics_test --benchmark_min_time=0.01)
+add_test(NAME diagnostics_test COMMAND diagnostics_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(skip_with_error_test)
-add_test(skip_with_error_test skip_with_error_test --benchmark_min_time=0.01)
+add_test(NAME skip_with_error_test COMMAND skip_with_error_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(donotoptimize_test)
 # Some of the issues with DoNotOptimize only occur when optimization is enabled
@@ -99,53 +102,63 @@ check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
 if (BENCHMARK_HAS_O3_FLAG)
   set_target_properties(donotoptimize_test PROPERTIES COMPILE_FLAGS "-O3")
 endif()
-add_test(donotoptimize_test donotoptimize_test --benchmark_min_time=0.01)
+add_test(NAME donotoptimize_test COMMAND donotoptimize_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(fixture_test)
-add_test(fixture_test fixture_test --benchmark_min_time=0.01)
+add_test(NAME fixture_test COMMAND fixture_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(register_benchmark_test)
-add_test(register_benchmark_test register_benchmark_test --benchmark_min_time=0.01)
+add_test(NAME register_benchmark_test COMMAND register_benchmark_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(map_test)
-add_test(map_test map_test --benchmark_min_time=0.01)
+add_test(NAME map_test COMMAND map_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(multiple_ranges_test)
-add_test(multiple_ranges_test multiple_ranges_test --benchmark_min_time=0.01)
+add_test(NAME multiple_ranges_test COMMAND multiple_ranges_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(args_product_test)
+add_test(NAME args_product_test COMMAND args_product_test --benchmark_min_time=0.01)
 
 compile_benchmark_test_with_main(link_main_test)
-add_test(link_main_test link_main_test --benchmark_min_time=0.01)
+add_test(NAME link_main_test COMMAND link_main_test --benchmark_min_time=0.01)
 
 compile_output_test(reporter_output_test)
-add_test(reporter_output_test reporter_output_test --benchmark_min_time=0.01)
+add_test(NAME reporter_output_test COMMAND reporter_output_test --benchmark_min_time=0.01)
 
 compile_output_test(templated_fixture_test)
-add_test(templated_fixture_test templated_fixture_test --benchmark_min_time=0.01)
+add_test(NAME templated_fixture_test COMMAND templated_fixture_test --benchmark_min_time=0.01)
 
 compile_output_test(user_counters_test)
-add_test(user_counters_test user_counters_test --benchmark_min_time=0.01)
+add_test(NAME user_counters_test COMMAND user_counters_test --benchmark_min_time=0.01)
+
+compile_output_test(perf_counters_test)
+add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time=0.01 --benchmark_perf_counters=CYCLES,BRANCHES)
+
+compile_output_test(internal_threading_test)
+add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01)
 
 compile_output_test(report_aggregates_only_test)
-add_test(report_aggregates_only_test report_aggregates_only_test --benchmark_min_time=0.01)
+add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01)
 
 compile_output_test(display_aggregates_only_test)
-add_test(display_aggregates_only_test display_aggregates_only_test --benchmark_min_time=0.01)
+add_test(NAME display_aggregates_only_test COMMAND display_aggregates_only_test --benchmark_min_time=0.01)
 
 compile_output_test(user_counters_tabular_test)
-add_test(user_counters_tabular_test user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01)
+add_test(NAME user_counters_tabular_test COMMAND user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01)
 
 compile_output_test(user_counters_thousands_test)
-add_test(user_counters_thousands_test user_counters_thousands_test --benchmark_min_time=0.01)
+add_test(NAME user_counters_thousands_test COMMAND user_counters_thousands_test --benchmark_min_time=0.01)
 
 compile_output_test(memory_manager_test)
-add_test(memory_manager_test memory_manager_test --benchmark_min_time=0.01)
+add_test(NAME memory_manager_test COMMAND memory_manager_test --benchmark_min_time=0.01)
 
 check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
 if (BENCHMARK_HAS_CXX03_FLAG)
   compile_benchmark_test(cxx03_test)
   set_target_properties(cxx03_test
       PROPERTIES
-      COMPILE_FLAGS "-std=c++03")
+      CXX_STANDARD 98
+      CXX_STANDARD_REQUIRED YES)
   # libstdc++ provides 
diff erent definitions within <map> between dialects. When
   # LTO is enabled and -Werror is specified GCC diagnoses this ODR violation
   # causing the test to fail to compile. To prevent this we explicitly disable
@@ -156,7 +169,7 @@ if (BENCHMARK_HAS_CXX03_FLAG)
         PROPERTIES
         LINK_FLAGS "-Wno-odr")
   endif()
-  add_test(cxx03 cxx03_test --benchmark_min_time=0.01)
+  add_test(NAME cxx03 COMMAND cxx03_test --benchmark_min_time=0.01)
 endif()
 
 # Attempt to work around flaky test failures when running on Appveyor servers.
@@ -166,7 +179,7 @@ else()
   set(COMPLEXITY_MIN_TIME "0.01")
 endif()
 compile_output_test(complexity_test)
-add_test(complexity_benchmark complexity_test --benchmark_min_time=${COMPLEXITY_MIN_TIME})
+add_test(NAME complexity_benchmark COMMAND complexity_test --benchmark_min_time=${COMPLEXITY_MIN_TIME})
 
 ###############################################################################
 # GoogleTest Unit Tests
@@ -175,24 +188,22 @@ add_test(complexity_benchmark complexity_test --benchmark_min_time=${COMPLEXITY_
 if (BENCHMARK_ENABLE_GTEST_TESTS)
   macro(compile_gtest name)
     add_executable(${name} "${name}.cc")
-    if (TARGET googletest)
-      add_dependencies(${name} googletest)
-    endif()
-    if (GTEST_INCLUDE_DIRS)
-      target_include_directories(${name} PRIVATE ${GTEST_INCLUDE_DIRS})
-    endif()
-    target_link_libraries(${name} benchmark
-        ${GTEST_BOTH_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+    target_link_libraries(${name} benchmark::benchmark
+        gmock_main ${CMAKE_THREAD_LIBS_INIT})
   endmacro(compile_gtest)
 
   macro(add_gtest name)
     compile_gtest(${name})
-    add_test(${name} ${name})
+    add_test(NAME ${name} COMMAND ${name})
   endmacro()
 
   add_gtest(benchmark_gtest)
+  add_gtest(benchmark_name_gtest)
+  add_gtest(benchmark_random_interleaving_gtest)
+  add_gtest(commandlineflags_gtest)
   add_gtest(statistics_gtest)
   add_gtest(string_util_gtest)
+  add_gtest(perf_counters_gtest)
 endif(BENCHMARK_ENABLE_GTEST_TESTS)
 
 ###############################################################################

diff  --git a/libcxx/utils/google-benchmark/test/args_product_test.cc b/libcxx/utils/google-benchmark/test/args_product_test.cc
new file mode 100644
index 0000000000000..32a75d50dd9e2
--- /dev/null
+++ b/libcxx/utils/google-benchmark/test/args_product_test.cc
@@ -0,0 +1,77 @@
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+#include <iostream>
+#include <set>
+#include <vector>
+
+class ArgsProductFixture : public ::benchmark::Fixture {
+ public:
+  ArgsProductFixture()
+      : expectedValues({{0, 100, 2000, 30000},
+                        {1, 15, 3, 8},
+                        {1, 15, 3, 9},
+                        {1, 15, 7, 8},
+                        {1, 15, 7, 9},
+                        {1, 15, 10, 8},
+                        {1, 15, 10, 9},
+                        {2, 15, 3, 8},
+                        {2, 15, 3, 9},
+                        {2, 15, 7, 8},
+                        {2, 15, 7, 9},
+                        {2, 15, 10, 8},
+                        {2, 15, 10, 9},
+                        {4, 5, 6, 11}}) {}
+
+  void SetUp(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
+    std::vector<int64_t> ranges = {state.range(0), state.range(1),
+                                   state.range(2), state.range(3)};
+
+    assert(expectedValues.find(ranges) != expectedValues.end());
+
+    actualValues.insert(ranges);
+  }
+
+  // NOTE: This is not TearDown as we want to check after _all_ runs are
+  // complete.
+  virtual ~ArgsProductFixture() {
+    if (actualValues != expectedValues) {
+      std::cout << "EXPECTED\n";
+      for (auto v : expectedValues) {
+        std::cout << "{";
+        for (int64_t iv : v) {
+          std::cout << iv << ", ";
+        }
+        std::cout << "}\n";
+      }
+      std::cout << "ACTUAL\n";
+      for (auto v : actualValues) {
+        std::cout << "{";
+        for (int64_t iv : v) {
+          std::cout << iv << ", ";
+        }
+        std::cout << "}\n";
+      }
+    }
+  }
+
+  std::set<std::vector<int64_t>> expectedValues;
+  std::set<std::vector<int64_t>> actualValues;
+};
+
+BENCHMARK_DEFINE_F(ArgsProductFixture, Empty)(benchmark::State& state) {
+  for (auto _ : state) {
+    int64_t product =
+        state.range(0) * state.range(1) * state.range(2) * state.range(3);
+    for (int64_t x = 0; x < product; x++) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK_REGISTER_F(ArgsProductFixture, Empty)
+    ->Args({0, 100, 2000, 30000})
+    ->ArgsProduct({{1, 2}, {15}, {3, 7, 10}, {8, 9}})
+    ->Args({4, 5, 6, 11});
+
+BENCHMARK_MAIN();

diff  --git a/libcxx/utils/google-benchmark/test/basic_test.cc b/libcxx/utils/google-benchmark/test/basic_test.cc
index d07fbc00b1516..33642211e2058 100644
--- a/libcxx/utils/google-benchmark/test/basic_test.cc
+++ b/libcxx/utils/google-benchmark/test/basic_test.cc
@@ -98,7 +98,7 @@ BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
 
 
 void BM_KeepRunning(benchmark::State& state) {
-  size_t iter_count = 0;
+  benchmark::IterationCount iter_count = 0;
   assert(iter_count == state.iterations());
   while (state.KeepRunning()) {
     ++iter_count;
@@ -108,18 +108,33 @@ void BM_KeepRunning(benchmark::State& state) {
 BENCHMARK(BM_KeepRunning);
 
 void BM_KeepRunningBatch(benchmark::State& state) {
-  // Choose a prime batch size to avoid evenly dividing max_iterations.
-  const size_t batch_size = 101;
-  size_t iter_count = 0;
+  // Choose a batch size >1000 to skip the typical runs with iteration
+  // targets of 10, 100 and 1000.  If these are not actually skipped the
+  // bug would be detectable as consecutive runs with the same iteration
+  // count.  Below we assert that this does not happen.
+  const benchmark::IterationCount batch_size = 1009;
+
+  static benchmark::IterationCount prior_iter_count = 0;
+  benchmark::IterationCount iter_count = 0;
   while (state.KeepRunningBatch(batch_size)) {
     iter_count += batch_size;
   }
   assert(state.iterations() == iter_count);
+
+  // Verify that the iteration count always increases across runs (see
+  // comment above).
+  assert(iter_count == batch_size            // max_iterations == 1
+         || iter_count > prior_iter_count);  // max_iterations > batch_size
+  prior_iter_count = iter_count;
 }
-BENCHMARK(BM_KeepRunningBatch);
+// Register with a fixed repetition count to establish the invariant that
+// the iteration count should always change across runs.  This overrides
+// the --benchmark_repetitions command line flag, which would otherwise
+// cause this test to fail if set > 1.
+BENCHMARK(BM_KeepRunningBatch)->Repetitions(1);
 
 void BM_RangedFor(benchmark::State& state) {
-  size_t iter_count = 0;
+  benchmark::IterationCount iter_count = 0;
   for (auto _ : state) {
     ++iter_count;
   }

diff  --git a/libcxx/utils/google-benchmark/test/benchmark_gtest.cc b/libcxx/utils/google-benchmark/test/benchmark_gtest.cc
index 10683b433ab54..14a885ba46da4 100644
--- a/libcxx/utils/google-benchmark/test/benchmark_gtest.cc
+++ b/libcxx/utils/google-benchmark/test/benchmark_gtest.cc
@@ -1,9 +1,15 @@
+#include <map>
+#include <string>
 #include <vector>
 
 #include "../src/benchmark_register.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace benchmark {
+namespace internal {
+extern std::map<std::string, std::string>* global_context;
+
 namespace {
 
 TEST(AddRangeTest, Simple) {
@@ -30,4 +36,130 @@ TEST(AddRangeTest, Advanced64) {
   EXPECT_THAT(dst, testing::ElementsAre(5, 8, 15));
 }
 
-}  // end namespace
+TEST(AddRangeTest, FullRange8) {
+  std::vector<int8_t> dst;
+  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), 8);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 8, 64, 127));
+}
+
+TEST(AddRangeTest, FullRange64) {
+  std::vector<int64_t> dst;
+  AddRange(&dst, int64_t{1}, std::numeric_limits<int64_t>::max(), 1024);
+  EXPECT_THAT(
+      dst, testing::ElementsAre(1LL, 1024LL, 1048576LL, 1073741824LL,
+                                1099511627776LL, 1125899906842624LL,
+                                1152921504606846976LL, 9223372036854775807LL));
+}
+
+TEST(AddRangeTest, NegativeRanges) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, 0, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1, 0));
+}
+
+TEST(AddRangeTest, StrictlyNegative) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, -1, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1));
+}
+
+TEST(AddRangeTest, SymmetricNegativeRanges) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, 8, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1, 0, 1, 2, 4, 8));
+}
+
+TEST(AddRangeTest, SymmetricNegativeRangesOddMult) {
+  std::vector<int> dst;
+  AddRange(&dst, -30, 32, 5);
+  EXPECT_THAT(dst, testing::ElementsAre(-30, -25, -5, -1, 0, 1, 5, 25, 32));
+}
+
+TEST(AddRangeTest, NegativeRangesAsymmetric) {
+  std::vector<int> dst;
+  AddRange(&dst, -3, 5, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-3, -2, -1, 0, 1, 2, 4, 5));
+}
+
+TEST(AddRangeTest, NegativeRangesLargeStep) {
+  // Always include -1, 0, 1 when crossing zero.
+  std::vector<int> dst;
+  AddRange(&dst, -8, 8, 10);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -1, 0, 1, 8));
+}
+
+TEST(AddRangeTest, ZeroOnlyRange) {
+  std::vector<int> dst;
+  AddRange(&dst, 0, 0, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(0));
+}
+
+TEST(AddRangeTest, ZeroStartingRange) {
+  std::vector<int> dst;
+  AddRange(&dst, 0, 2, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(0, 1, 2));
+}
+
+TEST(AddRangeTest, NegativeRange64) {
+  std::vector<int64_t> dst;
+  AddRange<int64_t>(&dst, -4, 4, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-4, -2, -1, 0, 1, 2, 4));
+}
+
+TEST(AddRangeTest, NegativeRangePreservesExistingOrder) {
+  // If elements already exist in the range, ensure we don't change
+  // their ordering by adding negative values.
+  std::vector<int64_t> dst = {1, 2, 3};
+  AddRange<int64_t>(&dst, -2, 2, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2, 3, -2, -1, 0, 1, 2));
+}
+
+TEST(AddRangeTest, FullNegativeRange64) {
+  std::vector<int64_t> dst;
+  const auto min = std::numeric_limits<int64_t>::min();
+  const auto max = std::numeric_limits<int64_t>::max();
+  AddRange(&dst, min, max, 1024);
+  EXPECT_THAT(
+      dst, testing::ElementsAreArray(std::vector<int64_t>{
+               min, -1152921504606846976LL, -1125899906842624LL,
+               -1099511627776LL, -1073741824LL, -1048576LL, -1024LL, -1LL, 0LL,
+               1LL, 1024LL, 1048576LL, 1073741824LL, 1099511627776LL,
+               1125899906842624LL, 1152921504606846976LL, max}));
+}
+
+TEST(AddRangeTest, Simple8) {
+  std::vector<int8_t> dst;
+  AddRange<int8_t>(&dst, 1, 8, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2, 4, 8));
+}
+
+TEST(AddCustomContext, Simple) {
+  EXPECT_THAT(global_context, nullptr);
+
+  AddCustomContext("foo", "bar");
+  AddCustomContext("baz", "qux");
+
+  EXPECT_THAT(*global_context,
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar"),
+                                            testing::Pair("baz", "qux")));
+
+  delete global_context;
+  global_context = nullptr;
+}
+
+TEST(AddCustomContext, DuplicateKey) {
+  EXPECT_THAT(global_context, nullptr);
+
+  AddCustomContext("foo", "bar");
+  AddCustomContext("foo", "qux");
+
+  EXPECT_THAT(*global_context,
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar")));
+
+  delete global_context;
+  global_context = nullptr;
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark

diff  --git a/libcxx/utils/google-benchmark/test/benchmark_name_gtest.cc b/libcxx/utils/google-benchmark/test/benchmark_name_gtest.cc
new file mode 100644
index 0000000000000..afb401c1f5328
--- /dev/null
+++ b/libcxx/utils/google-benchmark/test/benchmark_name_gtest.cc
@@ -0,0 +1,74 @@
+#include "benchmark/benchmark.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+using namespace benchmark;
+using namespace benchmark::internal;
+
+TEST(BenchmarkNameTest, Empty) {
+  const auto name = BenchmarkName();
+  EXPECT_EQ(name.str(), std::string());
+}
+
+TEST(BenchmarkNameTest, FunctionName) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  EXPECT_EQ(name.str(), "function_name");
+}
+
+TEST(BenchmarkNameTest, FunctionNameAndArgs) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.args = "some_args:3/4/5";
+  EXPECT_EQ(name.str(), "function_name/some_args:3/4/5");
+}
+
+TEST(BenchmarkNameTest, MinTime) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.args = "some_args:3/4";
+  name.min_time = "min_time:3.4s";
+  EXPECT_EQ(name.str(), "function_name/some_args:3/4/min_time:3.4s");
+}
+
+TEST(BenchmarkNameTest, Iterations) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.iterations = "iterations:42";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/iterations:42");
+}
+
+TEST(BenchmarkNameTest, Repetitions) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.repetitions = "repetitions:24";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/repetitions:24");
+}
+
+TEST(BenchmarkNameTest, TimeType) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.time_type = "hammer_time";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/hammer_time");
+}
+
+TEST(BenchmarkNameTest, Threads) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.threads = "threads:256";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/threads:256");
+}
+
+TEST(BenchmarkNameTest, TestEmptyFunctionName) {
+  auto name = BenchmarkName();
+  name.args = "first:3/second:4";
+  name.threads = "threads:22";
+  EXPECT_EQ(name.str(), "first:3/second:4/threads:22");
+}
+
+}  // end namespace

diff  --git a/libcxx/utils/google-benchmark/test/benchmark_random_interleaving_gtest.cc b/libcxx/utils/google-benchmark/test/benchmark_random_interleaving_gtest.cc
new file mode 100644
index 0000000000000..8e28dab3f41d3
--- /dev/null
+++ b/libcxx/utils/google-benchmark/test/benchmark_random_interleaving_gtest.cc
@@ -0,0 +1,126 @@
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "../src/commandlineflags.h"
+#include "../src/string_util.h"
+#include "benchmark/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+DECLARE_bool(benchmark_enable_random_interleaving);
+DECLARE_string(benchmark_filter);
+DECLARE_int32(benchmark_repetitions);
+
+namespace benchmark {
+namespace internal {
+namespace {
+
+class EventQueue : public std::queue<std::string> {
+ public:
+  void Put(const std::string& event) { push(event); }
+
+  void Clear() {
+    while (!empty()) {
+      pop();
+    }
+  }
+
+  std::string Get() {
+    std::string event = front();
+    pop();
+    return event;
+  }
+};
+
+static EventQueue* queue = new EventQueue;
+
+class NullReporter : public BenchmarkReporter {
+ public:
+  bool ReportContext(const Context& /*context*/) override { return true; }
+  void ReportRuns(const std::vector<Run>& /* report */) override {}
+};
+
+class BenchmarkTest : public testing::Test {
+ public:
+  static void SetupHook(int /* num_threads */) { queue->push("Setup"); }
+
+  static void TeardownHook(int /* num_threads */) { queue->push("Teardown"); }
+
+  void Execute(const std::string& pattern) {
+    queue->Clear();
+
+    BenchmarkReporter* reporter = new NullReporter;
+    FLAGS_benchmark_filter = pattern;
+    RunSpecifiedBenchmarks(reporter);
+    delete reporter;
+
+    queue->Put("DONE");  // End marker
+  }
+};
+
+static void BM_Match1(benchmark::State& state) {
+  const int64_t arg = state.range(0);
+
+  for (auto _ : state) {
+  }
+  queue->Put(StrFormat("BM_Match1/%d", static_cast<int>(arg)));
+}
+BENCHMARK(BM_Match1)
+    ->Iterations(100)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(3)
+    ->Range(10, 80)
+    ->Args({90})
+    ->Args({100});
+
+TEST_F(BenchmarkTest, Match1) {
+  Execute("BM_Match1");
+  ASSERT_EQ("BM_Match1/1", queue->Get());
+  ASSERT_EQ("BM_Match1/2", queue->Get());
+  ASSERT_EQ("BM_Match1/3", queue->Get());
+  ASSERT_EQ("BM_Match1/10", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/90", queue->Get());
+  ASSERT_EQ("BM_Match1/100", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRepetition) {
+  FLAGS_benchmark_repetitions = 2;
+
+  Execute("BM_Match1/(64|80)");
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRandomInterleaving) {
+  FLAGS_benchmark_enable_random_interleaving = true;
+  FLAGS_benchmark_repetitions = 100;
+
+  std::map<std::string, int> element_count;
+  std::map<std::string, int> interleaving_count;
+  Execute("BM_Match1/(64|80)");
+  for (int i = 0; i < 100; ++i) {
+    std::vector<std::string> interleaving;
+    interleaving.push_back(queue->Get());
+    interleaving.push_back(queue->Get());
+    element_count[interleaving[0].c_str()]++;
+    element_count[interleaving[1].c_str()]++;
+    interleaving_count[StrFormat("%s,%s", interleaving[0].c_str(),
+                                 interleaving[1].c_str())]++;
+  }
+  EXPECT_EQ(element_count["BM_Match1/64"], 100) << "Unexpected repetitions.";
+  EXPECT_EQ(element_count["BM_Match1/80"], 100) << "Unexpected repetitions.";
+  EXPECT_GE(interleaving_count.size(), 2) << "Interleaving was not randomized.";
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark

diff  --git a/libcxx/utils/google-benchmark/test/commandlineflags_gtest.cc b/libcxx/utils/google-benchmark/test/commandlineflags_gtest.cc
new file mode 100644
index 0000000000000..8412008ffe359
--- /dev/null
+++ b/libcxx/utils/google-benchmark/test/commandlineflags_gtest.cc
@@ -0,0 +1,228 @@
+#include <cstdlib>
+
+#include "../src/commandlineflags.h"
+#include "../src/internal_macros.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+namespace {
+
+#if defined(BENCHMARK_OS_WINDOWS)
+int setenv(const char* name, const char* value, int overwrite) {
+  if (!overwrite) {
+    // NOTE: getenv_s is far superior but not available under mingw.
+    char* env_value = getenv(name);
+    if (env_value == nullptr) {
+      return -1;
+    }
+  }
+  return _putenv_s(name, value);
+}
+
+int unsetenv(const char* name) { return _putenv_s(name, ""); }
+
+#endif  // BENCHMARK_OS_WINDOWS
+
+TEST(BoolFromEnv, Default) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_EQ(BoolFromEnv("not_in_env", true), true);
+}
+
+TEST(BoolFromEnv, False) {
+  ASSERT_EQ(setenv("IN_ENV", "0", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "N", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "n", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "NO", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "No", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "no", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "F", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "f", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "FALSE", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "False", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "false", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "OFF", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "Off", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "off", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+}
+
+TEST(BoolFromEnv, True) {
+  ASSERT_EQ(setenv("IN_ENV", "1", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "Y", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "y", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "YES", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "Yes", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "yes", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "T", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "t", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "TRUE", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "True", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "true", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "ON", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "On", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "on", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+#ifndef BENCHMARK_OS_WINDOWS
+  ASSERT_EQ(setenv("IN_ENV", "", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+#endif
+}
+
+TEST(Int32FromEnv, NotInEnv) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_EQ(Int32FromEnv("not_in_env", 42), 42);
+}
+
+TEST(Int32FromEnv, InvalidInteger) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_EQ(Int32FromEnv("in_env", 42), 42);
+  unsetenv("IN_ENV");
+}
+
+TEST(Int32FromEnv, ValidInteger) {
+  ASSERT_EQ(setenv("IN_ENV", "42", 1), 0);
+  EXPECT_EQ(Int32FromEnv("in_env", 64), 42);
+  unsetenv("IN_ENV");
+}
+
+TEST(DoubleFromEnv, NotInEnv) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_EQ(DoubleFromEnv("not_in_env", 0.51), 0.51);
+}
+
+TEST(DoubleFromEnv, InvalidReal) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_EQ(DoubleFromEnv("in_env", 0.51), 0.51);
+  unsetenv("IN_ENV");
+}
+
+TEST(DoubleFromEnv, ValidReal) {
+  ASSERT_EQ(setenv("IN_ENV", "0.51", 1), 0);
+  EXPECT_EQ(DoubleFromEnv("in_env", 0.71), 0.51);
+  unsetenv("IN_ENV");
+}
+
+TEST(StringFromEnv, Default) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_STREQ(StringFromEnv("not_in_env", "foo"), "foo");
+}
+
+TEST(StringFromEnv, Valid) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_STREQ(StringFromEnv("in_env", "bar"), "foo");
+  unsetenv("IN_ENV");
+}
+
+TEST(KvPairsFromEnv, Default) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_THAT(KvPairsFromEnv("not_in_env", {{"foo", "bar"}}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+}
+
+TEST(KvPairsFromEnv, MalformedReturnsDefault) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {{"foo", "bar"}}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+  unsetenv("IN_ENV");
+}
+
+TEST(KvPairsFromEnv, Single) {
+  ASSERT_EQ(setenv("IN_ENV", "foo=bar", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+  unsetenv("IN_ENV");
+}
+
+TEST(KvPairsFromEnv, Multiple) {
+  ASSERT_EQ(setenv("IN_ENV", "foo=bar,baz=qux", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {}),
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar"),
+                                            testing::Pair("baz", "qux")));
+  unsetenv("IN_ENV");
+}
+
+}  // namespace
+}  // namespace benchmark

diff  --git a/libcxx/utils/google-benchmark/test/complexity_test.cc b/libcxx/utils/google-benchmark/test/complexity_test.cc
index 323ddfe7ac595..0de73c5722b51 100644
--- a/libcxx/utils/google-benchmark/test/complexity_test.cc
+++ b/libcxx/utils/google-benchmark/test/complexity_test.cc
@@ -13,7 +13,8 @@ namespace {
   int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
 
 int AddComplexityTest(std::string test_name, std::string big_o_test_name,
-                      std::string rms_test_name, std::string big_o) {
+                      std::string rms_test_name, std::string big_o,
+                      int family_index) {
   SetSubstitutions({{"%name", test_name},
                     {"%bigo_name", big_o_test_name},
                     {"%rms_name", rms_test_name},
@@ -25,21 +26,31 @@ int AddComplexityTest(std::string test_name, std::string big_o_test_name,
       {{"^%bigo_name %bigo_str %bigo_str[ ]*$"},
        {"^%bigo_name", MR_Not},  // Assert we we didn't only matched a name.
        {"^%rms_name %rms %rms[ ]*$", MR_Next}});
-  AddCases(TC_JSONOut, {{"\"name\": \"%bigo_name\",$"},
-                        {"\"run_name\": \"%name\",$", MR_Next},
-                        {"\"run_type\": \"aggregate\",$", MR_Next},
-                        {"\"aggregate_name\": \"BigO\",$", MR_Next},
-                        {"\"cpu_coefficient\": %float,$", MR_Next},
-                        {"\"real_coefficient\": %float,$", MR_Next},
-                        {"\"big_o\": \"%bigo\",$", MR_Next},
-                        {"\"time_unit\": \"ns\"$", MR_Next},
-                        {"}", MR_Next},
-                        {"\"name\": \"%rms_name\",$"},
-                        {"\"run_name\": \"%name\",$", MR_Next},
-                        {"\"run_type\": \"aggregate\",$", MR_Next},
-                        {"\"aggregate_name\": \"RMS\",$", MR_Next},
-                        {"\"rms\": %float$", MR_Next},
-                        {"}", MR_Next}});
+  AddCases(
+      TC_JSONOut,
+      {{"\"name\": \"%bigo_name\",$"},
+       {"\"family_index\": " + std::to_string(family_index) + ",$", MR_Next},
+       {"\"per_family_instance_index\": 0,$", MR_Next},
+       {"\"run_name\": \"%name\",$", MR_Next},
+       {"\"run_type\": \"aggregate\",$", MR_Next},
+       {"\"repetitions\": %int,$", MR_Next},
+       {"\"threads\": 1,$", MR_Next},
+       {"\"aggregate_name\": \"BigO\",$", MR_Next},
+       {"\"cpu_coefficient\": %float,$", MR_Next},
+       {"\"real_coefficient\": %float,$", MR_Next},
+       {"\"big_o\": \"%bigo\",$", MR_Next},
+       {"\"time_unit\": \"ns\"$", MR_Next},
+       {"}", MR_Next},
+       {"\"name\": \"%rms_name\",$"},
+       {"\"family_index\": " + std::to_string(family_index) + ",$", MR_Next},
+       {"\"per_family_instance_index\": 0,$", MR_Next},
+       {"\"run_name\": \"%name\",$", MR_Next},
+       {"\"run_type\": \"aggregate\",$", MR_Next},
+       {"\"repetitions\": %int,$", MR_Next},
+       {"\"threads\": 1,$", MR_Next},
+       {"\"aggregate_name\": \"RMS\",$", MR_Next},
+       {"\"rms\": %float$", MR_Next},
+       {"}", MR_Next}});
   AddCases(TC_CSVOut, {{"^\"%bigo_name\",,%float,%float,%bigo,,,,,$"},
                        {"^\"%bigo_name\"", MR_Not},
                        {"^\"%rms_name\",,%float,%float,,,,,,$", MR_Next}});
@@ -62,9 +73,9 @@ void BM_Complexity_O1(benchmark::State& state) {
 }
 BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1);
 BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity();
-BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity([](int64_t) {
-  return 1.0;
-});
+BENCHMARK(BM_Complexity_O1)
+    ->Range(1, 1 << 18)
+    ->Complexity([](benchmark::IterationCount) { return 1.0; });
 
 const char *one_test_name = "BM_Complexity_O1";
 const char *big_o_1_test_name = "BM_Complexity_O1_BigO";
@@ -78,15 +89,15 @@ const char *lambda_big_o_1 = "f\\(N\\)";
 
 // Add enum tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     enum_big_o_1);
+                     enum_big_o_1, /*family_index=*/0);
 
 // Add auto enum tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     auto_big_o_1);
+                     auto_big_o_1, /*family_index=*/1);
 
 // Add lambda tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     lambda_big_o_1);
+                     lambda_big_o_1, /*family_index=*/2);
 
 // ========================================================================= //
 // --------------------------- Testing BigO O(N) --------------------------- //
@@ -117,7 +128,9 @@ BENCHMARK(BM_Complexity_O_N)
 BENCHMARK(BM_Complexity_O_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
-    ->Complexity([](int64_t n) -> double { return static_cast<double>(n); });
+    ->Complexity([](benchmark::IterationCount n) -> double {
+      return static_cast<double>(n);
+    });
 BENCHMARK(BM_Complexity_O_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
@@ -131,11 +144,11 @@ const char *lambda_big_o_n = "f\\(N\\)";
 
 // Add enum tests
 ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
-                     enum_auto_big_o_n);
+                     enum_auto_big_o_n, /*family_index=*/3);
 
 // Add lambda tests
 ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
-                     lambda_big_o_n);
+                     lambda_big_o_n, /*family_index=*/4);
 
 // ========================================================================= //
 // ------------------------- Testing BigO O(N*lgN) ------------------------- //
@@ -156,7 +169,9 @@ BENCHMARK(BM_Complexity_O_N_log_N)
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
-    ->Complexity([](int64_t n) { return kLog2E * n * log(static_cast<double>(n)); });
+    ->Complexity([](benchmark::IterationCount n) {
+      return kLog2E * n * log(static_cast<double>(n));
+    });
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
@@ -170,11 +185,35 @@ const char *lambda_big_o_n_lg_n = "f\\(N\\)";
 
 // Add enum tests
 ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
-                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n);
+                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n,
+                     /*family_index=*/6);
 
 // Add lambda tests
 ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
-                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n);
+                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n,
+                     /*family_index=*/7);
+
+// ========================================================================= //
+// -------- Testing formatting of Complexity with captured args ------------ //
+// ========================================================================= //
+
+void BM_ComplexityCaptureArgs(benchmark::State& state, int n) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  state.SetComplexityN(n);
+}
+
+BENCHMARK_CAPTURE(BM_ComplexityCaptureArgs, capture_test, 100)
+    ->Complexity(benchmark::oN)
+    ->Ranges({{1, 2}, {3, 4}});
+
+const std::string complexity_capture_name =
+    "BM_ComplexityCaptureArgs/capture_test";
+
+ADD_COMPLEXITY_CASES(complexity_capture_name, complexity_capture_name + "_BigO",
+                     complexity_capture_name + "_RMS", "N", /*family_index=*/9);
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //

diff  --git a/libcxx/utils/google-benchmark/test/cxx03_test.cc b/libcxx/utils/google-benchmark/test/cxx03_test.cc
index baa9ed9262baa..c4c9a52273e3a 100644
--- a/libcxx/utils/google-benchmark/test/cxx03_test.cc
+++ b/libcxx/utils/google-benchmark/test/cxx03_test.cc
@@ -14,7 +14,7 @@
 
 void BM_empty(benchmark::State& state) {
   while (state.KeepRunning()) {
-    volatile std::size_t x = state.iterations();
+    volatile benchmark::IterationCount x = state.iterations();
     ((void)x);
   }
 }

diff  --git a/libcxx/utils/google-benchmark/test/filter_test.cc b/libcxx/utils/google-benchmark/test/filter_test.cc
index 0e27065c1558e..1c198913b36a6 100644
--- a/libcxx/utils/google-benchmark/test/filter_test.cc
+++ b/libcxx/utils/google-benchmark/test/filter_test.cc
@@ -1,36 +1,41 @@
-#include "benchmark/benchmark.h"
-
+#include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstdint>
 #include <cstdlib>
-
 #include <iostream>
 #include <limits>
 #include <sstream>
 #include <string>
 
+#include "benchmark/benchmark.h"
+
 namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual bool ReportContext(const Context& context) {
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
     return ConsoleReporter::ReportContext(context);
   };
 
-  virtual void ReportRuns(const std::vector<Run>& report) {
+  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
     ++count_;
+    max_family_index_ =
+        std::max<size_t>(max_family_index_, report[0].family_index);
     ConsoleReporter::ReportRuns(report);
   };
 
-  TestReporter() : count_(0) {}
+  TestReporter() : count_(0), max_family_index_(0) {}
 
   virtual ~TestReporter() {}
 
   size_t GetCount() const { return count_; }
 
+  size_t GetMaxFamilyIndex() const { return max_family_index_; }
+
  private:
   mutable size_t count_;
+  mutable size_t max_family_index_;
 };
 
 }  // end namespace
@@ -98,6 +103,15 @@ int main(int argc, char **argv) {
                 << std::endl;
       return -1;
     }
+
+    const size_t max_family_index = test_reporter.GetMaxFamilyIndex();
+    const size_t num_families = reports_count == 0 ? 0 : 1 + max_family_index;
+    if (num_families != expected_reports) {
+      std::cerr << "ERROR: Expected " << expected_reports
+                << " test families to be run but num_families = "
+                << num_families << std::endl;
+      return -1;
+    }
   }
 
   return 0;

diff  --git a/libcxx/utils/google-benchmark/test/fixture_test.cc b/libcxx/utils/google-benchmark/test/fixture_test.cc
index 1462b10f02f96..eba0a42d9cb04 100644
--- a/libcxx/utils/google-benchmark/test/fixture_test.cc
+++ b/libcxx/utils/google-benchmark/test/fixture_test.cc
@@ -4,35 +4,37 @@
 #include <cassert>
 #include <memory>
 
-class MyFixture : public ::benchmark::Fixture {
+#define FIXTURE_BECHMARK_NAME MyFixture
+
+class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
  public:
-  void SetUp(const ::benchmark::State& state) {
+  void SetUp(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
     if (state.thread_index == 0) {
       assert(data.get() == nullptr);
       data.reset(new int(42));
     }
   }
 
-  void TearDown(const ::benchmark::State& state) {
+  void TearDown(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
     if (state.thread_index == 0) {
       assert(data.get() != nullptr);
       data.reset();
     }
   }
 
-  ~MyFixture() { assert(data == nullptr); }
+  ~FIXTURE_BECHMARK_NAME() { assert(data == nullptr); }
 
   std::unique_ptr<int> data;
 };
 
-BENCHMARK_F(MyFixture, Foo)(benchmark::State &st) {
+BENCHMARK_F(FIXTURE_BECHMARK_NAME, Foo)(benchmark::State &st) {
   assert(data.get() != nullptr);
   assert(*data == 42);
   for (auto _ : st) {
   }
 }
 
-BENCHMARK_DEFINE_F(MyFixture, Bar)(benchmark::State& st) {
+BENCHMARK_DEFINE_F(FIXTURE_BECHMARK_NAME, Bar)(benchmark::State& st) {
   if (st.thread_index == 0) {
     assert(data.get() != nullptr);
     assert(*data == 42);
@@ -43,7 +45,7 @@ BENCHMARK_DEFINE_F(MyFixture, Bar)(benchmark::State& st) {
   }
   st.SetItemsProcessed(st.range(0));
 }
-BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42);
-BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42)->ThreadPerCpu();
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, Bar)->Arg(42);
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, Bar)->Arg(42)->ThreadPerCpu();
 
 BENCHMARK_MAIN();

diff  --git a/libcxx/utils/google-benchmark/test/internal_threading_test.cc b/libcxx/utils/google-benchmark/test/internal_threading_test.cc
new file mode 100644
index 0000000000000..039d7c14a8c48
--- /dev/null
+++ b/libcxx/utils/google-benchmark/test/internal_threading_test.cc
@@ -0,0 +1,184 @@
+
+#undef NDEBUG
+
+#include <chrono>
+#include <thread>
+#include "../src/timers.h"
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+static const std::chrono::duration<double, std::milli> time_frame(50);
+static const double time_frame_in_sec(
+    std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(
+        time_frame)
+        .count());
+
+void MyBusySpinwait() {
+  const auto start = benchmark::ChronoClockNow();
+
+  while (true) {
+    const auto now = benchmark::ChronoClockNow();
+    const auto elapsed = now - start;
+
+    if (std::chrono::duration<double, std::chrono::seconds::period>(elapsed) >=
+        time_frame)
+      return;
+  }
+}
+
+// ========================================================================= //
+// --------------------------- TEST CASES BEGIN ---------------------------- //
+// ========================================================================= //
+
+// ========================================================================= //
+// BM_MainThread
+
+void BM_MainThread(benchmark::State& state) {
+  for (auto _ : state) {
+    MyBusySpinwait();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->UseRealTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->UseManualTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->UseRealTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->UseManualTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// BM_WorkerThread
+
+void BM_WorkerThread(benchmark::State& state) {
+  for (auto _ : state) {
+    std::thread Worker(&MyBusySpinwait);
+    Worker.join();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->UseRealTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->UseManualTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->MeasureProcessCPUTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->UseRealTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->UseManualTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->MeasureProcessCPUTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// BM_MainThreadAndWorkerThread
+
+void BM_MainThreadAndWorkerThread(benchmark::State& state) {
+  for (auto _ : state) {
+    std::thread Worker(&MyBusySpinwait);
+    MyBusySpinwait();
+    Worker.join();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_MainThreadAndWorkerThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->UseManualTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_MainThreadAndWorkerThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->UseManualTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// ---------------------------- TEST CASES END ----------------------------- //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }

diff  --git a/libcxx/utils/google-benchmark/test/map_test.cc b/libcxx/utils/google-benchmark/test/map_test.cc
index dbf7982a3686e..86391b36016fd 100644
--- a/libcxx/utils/google-benchmark/test/map_test.cc
+++ b/libcxx/utils/google-benchmark/test/map_test.cc
@@ -34,11 +34,11 @@ BENCHMARK(BM_MapLookup)->Range(1 << 3, 1 << 12);
 // Using fixtures.
 class MapFixture : public ::benchmark::Fixture {
  public:
-  void SetUp(const ::benchmark::State& st) {
+  void SetUp(const ::benchmark::State& st) BENCHMARK_OVERRIDE {
     m = ConstructRandomMap(static_cast<int>(st.range(0)));
   }
 
-  void TearDown(const ::benchmark::State&) { m.clear(); }
+  void TearDown(const ::benchmark::State&) BENCHMARK_OVERRIDE { m.clear(); }
 
   std::map<int, int> m;
 };

diff  --git a/libcxx/utils/google-benchmark/test/memory_manager_test.cc b/libcxx/utils/google-benchmark/test/memory_manager_test.cc
index 94be6083795e6..f0c192fcbd00d 100644
--- a/libcxx/utils/google-benchmark/test/memory_manager_test.cc
+++ b/libcxx/utils/google-benchmark/test/memory_manager_test.cc
@@ -5,8 +5,8 @@
 #include "output_test.h"
 
 class TestMemoryManager : public benchmark::MemoryManager {
-  void Start() {}
-  void Stop(Result* result) {
+  void Start() BENCHMARK_OVERRIDE {}
+  void Stop(Result* result) BENCHMARK_OVERRIDE {
     result->num_allocs = 42;
     result->max_bytes_used = 42000;
   }
@@ -21,8 +21,13 @@ BENCHMARK(BM_empty);
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_empty %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_empty\",$"},
+                       {"\"family_index\": 0,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_empty\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -32,8 +37,7 @@ ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_empty\",$"},
                        {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_empty\",%csv_report$"}});
 
-
-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[]) {
   std::unique_ptr<benchmark::MemoryManager> mm(new TestMemoryManager());
 
   benchmark::RegisterMemoryManager(mm.get());

diff  --git a/libcxx/utils/google-benchmark/test/multiple_ranges_test.cc b/libcxx/utils/google-benchmark/test/multiple_ranges_test.cc
index c64acabc25c98..6b61f3af47bb9 100644
--- a/libcxx/utils/google-benchmark/test/multiple_ranges_test.cc
+++ b/libcxx/utils/google-benchmark/test/multiple_ranges_test.cc
@@ -28,7 +28,7 @@ class MultipleRangesFixture : public ::benchmark::Fixture {
                         {2, 7, 15},
                         {7, 6, 3}}) {}
 
-  void SetUp(const ::benchmark::State& state) {
+  void SetUp(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
     std::vector<int64_t> ranges = {state.range(0), state.range(1),
                                    state.range(2)};
 
@@ -40,8 +40,7 @@ class MultipleRangesFixture : public ::benchmark::Fixture {
   // NOTE: This is not TearDown as we want to check after _all_ runs are
   // complete.
   virtual ~MultipleRangesFixture() {
-    assert(actualValues.size() == expectedValues.size());
-    if (actualValues.size() != expectedValues.size()) {
+    if (actualValues != expectedValues) {
       std::cout << "EXPECTED\n";
       for (auto v : expectedValues) {
         std::cout << "{";

diff  --git a/libcxx/utils/google-benchmark/test/options_test.cc b/libcxx/utils/google-benchmark/test/options_test.cc
index fdec69174eec0..9f9a78667c9ee 100644
--- a/libcxx/utils/google-benchmark/test/options_test.cc
+++ b/libcxx/utils/google-benchmark/test/options_test.cc
@@ -25,6 +25,7 @@ BENCHMARK(BM_basic)->Arg(42);
 BENCHMARK(BM_basic_slow)->Arg(10)->Unit(benchmark::kNanosecond);
 BENCHMARK(BM_basic_slow)->Arg(100)->Unit(benchmark::kMicrosecond);
 BENCHMARK(BM_basic_slow)->Arg(1000)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_basic_slow)->Arg(1000)->Unit(benchmark::kSecond);
 BENCHMARK(BM_basic)->Range(1, 8);
 BENCHMARK(BM_basic)->RangeMultiplier(2)->Range(1, 8);
 BENCHMARK(BM_basic)->DenseRange(10, 15);
@@ -35,6 +36,16 @@ BENCHMARK(BM_basic)->UseRealTime();
 BENCHMARK(BM_basic)->ThreadRange(2, 4);
 BENCHMARK(BM_basic)->ThreadPerCpu();
 BENCHMARK(BM_basic)->Repetitions(3);
+BENCHMARK(BM_basic)
+    ->RangeMultiplier(std::numeric_limits<int>::max())
+    ->Range(std::numeric_limits<int64_t>::min(),
+            std::numeric_limits<int64_t>::max());
+
+// Negative ranges
+BENCHMARK(BM_basic)->Range(-64, -1);
+BENCHMARK(BM_basic)->RangeMultiplier(4)->Range(-8, 8);
+BENCHMARK(BM_basic)->DenseRange(-2, 2, 1);
+BENCHMARK(BM_basic)->Ranges({{-64, 1}, {-8, -1}});
 
 void CustomArgs(benchmark::internal::Benchmark* b) {
   for (int i = 0; i < 10; ++i) {

diff  --git a/libcxx/utils/google-benchmark/test/output_test.h b/libcxx/utils/google-benchmark/test/output_test.h
index 9385761b214c7..15368f9b68309 100644
--- a/libcxx/utils/google-benchmark/test/output_test.h
+++ b/libcxx/utils/google-benchmark/test/output_test.h
@@ -158,7 +158,7 @@ T Results::GetAs(const char* entry_name) const {
 
 // clang-format off
 
-#define _CHECK_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value) \
+#define CHECK_RESULT_VALUE_IMPL(entry, getfn, var_type, var_name, relationship, value) \
     CONCAT(CHECK_, relationship)                                        \
     (entry.getfn< var_type >(var_name), (value)) << "\n"                \
     << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
@@ -169,7 +169,7 @@ T Results::GetAs(const char* entry_name) const {
 
 // check with tolerance. eps_factor is the tolerance window, which is
 // interpreted relative to value (eg, 0.1 means 10% of value).
-#define _CHECK_FLOAT_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value, eps_factor) \
+#define CHECK_FLOAT_RESULT_VALUE_IMPL(entry, getfn, var_type, var_name, relationship, value, eps_factor) \
     CONCAT(CHECK_FLOAT_, relationship)                                  \
     (entry.getfn< var_type >(var_name), (value), (eps_factor) * (value)) << "\n" \
     << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
@@ -187,16 +187,16 @@ T Results::GetAs(const char* entry_name) const {
     << "%)"
 
 #define CHECK_RESULT_VALUE(entry, var_type, var_name, relationship, value) \
-    _CHECK_RESULT_VALUE(entry, GetAs, var_type, var_name, relationship, value)
+    CHECK_RESULT_VALUE_IMPL(entry, GetAs, var_type, var_name, relationship, value)
 
 #define CHECK_COUNTER_VALUE(entry, var_type, var_name, relationship, value) \
-    _CHECK_RESULT_VALUE(entry, GetCounterAs, var_type, var_name, relationship, value)
+    CHECK_RESULT_VALUE_IMPL(entry, GetCounterAs, var_type, var_name, relationship, value)
 
 #define CHECK_FLOAT_RESULT_VALUE(entry, var_name, relationship, value, eps_factor) \
-    _CHECK_FLOAT_RESULT_VALUE(entry, GetAs, double, var_name, relationship, value, eps_factor)
+    CHECK_FLOAT_RESULT_VALUE_IMPL(entry, GetAs, double, var_name, relationship, value, eps_factor)
 
 #define CHECK_FLOAT_COUNTER_VALUE(entry, var_name, relationship, value, eps_factor) \
-    _CHECK_FLOAT_RESULT_VALUE(entry, GetCounterAs, double, var_name, relationship, value, eps_factor)
+    CHECK_FLOAT_RESULT_VALUE_IMPL(entry, GetCounterAs, double, var_name, relationship, value, eps_factor)
 
 // clang-format on
 

diff  --git a/libcxx/utils/google-benchmark/test/output_test_helper.cc b/libcxx/utils/google-benchmark/test/output_test_helper.cc
index 5dc951d2bca87..b8ef1205744ac 100644
--- a/libcxx/utils/google-benchmark/test/output_test_helper.cc
+++ b/libcxx/utils/google-benchmark/test/output_test_helper.cc
@@ -48,6 +48,9 @@ SubMap& GetSubstitutions() {
       {" %s ", "[ ]+"},
       {"%time", "[ ]*" + time_re + "[ ]+ns"},
       {"%console_report", "[ ]*" + time_re + "[ ]+ns [ ]*" + time_re + "[ ]+ns [ ]*[0-9]+"},
+      {"%console_us_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us [ ]*[0-9]+"},
+      {"%console_ms_report", "[ ]*" + time_re + "[ ]+ms [ ]*" + time_re + "[ ]+ms [ ]*[0-9]+"},
+      {"%console_s_report", "[ ]*" + time_re + "[ ]+s [ ]*" + time_re + "[ ]+s [ ]*[0-9]+"},
       {"%console_time_only_report", "[ ]*" + time_re + "[ ]+ns [ ]*" + time_re + "[ ]+ns"},
       {"%console_us_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us [ ]*[0-9]+"},
       {"%console_us_time_only_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us"},
@@ -56,6 +59,8 @@ SubMap& GetSubstitutions() {
        "items_per_second,label,error_occurred,error_message"},
       {"%csv_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,,,"},
       {"%csv_us_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",us,,,,,"},
+      {"%csv_ms_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ms,,,,,"},
+      {"%csv_s_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",s,,,,,"},
       {"%csv_bytes_report",
        "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re + ",,,,"},
       {"%csv_items_report",
@@ -134,7 +139,7 @@ class TestReporter : public benchmark::BenchmarkReporter {
   TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
       : reporters_(reps) {}
 
-  virtual bool ReportContext(const Context& context) {
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
     bool last_ret = false;
     bool first = true;
     for (auto rep : reporters_) {
@@ -148,10 +153,10 @@ class TestReporter : public benchmark::BenchmarkReporter {
     return last_ret;
   }
 
-  void ReportRuns(const std::vector<Run>& report) {
+  void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
     for (auto rep : reporters_) rep->ReportRuns(report);
   }
-  void Finalize() {
+  void Finalize() BENCHMARK_OVERRIDE {
     for (auto rep : reporters_) rep->Finalize();
   }
 
@@ -373,6 +378,12 @@ int SetSubstitutions(
   return 0;
 }
 
+// Disable deprecated warnings temporarily because we need to reference
+// CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
 void RunOutputTests(int argc, char* argv[]) {
   using internal::GetTestCaseList;
   benchmark::Initialize(&argc, argv);
@@ -431,6 +442,10 @@ void RunOutputTests(int argc, char* argv[]) {
   internal::GetResultsChecker().CheckResults(csv.out_stream);
 }
 
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
 int SubstrCnt(const std::string& haystack, const std::string& pat) {
   if (pat.length() == 0) return 0;
   int count = 0;

diff  --git a/libcxx/utils/google-benchmark/test/perf_counters_gtest.cc b/libcxx/utils/google-benchmark/test/perf_counters_gtest.cc
new file mode 100644
index 0000000000000..2a2868a715362
--- /dev/null
+++ b/libcxx/utils/google-benchmark/test/perf_counters_gtest.cc
@@ -0,0 +1,145 @@
+#include <thread>
+
+#include "../src/perf_counters.h"
+#include "gtest/gtest.h"
+
+#ifndef GTEST_SKIP
+struct MsgHandler {
+  void operator=(std::ostream&){}
+};
+#define GTEST_SKIP() return MsgHandler() = std::cout
+#endif
+
+using benchmark::internal::PerfCounters;
+using benchmark::internal::PerfCounterValues;
+
+namespace {
+const char kGenericPerfEvent1[] = "CYCLES";
+const char kGenericPerfEvent2[] = "BRANCHES";
+const char kGenericPerfEvent3[] = "INSTRUCTIONS";
+
+TEST(PerfCountersTest, Init) {
+  EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported);
+}
+
+TEST(PerfCountersTest, OneCounter) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Performance counters not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1}).IsValid());
+}
+
+TEST(PerfCountersTest, NegativeTest) {
+  if (!PerfCounters::kSupported) {
+    EXPECT_FALSE(PerfCounters::Initialize());
+    return;
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  EXPECT_FALSE(PerfCounters::Create({}).IsValid());
+  EXPECT_FALSE(PerfCounters::Create({""}).IsValid());
+  EXPECT_FALSE(PerfCounters::Create({"not a counter name"}).IsValid());
+  {
+    EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
+                                      kGenericPerfEvent3})
+                    .IsValid());
+  }
+  EXPECT_FALSE(
+      PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1})
+          .IsValid());
+  EXPECT_FALSE(PerfCounters::Create({kGenericPerfEvent3, "not a counter name",
+                                     kGenericPerfEvent1})
+                   .IsValid());
+  {
+    EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
+                                      kGenericPerfEvent3})
+                    .IsValid());
+  }
+  EXPECT_FALSE(
+      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
+                            kGenericPerfEvent3, "MISPREDICTED_BRANCH_RETIRED"})
+          .IsValid());
+}
+
+TEST(PerfCountersTest, Read1Counter) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  auto counters = PerfCounters::Create({kGenericPerfEvent1});
+  EXPECT_TRUE(counters.IsValid());
+  PerfCounterValues values1(1);
+  EXPECT_TRUE(counters.Snapshot(&values1));
+  EXPECT_GT(values1[0], 0);
+  PerfCounterValues values2(1);
+  EXPECT_TRUE(counters.Snapshot(&values2));
+  EXPECT_GT(values2[0], 0);
+  EXPECT_GT(values2[0], values1[0]);
+}
+
+TEST(PerfCountersTest, Read2Counters) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  auto counters =
+      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
+  EXPECT_TRUE(counters.IsValid());
+  PerfCounterValues values1(2);
+  EXPECT_TRUE(counters.Snapshot(&values1));
+  EXPECT_GT(values1[0], 0);
+  EXPECT_GT(values1[1], 0);
+  PerfCounterValues values2(2);
+  EXPECT_TRUE(counters.Snapshot(&values2));
+  EXPECT_GT(values2[0], 0);
+  EXPECT_GT(values2[1], 0);
+}
+
+size_t do_work() {
+  size_t res = 0;
+  for (size_t i = 0; i < 100000000; ++i) res += i * i;
+  return res;
+}
+
+void measure(size_t threadcount, PerfCounterValues* values1,
+             PerfCounterValues* values2) {
+  CHECK_NE(values1, nullptr);
+  CHECK_NE(values2, nullptr);
+  std::vector<std::thread> threads(threadcount);
+  auto work = [&]() { CHECK(do_work() > 1000); };
+
+  // We need to first set up the counters, then start the threads, so the
+  // threads would inherit the counters. But later, we need to first destroy the
+  // thread pool (so all the work finishes), then measure the counters. So the
+  // scopes overlap, and we need to explicitly control the scope of the
+  // threadpool.
+  auto counters =
+      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent3});
+  for (auto& t : threads) t = std::thread(work);
+  counters.Snapshot(values1);
+  for (auto& t : threads) t.join();
+  counters.Snapshot(values2);
+}
+
+TEST(PerfCountersTest, MultiThreaded) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  PerfCounterValues values1(2);
+  PerfCounterValues values2(2);
+
+  measure(2, &values1, &values2);
+  std::vector<double> D1{static_cast<double>(values2[0] - values1[0]),
+                         static_cast<double>(values2[1] - values1[1])};
+
+  measure(4, &values1, &values2);
+  std::vector<double> D2{static_cast<double>(values2[0] - values1[0]),
+                         static_cast<double>(values2[1] - values1[1])};
+
+  // Some extra work will happen on the main thread - like joining the threads
+  // - so the ratio won't be quite 2.0, but very close.
+  EXPECT_GE(D2[0], 1.9 * D1[0]);
+  EXPECT_GE(D2[1], 1.9 * D1[1]);
+}
+}  // namespace

diff  --git a/libcxx/utils/google-benchmark/test/perf_counters_test.cc b/libcxx/utils/google-benchmark/test/perf_counters_test.cc
new file mode 100644
index 0000000000000..d6e0284d4d4b5
--- /dev/null
+++ b/libcxx/utils/google-benchmark/test/perf_counters_test.cc
@@ -0,0 +1,27 @@
+#undef NDEBUG
+
+#include "../src/perf_counters.h"
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+void BM_Simple(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_Simple);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Simple\",$"}});
+
+void CheckSimple(Results const& e) {
+  CHECK_COUNTER_VALUE(e, double, "CYCLES", GT, 0);
+  CHECK_COUNTER_VALUE(e, double, "BRANCHES", GT, 0.0);
+}
+CHECK_BENCHMARK_RESULTS("BM_Simple", &CheckSimple);
+
+int main(int argc, char* argv[]) {
+  if (!benchmark::internal::PerfCounters::kSupported) {
+    return 0;
+  }
+  RunOutputTests(argc, argv);
+}

diff  --git a/libcxx/utils/google-benchmark/test/register_benchmark_test.cc b/libcxx/utils/google-benchmark/test/register_benchmark_test.cc
index 3ac5b21fb348b..c027eabacae07 100644
--- a/libcxx/utils/google-benchmark/test/register_benchmark_test.cc
+++ b/libcxx/utils/google-benchmark/test/register_benchmark_test.cc
@@ -10,7 +10,7 @@ namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual void ReportRuns(const std::vector<Run>& report) {
+  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
     all_runs_.insert(all_runs_.end(), begin(report), end(report));
     ConsoleReporter::ReportRuns(report);
   }

diff  --git a/libcxx/utils/google-benchmark/test/repetitions_test.cc b/libcxx/utils/google-benchmark/test/repetitions_test.cc
new file mode 100644
index 0000000000000..f93de502a35ab
--- /dev/null
+++ b/libcxx/utils/google-benchmark/test/repetitions_test.cc
@@ -0,0 +1,208 @@
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ------------------------ Testing Basic Output --------------------------- //
+// ========================================================================= //
+
+void BM_ExplicitRepetitions(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_ExplicitRepetitions)->Repetitions(2);
+
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2 %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2 %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2_mean %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2_median %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2_stddev %console_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2_mean\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ExplicitRepetitions/repeats:2\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ExplicitRepetitions/repeats:2\",%csv_report$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_ExplicitRepetitions/repeats:2_mean\",%csv_report$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_ExplicitRepetitions/repeats:2_median\",%csv_report$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_ExplicitRepetitions/repeats:2_stddev\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Basic Output --------------------------- //
+// ========================================================================= //
+
+void BM_ImplicitRepetitions(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_ImplicitRepetitions);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions_mean %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions_median %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions_stddev %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions_mean\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions_median\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions_stddev\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_mean\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_median\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_stddev\",%csv_report$"}});
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }

diff  --git a/libcxx/utils/google-benchmark/test/reporter_output_test.cc b/libcxx/utils/google-benchmark/test/reporter_output_test.cc
index ec6d51b35917c..989eb48ecc81c 100644
--- a/libcxx/utils/google-benchmark/test/reporter_output_test.cc
+++ b/libcxx/utils/google-benchmark/test/reporter_output_test.cc
@@ -15,7 +15,7 @@ ADD_CASES(TC_ConsoleOut, {{"^[-]+$", MR_Next},
 static int AddContextCases() {
   AddCases(TC_ConsoleErr,
            {
-               {"%int[-/]%int[-/]%int %int:%int:%int$", MR_Default},
+               {"^%int-%int-%intT%int:%int:%int[-+]%int:%int$", MR_Default},
                {"Running .*/reporter_output_test(\\.exe)?$", MR_Next},
                {"Run on \\(%int X %float MHz CPU s?\\)", MR_Next},
            });
@@ -28,8 +28,7 @@ static int AddContextCases() {
              MR_Next},
             {"\"num_cpus\": %int,$", MR_Next},
             {"\"mhz_per_cpu\": %float,$", MR_Next},
-            {"\"cpu_scaling_enabled\": ", MR_Next},
-            {"\"caches\": \\[$", MR_Next}});
+            {"\"caches\": \\[$", MR_Default}});
   auto const& Info = benchmark::CPUInfo::Get();
   auto const& Caches = Info.caches;
   if (!Caches.empty()) {
@@ -38,9 +37,9 @@ static int AddContextCases() {
   for (size_t I = 0; I < Caches.size(); ++I) {
     std::string num_caches_str =
         Caches[I].num_sharing != 0 ? " \\(x%int\\)$" : "$";
-    AddCases(
-        TC_ConsoleErr,
-        {{"L%int (Data|Instruction|Unified) %intK" + num_caches_str, MR_Next}});
+    AddCases(TC_ConsoleErr,
+             {{"L%int (Data|Instruction|Unified) %int KiB" + num_caches_str,
+               MR_Next}});
     AddCases(TC_JSONOut, {{"\\{$", MR_Next},
                           {"\"type\": \"", MR_Next},
                           {"\"level\": %int,$", MR_Next},
@@ -72,8 +71,13 @@ BENCHMARK(BM_basic);
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_basic %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_basic\",$"},
+                       {"\"family_index\": 0,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_basic\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -87,6 +91,8 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_basic\",%csv_report$"}});
 
 void BM_bytes_per_second(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   state.SetBytesProcessed(1);
 }
@@ -95,8 +101,13 @@ BENCHMARK(BM_bytes_per_second);
 ADD_CASES(TC_ConsoleOut, {{"^BM_bytes_per_second %console_report "
                            "bytes_per_second=%float[kM]{0,1}/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_bytes_per_second\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_bytes_per_second\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -111,6 +122,8 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_bytes_per_second\",%csv_bytes_report$"}});
 
 void BM_items_per_second(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   state.SetItemsProcessed(1);
 }
@@ -119,8 +132,13 @@ BENCHMARK(BM_items_per_second);
 ADD_CASES(TC_ConsoleOut, {{"^BM_items_per_second %console_report "
                            "items_per_second=%float[kM]{0,1}/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_items_per_second\",$"},
+                       {"\"family_index\": 2,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_items_per_second\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -142,8 +160,13 @@ BENCHMARK(BM_label);
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_label %console_report some label$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_label\",$"},
+                       {"\"family_index\": 3,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_label\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -153,6 +176,101 @@ ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_label\",$"},
 ADD_CASES(TC_CSVOut, {{"^\"BM_label\",%csv_label_report_begin\"some "
                        "label\"%csv_label_report_end$"}});
 
+// ========================================================================= //
+// ------------------------ Testing Time Label Output ---------------------- //
+// ========================================================================= //
+
+void BM_time_label_nanosecond(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_nanosecond)->Unit(benchmark::kNanosecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_nanosecond %console_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_time_label_nanosecond\",$"},
+           {"\"family_index\": 4,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_time_label_nanosecond\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_nanosecond\",%csv_report$"}});
+
+void BM_time_label_microsecond(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_microsecond)->Unit(benchmark::kMicrosecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_microsecond %console_us_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_time_label_microsecond\",$"},
+           {"\"family_index\": 5,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_time_label_microsecond\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"us\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_microsecond\",%csv_us_report$"}});
+
+void BM_time_label_millisecond(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_millisecond)->Unit(benchmark::kMillisecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_millisecond %console_ms_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_time_label_millisecond\",$"},
+           {"\"family_index\": 6,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_time_label_millisecond\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ms\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_millisecond\",%csv_ms_report$"}});
+
+void BM_time_label_second(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_second)->Unit(benchmark::kSecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_second %console_s_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_time_label_second\",$"},
+                       {"\"family_index\": 7,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_time_label_second\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"s\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_second\",%csv_s_report$"}});
+
 // ========================================================================= //
 // ------------------------ Testing Error Output --------------------------- //
 // ========================================================================= //
@@ -165,8 +283,13 @@ void BM_error(benchmark::State& state) {
 BENCHMARK(BM_error);
 ADD_CASES(TC_ConsoleOut, {{"^BM_error[ ]+ERROR OCCURRED: 'message'$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_error\",$"},
+                       {"\"family_index\": 8,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_error\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"error_occurred\": true,$", MR_Next},
                        {"\"error_message\": \"message\",$", MR_Next}});
 
@@ -184,8 +307,13 @@ void BM_no_arg_name(benchmark::State& state) {
 BENCHMARK(BM_no_arg_name)->Arg(3);
 ADD_CASES(TC_ConsoleOut, {{"^BM_no_arg_name/3 %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"},
+                       {"\"family_index\": 9,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_no_arg_name/3\",$", MR_Next},
-                       {"\"run_type\": \"iteration\",$", MR_Next}});
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_no_arg_name/3\",%csv_report$"}});
 
 // ========================================================================= //
@@ -199,8 +327,13 @@ void BM_arg_name(benchmark::State& state) {
 BENCHMARK(BM_arg_name)->ArgName("first")->Arg(3);
 ADD_CASES(TC_ConsoleOut, {{"^BM_arg_name/first:3 %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_name/first:3\",$"},
+                       {"\"family_index\": 10,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_arg_name/first:3\",$", MR_Next},
-                       {"\"run_type\": \"iteration\",$", MR_Next}});
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_arg_name/first:3\",%csv_report$"}});
 
 // ========================================================================= //
@@ -216,10 +349,41 @@ ADD_CASES(TC_ConsoleOut,
           {{"^BM_arg_names/first:2/5/third:4 %console_report$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_arg_names/first:2/5/third:4\",$"},
+           {"\"family_index\": 11,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_arg_names/first:2/5/third:4\",$", MR_Next},
-           {"\"run_type\": \"iteration\",$", MR_Next}});
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_arg_names/first:2/5/third:4\",%csv_report$"}});
 
+// ========================================================================= //
+// ------------------------ Testing Name Output ---------------------------- //
+// ========================================================================= //
+
+void BM_name(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_name)->Name("BM_custom_name");
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_custom_name %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_custom_name\",$"},
+                       {"\"family_index\": 12,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_custom_name\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_custom_name\",%csv_report$"}});
+
 // ========================================================================= //
 // ------------------------ Testing Big Args Output ------------------------ //
 // ========================================================================= //
@@ -238,6 +402,8 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_BigArgs/1073741824 %console_report$"},
 
 void BM_Complexity_O1(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   state.SetComplexityN(state.range(0));
 }
@@ -265,24 +431,46 @@ ADD_CASES(TC_ConsoleOut,
            {"^BM_Repeat/repeats:2_median %console_time_only_report [ ]*2$"},
            {"^BM_Repeat/repeats:2_stddev %console_time_only_report [ ]*2$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\"", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_mean\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"mean\",$", MR_Next},
                        {"\"iterations\": 2,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_median\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"median\",$", MR_Next},
                        {"\"iterations\": 2,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_stddev\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"stddev\",$", MR_Next},
                        {"\"iterations\": 2,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:2\",%csv_report$"},
@@ -300,27 +488,54 @@ ADD_CASES(TC_ConsoleOut,
            {"^BM_Repeat/repeats:3_median %console_time_only_report [ ]*3$"},
            {"^BM_Repeat/repeats:3_stddev %console_time_only_report [ ]*3$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_mean\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"mean\",$", MR_Next},
                        {"\"iterations\": 3,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_median\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"median\",$", MR_Next},
                        {"\"iterations\": 3,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"stddev\",$", MR_Next},
                        {"\"iterations\": 3,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
@@ -340,30 +555,62 @@ ADD_CASES(TC_ConsoleOut,
            {"^BM_Repeat/repeats:4_median %console_time_only_report [ ]*4$"},
            {"^BM_Repeat/repeats:4_stddev %console_time_only_report [ ]*4$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_mean\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"mean\",$", MR_Next},
                        {"\"iterations\": 4,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_median\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"median\",$", MR_Next},
                        {"\"iterations\": 4,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_stddev\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"stddev\",$", MR_Next},
                        {"\"iterations\": 4,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:4\",%csv_report$"},
@@ -383,8 +630,13 @@ void BM_RepeatOnce(benchmark::State& state) {
 BENCHMARK(BM_RepeatOnce)->Repetitions(1)->ReportAggregatesOnly();
 ADD_CASES(TC_ConsoleOut, {{"^BM_RepeatOnce/repeats:1 %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_RepeatOnce/repeats:1\",$"},
+                       {"\"family_index\": 18,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_RepeatOnce/repeats:1\",$", MR_Next},
-                       {"\"run_type\": \"iteration\",$", MR_Next}});
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_RepeatOnce/repeats:1\",%csv_report$"}});
 
 // Test that non-aggregate data is not reported
@@ -402,18 +654,30 @@ ADD_CASES(
 ADD_CASES(TC_JSONOut,
           {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
            {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"},
+           {"\"family_index\": 19,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"name\": \"BM_SummaryRepeat/repeats:3_median\",$"},
+           {"\"family_index\": 19,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"},
+           {"\"family_index\": 19,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
@@ -438,18 +702,30 @@ ADD_CASES(
 ADD_CASES(TC_JSONOut,
           {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
            {"\"name\": \"BM_SummaryDisplay/repeats:2_mean\",$"},
+           {"\"family_index\": 20,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"name\": \"BM_SummaryDisplay/repeats:2_median\",$"},
+           {"\"family_index\": 20,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"name\": \"BM_SummaryDisplay/repeats:2_stddev\",$"},
+           {"\"family_index\": 20,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next}});
 ADD_CASES(TC_CSVOut,
@@ -478,20 +754,32 @@ ADD_CASES(
 ADD_CASES(TC_JSONOut,
           {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
            {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"},
+           {"\"family_index\": 21,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"time_unit\": \"us\",?$"},
            {"\"name\": \"BM_RepeatTimeUnit/repeats:3_median\",$"},
+           {"\"family_index\": 21,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"time_unit\": \"us\",?$"},
            {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
+           {"\"family_index\": 21,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"time_unit\": \"us\",?$"}});
@@ -540,48 +828,79 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
 ADD_CASES(
     TC_JSONOut,
     {{"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 0,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
      {"\"iterations\": 5,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 1,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
      {"\"iterations\": 5,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 2,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
      {"\"iterations\": 5,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_mean\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"mean\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_median\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"median\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_stddev\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"stddev\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": %float,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next}});
@@ -597,6 +916,39 @@ ADD_CASES(
       "manual_time_stddev\",%csv_report$"},
      {"^\"BM_UserStats/iterations:5/repeats:3/manual_time_\",%csv_report$"}});
 
+// ========================================================================= //
+// ------------------------- Testing StrEscape JSON ------------------------ //
+// ========================================================================= //
+#if 0  // enable when csv testing code correctly handles multi-line fields
+void BM_JSON_Format(benchmark::State& state) {
+  state.SkipWithError("val\b\f\n\r\t\\\"with\"es,capes");
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_JSON_Format);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_JSON_Format\",$"},
+                                              {"\"family_index\": 23,$", MR_Next},
+{"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_JSON_Format\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"error_occurred\": true,$", MR_Next},
+                       {R"("error_message": "val\\b\\f\\n\\r\\t\\\\\\"with\\"es,capes",$)", MR_Next}});
+#endif
+// ========================================================================= //
+// -------------------------- Testing CsvEscape ---------------------------- //
+// ========================================================================= //
+
+void BM_CSV_Format(benchmark::State& state) {
+  state.SkipWithError("\"freedom\"");
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_CSV_Format);
+ADD_CASES(TC_CSVOut, {{"^\"BM_CSV_Format\",,,,,,,,true,\"\"\"freedom\"\"\"$"}});
+
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
 // ========================================================================= //

diff  --git a/libcxx/utils/google-benchmark/test/skip_with_error_test.cc b/libcxx/utils/google-benchmark/test/skip_with_error_test.cc
index 06579772ff773..827966e9dfe37 100644
--- a/libcxx/utils/google-benchmark/test/skip_with_error_test.cc
+++ b/libcxx/utils/google-benchmark/test/skip_with_error_test.cc
@@ -10,11 +10,11 @@ namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual bool ReportContext(const Context& context) {
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
     return ConsoleReporter::ReportContext(context);
   };
 
-  virtual void ReportRuns(const std::vector<Run>& report) {
+  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
     all_runs_.insert(all_runs_.end(), begin(report), end(report));
     ConsoleReporter::ReportRuns(report);
   }
@@ -61,6 +61,12 @@ int AddCases(const char* base_name, std::initializer_list<TestCase> const& v) {
 
 }  // end namespace
 
+void BM_error_no_running(benchmark::State& state) {
+  state.SkipWithError("error message");
+}
+BENCHMARK(BM_error_no_running);
+ADD_CASES("BM_error_no_running", {{"", true, "error message"}});
+
 void BM_error_before_running(benchmark::State& state) {
   state.SkipWithError("error message");
   while (state.KeepRunning()) {

diff  --git a/libcxx/utils/google-benchmark/test/state_assembly_test.cc b/libcxx/utils/google-benchmark/test/state_assembly_test.cc
index abe9a4ddb56dd..7ddbb3b2a92c8 100644
--- a/libcxx/utils/google-benchmark/test/state_assembly_test.cc
+++ b/libcxx/utils/google-benchmark/test/state_assembly_test.cc
@@ -25,7 +25,7 @@ extern "C" int test_for_auto_loop() {
   for (auto _ : S) {
     // CHECK: .L[[LOOP_HEAD:[a-zA-Z0-9_]+]]:
     // CHECK-GNU-NEXT: subq $1, %rbx
-    // CHECK-CLANG-NEXT: {{(addq \$1,|incq)}} %rax
+    // CHECK-CLANG-NEXT: {{(addq \$1, %rax|incq %rax|addq \$-1, %rbx)}}
     // CHECK-NEXT: jne .L[[LOOP_HEAD]]
     benchmark::DoNotOptimize(x);
   }

diff  --git a/libcxx/utils/google-benchmark/test/statistics_gtest.cc b/libcxx/utils/google-benchmark/test/statistics_gtest.cc
index 99e314920c55c..3ddc72dd7ac62 100644
--- a/libcxx/utils/google-benchmark/test/statistics_gtest.cc
+++ b/libcxx/utils/google-benchmark/test/statistics_gtest.cc
@@ -21,8 +21,8 @@ TEST(StatisticsTest, Median) {
 TEST(StatisticsTest, StdDev) {
   EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({101, 101, 101, 101}), 0.0);
   EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({1, 2, 3}), 1.0);
-  EXPECT_FLOAT_EQ(benchmark::StatisticsStdDev({1.5, 2.4, 3.3, 4.2, 5.1}),
-                  1.42302495);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({2.5, 2.4, 3.3, 4.2, 5.1}),
+                   1.151086443322134);
 }
 
 }  // end namespace

diff  --git a/libcxx/utils/google-benchmark/test/string_util_gtest.cc b/libcxx/utils/google-benchmark/test/string_util_gtest.cc
index 2c5d073f613b7..c7061b409e91c 100644
--- a/libcxx/utils/google-benchmark/test/string_util_gtest.cc
+++ b/libcxx/utils/google-benchmark/test/string_util_gtest.cc
@@ -3,6 +3,7 @@
 //===---------------------------------------------------------------------===//
 
 #include "../src/string_util.h"
+#include "../src/internal_macros.h"
 #include "gtest/gtest.h"
 
 namespace {
@@ -60,9 +61,11 @@ TEST(StringUtilTest, stoul) {
     EXPECT_EQ(0xBEEFul, benchmark::stoul("BEEF", &pos, 16));
     EXPECT_EQ(4ul, pos);
   }
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
   {
     ASSERT_THROW(benchmark::stoul("this is a test"), std::invalid_argument);
   }
+#endif
 }
 
 TEST(StringUtilTest, stoi) {
@@ -106,9 +109,11 @@ TEST(StringUtilTest, stoi) {
     EXPECT_EQ(0xBEEF, benchmark::stoi("BEEF", &pos, 16));
     EXPECT_EQ(4ul, pos);
   }
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
   {
     ASSERT_THROW(benchmark::stoi("this is a test"), std::invalid_argument);
   }
+#endif
 }
 
 TEST(StringUtilTest, stod) {
@@ -138,9 +143,19 @@ TEST(StringUtilTest, stod) {
     EXPECT_EQ(-1.25e+9, benchmark::stod("-1.25e+9", &pos));
     EXPECT_EQ(8ul, pos);
   }
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
   {
     ASSERT_THROW(benchmark::stod("this is a test"), std::invalid_argument);
   }
+#endif
+}
+
+TEST(StringUtilTest, StrSplit) {
+  EXPECT_EQ(benchmark::StrSplit("", ','), std::vector<std::string>{});
+  EXPECT_EQ(benchmark::StrSplit("hello", ','),
+            std::vector<std::string>({"hello"}));
+  EXPECT_EQ(benchmark::StrSplit("hello,there,is,more", ','),
+            std::vector<std::string>({"hello", "there", "is", "more"}));
 }
 
 }  // end namespace

diff  --git a/libcxx/utils/google-benchmark/test/user_counters_tabular_test.cc b/libcxx/utils/google-benchmark/test/user_counters_tabular_test.cc
index 030e98916c3d3..421f27b5cb8b0 100644
--- a/libcxx/utils/google-benchmark/test/user_counters_tabular_test.cc
+++ b/libcxx/utils/google-benchmark/test/user_counters_tabular_test.cc
@@ -7,19 +7,23 @@
 // @todo: <jpmag> this checks the full output at once; the rule for
 // CounterSet1 was failing because it was not matching "^[-]+$".
 // @todo: <jpmag> check that the counters are vertically aligned.
-ADD_CASES(
-    TC_ConsoleOut,
-    {
-        // keeping these lines long improves readability, so:
-        // clang-format off
+ADD_CASES(TC_ConsoleOut,
+          {
+              // keeping these lines long improves readability, so:
+              // clang-format off
     {"^[-]+$", MR_Next},
     {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Bat %s Baz %s Foo %s Frob %s Lob$", MR_Next},
     {"^[-]+$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1_mean %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1_median %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1_stddev %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2_mean %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2_median %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2_stddev %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
     {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
     {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
     {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
@@ -46,8 +50,8 @@ ADD_CASES(
     {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
     {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
     {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$"},
-        // clang-format on
-    });
+              // clang-format on
+          });
 ADD_CASES(TC_CSVOut, {{"%csv_header,"
                        "\"Bar\",\"Bat\",\"Baz\",\"Foo\",\"Frob\",\"Lob\""}});
 
@@ -68,11 +72,17 @@ void BM_Counters_Tabular(benchmark::State& state) {
       {"Lob", {32, bm::Counter::kAvgThreads}},
   });
 }
-BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 16);
+BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 2)->Repetitions(2);
 ADD_CASES(TC_JSONOut,
-          {{"\"name\": \"BM_Counters_Tabular/threads:%int\",$"},
-           {"\"run_name\": \"BM_Counters_Tabular/threads:%int\",$", MR_Next},
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -84,8 +94,205 @@ ADD_CASES(TC_JSONOut,
            {"\"Frob\": %float,$", MR_Next},
            {"\"Lob\": %float$", MR_Next},
            {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Tabular/threads:%int\",%csv_report,"
-                       "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_mean\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_mean\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_median\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_stddev\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_mean\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_median\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_stddev\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckTabular(Results const& e) {
@@ -96,7 +303,10 @@ void CheckTabular(Results const& e) {
   CHECK_COUNTER_VALUE(e, int, "Frob", EQ, 16);
   CHECK_COUNTER_VALUE(e, int, "Lob", EQ, 32);
 }
-CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/threads:%int", &CheckTabular);
+CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/repeats:2/threads:1$",
+                        &CheckTabular);
+CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/repeats:2/threads:2$",
+                        &CheckTabular);
 
 // ========================================================================= //
 // -------------------- Tabular+Rate Counters Output ----------------------- //
@@ -104,6 +314,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/threads:%int", &CheckTabular);
 
 void BM_CounterRates_Tabular(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   namespace bm = benchmark;
   state.counters.insert({
@@ -118,9 +330,14 @@ void BM_CounterRates_Tabular(benchmark::State& state) {
 BENCHMARK(BM_CounterRates_Tabular)->ThreadRange(1, 16);
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
+           {"\"family_index\": 1,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_CounterRates_Tabular/threads:%int\",$",
             MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -166,8 +383,13 @@ void BM_CounterSet0_Tabular(benchmark::State& state) {
 BENCHMARK(BM_CounterSet0_Tabular)->ThreadRange(1, 16);
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
+           {"\"family_index\": 2,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_CounterSet0_Tabular/threads:%int\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -201,8 +423,13 @@ void BM_CounterSet1_Tabular(benchmark::State& state) {
 BENCHMARK(BM_CounterSet1_Tabular)->ThreadRange(1, 16);
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
+           {"\"family_index\": 3,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_CounterSet1_Tabular/threads:%int\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -240,8 +467,13 @@ void BM_CounterSet2_Tabular(benchmark::State& state) {
 BENCHMARK(BM_CounterSet2_Tabular)->ThreadRange(1, 16);
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
+           {"\"family_index\": 4,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_CounterSet2_Tabular/threads:%int\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},

diff  --git a/libcxx/utils/google-benchmark/test/user_counters_test.cc b/libcxx/utils/google-benchmark/test/user_counters_test.cc
index bb0d6b4c5a91c..377bb32ca948c 100644
--- a/libcxx/utils/google-benchmark/test/user_counters_test.cc
+++ b/libcxx/utils/google-benchmark/test/user_counters_test.cc
@@ -32,8 +32,13 @@ BENCHMARK(BM_Counters_Simple);
 ADD_CASES(TC_ConsoleOut,
           {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Simple\",$"},
+                       {"\"family_index\": 0,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Counters_Simple\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -61,6 +66,8 @@ int num_calls1 = 0;
 }
 void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   state.counters["foo"] = 1;
   state.counters["bar"] = ++num_calls1;
@@ -73,8 +80,13 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_WithBytesAndItemsPSec %console_report "
                            "foo=%hrfloat items_per_second=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
+           {"\"family_index\": 1,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_WithBytesAndItemsPSec\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -105,6 +117,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_WithBytesAndItemsPSec",
 
 void BM_Counters_Rate(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kIsRate};
@@ -115,8 +129,13 @@ ADD_CASES(
     TC_ConsoleOut,
     {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Rate\",$"},
+                       {"\"family_index\": 2,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Counters_Rate\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -135,6 +154,93 @@ void CheckRate(Results const& e) {
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_Rate", &CheckRate);
 
+// ========================================================================= //
+// ----------------------- Inverted Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Invert(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{0.0001, bm::Counter::kInvert};
+  state.counters["bar"] = bm::Counter{10000, bm::Counter::kInvert};
+}
+BENCHMARK(BM_Invert);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Invert %console_report bar=%hrfloatu foo=%hrfloatk$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Invert\",$"},
+                       {"\"family_index\": 3,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Invert\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Invert\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckInvert(Results const& e) {
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 10000, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 0.0001, 0.0001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Invert", &CheckInvert);
+
+// ========================================================================= //
+// ------------------------- InvertedRate Counters Output
+// -------------------------- //
+// ========================================================================= //
+
+void BM_Counters_InvertedRate(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] =
+      bm::Counter{1, bm::Counter::kIsRate | bm::Counter::kInvert};
+  state.counters["bar"] =
+      bm::Counter{8192, bm::Counter::kIsRate | bm::Counter::kInvert};
+}
+BENCHMARK(BM_Counters_InvertedRate);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_InvertedRate %console_report "
+                           "bar=%hrfloats foo=%hrfloats$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_InvertedRate\",$"},
+           {"\"family_index\": 4,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_InvertedRate\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_InvertedRate\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckInvertedRate(Results const& e) {
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, t / 8192.0, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_InvertedRate", &CheckInvertedRate);
+
 // ========================================================================= //
 // ------------------------- Thread Counters Output ------------------------ //
 // ========================================================================= //
@@ -150,8 +256,13 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report "
                            "bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
+           {"\"family_index\": 5,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Threads/threads:%int\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -186,8 +297,13 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int "
                            "%console_report bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
+           {"\"family_index\": 6,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_AvgThreads/threads:%int\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -213,6 +329,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreads/threads:%int",
 
 void BM_Counters_AvgThreadsRate(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreadsRate};
@@ -223,9 +341,14 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int "
                            "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
+           {"\"family_index\": 7,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$",
             MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -260,8 +383,13 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_IterationInvariant %console_report "
                            "bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_IterationInvariant\",$"},
+           {"\"family_index\": 8,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_IterationInvariant\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -288,6 +416,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_IterationInvariant",
 
 void BM_Counters_kIsIterationInvariantRate(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   namespace bm = benchmark;
   state.counters["foo"] =
@@ -300,9 +430,14 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kIsIterationInvariantRate "
                            "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_kIsIterationInvariantRate\",$"},
+           {"\"family_index\": 9,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_kIsIterationInvariantRate\",$",
             MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -340,8 +475,13 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgIterations %console_report "
                            "bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_AvgIterations\",$"},
+           {"\"family_index\": 10,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_AvgIterations\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -367,6 +507,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_AvgIterations", &CheckAvgIterations);
 
 void BM_Counters_kAvgIterationsRate(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgIterationsRate};
@@ -378,8 +520,13 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kAvgIterationsRate "
                            "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_kAvgIterationsRate\",$"},
+           {"\"family_index\": 11,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_kAvgIterationsRate\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},

diff  --git a/libcxx/utils/google-benchmark/test/user_counters_thousands_test.cc b/libcxx/utils/google-benchmark/test/user_counters_thousands_test.cc
index fa0ef97204704..bbe194264ed45 100644
--- a/libcxx/utils/google-benchmark/test/user_counters_thousands_test.cc
+++ b/libcxx/utils/google-benchmark/test/user_counters_thousands_test.cc
@@ -51,8 +51,13 @@ ADD_CASES(
     });
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -65,8 +70,13 @@ ADD_CASES(TC_JSONOut,
            {"}", MR_Next}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -79,8 +89,12 @@ ADD_CASES(TC_JSONOut,
            {"}", MR_Next}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2_mean\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
@@ -94,8 +108,12 @@ ADD_CASES(TC_JSONOut,
            {"}", MR_Next}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
@@ -109,8 +127,12 @@ ADD_CASES(TC_JSONOut,
            {"}", MR_Next}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},

diff  --git a/libcxx/utils/google-benchmark/tools/BUILD.bazel b/libcxx/utils/google-benchmark/tools/BUILD.bazel
new file mode 100644
index 0000000000000..5895883a2eb3c
--- /dev/null
+++ b/libcxx/utils/google-benchmark/tools/BUILD.bazel
@@ -0,0 +1,19 @@
+load("@py_deps//:requirements.bzl", "requirement")
+
+py_library(
+    name = "gbench",
+    srcs = glob(["gbench/*.py"]),
+    deps = [
+      requirement("numpy"),
+      requirement("scipy"),
+    ],
+)
+
+py_binary(
+    name = "compare",
+    srcs = ["compare.py"],
+    python_version = "PY2",
+    deps = [
+        ":gbench",
+    ],
+)

diff  --git a/libcxx/utils/google-benchmark/tools/compare.py b/libcxx/utils/google-benchmark/tools/compare.py
index 539ace6fb163b..01d2c89f50fbb 100755
--- a/libcxx/utils/google-benchmark/tools/compare.py
+++ b/libcxx/utils/google-benchmark/tools/compare.py
@@ -7,6 +7,7 @@
 
 import argparse
 from argparse import ArgumentParser
+import json
 import sys
 import gbench
 from gbench import util, report
@@ -48,6 +49,20 @@ def create_parser():
              "of repetitions. Do note that only the display is affected. "
              "Internally, all the actual runs are still used, e.g. for U test.")
 
+    parser.add_argument(
+        '--no-color',
+        dest='color',
+        default=True,
+        action="store_false",
+        help="Do not use colors in the terminal output"
+    )
+
+    parser.add_argument(
+        '-d',
+        '--dump_to_json',
+        dest='dump_to_json',
+        help="Additionally, dump benchmark comparison output to this file in JSON format.")
+
     utest = parser.add_argument_group()
     utest.add_argument(
         '--no-utest',
@@ -223,10 +238,10 @@ def main():
         options_contender = ['--benchmark_filter=%s' % filter_contender]
 
     # Run the benchmarks and report the results
-    json1 = json1_orig = gbench.util.run_or_load_benchmark(
-        test_baseline, benchmark_options + options_baseline)
-    json2 = json2_orig = gbench.util.run_or_load_benchmark(
-        test_contender, benchmark_options + options_contender)
+    json1 = json1_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
+        test_baseline, benchmark_options + options_baseline))
+    json2 = json2_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
+        test_contender, benchmark_options + options_contender))
 
     # Now, filter the benchmarks so that the 
diff erence report can work
     if filter_baseline and filter_contender:
@@ -236,14 +251,20 @@ def main():
         json2 = gbench.report.filter_benchmark(
             json2_orig, filter_contender, replacement)
 
-    # Diff and output
-    output_lines = gbench.report.generate_
diff erence_report(
-        json1, json2, args.display_aggregates_only,
-        args.utest, args.utest_alpha)
+    
diff _report = gbench.report.get_
diff erence_report(
+        json1, json2, args.utest)
+    output_lines = gbench.report.print_
diff erence_report(
+        
diff _report,
+        args.display_aggregates_only,
+        args.utest, args.utest_alpha, args.color)
     print(description)
     for ln in output_lines:
         print(ln)
 
+    # Optionally, 
diff  and output to JSON
+    if args.dump_to_json is not None:
+        with open(args.dump_to_json, 'w') as f_json:
+            json.dump(
diff _report, f_json)
 
 class TestParser(unittest.TestCase):
     def setUp(self):

diff  --git a/libcxx/utils/google-benchmark/tools/gbench/Inputs/test1_run1.json b/libcxx/utils/google-benchmark/tools/gbench/Inputs/test1_run1.json
index d7ec6a9c8f61a..601e327aefb59 100644
--- a/libcxx/utils/google-benchmark/tools/gbench/Inputs/test1_run1.json
+++ b/libcxx/utils/google-benchmark/tools/gbench/Inputs/test1_run1.json
@@ -85,7 +85,24 @@
       "time_unit": "ns"
     },
     {
-      "name": "BM_BadTimeUnit",
+      "name": "MyComplexityTest_BigO",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "BigO",
+      "cpu_coefficient": 4.2749856294592886e+00,
+      "real_coefficient": 6.4789275289789780e+00,
+      "big_o": "N",
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_RMS",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "RMS",
+      "rms": 4.5097802512472874e-03
+    },
+    {
+      "name": "BM_NotBadTimeUnit",
       "iterations": 1000,
       "real_time": 0.4,
       "cpu_time": 0.5,

diff  --git a/libcxx/utils/google-benchmark/tools/gbench/Inputs/test1_run2.json b/libcxx/utils/google-benchmark/tools/gbench/Inputs/test1_run2.json
index 59a5ffaca4d4d..3cbcf39b0c938 100644
--- a/libcxx/utils/google-benchmark/tools/gbench/Inputs/test1_run2.json
+++ b/libcxx/utils/google-benchmark/tools/gbench/Inputs/test1_run2.json
@@ -85,7 +85,24 @@
       "time_unit": "ns"
     },
     {
-      "name": "BM_BadTimeUnit",
+      "name": "MyComplexityTest_BigO",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "BigO",
+      "cpu_coefficient": 5.6215779594361486e+00,
+      "real_coefficient": 5.6288314793554610e+00,
+      "big_o": "N",
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_RMS",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "RMS",
+      "rms": 3.3128901852342174e-03
+    },
+    {
+      "name": "BM_NotBadTimeUnit",
       "iterations": 1000,
       "real_time": 0.04,
       "cpu_time": 0.6,

diff  --git a/libcxx/utils/google-benchmark/tools/gbench/Inputs/test4_run.json b/libcxx/utils/google-benchmark/tools/gbench/Inputs/test4_run.json
new file mode 100644
index 0000000000000..eaa005f3a9f47
--- /dev/null
+++ b/libcxx/utils/google-benchmark/tools/gbench/Inputs/test4_run.json
@@ -0,0 +1,96 @@
+{
+  "benchmarks": [
+    {
+      "name": "99 family 0 instance 0 repetition 0",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "repetition_index": 0
+    },
+    {
+      "name": "98 family 0 instance 0 repetition 1",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "repetition_index": 1
+    },
+    {
+      "name": "97 family 0 instance 0 aggregate",
+      "run_type": "aggregate",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+    {
+      "name": "96 family 0 instance 1 repetition 0",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "repetition_index": 0
+    },
+    {
+      "name": "95 family 0 instance 1 repetition 1",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "repetition_index": 1
+    },
+    {
+      "name": "94 family 0 instance 1 aggregate",
+      "run_type": "aggregate",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+
+
+    {
+      "name": "93 family 1 instance 0 repetition 0",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "repetition_index": 0
+    },
+    {
+      "name": "92 family 1 instance 0 repetition 1",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "repetition_index": 1
+    },
+    {
+      "name": "91 family 1 instance 0 aggregate",
+      "run_type": "aggregate",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+    {
+      "name": "90 family 1 instance 1 repetition 0",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "repetition_index": 0
+    },
+    {
+      "name": "89 family 1 instance 1 repetition 1",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "repetition_index": 1
+    },
+    {
+      "name": "88 family 1 instance 1 aggregate",
+      "run_type": "aggregate",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "aggregate_name": "9 aggregate"
+    }
+  ]
+}

diff  --git a/libcxx/utils/google-benchmark/tools/gbench/report.py b/libcxx/utils/google-benchmark/tools/gbench/report.py
index 5085b9319475b..6bea82f6bf7b4 100644
--- a/libcxx/utils/google-benchmark/tools/gbench/report.py
+++ b/libcxx/utils/google-benchmark/tools/gbench/report.py
@@ -1,9 +1,11 @@
-import unittest
 """report.py - Utilities for reporting statistics about benchmark results
 """
+
+import unittest
 import os
 import re
 import copy
+import random
 
 from scipy.stats import mannwhitneyu
 
@@ -114,6 +116,10 @@ def intersect(list1, list2):
     return [x for x in list1 if x in list2]
 
 
+def is_potentially_comparable_benchmark(x):
+    return ('time_unit' in x and 'real_time' in x and 'cpu_time' in x)
+
+
 def partition_benchmarks(json1, json2):
     """
     While preserving the ordering, find benchmarks with the same names in
@@ -125,10 +131,17 @@ def partition_benchmarks(json1, json2):
     names = intersect(json1_unique_names, json2_unique_names)
     partitions = []
     for name in names:
+        time_unit = None
         # Pick the time unit from the first entry of the lhs benchmark.
-        time_unit = (x['time_unit']
-                     for x in json1['benchmarks'] if x['name'] == name).next()
+        # We should be careful not to crash with unexpected input.
+        for x in json1['benchmarks']:
+            if (x['name'] == name and is_potentially_comparable_benchmark(x)):
+                time_unit = x['time_unit']
+                break
+        if time_unit is None:
+            continue
         # Filter by name and time unit.
+        # All the repetitions are assumed to be comparable.
         lhs = [x for x in json1['benchmarks'] if x['name'] == name and
                x['time_unit'] == time_unit]
         rhs = [x for x in json2['benchmarks'] if x['name'] == name and
@@ -144,10 +157,7 @@ def extract_field(partition, field_name):
     return [lhs, rhs]
 
 
-def print_utest(partition, utest_alpha, first_col_width, use_color=True):
-    timings_time = extract_field(partition, 'real_time')
-    timings_cpu = extract_field(partition, 'cpu_time')
-
+def calc_utest(timings_cpu, timings_time):
     min_rep_cnt = min(len(timings_time[0]),
                       len(timings_time[1]),
                       len(timings_cpu[0]),
@@ -155,43 +165,115 @@ def print_utest(partition, utest_alpha, first_col_width, use_color=True):
 
     # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions?
     if min_rep_cnt < UTEST_MIN_REPETITIONS:
-        return []
-
-    def get_utest_color(pval):
-        return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
+        return False, None, None
 
     time_pvalue = mannwhitneyu(
         timings_time[0], timings_time[1], alternative='two-sided').pvalue
     cpu_pvalue = mannwhitneyu(
         timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
 
+    return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
+
+def print_utest(bc_name, utest, utest_alpha, first_col_width, use_color=True):
+    def get_utest_color(pval):
+        return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
+
+    # Check if we failed miserably with minimum required repetitions for utest
+    if not utest['have_optimal_repetitions'] and utest['cpu_pvalue'] is None and utest['time_pvalue'] is None:
+        return []
+
     dsc = "U Test, Repetitions: {} vs {}".format(
-        len(timings_cpu[0]), len(timings_cpu[1]))
+        utest['nr_of_repetitions'], utest['nr_of_repetitions_other'])
     dsc_color = BC_OKGREEN
 
-    if min_rep_cnt < UTEST_OPTIMAL_REPETITIONS:
+    # We still got some results to show but issue a warning about it.
+    if not utest['have_optimal_repetitions']:
         dsc_color = BC_WARNING
         dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
             UTEST_OPTIMAL_REPETITIONS)
 
     special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
 
-    last_name = partition[0][0]['name']
     return [color_format(use_color,
                          special_str,
                          BC_HEADER,
-                         "{}{}".format(last_name, UTEST_COL_NAME),
+                         "{}{}".format(bc_name, UTEST_COL_NAME),
                          first_col_width,
-                         get_utest_color(time_pvalue), time_pvalue,
-                         get_utest_color(cpu_pvalue), cpu_pvalue,
+                         get_utest_color(
+                             utest['time_pvalue']), utest['time_pvalue'],
+                         get_utest_color(
+                             utest['cpu_pvalue']), utest['cpu_pvalue'],
                          dsc_color, dsc,
                          endc=BC_ENDC)]
 
 
-def generate_
diff erence_report(
+def get_
diff erence_report(
         json1,
         json2,
-        display_aggregates_only=False,
+        utest=False):
+    """
+    Calculate and report the 
diff erence between each test of two benchmarks
+    runs specified as 'json1' and 'json2'. Output is another json containing
+    relevant details for each test run.
+    """
+    assert utest is True or utest is False
+
+    
diff _report = []
+    partitions = partition_benchmarks(json1, json2)
+    for partition in partitions:
+        benchmark_name = partition[0][0]['name']
+        time_unit = partition[0][0]['time_unit']
+        measurements = []
+        utest_results = {}
+        # Careful, we may have 
diff erent repetition count.
+        for i in range(min(len(partition[0]), len(partition[1]))):
+            bn = partition[0][i]
+            other_bench = partition[1][i]
+            measurements.append({
+                'real_time': bn['real_time'],
+                'cpu_time': bn['cpu_time'],
+                'real_time_other': other_bench['real_time'],
+                'cpu_time_other': other_bench['cpu_time'],
+                'time': calculate_change(bn['real_time'], other_bench['real_time']),
+                'cpu': calculate_change(bn['cpu_time'], other_bench['cpu_time'])
+            })
+
+        # After processing the whole partition, if requested, do the U test.
+        if utest:
+            timings_cpu = extract_field(partition, 'cpu_time')
+            timings_time = extract_field(partition, 'real_time')
+            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
+            if cpu_pvalue and time_pvalue:
+                utest_results = {
+                    'have_optimal_repetitions': have_optimal_repetitions,
+                    'cpu_pvalue': cpu_pvalue,
+                    'time_pvalue': time_pvalue,
+                    'nr_of_repetitions': len(timings_cpu[0]),
+                    'nr_of_repetitions_other': len(timings_cpu[1])
+                }
+
+        # Store only if we had any measurements for given benchmark.
+        # E.g. partition_benchmarks will filter out the benchmarks having
+        # time units which are not compatible with other time units in the
+        # benchmark suite.
+        if measurements:
+            run_type = partition[0][0]['run_type'] if 'run_type' in partition[0][0] else ''
+            aggregate_name = partition[0][0]['aggregate_name'] if run_type == 'aggregate' and 'aggregate_name' in partition[0][0] else ''
+            
diff _report.append({
+                'name': benchmark_name,
+                'measurements': measurements,
+                'time_unit': time_unit,
+                'run_type': run_type,
+                'aggregate_name': aggregate_name,
+                'utest': utest_results
+            })
+
+    return 
diff _report
+
+
+def print_
diff erence_report(
+        json_
diff _report,
+        include_aggregates_only=False,
         utest=False,
         utest_alpha=0.05,
         use_color=True):
@@ -200,14 +282,16 @@ def generate_
diff erence_report(
     runs specified as 'json1' and 'json2'.
     """
     assert utest is True or utest is False
-    first_col_width = find_longest_name(json1['benchmarks'])
 
-    def find_test(name):
-        for b in json2['benchmarks']:
-            if b['name'] == name:
-                return b
-        return None
+    def get_color(res):
+        if res > 0.05:
+            return BC_FAIL
+        elif res > -0.07:
+            return BC_WHITE
+        else:
+            return BC_CYAN
 
+    first_col_width = find_longest_name(json_
diff _report)
     first_col_width = max(
         first_col_width,
         len('Benchmark'))
@@ -216,50 +300,33 @@ def find_test(name):
         'Benchmark', 12 + first_col_width)
     output_strs = [first_line, '-' * len(first_line)]
 
-    partitions = partition_benchmarks(json1, json2)
-    for partition in partitions:
-        # Careful, we may have 
diff erent repetition count.
-        for i in range(min(len(partition[0]), len(partition[1]))):
-            bn = partition[0][i]
-            other_bench = partition[1][i]
-
-            # *If* we were asked to only display aggregates,
-            # and if it is non-aggregate, then skip it.
-            if display_aggregates_only and 'run_type' in bn and 'run_type' in other_bench:
-                assert bn['run_type'] == other_bench['run_type']
-                if bn['run_type'] != 'aggregate':
-                    continue
-
-            fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
-
-            def get_color(res):
-                if res > 0.05:
-                    return BC_FAIL
-                elif res > -0.07:
-                    return BC_WHITE
-                else:
-                    return BC_CYAN
-
-            tres = calculate_change(bn['real_time'], other_bench['real_time'])
-            cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
-            output_strs += [color_format(use_color,
-                                         fmt_str,
-                                         BC_HEADER,
-                                         bn['name'],
-                                         first_col_width,
-                                         get_color(tres),
-                                         tres,
-                                         get_color(cpures),
-                                         cpures,
-                                         bn['real_time'],
-                                         other_bench['real_time'],
-                                         bn['cpu_time'],
-                                         other_bench['cpu_time'],
-                                         endc=BC_ENDC)]
-
-        # After processing the whole partition, if requested, do the U test.
-        if utest:
-            output_strs += print_utest(partition,
+    fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
+    for benchmark in json_
diff _report:
+        # *If* we were asked to only include aggregates,
+        # and if it is non-aggregate, then don't print it.
+        if not include_aggregates_only or not 'run_type' in benchmark or benchmark['run_type'] == 'aggregate':
+            for measurement in benchmark['measurements']:
+                output_strs += [color_format(use_color,
+                                            fmt_str,
+                                            BC_HEADER,
+                                            benchmark['name'],
+                                            first_col_width,
+                                            get_color(measurement['time']),
+                                            measurement['time'],
+                                            get_color(measurement['cpu']),
+                                            measurement['cpu'],
+                                            measurement['real_time'],
+                                            measurement['real_time_other'],
+                                            measurement['cpu_time'],
+                                            measurement['cpu_time_other'],
+                                            endc=BC_ENDC)]
+
+        # After processing the measurements, if requested and
+        # if applicable (e.g. u-test exists for given benchmark),
+        # print the U test.
+        if utest and benchmark['utest']:
+            output_strs += print_utest(benchmark['name'],
+                                       benchmark['utest'],
                                        utest_alpha=utest_alpha,
                                        first_col_width=first_col_width,
                                        use_color=use_color)
@@ -300,21 +367,26 @@ def test_basic(self):
 
 
 class TestReportDifference(unittest.TestCase):
-    def load_results(self):
-        import json
-        testInputs = os.path.join(
-            os.path.dirname(
-                os.path.realpath(__file__)),
-            'Inputs')
-        testOutput1 = os.path.join(testInputs, 'test1_run1.json')
-        testOutput2 = os.path.join(testInputs, 'test1_run2.json')
-        with open(testOutput1, 'r') as f:
-            json1 = json.load(f)
-        with open(testOutput2, 'r') as f:
-            json2 = json.load(f)
-        return json1, json2
-
-    def test_basic(self):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test1_run1.json')
+            testOutput2 = os.path.join(testInputs, 'test1_run2.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_
diff _report = get_
diff erence_report(json1, json2)
+
+    def test_json_
diff _report_pretty_printing(self):
         expect_lines = [
             ['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'],
             ['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'],
@@ -330,11 +402,10 @@ def test_basic(self):
             ['BM_10PercentCPUToTime', '+0.1000',
                 '-0.1000', '100', '110', '100', '90'],
             ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
-            ['BM_BadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
+            ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
         ]
-        json1, json2 = self.load_results()
-        output_lines_with_header = generate_
diff erence_report(
-            json1, json2, use_color=False)
+        output_lines_with_header = print_
diff erence_report(
+            self.json_
diff _report, use_color=False)
         output_lines = output_lines_with_header[2:]
         print("\n")
         print("\n".join(output_lines_with_header))
@@ -344,31 +415,118 @@ def test_basic(self):
             self.assertEqual(len(parts), 7)
             self.assertEqual(expect_lines[i], parts)
 
+    def test_json_
diff _report_output(self):
+        expected_output = [
+            {
+                'name': 'BM_SameTimes',
+                'measurements': [{'time': 0.0000, 'cpu': 0.0000, 'real_time': 10, 'real_time_other': 10, 'cpu_time': 10, 'cpu_time_other': 10}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_2xFaster',
+                'measurements': [{'time': -0.5000, 'cpu': -0.5000, 'real_time': 50, 'real_time_other': 25, 'cpu_time': 50, 'cpu_time_other': 25}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_2xSlower',
+                'measurements': [{'time': 1.0000, 'cpu': 1.0000, 'real_time': 50, 'real_time_other': 100, 'cpu_time': 50, 'cpu_time_other': 100}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_1PercentFaster',
+                'measurements': [{'time': -0.0100, 'cpu': -0.0100, 'real_time': 100, 'real_time_other': 98.9999999, 'cpu_time': 100, 'cpu_time_other': 98.9999999}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_1PercentSlower',
+                'measurements': [{'time': 0.0100, 'cpu': 0.0100, 'real_time': 100, 'real_time_other': 101, 'cpu_time': 100, 'cpu_time_other': 101}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentFaster',
+                'measurements': [{'time': -0.1000, 'cpu': -0.1000, 'real_time': 100, 'real_time_other': 90, 'cpu_time': 100, 'cpu_time_other': 90}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentSlower',
+                'measurements': [{'time': 0.1000, 'cpu': 0.1000, 'real_time': 100, 'real_time_other': 110, 'cpu_time': 100, 'cpu_time_other': 110}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_100xSlower',
+                'measurements': [{'time': 99.0000, 'cpu': 99.0000, 'real_time': 100, 'real_time_other': 10000, 'cpu_time': 100, 'cpu_time_other': 10000}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_100xFaster',
+                'measurements': [{'time': -0.9900, 'cpu': -0.9900, 'real_time': 10000, 'real_time_other': 100, 'cpu_time': 10000, 'cpu_time_other': 100}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentCPUToTime',
+                'measurements': [{'time': 0.1000, 'cpu': -0.1000, 'real_time': 100, 'real_time_other': 110, 'cpu_time': 100, 'cpu_time_other': 90}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_ThirdFaster',
+                'measurements': [{'time': -0.3333, 'cpu': -0.3334, 'real_time': 100, 'real_time_other': 67, 'cpu_time': 100, 'cpu_time_other': 67}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_NotBadTimeUnit',
+                'measurements': [{'time': -0.9000, 'cpu': 0.2000, 'real_time': 0.4, 'real_time_other': 0.04, 'cpu_time': 0.5, 'cpu_time_other': 0.6}],
+                'time_unit': 's',
+                'utest': {}
+            },
+        ]
+        self.assertEqual(len(self.json_
diff _report), len(expected_output))
+        for out, expected in zip(
+                self.json_
diff _report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
 
 class TestReportDifferenceBetweenFamilies(unittest.TestCase):
-    def load_result(self):
-        import json
-        testInputs = os.path.join(
-            os.path.dirname(
-                os.path.realpath(__file__)),
-            'Inputs')
-        testOutput = os.path.join(testInputs, 'test2_run.json')
-        with open(testOutput, 'r') as f:
-            json = json.load(f)
-        return json
+    @classmethod
+    def setUpClass(cls):
+        def load_result():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput = os.path.join(testInputs, 'test2_run.json')
+            with open(testOutput, 'r') as f:
+                json = json.load(f)
+            return json
+
+        json = load_result()
+        json1 = filter_benchmark(json, "BM_Z.ro", ".")
+        json2 = filter_benchmark(json, "BM_O.e", ".")
+        cls.json_
diff _report = get_
diff erence_report(json1, json2)
 
-    def test_basic(self):
+    def test_json_
diff _report_pretty_printing(self):
         expect_lines = [
             ['.', '-0.5000', '-0.5000', '10', '5', '10', '5'],
             ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
             ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
             ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
         ]
-        json = self.load_result()
-        json1 = filter_benchmark(json, "BM_Z.ro", ".")
-        json2 = filter_benchmark(json, "BM_O.e", ".")
-        output_lines_with_header = generate_
diff erence_report(
-            json1, json2, use_color=False)
+        output_lines_with_header = print_
diff erence_report(
+            self.json_
diff _report, use_color=False)
         output_lines = output_lines_with_header[2:]
         print("\n")
         print("\n".join(output_lines_with_header))
@@ -378,24 +536,64 @@ def test_basic(self):
             self.assertEqual(len(parts), 7)
             self.assertEqual(expect_lines[i], parts)
 
+    def test_json_
diff _report(self):
+        expected_output = [
+            {
+                'name': u'.',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 10, 'real_time_other': 5, 'cpu_time': 10, 'cpu_time_other': 5}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'./4',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 40, 'real_time_other': 20, 'cpu_time': 40, 'cpu_time_other': 20}],
+                'time_unit': 'ns',
+                'utest': {},
+            },
+            {
+                'name': u'Prefix/.',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 20, 'real_time_other': 10, 'cpu_time': 20, 'cpu_time_other': 10}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'Prefix/./3',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 30, 'real_time_other': 15, 'cpu_time': 30, 'cpu_time_other': 15}],
+                'time_unit': 'ns',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_
diff _report), len(expected_output))
+        for out, expected in zip(
+                self.json_
diff _report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
 
 class TestReportDifferenceWithUTest(unittest.TestCase):
-    def load_results(self):
-        import json
-        testInputs = os.path.join(
-            os.path.dirname(
-                os.path.realpath(__file__)),
-            'Inputs')
-        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
-        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
-        with open(testOutput1, 'r') as f:
-            json1 = json.load(f)
-        with open(testOutput2, 'r') as f:
-            json2 = json.load(f)
-        return json1, json2
-
-    def test_utest(self):
-        expect_lines = []
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_
diff _report = get_
diff erence_report(
+            json1, json2, utest=True)
+
+    def test_json_
diff _report_pretty_printing(self):
         expect_lines = [
             ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
             ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
@@ -434,9 +632,8 @@ def test_utest(self):
              'recommended.'],
             ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
         ]
-        json1, json2 = self.load_results()
-        output_lines_with_header = generate_
diff erence_report(
-            json1, json2, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines_with_header = print_
diff erence_report(
+            self.json_
diff _report, utest=True, utest_alpha=0.05, use_color=False)
         output_lines = output_lines_with_header[2:]
         print("\n")
         print("\n".join(output_lines_with_header))
@@ -445,25 +642,151 @@ def test_utest(self):
             parts = [x for x in output_lines[i].split(' ') if x]
             self.assertEqual(expect_lines[i], parts)
 
+    def test_json_
diff _report_pretty_printing_aggregates_only(self):
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two_pvalue',
+             '0.6985',
+             '0.6985',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.1489',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+        ]
+        output_lines_with_header = print_
diff erence_report(
+            self.json_
diff _report, include_aggregates_only=True, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_
diff _report(self):
+        expected_output = [
+            {
+                'name': u'BM_One',
+                'measurements': [
+                    {'time': -0.1,
+                     'cpu': 0.1,
+                     'real_time': 10,
+                     'real_time_other': 9,
+                     'cpu_time': 100,
+                     'cpu_time_other': 110}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'BM_Two',
+                'measurements': [
+                    {'time': 0.1111111111111111,
+                     'cpu': -0.011111111111111112,
+                     'real_time': 9,
+                     'real_time_other': 10,
+                     'cpu_time': 90,
+                     'cpu_time_other': 89},
+                    {'time': -0.125, 'cpu': -0.16279069767441862, 'real_time': 8,
+                        'real_time_other': 7, 'cpu_time': 86, 'cpu_time_other': 72}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6985353583033387, 'time_pvalue': 0.6985353583033387
+                }
+            },
+            {
+                'name': u'short',
+                'measurements': [
+                    {'time': -0.125,
+                     'cpu': -0.0625,
+                     'real_time': 8,
+                     'real_time_other': 7,
+                     'cpu_time': 80,
+                     'cpu_time_other': 75},
+                    {'time': -0.4325,
+                     'cpu': -0.13506493506493514,
+                     'real_time': 8,
+                     'real_time_other': 4.54,
+                     'cpu_time': 77,
+                     'cpu_time_other': 66.6}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.14891467317876572, 'time_pvalue': 0.7670968684102772
+                }
+            },
+            {
+                'name': u'medium',
+                'measurements': [
+                    {'time': -0.375,
+                     'cpu': -0.3375,
+                     'real_time': 8,
+                     'real_time_other': 5,
+                     'cpu_time': 80,
+                     'cpu_time_other': 53}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_
diff _report), len(expected_output))
+        for out, expected in zip(
+                self.json_
diff _report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
 
 class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
         unittest.TestCase):
-    def load_results(self):
-        import json
-        testInputs = os.path.join(
-            os.path.dirname(
-                os.path.realpath(__file__)),
-            'Inputs')
-        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
-        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
-        with open(testOutput1, 'r') as f:
-            json1 = json.load(f)
-        with open(testOutput2, 'r') as f:
-            json2 = json.load(f)
-        return json1, json2
-
-    def test_utest(self):
-        expect_lines = []
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_
diff _report = get_
diff erence_report(
+            json1, json2, utest=True)
+
+    def test_json_
diff _report_pretty_printing(self):
         expect_lines = [
             ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
             ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
@@ -500,10 +823,10 @@ def test_utest(self):
              '9+',
              'repetitions',
              'recommended.'],
+             ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53']
         ]
-        json1, json2 = self.load_results()
-        output_lines_with_header = generate_
diff erence_report(
-            json1, json2, display_aggregates_only=True,
+        output_lines_with_header = print_
diff erence_report(
+            self.json_
diff _report,
             utest=True, utest_alpha=0.05, use_color=False)
         output_lines = output_lines_with_header[2:]
         print("\n")
@@ -513,6 +836,152 @@ def test_utest(self):
             parts = [x for x in output_lines[i].split(' ') if x]
             self.assertEqual(expect_lines[i], parts)
 
+    def test_json_
diff _report(self):
+        expected_output = [
+            {
+                'name': u'BM_One',
+                'measurements': [
+                    {'time': -0.1,
+                     'cpu': 0.1,
+                     'real_time': 10,
+                     'real_time_other': 9,
+                     'cpu_time': 100,
+                     'cpu_time_other': 110}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'BM_Two',
+                'measurements': [
+                    {'time': 0.1111111111111111,
+                     'cpu': -0.011111111111111112,
+                     'real_time': 9,
+                     'real_time_other': 10,
+                     'cpu_time': 90,
+                     'cpu_time_other': 89},
+                    {'time': -0.125, 'cpu': -0.16279069767441862, 'real_time': 8,
+                        'real_time_other': 7, 'cpu_time': 86, 'cpu_time_other': 72}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6985353583033387, 'time_pvalue': 0.6985353583033387
+                }
+            },
+            {
+                'name': u'short',
+                'measurements': [
+                    {'time': -0.125,
+                     'cpu': -0.0625,
+                     'real_time': 8,
+                     'real_time_other': 7,
+                     'cpu_time': 80,
+                     'cpu_time_other': 75},
+                    {'time': -0.4325,
+                     'cpu': -0.13506493506493514,
+                     'real_time': 8,
+                     'real_time_other': 4.54,
+                     'cpu_time': 77,
+                     'cpu_time_other': 66.6}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.14891467317876572, 'time_pvalue': 0.7670968684102772
+                }
+            },
+            {
+                'name': u'medium',
+                'measurements': [
+                    {'real_time_other': 5,
+                     'cpu_time': 80,
+                     'time': -0.375,
+                     'real_time': 8,
+                     'cpu_time_other': 53,
+                     'cpu': -0.3375
+                    }
+                ],
+                'utest': {},
+                'time_unit': u'ns',
+                'aggregate_name': ''
+            }
+        ]
+        self.assertEqual(len(self.json_
diff _report), len(expected_output))
+        for out, expected in zip(
+                self.json_
diff _report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportSorting(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_result():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput = os.path.join(testInputs, 'test4_run.json')
+            with open(testOutput, 'r') as f:
+                json = json.load(f)
+            return json
+
+        cls.json = load_result()
+
+    def test_json_
diff _report_pretty_printing(self):
+        import util
+
+        expected_names = [
+            "99 family 0 instance 0 repetition 0",
+            "98 family 0 instance 0 repetition 1",
+            "97 family 0 instance 0 aggregate",
+            "96 family 0 instance 1 repetition 0",
+            "95 family 0 instance 1 repetition 1",
+            "94 family 0 instance 1 aggregate",
+            "93 family 1 instance 0 repetition 0",
+            "92 family 1 instance 0 repetition 1",
+            "91 family 1 instance 0 aggregate",
+            "90 family 1 instance 1 repetition 0",
+            "89 family 1 instance 1 repetition 1",
+            "88 family 1 instance 1 aggregate"
+        ]
+
+        for n in range(len(self.json['benchmarks']) ** 2):
+            random.shuffle(self.json['benchmarks'])
+            sorted_benchmarks = util.sort_benchmark_results(self.json)[
+                'benchmarks']
+            self.assertEqual(len(expected_names), len(sorted_benchmarks))
+            for out, expected in zip(sorted_benchmarks, expected_names):
+                self.assertEqual(out['name'], expected)
+
+
+def assert_utest(unittest_instance, lhs, rhs):
+    if lhs['utest']:
+        unittest_instance.assertAlmostEqual(
+            lhs['utest']['cpu_pvalue'],
+            rhs['utest']['cpu_pvalue'])
+        unittest_instance.assertAlmostEqual(
+            lhs['utest']['time_pvalue'],
+            rhs['utest']['time_pvalue'])
+        unittest_instance.assertEqual(
+            lhs['utest']['have_optimal_repetitions'],
+            rhs['utest']['have_optimal_repetitions'])
+    else:
+        # lhs is empty. assert if rhs is not.
+        unittest_instance.assertEqual(lhs['utest'], rhs['utest'])
+
+
+def assert_measurements(unittest_instance, lhs, rhs):
+    for m1, m2 in zip(lhs['measurements'], rhs['measurements']):
+        unittest_instance.assertEqual(m1['real_time'], m2['real_time'])
+        unittest_instance.assertEqual(m1['cpu_time'], m2['cpu_time'])
+        # m1['time'] and m1['cpu'] hold values which are being calculated,
+        # and therefore we must use almost-equal pattern.
+        unittest_instance.assertAlmostEqual(m1['time'], m2['time'], places=4)
+        unittest_instance.assertAlmostEqual(m1['cpu'], m2['cpu'], places=4)
+
 
 if __name__ == '__main__':
     unittest.main()

diff  --git a/libcxx/utils/google-benchmark/tools/gbench/util.py b/libcxx/utils/google-benchmark/tools/gbench/util.py
index 1f8e8e2c47968..5d0012c0cb1c3 100644
--- a/libcxx/utils/google-benchmark/tools/gbench/util.py
+++ b/libcxx/utils/google-benchmark/tools/gbench/util.py
@@ -5,6 +5,7 @@
 import tempfile
 import subprocess
 import sys
+import functools
 
 # Input file type enumeration
 IT_Invalid = 0
@@ -119,6 +120,23 @@ def load_benchmark_results(fname):
         return json.load(f)
 
 
+def sort_benchmark_results(result):
+    benchmarks = result['benchmarks']
+
+    # From inner key to the outer key!
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['repetition_index'] if 'repetition_index' in benchmark else -1)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: 1 if 'run_type' in benchmark and benchmark['run_type'] == "aggregate" else 0)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['per_family_instance_index'] if 'per_family_instance_index' in benchmark else -1)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['family_index'] if 'family_index' in benchmark else -1)
+
+    result['benchmarks'] = benchmarks
+    return result
+
+
 def run_benchmark(exe_name, benchmark_flags):
     """
     Run a benchmark specified by 'exe_name' with the specified
@@ -158,7 +176,6 @@ def run_or_load_benchmark(filename, benchmark_flags):
     ftype = check_input_file(filename)
     if ftype == IT_JSON:
         return load_benchmark_results(filename)
-    elif ftype == IT_Executable:
+    if ftype == IT_Executable:
         return run_benchmark(filename, benchmark_flags)
-    else:
-        assert False  # This branch is unreachable
+    raise ValueError('Unknown file type %s' % ftype)

diff  --git a/libcxx/utils/google-benchmark/tools/requirements.txt b/libcxx/utils/google-benchmark/tools/requirements.txt
new file mode 100644
index 0000000000000..3b3331b5af127
--- /dev/null
+++ b/libcxx/utils/google-benchmark/tools/requirements.txt
@@ -0,0 +1 @@
+scipy>=1.5.0
\ No newline at end of file