update google benchmark library to 1.9.4 (#9441)

* benchmark: update README and add update script * update google benchmark library to 1.9.4 * update tnt CMakeLists to match the library new version
2025-11-19 11:50:34 -08:00
parent 3127632f96
commit 311104da97
180 changed files with 16394 additions and 4747 deletions
--- a/third_party/benchmark/.clang-format
+++ b/third_party/benchmark/.clang-format
@@ -1,5 +1,5 @@
 ---
 Language:        Cpp
 BasedOnStyle:  Google
+PointerAlignment: Left
 ...
-
--- a/third_party/benchmark/.clang-tidy
+++ b/third_party/benchmark/.clang-tidy
@@ -0,0 +1,37 @@
+---
+Checks: >
+  abseil-*,
+  bugprone-*,
+  clang-analyzer-*,
+  cppcoreguidelines-*,
+  google-*,
+  misc-*,
+  performance-*,
+  readability-*,
+  -clang-analyzer-deadcode*,
+  -clang-analyzer-optin*,
+  -readability-identifier-length
+WarningsAsErrors: ''
+HeaderFilterRegex: ''
+FormatStyle:     none
+CheckOptions:
+  llvm-else-after-return.WarnOnConditionVariables: 'false'
+  modernize-loop-convert.MinConfidence: reasonable
+  modernize-replace-auto-ptr.IncludeStyle: llvm
+  cert-str34-c.DiagnoseSignedUnsignedCharComparisons: 'false'
+  google-readability-namespace-comments.ShortNamespaceLines: '10'
+  cert-err33-c.CheckedFunctions: '::aligned_alloc;::asctime_s;::at_quick_exit;::atexit;::bsearch;::bsearch_s;::btowc;::c16rtomb;::c32rtomb;::calloc;::clock;::cnd_broadcast;::cnd_init;::cnd_signal;::cnd_timedwait;::cnd_wait;::ctime_s;::fclose;::fflush;::fgetc;::fgetpos;::fgets;::fgetwc;::fopen;::fopen_s;::fprintf;::fprintf_s;::fputc;::fputs;::fputwc;::fputws;::fread;::freopen;::freopen_s;::fscanf;::fscanf_s;::fseek;::fsetpos;::ftell;::fwprintf;::fwprintf_s;::fwrite;::fwscanf;::fwscanf_s;::getc;::getchar;::getenv;::getenv_s;::gets_s;::getwc;::getwchar;::gmtime;::gmtime_s;::localtime;::localtime_s;::malloc;::mbrtoc16;::mbrtoc32;::mbsrtowcs;::mbsrtowcs_s;::mbstowcs;::mbstowcs_s;::memchr;::mktime;::mtx_init;::mtx_lock;::mtx_timedlock;::mtx_trylock;::mtx_unlock;::printf_s;::putc;::putwc;::raise;::realloc;::remove;::rename;::scanf;::scanf_s;::setlocale;::setvbuf;::signal;::snprintf;::snprintf_s;::sprintf;::sprintf_s;::sscanf;::sscanf_s;::strchr;::strerror_s;::strftime;::strpbrk;::strrchr;::strstr;::strtod;::strtof;::strtoimax;::strtok;::strtok_s;::strtol;::strtold;::strtoll;::strtoul;::strtoull;::strtoumax;::strxfrm;::swprintf;::swprintf_s;::swscanf;::swscanf_s;::thrd_create;::thrd_detach;::thrd_join;::thrd_sleep;::time;::timespec_get;::tmpfile;::tmpfile_s;::tmpnam;::tmpnam_s;::tss_create;::tss_get;::tss_set;::ungetc;::ungetwc;::vfprintf;::vfprintf_s;::vfscanf;::vfscanf_s;::vfwprintf;::vfwprintf_s;::vfwscanf;::vfwscanf_s;::vprintf_s;::vscanf;::vscanf_s;::vsnprintf;::vsnprintf_s;::vsprintf;::vsprintf_s;::vsscanf;::vsscanf_s;::vswprintf;::vswprintf_s;::vswscanf;::vswscanf_s;::vwprintf_s;::vwscanf;::vwscanf_s;::wcrtomb;::wcschr;::wcsftime;::wcspbrk;::wcsrchr;::wcsrtombs;::wcsrtombs_s;::wcsstr;::wcstod;::wcstof;::wcstoimax;::wcstok;::wcstok_s;::wcstol;::wcstold;::wcstoll;::wcstombs;::wcstombs_s;::wcstoul;::wcstoull;::wcstoumax;::wcsxfrm;::wctob;::wctrans;::wctype;::wmemchr;::wprintf_s;::wscanf;::wscanf_s;'
+  cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField: 'false'
+  cert-dcl16-c.NewSuffixes: 'L;LL;LU;LLU'
+  google-readability-braces-around-statements.ShortStatementLines: '1'
+  cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic: 'true'
+  google-readability-namespace-comments.SpacesBeforeComments: '2'
+  modernize-loop-convert.MaxCopySize: '16'
+  modernize-pass-by-value.IncludeStyle: llvm
+  modernize-use-nullptr.NullMacros: 'NULL'
+  llvm-qualified-auto.AddConstToQualified: 'false'
+  modernize-loop-convert.NamingStyle: CamelCase
+  llvm-else-after-return.WarnOnUnfixable: 'false'
+  google-readability-function-size.StatementThreshold: '800'
+...
+
--- a/third_party/benchmark/.clang-tidy.ignore
+++ b/third_party/benchmark/.clang-tidy.ignore
@@ -0,0 +1 @@
+.*third_party/.*
--- a/third_party/benchmark/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/third_party/benchmark/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,32 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: "[BUG]"
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**System**
+Which OS, compiler, and compiler version are you using:
+  - OS: 
+  - Compiler and version: 
+
+**To reproduce**
+Steps to reproduce the behavior:
+1. sync to commit ...
+2. cmake/bazel...
+3. make ...
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Additional context**
+Add any other context about the problem here.
--- a/third_party/benchmark/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/third_party/benchmark/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: "[FR]"
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
--- a/third_party/benchmark/.github/install_bazel.sh
+++ b/third_party/benchmark/.github/install_bazel.sh
@@ -0,0 +1,12 @@
+if ! bazel version; then
+  arch=$(uname -m)
+  if [ "$arch" == "aarch64" ]; then
+    arch="arm64"
+  fi
+  echo "Downloading $arch Bazel binary from GitHub releases."
+  curl -L -o $HOME/bin/bazel --create-dirs "https://github.com/bazelbuild/bazel/releases/download/8.2.0/bazel-8.2.0-linux-$arch"
+  chmod +x $HOME/bin/bazel
+else
+  # Bazel is installed for the correct architecture
+  exit 0
+fi
--- a/third_party/benchmark/.github/libcxx-setup.sh
+++ b/third_party/benchmark/.github/libcxx-setup.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Checkout LLVM sources
+git clone --filter=blob:none --depth=1 --branch llvmorg-19.1.6 --no-checkout https://github.com/llvm/llvm-project.git llvm-project
+cd llvm-project
+git sparse-checkout set --cone
+git checkout llvmorg-19.1.6
+git sparse-checkout set cmake llvm/cmake runtimes libcxx libcxxabi
+cd ..
+
+## Setup libc++ options
+if [ -z "$BUILD_32_BITS" ]; then
+  export BUILD_32_BITS=OFF && echo disabling 32 bit build
+fi
+
+## Build and install libc++ (Use unstable ABI for better sanitizer coverage)
+mkdir llvm-build && cd llvm-build
+cmake -GNinja                                   \
+      -DCMAKE_C_COMPILER=${CC}                  \
+      -DCMAKE_CXX_COMPILER=${CXX}               \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
+      -DCMAKE_INSTALL_PREFIX=/usr               \
+      -DLIBCXX_ABI_UNSTABLE=OFF                 \
+      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER}  \
+      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS}     \
+      -DLIBCXXABI_USE_LLVM_UNWINDER=OFF         \
+      -DLLVM_INCLUDE_TESTS=OFF                  \
+      -DLIBCXX_INCLUDE_TESTS=OFF                \
+      -DLIBCXX_INCLUDE_BENCHMARKS=OFF           \
+      -DLLVM_ENABLE_RUNTIMES='libcxx;libcxxabi' \
+      ../llvm-project/runtimes/
+cmake --build . -- cxx cxxabi
+cd ..
--- a/third_party/benchmark/.github/workflows/bazel.yml
+++ b/third_party/benchmark/.github/workflows/bazel.yml
@@ -0,0 +1,37 @@
+name: bazel
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  build_and_test_default:
+    name: bazel.${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: mount bazel cache
+      uses: actions/cache@v4
+      env:
+        cache-name: bazel-cache
+      with:
+        path: "~/.cache/bazel"
+        key: ${{ env.cache-name }}-${{ matrix.os }}-${{ github.ref }}
+        restore-keys: |
+          ${{ env.cache-name }}-${{ matrix.os }}-main
+
+    - name: build
+      run: |
+        bazel build //:benchmark //:benchmark_main //test/...
+
+    - name: test
+      run: |
+        bazel test --test_output=all //test/...
--- a/third_party/benchmark/.github/workflows/build-and-test-min-cmake.yml
+++ b/third_party/benchmark/.github/workflows/build-and-test-min-cmake.yml
@@ -0,0 +1,49 @@
+name: build-and-test-min-cmake
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  job:
+    name: ${{ matrix.os }}.min-cmake
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: lukka/get-cmake@latest
+        with:
+          cmakeVersion: 3.13.0
+
+      - name: create build environment
+        run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+      - name: setup cmake initial cache
+        run: touch compiler-cache.cmake
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: >
+          cmake -C ${{ github.workspace }}/compiler-cache.cmake
+          $GITHUB_WORKSPACE
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DCMAKE_CXX_VISIBILITY_PRESET=hidden
+          -DCMAKE_VISIBILITY_INLINES_HIDDEN=ON
+
+      - name: build
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: cmake --build .
--- a/third_party/benchmark/.github/workflows/build-and-test-perfcounters.yml
+++ b/third_party/benchmark/.github/workflows/build-and-test-perfcounters.yml
@@ -0,0 +1,54 @@
+name: build-and-test-perfcounters
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  job:
+    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
+    name: ${{ matrix.os }}.${{ matrix.build_type }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        build_type: ['Release', 'Debug']
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: install libpfm
+      run: |
+        sudo apt update
+        sudo apt -y install libpfm4-dev
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_LIBPFM=1
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    # Skip testing, for now. It seems perf_event_open does not succeed on the
+    # hosting machine, very likely a permissions issue.
+    # TODO(mtrofin): Enable test.
+    # - name: test
+    #   shell: bash
+    #   working-directory: ${{ runner.workspace }}/_build
+    #   run: ctest -C ${{ matrix.build_type }} --rerun-failed --output-on-failure
+
--- a/third_party/benchmark/.github/workflows/build-and-test.yml
+++ b/third_party/benchmark/.github/workflows/build-and-test.yml
@@ -0,0 +1,151 @@
+name: build-and-test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  # TODO: add 32-bit builds (g++ and clang++) for ubuntu
+  #   (requires g++-multilib and libc6:i386)
+  # TODO: add coverage build (requires lcov)
+  # TODO: add clang + libc++ builds for ubuntu
+  job:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.compiler }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-24.04, ubuntu-22.04, ubuntu-24.04-arm, macos-latest]
+        build_type: ['Release', 'Debug']
+        compiler: ['g++', 'clang++']
+        lib: ['shared', 'static']
+
+    steps:
+      - name: Install dependencies (macos)
+        if: runner.os == 'macOS'
+        run: brew install ninja
+
+      - uses: actions/checkout@v4
+
+      - name: build
+        uses: threeal/cmake-action@v2.1.0
+        with:
+          build-dir: ${{ runner.workspace }}/_build
+          cxx-compiler: ${{ matrix.compiler }}
+          options: |
+            BENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+            BUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+            CMAKE_BUILD_TYPE=${{ matrix.build_type }}
+            CMAKE_CXX_COMPILER=${{ matrix.compiler }}
+            CMAKE_CXX_VISIBILITY_PRESET=hidden
+            CMAKE_VISIBILITY_INLINES_HIDDEN=ON
+
+      - name: test
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: ctest -C ${{ matrix.build_type }} -VV
+
+  msvc:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.msvc }}
+    runs-on: ${{ matrix.os }}
+    defaults:
+        run:
+            shell: powershell
+    strategy:
+      fail-fast: false
+      matrix:
+        msvc:
+          - VS-16-2019
+          - VS-17-2022
+        build_type:
+          - Debug
+          - Release
+        lib:
+          - shared
+          - static
+        include:
+          - msvc: VS-16-2019
+            os: windows-2019
+            generator: 'Visual Studio 16 2019'
+          - msvc: VS-17-2022
+            os: windows-2022
+            generator: 'Visual Studio 17 2022'
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: lukka/get-cmake@latest
+
+      - name: configure cmake
+        run: >
+          cmake -S . -B ${{ runner.workspace }}/_build/
+          -G "${{ matrix.generator }}"
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+
+      - name: build
+        run: cmake --build ${{ runner.workspace }}/_build/ --config ${{ matrix.build_type }}
+
+      - name: test
+        run: ctest --test-dir ${{ runner.workspace }}/_build/ -C ${{ matrix.build_type }} -VV
+
+  msys2:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.msys2.msystem }}
+    runs-on: ${{ matrix.os }}
+    defaults:
+        run:
+            shell: msys2 {0}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ windows-latest ]
+        msys2:
+          - { msystem: MINGW64,    arch: x86_64,  family: GNU,  compiler: g++ }
+          - { msystem: CLANG64,    arch: x86_64,  family: LLVM, compiler: clang++ }
+          - { msystem: UCRT64,     arch: x86_64,  family: GNU,  compiler: g++ }
+        build_type:
+          - Debug
+          - Release
+        lib:
+          - shared
+          - static
+
+    steps:
+      - name: setup msys2
+        uses: msys2/setup-msys2@v2
+        with:
+          cache: false
+          msystem: ${{ matrix.msys2.msystem }}
+          update: true
+          install: >-
+            git
+            base-devel
+          pacboy: >-
+            gcc:p
+            clang:p
+            cmake:p
+            ninja:p
+
+      - uses: actions/checkout@v4
+
+      # NOTE: we can't use cmake actions here as we need to do everything in msys2 shell.
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.msys2.compiler }}
+        run: >
+          cmake -S . -B _build/
+          -GNinja
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+
+      - name: build
+        run: cmake --build _build/ --config ${{ matrix.build_type }}
+
+      - name: test
+        working-directory: _build
+        run: ctest -C ${{ matrix.build_type }} -VV
--- a/third_party/benchmark/.github/workflows/clang-format-lint.yml
+++ b/third_party/benchmark/.github/workflows/clang-format-lint.yml
@@ -0,0 +1,19 @@
+name: clang-format-lint
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  job:
+    name: check-clang-format
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - uses: DoozyX/clang-format-lint-action@v0.18.2
+      with:
+        source: './include/benchmark ./src ./test'
+        clangFormatVersion: 18
--- a/third_party/benchmark/.github/workflows/clang-tidy-lint.yml
+++ b/third_party/benchmark/.github/workflows/clang-tidy-lint.yml
@@ -0,0 +1,41 @@
+name: clang-tidy
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  job:
+    name: run-clang-tidy
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: install clang-tidy
+      run: sudo apt update && sudo apt -y install clang-tidy
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ github.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ github.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=clang
+        -DCMAKE_CXX_COMPILER=clang++
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DGTEST_COMPILE_COMMANDS=OFF
+
+    - name: run
+      shell: bash
+      working-directory: ${{ github.workspace }}/_build
+      run: run-clang-tidy -config-file=$GITHUB_WORKSPACE/.clang-tidy
--- a/third_party/benchmark/.github/workflows/doxygen.yml
+++ b/third_party/benchmark/.github/workflows/doxygen.yml
@@ -0,0 +1,31 @@
+name: doxygen
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  build-and-deploy:
+    name: Build HTML documentation
+    runs-on: ubuntu-latest
+    steps:
+    - name: Fetching sources
+      uses: actions/checkout@v4
+
+    - name: Installing build dependencies
+      run: |
+        sudo apt update
+        sudo apt install doxygen gcc git
+
+    - name: Creating build directory
+      run: mkdir build
+
+    - name: Building HTML documentation with Doxygen
+      run: |
+        cmake -S . -B build -DBENCHMARK_ENABLE_TESTING:BOOL=OFF -DBENCHMARK_ENABLE_DOXYGEN:BOOL=ON -DBENCHMARK_INSTALL_DOCS:BOOL=ON
+        cmake --build build --target benchmark_doxygen
--- a/third_party/benchmark/.github/workflows/pre-commit.yml
+++ b/third_party/benchmark/.github/workflows/pre-commit.yml
@@ -0,0 +1,41 @@
+name: python + Bazel pre-commit checks
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    env:
+      MYPY_CACHE_DIR: "${{ github.workspace }}/.cache/mypy"
+      RUFF_CACHE_DIR: "${{ github.workspace }}/.cache/ruff"
+      PRE_COMMIT_HOME: "${{ github.workspace }}/.cache/pre-commit"
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.11
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Install dependencies
+      run: python -m pip install ".[dev]"
+    - name: Cache pre-commit tools
+      uses: actions/cache@v4
+      with:
+        path: |
+          ${{ env.MYPY_CACHE_DIR }}
+          ${{ env.RUFF_CACHE_DIR }}
+          ${{ env.PRE_COMMIT_HOME }}
+        key: ${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }}-linter-cache
+    - name: Run pre-commit checks
+      run: pre-commit run --all-files --verbose --show-diff-on-failure
--- a/third_party/benchmark/.github/workflows/sanitizer.yml
+++ b/third_party/benchmark/.github/workflows/sanitizer.yml
@@ -0,0 +1,97 @@
+name: sanitizer
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  CMAKE_GENERATOR: Ninja
+  UBSAN_OPTIONS: "print_stacktrace=1"
+
+jobs:
+  job:
+    name: ${{ matrix.sanitizer }}.${{ matrix.build_type }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: ['Debug', 'RelWithDebInfo']
+        sanitizer: ['asan', 'ubsan', 'tsan', 'msan']
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: configure msan env
+      if: matrix.sanitizer == 'msan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=MemoryWithOrigins" >> $GITHUB_ENV
+
+    - name: configure ubsan env
+      if: matrix.sanitizer == 'ubsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Undefined" >> $GITHUB_ENV
+
+    - name: configure asan env
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=address -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Address" >> $GITHUB_ENV
+
+    - name: configure tsan env
+      if: matrix.sanitizer == 'tsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Thread" >> $GITHUB_ENV
+
+    - name: fine-tune asan options
+      # in asan we get an error from std::regex. ignore it.
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "ASAN_OPTIONS=alloc_dealloc_mismatch=0" >> $GITHUB_ENV
+
+    - name: setup clang
+      uses: egor-tensin/setup-clang@v1
+      with:
+        version: latest
+        platform: x64
+
+    - name: configure clang
+      run: |
+        echo "CC=cc" >> $GITHUB_ENV
+        echo "CXX=c++" >> $GITHUB_ENV
+
+    - name: build libc++ (non-asan)
+      if: matrix.sanitizer != 'asan'
+      run: |
+        "${GITHUB_WORKSPACE}/.github/libcxx-setup.sh"
+        echo "EXTRA_CXX_FLAGS=-stdlib=libc++ -L${GITHUB_WORKSPACE}/llvm-build/lib -lc++abi -I${GITHUB_WORKSPACE}/llvm-build/include/c++/v1 -Isystem${GITHUB_WORKSPACE}/llvm-build/include/c++/v1 -Wl,-rpath,${GITHUB_WORKSPACE}/llvm-build/lib" >> $GITHUB_ENV
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        VERBOSE=1
+        cmake -GNinja $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=${{ env.CC }}
+        -DCMAKE_CXX_COMPILER=${{ env.CXX }}
+        -DCMAKE_C_FLAGS="${{ env.EXTRA_FLAGS }}"
+        -DCMAKE_CXX_FLAGS="${{ env.EXTRA_FLAGS }} ${{ env.EXTRA_CXX_FLAGS }}"
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    - name: test
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: ctest -C ${{ matrix.build_type }} -VV
--- a/third_party/benchmark/.github/workflows/test_bindings.yml
+++ b/third_party/benchmark/.github/workflows/test_bindings.yml
@@ -0,0 +1,33 @@
+name: test-bindings
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  python_bindings:
+    name: Test GBM Python ${{ matrix.python-version }} bindings on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
+        python-version: [ "3.10", "3.11", "3.12", "3.13" ]
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install GBM Python bindings on ${{ matrix.os }}
+        run: python -m pip install .
+      - name: Run example on ${{ matrix.os }} under Python ${{ matrix.python-version }}
+        run: python bindings/python/google_benchmark/example.py
--- a/third_party/benchmark/.github/workflows/wheels.yml
+++ b/third_party/benchmark/.github/workflows/wheels.yml
@@ -0,0 +1,83 @@
+name: Build and upload Python wheels
+
+on:
+  workflow_dispatch:
+  release:
+    types:
+      - published
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Install Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - run: python -m pip install build
+      - name: Build sdist
+        run: python -m build --sdist
+      - uses: actions/upload-artifact@v4
+        with:
+          name: dist-sdist
+          path: dist/*.tar.gz
+
+  build_wheels:
+    name: Build Google Benchmark wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, ubuntu-24.04-arm, macos-13, macos-14, windows-latest]
+    steps:
+      - name: Check out Google Benchmark
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - uses: actions/setup-python@v5
+        name: Install Python 3.12
+        with:
+          python-version: "3.12"
+      - run: pip install --upgrade pip uv
+
+      - name: Build wheels on ${{ matrix.os }} using cibuildwheel
+        uses: pypa/cibuildwheel@v2.23.2
+        env:
+          CIBW_BUILD: "cp310-* cp311-* cp312-*"
+          CIBW_BUILD_FRONTEND: "build[uv]"
+          CIBW_SKIP: "*-musllinux_*"
+          CIBW_ARCHS: auto64
+          CIBW_BEFORE_ALL_LINUX: bash .github/install_bazel.sh
+          # Grab the rootless Bazel installation inside the container.
+          CIBW_ENVIRONMENT_LINUX: PATH=$PATH:$HOME/bin
+          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+          # unused by Bazel, but needed explicitly by delocate on MacOS.
+          MACOSX_DEPLOYMENT_TARGET: "10.14"
+
+      - name: Upload Google Benchmark ${{ matrix.os }} wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: dist-${{ matrix.os }}
+          path: wheelhouse/*.whl
+
+  pypi_upload:
+    name: Publish google-benchmark wheels to PyPI
+    needs: [build_sdist, build_wheels]
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          path: dist
+          pattern: dist-*
+          merge-multiple: true
+      - uses: pypa/gh-action-pypi-publish@release/v1
--- a/third_party/benchmark/.gitignore
+++ b/third_party/benchmark/.gitignore
@@ -8,8 +8,10 @@
 !/cmake/*.cmake
 !/test/AssemblyTests.cmake
 *~
+*.swp
 *.pyc
 __pycache__
+.DS_Store

 # lcov
 *.lcov
@@ -44,6 +46,7 @@ rules.ninja

 # bazel output symlinks.
 bazel-*
+MODULE.bazel.lock

 # out-of-source build top-level folders.
 build/
@@ -56,3 +59,10 @@ build*/
 # Visual Studio 2015/2017 cache/options directory
 .vs/
 CMakeSettings.json
+
+# Visual Studio Code cache/options directory
+.vscode/
+
+# Python build stuff
+dist/
+*.egg-info*
--- a/third_party/benchmark/.pre-commit-config.yaml
+++ b/third_party/benchmark/.pre-commit-config.yaml
@@ -0,0 +1,18 @@
+repos:
+  -   repo: https://github.com/keith/pre-commit-buildifier
+      rev: 8.0.3
+      hooks:
+      -   id: buildifier
+      -   id: buildifier-lint
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.15.0
+    hooks:
+      - id: mypy
+        types_or: [ python, pyi ]
+        args: [ "--ignore-missing-imports", "--scripts-are-modules" ]
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.8
+    hooks:
+      - id: ruff
+        args: [ --fix, --exit-non-zero-on-fix ]
+      - id: ruff-format
--- a/third_party/benchmark/.travis-libcxx-setup.sh
+++ b/third_party/benchmark/.travis-libcxx-setup.sh
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-# Install a newer CMake version
-curl -sSL https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh -o install-cmake.sh
-chmod +x install-cmake.sh
-sudo ./install-cmake.sh --prefix=/usr/local --skip-license
-
-# Checkout LLVM sources
-git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
-git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
-git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
-
-# Setup libc++ options
-if [ -z "$BUILD_32_BITS" ]; then
-  export BUILD_32_BITS=OFF && echo disabling 32 bit build
-fi
-
-# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
-mkdir llvm-build && cd llvm-build
-cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
-      -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
-      -DLIBCXX_ABI_UNSTABLE=ON \
-      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
-      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \
-      ../llvm-source
-make cxx -j2
-sudo make install-cxxabi install-cxx
-cd ../
--- a/third_party/benchmark/.travis.yml
+++ b/third_party/benchmark/.travis.yml
@@ -1,199 +0,0 @@
-sudo: required
-dist: trusty
-language: cpp
-
-env:
-  global:
-    - /usr/local/bin:$PATH
-
-matrix:
-  include:
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - lcov
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - g++-multilib
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug BUILD_32_BITS=ON
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - g++-multilib
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release BUILD_32_BITS=ON
-    - compiler: gcc
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
-    # Clang w/ libc++
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1
-        - EXTRA_FLAGS="-stdlib=libc++"
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
-        - LIBCXX_BUILD=1
-        - EXTRA_FLAGS="-stdlib=libc++"
-    # Clang w/ 32bit libc++
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            - clang-3.8
-            - g++-multilib
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-stdlib=libc++ -m32"
-    # Clang w/ 32bit libc++
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            - clang-3.8
-            - g++-multilib
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
-        - LIBCXX_BUILD=1
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-stdlib=libc++ -m32"
-    # Clang w/ libc++, ASAN, UBSAN
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
-        - UBSAN_OPTIONS=print_stacktrace=1
-    # Clang w/ libc++ and MSAN
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
-    # Clang w/ libc++ and MSAN
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Debug
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Release
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Release BUILD_32_BITS=ON
-    - os: osx
-      osx_image: xcode8.3
-      compiler: gcc
-      env:
-        - COMPILER=g++-7 C_COMPILER=gcc-7  BUILD_TYPE=Debug
-
-before_script:
-  - if [ -n "${LIBCXX_BUILD}" ]; then
-      source .travis-libcxx-setup.sh;
-    fi
-  - if [ -n "${ENABLE_SANITIZER}" ]; then
-      export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
-    else
-      export EXTRA_OPTIONS="";
-    fi
-  - mkdir -p build && cd build
-
-before_install:
-  - if [ -z "$BUILD_32_BITS" ]; then
-      export BUILD_32_BITS=OFF && echo disabling 32 bit build;
-    fi
-  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
-      sudo add-apt-repository -y "ppa:ubuntu-toolchain-r/test";
-      sudo apt-get update --option Acquire::Retries=100 --option Acquire::http::Timeout="60";
-    fi
-
-install:
-  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
-      travis_wait sudo -E apt-get -yq --no-install-suggests --no-install-recommends install g++-6;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "linux" -a "${BUILD_32_BITS}" == "OFF" ]; then
-      travis_wait sudo -E apt-get -y --no-install-suggests --no-install-recommends install llvm-3.9-tools;
-      sudo cp /usr/lib/llvm-3.9/bin/FileCheck /usr/local/bin/;
-    fi
-  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
-      PATH=~/.local/bin:${PATH};
-      pip install --user --upgrade pip;
-      travis_wait pip install --user cpp-coveralls;
-    fi
-  - if [ "${C_COMPILER}" == "gcc-7" -a "${TRAVIS_OS_NAME}" == "osx" ]; then
-      rm -f /usr/local/include/c++;
-      brew update;
-      travis_wait brew install gcc@7;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "linux" ]; then
-      sudo apt-get update -qq;
-      sudo apt-get install -qq unzip;
-      wget https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-linux-x86_64.sh --output-document bazel-installer.sh;
-      travis_wait sudo bash bazel-installer.sh;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
-      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-darwin-x86_64.sh;
-      travis_wait sudo bash bazel-installer.sh;
-    fi
-
-script:
-  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS}" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ${EXTRA_OPTIONS} ..
-  - make
-  - ctest -C ${BUILD_TYPE} --output-on-failure
-  - bazel test -c dbg --define google_benchmark.have_regex=posix --announce_rc --verbose_failures --test_output=errors --keep_going //test/...
-
-after_success:
-  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
-      coveralls --include src --include include --gcov-options '\-lp' --root .. --build-root .;
-    fi
--- a/third_party/benchmark/.ycm_extra_conf.py
+++ b/third_party/benchmark/.ycm_extra_conf.py
@@ -1,25 +1,30 @@
 import os
+
 import ycm_core

 # These are the compilation flags that will be used in case there's no
 # compilation database set (by default, one is not set).
 # CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR.
 flags = [
-'-Wall',
-'-Werror',
-'-pedantic-errors',
-'-std=c++0x',
-'-fno-strict-aliasing',
-'-O3',
-'-DNDEBUG',
-# ...and the same thing goes for the magic -x option which specifies the
-# language that the files to be compiled are written in. This is mostly
-# relevant for c++ headers.
-# For a C project, you would set this to 'c' instead of 'c++'.
-'-x', 'c++',
-'-I', 'include',
-'-isystem', '/usr/include',
-'-isystem', '/usr/local/include',
+    "-Wall",
+    "-Werror",
+    "-pedantic-errors",
+    "-std=c++0x",
+    "-fno-strict-aliasing",
+    "-O3",
+    "-DNDEBUG",
+    # ...and the same thing goes for the magic -x option which specifies the
+    # language that the files to be compiled are written in. This is mostly
+    # relevant for c++ headers.
+    # For a C project, you would set this to 'c' instead of 'c++'.
+    "-x",
+    "c++",
+    "-I",
+    "include",
+    "-isystem",
+    "/usr/include",
+    "-isystem",
+    "/usr/local/include",
 ]


@@ -29,87 +34,87 @@ flags = [
 #
 # Most projects will NOT need to set this to anything; you can just change the
 # 'flags' list of compilation flags. Notice that YCM itself uses that approach.
-compilation_database_folder = ''
+compilation_database_folder = ""

-if os.path.exists( compilation_database_folder ):
-  database = ycm_core.CompilationDatabase( compilation_database_folder )
+if os.path.exists(compilation_database_folder):
+    database = ycm_core.CompilationDatabase(compilation_database_folder)
 else:
-  database = None
+    database = None
+
+SOURCE_EXTENSIONS = [".cc"]

-SOURCE_EXTENSIONS = [ '.cc' ]

 def DirectoryOfThisScript():
-  return os.path.dirname( os.path.abspath( __file__ ) )
+    return os.path.dirname(os.path.abspath(__file__))


-def MakeRelativePathsInFlagsAbsolute( flags, working_directory ):
-  if not working_directory:
-    return list( flags )
-  new_flags = []
-  make_next_absolute = False
-  path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ]
-  for flag in flags:
-    new_flag = flag
+def MakeRelativePathsInFlagsAbsolute(flags, working_directory):
+    if not working_directory:
+        return list(flags)
+    new_flags = []
+    make_next_absolute = False
+    path_flags = ["-isystem", "-I", "-iquote", "--sysroot="]
+    for flag in flags:
+        new_flag = flag

-    if make_next_absolute:
-      make_next_absolute = False
-      if not flag.startswith( '/' ):
-        new_flag = os.path.join( working_directory, flag )
+        if make_next_absolute:
+            make_next_absolute = False
+            if not flag.startswith("/"):
+                new_flag = os.path.join(working_directory, flag)

-    for path_flag in path_flags:
-      if flag == path_flag:
-        make_next_absolute = True
-        break
+        for path_flag in path_flags:
+            if flag == path_flag:
+                make_next_absolute = True
+                break

-      if flag.startswith( path_flag ):
-        path = flag[ len( path_flag ): ]
-        new_flag = path_flag + os.path.join( working_directory, path )
-        break
+            if flag.startswith(path_flag):
+                path = flag[len(path_flag) :]
+                new_flag = path_flag + os.path.join(working_directory, path)
+                break

-    if new_flag:
-      new_flags.append( new_flag )
-  return new_flags
+        if new_flag:
+            new_flags.append(new_flag)
+    return new_flags


-def IsHeaderFile( filename ):
-  extension = os.path.splitext( filename )[ 1 ]
-  return extension in [ '.h', '.hxx', '.hpp', '.hh' ]
+def IsHeaderFile(filename):
+    extension = os.path.splitext(filename)[1]
+    return extension in [".h", ".hxx", ".hpp", ".hh"]


-def GetCompilationInfoForFile( filename ):
-  # The compilation_commands.json file generated by CMake does not have entries
-  # for header files. So we do our best by asking the db for flags for a
-  # corresponding source file, if any. If one exists, the flags for that file
-  # should be good enough.
-  if IsHeaderFile( filename ):
-    basename = os.path.splitext( filename )[ 0 ]
-    for extension in SOURCE_EXTENSIONS:
-      replacement_file = basename + extension
-      if os.path.exists( replacement_file ):
-        compilation_info = database.GetCompilationInfoForFile(
-          replacement_file )
-        if compilation_info.compiler_flags_:
-          return compilation_info
-    return None
-  return database.GetCompilationInfoForFile( filename )
+def GetCompilationInfoForFile(filename):
+    # The compilation_commands.json file generated by CMake does not have
+    # entries for header files. So we do our best by asking the db for flags for
+    # a corresponding source file, if any. If one exists, the flags for that
+    # file should be good enough.
+    if IsHeaderFile(filename):
+        basename = os.path.splitext(filename)[0]
+        for extension in SOURCE_EXTENSIONS:
+            replacement_file = basename + extension
+            if os.path.exists(replacement_file):
+                compilation_info = database.GetCompilationInfoForFile(
+                    replacement_file
+                )
+                if compilation_info.compiler_flags_:
+                    return compilation_info
+        return None
+    return database.GetCompilationInfoForFile(filename)


-def FlagsForFile( filename, **kwargs ):
-  if database:
-    # Bear in mind that compilation_info.compiler_flags_ does NOT return a
-    # python list, but a "list-like" StringVec object
-    compilation_info = GetCompilationInfoForFile( filename )
-    if not compilation_info:
-      return None
+def FlagsForFile(filename, **kwargs):
+    if database:
+        # Bear in mind that compilation_info.compiler_flags_ does NOT return a
+        # python list, but a "list-like" StringVec object
+        compilation_info = GetCompilationInfoForFile(filename)
+        if not compilation_info:
+            return None

-    final_flags = MakeRelativePathsInFlagsAbsolute(
-      compilation_info.compiler_flags_,
-      compilation_info.compiler_working_dir_ )
-  else:
-    relative_to = DirectoryOfThisScript()
-    final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to )
+        final_flags = MakeRelativePathsInFlagsAbsolute(
+            compilation_info.compiler_flags_,
+            compilation_info.compiler_working_dir_,
+        )
+    else:
+        relative_to = DirectoryOfThisScript()
+        final_flags = MakeRelativePathsInFlagsAbsolute(flags, relative_to)

-  return {
-    'flags': final_flags,
-    'do_cache': True
-  }
+    return {"flags": final_flags, "do_cache": True}
--- a/third_party/benchmark/AUTHORS
+++ b/third_party/benchmark/AUTHORS
@@ -9,40 +9,64 @@
 # Please keep the list sorted.

 Albert Pretorius <pretoalb@gmail.com>
+Alex Steele <steeleal123@gmail.com>
+Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
 Carto
+Cezary Skrzyński <czars1988@gmail.com>
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
+Colin Braley <braley.colin@gmail.com>
+Daniel Harvey <danielharvey458@gmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Deniz Evrenci <denizevrenci@gmail.com>
 Dirac Research 
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
+Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fabien Pichot <pichot.fabien@gmail.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Gergely Meszaros <maetveis@gmail.com>
+Gergő Szitár <szitar.gergo@gmail.com>
 Google Inc.
+Henrique Bucher <hbucher@gmail.com>
 International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
+Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
 MongoDB Inc.
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raghu Raja <raghu@enfabrica.net>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Roman Lebedev <lebedev.ri@gmail.com>
+Sayan Bhattacharjee <aero.sayan@gmail.com>
+Shapr3D <google-contributors@shapr3d.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Staffan Tjernstrom <staffantj@gmail.com>
 Steinar H. Gunderson <sgunderson@bigfoot.com>
 Stripe, Inc.
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
--- a/third_party/benchmark/BUILD.bazel
+++ b/third_party/benchmark/BUILD.bazel
@@ -1,9 +1,38 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
 licenses(["notice"])

+COPTS = [
+    "-pedantic",
+    "-pedantic-errors",
+    "-std=c++17",
+    "-Wall",
+    "-Wconversion",
+    "-Wextra",
+    "-Wshadow",
+    #    "-Wshorten-64-to-32",
+    "-Wfloat-equal",
+    "-fstrict-aliasing",
+    ## assert() are used a lot in tests upstream, which may be optimised out leading to
+    ## unused-variable warning.
+    "-Wno-unused-variable",
+    "-Werror=old-style-cast",
+]
+
+MSVC_COPTS = [
+    "/std:c++17",
+]
+
 config_setting(
    name = "windows",
-    values = {
-        "cpu": "x64_windows",
+    constraint_values = ["@platforms//os:windows"],
+    visibility = [":__subpackages__"],
+)
+
+config_setting(
+    name = "perfcounters",
+    define_values = {
+        "pfm": "1",
    },
    visibility = [":__subpackages__"],
 )
@@ -17,20 +46,51 @@ cc_library(
        ],
        exclude = ["src/benchmark_main.cc"],
    ),
-    hdrs = ["include/benchmark/benchmark.h"],
+    hdrs = [
+        "include/benchmark/benchmark.h",
+        "include/benchmark/export.h",
+    ],
+    copts = select({
+        ":windows": MSVC_COPTS,
+        "//conditions:default": COPTS,
+    }),
+    defines = [
+        "BENCHMARK_STATIC_DEFINE",
+        "BENCHMARK_VERSION=\\\"" + (module_version() if module_version() != None else "") + "\\\"",
+    ] + select({
+        ":perfcounters": ["HAVE_LIBPFM"],
+        "//conditions:default": [],
+    }),
+    includes = ["include"],
    linkopts = select({
        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
        "//conditions:default": ["-pthread"],
    }),
-    strip_include_prefix = "include",
+    # Only static linking is allowed; no .so will be produced.
+    # Using `defines` (i.e. not `local_defines`) means that no
+    # dependent rules need to bother about defining the macro.
+    linkstatic = True,
+    local_defines = [
+        # Turn on Large-file Support
+        "_FILE_OFFSET_BITS=64",
+        "_LARGEFILE64_SOURCE",
+        "_LARGEFILE_SOURCE",
+    ],
    visibility = ["//visibility:public"],
+    deps = select({
+        ":perfcounters": ["@libpfm"],
+        "//conditions:default": [],
+    }),
 )

 cc_library(
    name = "benchmark_main",
    srcs = ["src/benchmark_main.cc"],
-    hdrs = ["include/benchmark/benchmark.h"],
-    strip_include_prefix = "include",
+    hdrs = [
+        "include/benchmark/benchmark.h",
+        "include/benchmark/export.h",
+    ],
+    includes = ["include"],
    visibility = ["//visibility:public"],
    deps = [":benchmark"],
 )
--- a/third_party/benchmark/CMakeLists.txt
+++ b/third_party/benchmark/CMakeLists.txt
@@ -1,27 +1,34 @@
-cmake_minimum_required (VERSION 2.8.12)
+# Require CMake 3.10. If available, use the policies up to CMake 3.22.
+cmake_minimum_required (VERSION 3.13...3.22)

-project (benchmark)
-
-foreach(p
-    CMP0054 # CMake 3.1
-    CMP0056 # export EXE_LINKER_FLAGS to try_run
-    CMP0057 # Support no if() IN_LIST operator
-    )
-  if(POLICY ${p})
-    cmake_policy(SET ${p} NEW)
-  endif()
-endforeach()
+project (benchmark VERSION 1.9.4 LANGUAGES CXX)

 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
-if(NOT MSVC)
+option(BENCHMARK_ENABLE_WERROR "Build Release candidates with -Werror." ON)
+option(BENCHMARK_FORCE_WERROR "Build Release candidates with -Werror regardless of compiler issues." OFF)
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
+  # PGC++ maybe reporting false positives.
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "NVHPC")
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if(BENCHMARK_FORCE_WERROR)
+  set(BENCHMARK_ENABLE_WERROR ON)
+endif(BENCHMARK_FORCE_WERROR)
+
+if(NOT (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC"))
  option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
 else()
  set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
 endif()
 option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
+option(BENCHMARK_ENABLE_DOXYGEN "Build documentation with Doxygen." OFF)
+option(BENCHMARK_INSTALL_DOCS "Enable installation of documentation." ON)

 # Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
 # may require downloading the source code.
@@ -30,6 +37,24 @@ option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree buildi
 # This option can be used to disable building and running unit tests which depend on gtest
 # in cases where it is not possible to build or find a valid version of gtest.
 option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
+option(BENCHMARK_USE_BUNDLED_GTEST "Use bundled GoogleTest. If disabled, the find_package(GTest) will be used." ON)
+
+option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm" OFF)
+
+# Export only public symbols
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    # As of CMake 3.18, CMAKE_SYSTEM_PROCESSOR is not set properly for MSVC and
+    # cross-compilation (e.g. Host=x86_64, target=aarch64) requires using the
+    # undocumented, but working variable.
+    # See https://gitlab.kitware.com/cmake/cmake/-/issues/15170
+    set(CMAKE_SYSTEM_PROCESSOR ${MSVC_CXX_ARCHITECTURE_ID})
+    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ARM")
+      set(CMAKE_CROSSCOMPILING TRUE)
+    endif()
+endif()

 set(ENABLE_ASSEMBLY_TESTS_DEFAULT OFF)
 function(should_enable_assembly_tests)
@@ -41,7 +66,7 @@ function(should_enable_assembly_tests)
      return()
    endif()
  endif()
-  if (MSVC)
+  if (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC")
    return()
  elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
    return()
@@ -77,29 +102,63 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 include(GetGitVersion)
 get_git_version(GIT_VERSION)

+# If no git version can be determined, use the version
+# from the project() command
+if ("${GIT_VERSION}" STREQUAL "v0.0.0")
+  set(VERSION "v${benchmark_VERSION}")
+else()
+  set(VERSION "${GIT_VERSION}")
+endif()
+
+# Normalize version: drop "v" prefix, replace first "-" with ".",
+# drop everything after second "-" (including said "-").
+string(STRIP ${VERSION} VERSION)
+if(VERSION MATCHES v[^-]*-)
+   string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  NORMALIZED_VERSION ${VERSION})
+else()
+   string(REGEX REPLACE "v(.*)" "\\1" NORMALIZED_VERSION ${VERSION})
+endif()
+
 # Tell the user what versions we are using
-string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
-message(STATUS "Version: ${VERSION}")
+message(STATUS "Google Benchmark version: ${VERSION}, normalized to ${NORMALIZED_VERSION}")

 # The version of the libraries
-set(GENERIC_LIB_VERSION ${VERSION})
-string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
+set(GENERIC_LIB_VERSION ${NORMALIZED_VERSION})
+string(SUBSTRING ${NORMALIZED_VERSION} 0 1 GENERIC_LIB_SOVERSION)

 # Import our CMake modules
-include(CheckCXXCompilerFlag)
 include(AddCXXCompilerFlag)
+include(CheckCXXCompilerFlag)
+include(CheckLibraryExists)
 include(CXXFeatureCheck)

+check_library_exists(rt shm_open "" HAVE_LIB_RT)
+
 if (BENCHMARK_BUILD_32_BITS)
  add_required_cxx_compiler_flag(-m32)
 endif()

+set(BENCHMARK_CXX_STANDARD 17)
+
+set(CMAKE_CXX_STANDARD ${BENCHMARK_CXX_STANDARD})
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
 if (MSVC)
  # Turn compiler warnings up to 11
  string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
+
+  # MP flag only applies to cl, not cl frontends to other compilers (e.g. clang-cl, icx-cl etc)
+  if(CMAKE_CXX_COMPILER_ID MATCHES MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
+  endif()
  add_definitions(-D_CRT_SECURE_NO_WARNINGS)

+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-WX)
+  endif()
+
  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
    add_cxx_compiler_flag(-EHs-)
    add_cxx_compiler_flag(-EHa-)
@@ -126,45 +185,48 @@ if (MSVC)
    set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
  endif()
 else()
-  # Try and enable C++11. Don't use C++14 because it doesn't work in some
-  # configurations.
-  add_cxx_compiler_flag(-std=c++11)
-  if (NOT HAVE_CXX_FLAG_STD_CXX11)
-    add_cxx_compiler_flag(-std=c++0x)
-  endif()
-
+  # Turn on Large-file Support
+  add_definitions(-D_FILE_OFFSET_BITS=64)
+  add_definitions(-D_LARGEFILE64_SOURCE)
+  add_definitions(-D_LARGEFILE_SOURCE)
  # Turn compiler warnings up to 11
-  if (NOT MSVC)
-    add_cxx_compiler_flag(-Wall)
-    add_cxx_compiler_flag(-Wextra)
-    add_cxx_compiler_flag(-Wshadow)
-    add_cxx_compiler_flag(-Werror RELEASE)
-    add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
-    add_cxx_compiler_flag(-Werror MINSIZEREL)
-    add_cxx_compiler_flag(-pedantic)
-    add_cxx_compiler_flag(-pedantic-errors)
-    add_cxx_compiler_flag(-Wshorten-64-to-32)
-    add_cxx_compiler_flag(-fstrict-aliasing)
-    # Disable warnings regarding deprecated parts of the library while building
-    # and testing those parts of the library.
-    add_cxx_compiler_flag(-Wno-deprecated-declarations)
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-      # Intel silently ignores '-Wno-deprecated-declarations',
-      # warning no. 1786 must be explicitly disabled.
-      # See #631 for rationale.
-      add_cxx_compiler_flag(-wd1786)
-    endif()
-    # Disable deprecation warnings for release builds (when -Werror is enabled).
-    add_cxx_compiler_flag(-Wno-deprecated RELEASE)
-    add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
-    add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
-    if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
-      add_cxx_compiler_flag(-fno-exceptions)
-    endif()
+  add_cxx_compiler_flag(-Wall)
+  add_cxx_compiler_flag(-Wextra)
+  add_cxx_compiler_flag(-Wshadow)
+  add_cxx_compiler_flag(-Wfloat-equal)
+  add_cxx_compiler_flag(-Wold-style-cast)
+  add_cxx_compiler_flag(-Wconversion)
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Werror)
+  endif()
+  if (NOT BENCHMARK_ENABLE_TESTING)
+    # Disable warning when compiling tests as gtest does not use 'override'.
+    add_cxx_compiler_flag(-Wsuggest-override)
+  endif()
+  add_cxx_compiler_flag(-pedantic)
+  add_cxx_compiler_flag(-pedantic-errors)
+  add_cxx_compiler_flag(-Wshorten-64-to-32)
+  add_cxx_compiler_flag(-fstrict-aliasing)
+  # Disable warnings regarding deprecated parts of the library while building
+  # and testing those parts of the library.
+  add_cxx_compiler_flag(-Wno-deprecated-declarations)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
+    # Intel silently ignores '-Wno-deprecated-declarations',
+    # warning no. 1786 must be explicitly disabled.
+    # See #631 for rationale.
+    add_cxx_compiler_flag(-wd1786)
+    add_cxx_compiler_flag(-fno-finite-math-only)
+  endif()
+  # Disable deprecation warnings for release builds (when -Werror is enabled).
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Wno-deprecated)
+  endif()
+  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
+    add_cxx_compiler_flag(-fno-exceptions)
  endif()

  if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
-    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
+    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM") #ICC17u2: Many false positives for Wstrict-aliasing
      add_cxx_compiler_flag(-Wstrict-aliasing)
    endif()
  endif()
@@ -173,21 +235,26 @@ else()
  add_cxx_compiler_flag(-wd654)
  add_cxx_compiler_flag(-Wthread-safety)
  if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
-    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
+    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES "-DINCLUDE_DIRECTORIES=${PROJECT_SOURCE_DIR}/include")
  endif()

  # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a
  # predefined macro, which turns on all of the wonderful libc extensions.
-  # However g++ doesn't do this in Cygwin so we have to define it ourselfs
+  # However g++ doesn't do this in Cygwin so we have to define it ourselves
  # since we depend on GNU/POSIX/BSD extensions.
  if (CYGWIN)
    add_definitions(-D_GNU_SOURCE=1)
  endif()

+  if (QNXNTO)
+    add_definitions(-D_QNX_SOURCE)
+  endif()
+
  # Link time optimisation
  if (BENCHMARK_ENABLE_LTO)
    add_cxx_compiler_flag(-flto)
-    if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+    add_cxx_compiler_flag(-Wno-lto-type-mismatch)
+    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
      find_program(GCC_AR gcc-ar)
      if (GCC_AR)
        set(CMAKE_AR ${GCC_AR})
@@ -196,7 +263,7 @@ else()
      if (GCC_RANLIB)
        set(CMAKE_RANLIB ${GCC_RANLIB})
      endif()
-    elseif("${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
+    elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
      include(llvm-toolchain)
    endif()
  endif()
@@ -224,7 +291,8 @@ if (BENCHMARK_USE_LIBCXX)
  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
    add_cxx_compiler_flag(-stdlib=libc++)
  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
-          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel" OR
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM")
    add_cxx_compiler_flag(-nostdinc++)
    message(WARNING "libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
    # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
@@ -250,9 +318,16 @@ if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
        AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
  message(WARNING "Using std::regex with exceptions disabled is not fully supported")
 endif()
+
 cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
+cxx_feature_check(PTHREAD_AFFINITY)
+
+if (BENCHMARK_ENABLE_LIBPFM)
+  find_package(PFM REQUIRED)
+endif()

 # Set up directories
 include_directories(${PROJECT_SOURCE_DIR}/include)
@@ -262,8 +337,18 @@ add_subdirectory(src)

 if (BENCHMARK_ENABLE_TESTING)
  enable_testing()
-  if (BENCHMARK_ENABLE_GTEST_TESTS)
-    include(HandleGTest)
+  if (BENCHMARK_ENABLE_GTEST_TESTS AND
+      NOT (TARGET gtest AND TARGET gtest_main AND
+           TARGET gmock AND TARGET gmock_main))
+    if (BENCHMARK_USE_BUNDLED_GTEST)
+      include(GoogleTest)
+    else()
+      find_package(GTest CONFIG REQUIRED)
+      add_library(gtest ALIAS GTest::gtest)
+      add_library(gtest_main ALIAS GTest::gtest_main)
+      add_library(gmock ALIAS GTest::gmock)
+      add_library(gmock_main ALIAS GTest::gmock_main)
+    endif()
  endif()
  add_subdirectory(test)
 endif()
--- a/third_party/benchmark/CONTRIBUTORS
+++ b/third_party/benchmark/CONTRIBUTORS
@@ -22,44 +22,75 @@
 #
 # Please keep the list sorted.

+Abhina Sreeskantharajan <abhina.sreeskantharajan@ibm.com>
 Albert Pretorius <pretoalb@gmail.com>
+Alex Steele <steelal123@gmail.com>
+Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
+Bátor Tallér <bator.taller@shapr3d.com>
 Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
+Cezary Skrzyński <czars1988@gmail.com>
 Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
+Colin Braley <braley.colin@gmail.com>
+Cyrille Faucheux <cyrille.faucheux@gmail.com>
+Daniel Harvey <danielharvey458@gmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Deniz Evrenci <denizevrenci@gmail.com>
 Dominic Hamon <dma@stripysock.com> <dominic@google.com>
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
+Doug Evans <xdje42@gmail.com>
+Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fabien Pichot <pichot.fabien@gmail.com>
+Fanbo Meng <fanbo.meng@ibm.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Geoffrey Martin-Noble <gcmn@google.com> <gmngeoffrey@gmail.com>
+Gergely Meszaros <maetveis@gmail.com>
+Gergő Szitár <szitar.gergo@gmail.com>
+Hannes Hauswedell <h2@fsfe.org>
+Henrique Bucher <hbucher@gmail.com>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+Iakov Sergeev <yahontu@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 John Millikin <jmillikin@stripe.com>
+Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
+Kaito Udagawa <umireon@gmail.com>
 Kai Wolf <kai.wolf@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
-Kaito Udagawa <umireon@gmail.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raghu Raja <raghu@enfabrica.net>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Raul Marin <rmrodriguez@cartodb.com>
 Ray Glover <ray.glover@uk.ibm.com>
 Robert Guo <robert.guo@mongodb.com>
 Roman Lebedev <lebedev.ri@gmail.com>
+Sayan Bhattacharjee <aero.sayan@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Steven Wan <wan.yu@ibm.com>
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Tobias Ulvgård <tobias.ulvgard@dirac.se>
 Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
 Yixuan Qiu <yixuanq@gmail.com>
--- a/third_party/benchmark/MODULE.bazel
+++ b/third_party/benchmark/MODULE.bazel
@@ -0,0 +1,41 @@
+module(
+    name = "google_benchmark",
+    version = "1.9.4",
+)
+
+bazel_dep(name = "bazel_skylib", version = "1.7.1")
+bazel_dep(name = "platforms", version = "0.0.10")
+bazel_dep(name = "rules_cc", version = "0.0.9")
+
+bazel_dep(name = "rules_python", version = "1.0.0", dev_dependency = True)
+bazel_dep(name = "googletest", version = "1.14.0", dev_dependency = True, repo_name = "com_google_googletest")
+
+bazel_dep(name = "libpfm", version = "4.11.0.bcr.1")
+
+# Register a toolchain for Python 3.9 to be able to build numpy. Python
+# versions >=3.10 are problematic.
+# A second reason for this is to be able to build Python hermetically instead
+# of relying on the changing default version from rules_python.
+
+python = use_extension("@rules_python//python/extensions:python.bzl", "python", dev_dependency = True)
+python.toolchain(python_version = "3.8")
+python.toolchain(python_version = "3.9")
+python.toolchain(python_version = "3.10")
+python.toolchain(python_version = "3.11")
+python.toolchain(
+    is_default = True,
+    python_version = "3.12",
+)
+python.toolchain(python_version = "3.13")
+
+pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip", dev_dependency = True)
+pip.parse(
+    hub_name = "tools_pip_deps",
+    python_version = "3.9",
+    requirements_lock = "//tools:requirements.txt",
+)
+use_repo(pip, "tools_pip_deps")
+
+# -- bazel_dep definitions -- #
+
+bazel_dep(name = "nanobind_bazel", version = "2.7.0", dev_dependency = True)
--- a/third_party/benchmark/README.md
+++ b/third_party/benchmark/README.md
--- a/third_party/benchmark/WORKSPACE
+++ b/third_party/benchmark/WORKSPACE
@@ -1,7 +1,20 @@
 workspace(name = "com_github_google_benchmark")

-http_archive(
-     name = "com_google_googletest",
-     urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
-     strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
+load("//:bazel/benchmark_deps.bzl", "benchmark_deps")
+
+benchmark_deps()
+
+load("@rules_python//python:repositories.bzl", "py_repositories")
+
+py_repositories()
+
+load("@rules_python//python:pip.bzl", "pip_parse")
+
+pip_parse(
+    name = "tools_pip_deps",
+    requirements_lock = "//tools:requirements.txt",
 )
+
+load("@tools_pip_deps//:requirements.bzl", "install_deps")
+
+install_deps()
--- a/third_party/benchmark/WORKSPACE.bzlmod
+++ b/third_party/benchmark/WORKSPACE.bzlmod
@@ -0,0 +1,2 @@
+# This file marks the root of the Bazel workspace.
+# See MODULE.bazel for dependencies and setup.
--- a/third_party/benchmark/_config.yml
+++ b/third_party/benchmark/_config.yml
@@ -0,0 +1,2 @@
+theme: jekyll-theme-midnight
+markdown: GFM
--- a/third_party/benchmark/appveyor.yml
+++ b/third_party/benchmark/appveyor.yml
@@ -41,7 +41,7 @@ build_script:
  - cmake --build . --config %configuration%

 test_script:
-  - ctest -c %configuration% --timeout 300 --output-on-failure
+  - ctest --build-config %configuration% --timeout 300 --output-on-failure

 artifacts:
  - path: '_build/CMakeFiles/*.log'
--- a/third_party/benchmark/bazel/benchmark_deps.bzl
+++ b/third_party/benchmark/bazel/benchmark_deps.bzl
@@ -0,0 +1,54 @@
+"""
+This file contains the Bazel build dependencies for Google Benchmark (both C++ source and Python bindings).
+"""
+
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "new_git_repository")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+def benchmark_deps():
+    """Loads dependencies required to build Google Benchmark."""
+
+    if "bazel_skylib" not in native.existing_rules():
+        http_archive(
+            name = "bazel_skylib",
+            sha256 = "cd55a062e763b9349921f0f5db8c3933288dc8ba4f76dd9416aac68acee3cb94",
+            urls = [
+                "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
+                "https://github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
+            ],
+        )
+
+    if "rules_python" not in native.existing_rules():
+        http_archive(
+            name = "rules_python",
+            sha256 = "e85ae30de33625a63eca7fc40a94fea845e641888e52f32b6beea91e8b1b2793",
+            strip_prefix = "rules_python-0.27.1",
+            url = "https://github.com/bazelbuild/rules_python/releases/download/0.27.1/rules_python-0.27.1.tar.gz",
+        )
+
+    if "com_google_googletest" not in native.existing_rules():
+        new_git_repository(
+            name = "com_google_googletest",
+            remote = "https://github.com/google/googletest.git",
+            tag = "release-1.12.1",
+        )
+
+    if "nanobind" not in native.existing_rules():
+        new_git_repository(
+            name = "nanobind",
+            remote = "https://github.com/wjakob/nanobind.git",
+            tag = "v1.9.2",
+            build_file = "@//bindings/python:nanobind.BUILD",
+            recursive_init_submodules = True,
+        )
+
+    if "libpfm" not in native.existing_rules():
+        # Downloaded from v4.9.0 tag at https://sourceforge.net/p/perfmon2/libpfm4/ref/master/tags/
+        http_archive(
+            name = "libpfm",
+            build_file = str(Label("//tools:libpfm.BUILD.bazel")),
+            sha256 = "5da5f8872bde14b3634c9688d980f68bda28b510268723cc12973eedbab9fecc",
+            type = "tar.gz",
+            strip_prefix = "libpfm-4.11.0",
+            urls = ["https://sourceforge.net/projects/perfmon2/files/libpfm4/libpfm-4.11.0.tar.gz/download"],
+        )
--- a/third_party/benchmark/bindings/python/google_benchmark/BUILD
+++ b/third_party/benchmark/bindings/python/google_benchmark/BUILD
@@ -0,0 +1,34 @@
+load("@nanobind_bazel//:build_defs.bzl", "nanobind_extension", "nanobind_stubgen")
+load("@rules_python//python:defs.bzl", "py_library", "py_test")
+
+py_library(
+    name = "google_benchmark",
+    srcs = ["__init__.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":_benchmark",
+    ],
+)
+
+nanobind_extension(
+    name = "_benchmark",
+    srcs = ["benchmark.cc"],
+    deps = ["//:benchmark"],
+)
+
+nanobind_stubgen(
+    name = "benchmark_stubgen",
+    marker_file = "bindings/python/google_benchmark/py.typed",
+    module = ":_benchmark",
+)
+
+py_test(
+    name = "example",
+    srcs = ["example.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":google_benchmark",
+    ],
+)
--- a/third_party/benchmark/bindings/python/google_benchmark/init.py
+++ b/third_party/benchmark/bindings/python/google_benchmark/init.py
@@ -0,0 +1,145 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Python benchmarking utilities.
+
+Example usage:
+  import google_benchmark as benchmark
+
+  @benchmark.register
+  def my_benchmark(state):
+      ...  # Code executed outside `while` loop is not timed.
+
+      while state:
+        ...  # Code executed within `while` loop is timed.
+
+  if __name__ == '__main__':
+    benchmark.main()
+"""
+
+import atexit
+
+from absl import app
+
+from google_benchmark import _benchmark
+from google_benchmark._benchmark import (
+    Counter as Counter,
+    State as State,
+    kMicrosecond as kMicrosecond,
+    kMillisecond as kMillisecond,
+    kNanosecond as kNanosecond,
+    kSecond as kSecond,
+    o1 as o1,
+    oAuto as oAuto,
+    oLambda as oLambda,
+    oLogN as oLogN,
+    oN as oN,
+    oNCubed as oNCubed,
+    oNLogN as oNLogN,
+    oNone as oNone,
+    oNSquared as oNSquared,
+)
+
+__version__ = "1.9.4"
+
+
+class __OptionMaker:
+    """A stateless class to collect benchmark options.
+
+    Collect all decorator calls like @option.range(start=0, limit=1<<5).
+    """
+
+    class Options:
+        """Pure data class to store options calls, along with the benchmarked
+        function."""
+
+        def __init__(self, func):
+            self.func = func
+            self.builder_calls = []
+
+    @classmethod
+    def make(cls, func_or_options):
+        """Make Options from Options or the benchmarked function."""
+        if isinstance(func_or_options, cls.Options):
+            return func_or_options
+        return cls.Options(func_or_options)
+
+    def __getattr__(self, builder_name):
+        """Append option call in the Options."""
+
+        # The function that get returned on @option.range(start=0, limit=1<<5).
+        def __builder_method(*args, **kwargs):
+            # The decorator that get called, either with the benchmared function
+            # or the previous Options
+            def __decorator(func_or_options):
+                options = self.make(func_or_options)
+                options.builder_calls.append((builder_name, args, kwargs))
+                # The decorator returns Options so it is not technically a
+                # decorator and needs a final call to @register
+                return options
+
+            return __decorator
+
+        return __builder_method
+
+
+# Alias for nicer API.
+# We have to instantiate an object, even if stateless, to be able to use
+# __getattr__ on option.range
+option = __OptionMaker()
+
+
+def register(undefined=None, *, name=None):
+    """Register function for benchmarking."""
+    if undefined is None:
+        # Decorator is called without parenthesis so we return a decorator
+        return lambda f: register(f, name=name)
+
+    # We have either the function to benchmark (simple case) or an instance of
+    # Options (@option._ case).
+    options = __OptionMaker.make(undefined)
+
+    if name is None:
+        name = options.func.__name__
+
+    # We register the benchmark and reproduce all the @option._ calls onto the
+    # benchmark builder pattern
+    benchmark = _benchmark.RegisterBenchmark(name, options.func)
+    for name, args, kwargs in options.builder_calls[::-1]:
+        getattr(benchmark, name)(*args, **kwargs)
+
+    # return the benchmarked function because the decorator does not modify it
+    return options.func
+
+
+def _flags_parser(argv):
+    argv = _benchmark.Initialize(argv)
+    return app.parse_flags_with_usage(argv)
+
+
+def _run_benchmarks(argv):
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+    return _benchmark.RunSpecifiedBenchmarks()
+
+
+def main(argv=None):
+    return app.run(_run_benchmarks, argv=argv, flags_parser=_flags_parser)
+
+
+# FIXME: can we rerun with disabled ASLR?
+
+# Methods for use with custom main function.
+initialize = _benchmark.Initialize
+run_benchmarks = _benchmark.RunSpecifiedBenchmarks
+atexit.register(_benchmark.ClearRegisteredBenchmarks)
--- a/third_party/benchmark/bindings/python/google_benchmark/benchmark.cc
+++ b/third_party/benchmark/bindings/python/google_benchmark/benchmark.cc
@@ -0,0 +1,184 @@
+// Benchmark for Python.
+
+#include "benchmark/benchmark.h"
+
+#include "nanobind/nanobind.h"
+#include "nanobind/operators.h"
+#include "nanobind/stl/bind_map.h"
+#include "nanobind/stl/string.h"
+#include "nanobind/stl/vector.h"
+
+NB_MAKE_OPAQUE(benchmark::UserCounters);
+
+namespace {
+namespace nb = nanobind;
+
+std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
+  // The `argv` pointers here become invalid when this function returns, but
+  // benchmark holds the pointer to `argv[0]`. We create a static copy of it
+  // so it persists, and replace the pointer below.
+  static std::string executable_name(argv[0]);
+  std::vector<char*> ptrs;
+  ptrs.reserve(argv.size());
+  for (auto& arg : argv) {
+    ptrs.push_back(const_cast<char*>(arg.c_str()));
+  }
+  ptrs[0] = const_cast<char*>(executable_name.c_str());
+  int argc = static_cast<int>(argv.size());
+  benchmark::Initialize(&argc, ptrs.data());
+  std::vector<std::string> remaining_argv;
+  remaining_argv.reserve(argc);
+  for (int i = 0; i < argc; ++i) {
+    remaining_argv.emplace_back(ptrs[i]);
+  }
+  return remaining_argv;
+}
+
+benchmark::internal::Benchmark* RegisterBenchmark(const std::string& name,
+                                                  nb::callable f) {
+  return benchmark::RegisterBenchmark(
+      name, [f](benchmark::State& state) { f(&state); });
+}
+
+NB_MODULE(_benchmark, m) {
+
+  using benchmark::TimeUnit;
+  nb::enum_<TimeUnit>(m, "TimeUnit")
+      .value("kNanosecond", TimeUnit::kNanosecond)
+      .value("kMicrosecond", TimeUnit::kMicrosecond)
+      .value("kMillisecond", TimeUnit::kMillisecond)
+      .value("kSecond", TimeUnit::kSecond)
+      .export_values();
+
+  using benchmark::BigO;
+  nb::enum_<BigO>(m, "BigO")
+      .value("oNone", BigO::oNone)
+      .value("o1", BigO::o1)
+      .value("oN", BigO::oN)
+      .value("oNSquared", BigO::oNSquared)
+      .value("oNCubed", BigO::oNCubed)
+      .value("oLogN", BigO::oLogN)
+      .value("oNLogN", BigO::oNLogN)
+      .value("oAuto", BigO::oAuto)
+      .value("oLambda", BigO::oLambda)
+      .export_values();
+
+  using benchmark::internal::Benchmark;
+  nb::class_<Benchmark>(m, "Benchmark")
+      // For methods returning a pointer to the current object, reference
+      // return policy is used to ask nanobind not to take ownership of the
+      // returned object and avoid calling delete on it.
+      // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies
+      //
+      // For methods taking a const std::vector<...>&, a copy is created
+      // because a it is bound to a Python list.
+      // https://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html
+      .def("unit", &Benchmark::Unit, nb::rv_policy::reference)
+      .def("arg", &Benchmark::Arg, nb::rv_policy::reference)
+      .def("args", &Benchmark::Args, nb::rv_policy::reference)
+      .def("range", &Benchmark::Range, nb::rv_policy::reference,
+           nb::arg("start"), nb::arg("limit"))
+      .def("dense_range", &Benchmark::DenseRange,
+           nb::rv_policy::reference, nb::arg("start"),
+           nb::arg("limit"), nb::arg("step") = 1)
+      .def("ranges", &Benchmark::Ranges, nb::rv_policy::reference)
+      .def("args_product", &Benchmark::ArgsProduct,
+           nb::rv_policy::reference)
+      .def("arg_name", &Benchmark::ArgName, nb::rv_policy::reference)
+      .def("arg_names", &Benchmark::ArgNames,
+           nb::rv_policy::reference)
+      .def("range_pair", &Benchmark::RangePair,
+           nb::rv_policy::reference, nb::arg("lo1"), nb::arg("hi1"),
+           nb::arg("lo2"), nb::arg("hi2"))
+      .def("range_multiplier", &Benchmark::RangeMultiplier,
+           nb::rv_policy::reference)
+      .def("min_time", &Benchmark::MinTime, nb::rv_policy::reference)
+      .def("min_warmup_time", &Benchmark::MinWarmUpTime,
+           nb::rv_policy::reference)
+      .def("iterations", &Benchmark::Iterations,
+           nb::rv_policy::reference)
+      .def("repetitions", &Benchmark::Repetitions,
+           nb::rv_policy::reference)
+      .def("report_aggregates_only", &Benchmark::ReportAggregatesOnly,
+           nb::rv_policy::reference, nb::arg("value") = true)
+      .def("display_aggregates_only", &Benchmark::DisplayAggregatesOnly,
+           nb::rv_policy::reference, nb::arg("value") = true)
+      .def("measure_process_cpu_time", &Benchmark::MeasureProcessCPUTime,
+           nb::rv_policy::reference)
+      .def("use_real_time", &Benchmark::UseRealTime,
+           nb::rv_policy::reference)
+      .def("use_manual_time", &Benchmark::UseManualTime,
+           nb::rv_policy::reference)
+      .def(
+          "complexity",
+          (Benchmark * (Benchmark::*)(benchmark::BigO)) & Benchmark::Complexity,
+          nb::rv_policy::reference,
+          nb::arg("complexity") = benchmark::oAuto);
+
+  using benchmark::Counter;
+  nb::class_<Counter> py_counter(m, "Counter");
+
+  nb::enum_<Counter::Flags>(py_counter, "Flags", nb::is_arithmetic(), nb::is_flag())
+      .value("kDefaults", Counter::Flags::kDefaults)
+      .value("kIsRate", Counter::Flags::kIsRate)
+      .value("kAvgThreads", Counter::Flags::kAvgThreads)
+      .value("kAvgThreadsRate", Counter::Flags::kAvgThreadsRate)
+      .value("kIsIterationInvariant", Counter::Flags::kIsIterationInvariant)
+      .value("kIsIterationInvariantRate",
+             Counter::Flags::kIsIterationInvariantRate)
+      .value("kAvgIterations", Counter::Flags::kAvgIterations)
+      .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate)
+      .value("kInvert", Counter::Flags::kInvert)
+      .export_values();
+
+  nb::enum_<Counter::OneK>(py_counter, "OneK")
+      .value("kIs1000", Counter::OneK::kIs1000)
+      .value("kIs1024", Counter::OneK::kIs1024)
+      .export_values();
+
+  py_counter
+      .def(nb::init<double, Counter::Flags, Counter::OneK>(),
+           nb::arg("value") = 0., nb::arg("flags") = Counter::kDefaults,
+           nb::arg("k") = Counter::kIs1000)
+      .def("__init__",
+           ([](Counter* c, double value) { new (c) Counter(value); }))
+      .def_rw("value", &Counter::value)
+      .def_rw("flags", &Counter::flags)
+      .def_rw("oneK", &Counter::oneK)
+      .def(nb::init_implicit<double>());
+
+  nb::implicitly_convertible<nb::int_, Counter>();
+
+  nb::bind_map<benchmark::UserCounters>(m, "UserCounters");
+
+  using benchmark::State;
+  nb::class_<State>(m, "State")
+      .def("__bool__", &State::KeepRunning)
+      .def_prop_ro("keep_running", &State::KeepRunning)
+      .def("pause_timing", &State::PauseTiming)
+      .def("resume_timing", &State::ResumeTiming)
+      .def("skip_with_error", &State::SkipWithError)
+      .def_prop_ro("error_occurred", &State::error_occurred)
+      .def("set_iteration_time", &State::SetIterationTime)
+      .def_prop_rw("bytes_processed", &State::bytes_processed,
+                    &State::SetBytesProcessed)
+      .def_prop_rw("complexity_n", &State::complexity_length_n,
+                    &State::SetComplexityN)
+      .def_prop_rw("items_processed", &State::items_processed,
+                   &State::SetItemsProcessed)
+      .def("set_label", &State::SetLabel)
+      .def("range", &State::range, nb::arg("pos") = 0)
+      .def_prop_ro("iterations", &State::iterations)
+      .def_prop_ro("name", &State::name)
+      .def_rw("counters", &State::counters)
+      .def_prop_ro("thread_index", &State::thread_index)
+      .def_prop_ro("threads", &State::threads);
+
+  m.def("Initialize", Initialize);
+  m.def("RegisterBenchmark", RegisterBenchmark,
+        nb::rv_policy::reference);
+  m.def("RunSpecifiedBenchmarks",
+        []() { benchmark::RunSpecifiedBenchmarks(); });
+  m.def("ClearRegisteredBenchmarks", benchmark::ClearRegisteredBenchmarks);
+};
+}  // namespace
--- a/third_party/benchmark/bindings/python/google_benchmark/example.py
+++ b/third_party/benchmark/bindings/python/google_benchmark/example.py
@@ -0,0 +1,140 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Example of Python using C++ benchmark framework.
+
+To run this example, you must first install the `google_benchmark` Python
+package.
+
+To install using `setup.py`, download and extract the `google_benchmark` source.
+In the extracted directory, execute:
+  python setup.py install
+"""
+
+import random
+import time
+
+import google_benchmark as benchmark
+from google_benchmark import Counter
+
+
+@benchmark.register
+def empty(state):
+    while state:
+        pass
+
+
+@benchmark.register
+def sum_million(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register
+def pause_timing(state):
+    """Pause timing every iteration."""
+    while state:
+        # Construct a list of random ints every iteration without timing it
+        state.pause_timing()
+        random_list = [random.randint(0, 100) for _ in range(100)]
+        state.resume_timing()
+        # Time the in place sorting algorithm
+        random_list.sort()
+
+
+@benchmark.register
+def skipped(state):
+    if True:  # Test some predicate here.
+        state.skip_with_error("some error")
+        return  # NOTE: You must explicitly return, or benchmark will continue.
+
+    # Benchmark code would be here.
+
+
+@benchmark.register
+@benchmark.option.use_manual_time()
+def manual_timing(state):
+    while state:
+        # Manually count Python CPU time
+        start = time.perf_counter()  # perf_counter_ns() in Python 3.7+
+        # Something to benchmark
+        time.sleep(0.01)
+        end = time.perf_counter()
+        state.set_iteration_time(end - start)
+
+
+@benchmark.register
+def custom_counters(state):
+    """Collect custom metric using benchmark.Counter."""
+    num_foo = 0.0
+    while state:
+        # Benchmark some code here
+        # Collect some custom metric named foo
+        num_foo += 0.13
+
+    # Automatic Counter from numbers.
+    state.counters["foo"] = num_foo
+    # Set a counter as a rate.
+    state.counters["foo_rate"] = Counter(num_foo, Counter.kIsRate)
+    #  Set a counter as an inverse of rate.
+    state.counters["foo_inv_rate"] = Counter(
+        num_foo, Counter.kIsRate | Counter.kInvert
+    )
+    # Set a counter as a thread-average quantity.
+    state.counters["foo_avg"] = Counter(num_foo, Counter.kAvgThreads)
+    # There's also a combined flag:
+    state.counters["foo_avg_rate"] = Counter(num_foo, Counter.kAvgThreadsRate)
+
+
+@benchmark.register
+@benchmark.option.measure_process_cpu_time()
+@benchmark.option.use_real_time()
+def with_options(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register(name="sum_million_microseconds")
+@benchmark.option.unit(benchmark.kMicrosecond)
+def with_options2(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register
+@benchmark.option.arg(100)
+@benchmark.option.arg(1000)
+def passing_argument(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+@benchmark.register
+@benchmark.option.range(8, limit=8 << 10)
+def using_range(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+@benchmark.register
+@benchmark.option.range_multiplier(2)
+@benchmark.option.range(1 << 10, 1 << 18)
+@benchmark.option.complexity(benchmark.oN)
+def computing_complexity(state):
+    while state:
+        sum(range(state.range(0)))
+    state.complexity_n = state.range(0)
+
+
+if __name__ == "__main__":
+    benchmark.main()
--- a/third_party/benchmark/cmake/AddCXXCompilerFlag.cmake
+++ b/third_party/benchmark/cmake/AddCXXCompilerFlag.cmake
@@ -34,9 +34,11 @@ function(add_cxx_compiler_flag FLAG)
  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
  if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
      string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
    endif()
    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${BENCHMARK_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
  endif()
@@ -49,9 +51,11 @@ function(add_required_cxx_compiler_flag FLAG)
  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
  if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
      string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
    endif()
    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
--- a/third_party/benchmark/cmake/CXXFeatureCheck.cmake
+++ b/third_party/benchmark/cmake/CXXFeatureCheck.cmake
@@ -17,6 +17,8 @@ if(__cxx_feature_check)
 endif()
 set(__cxx_feature_check INCLUDED)

+option(CXXFEATURECHECK_DEBUG OFF)
+
 function(cxx_feature_check FILE)
  string(TOLOWER ${FILE} FILE)
  string(TOUPPER ${FILE} VAR)
@@ -27,26 +29,38 @@ function(cxx_feature_check FILE)
    return()
  endif()

+  set(FEATURE_CHECK_CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+  if (ARGC GREATER 1)
+    message(STATUS "Enabling additional flags: ${ARGV1}")
+    list(APPEND FEATURE_CHECK_CMAKE_FLAGS ${ARGV1})
+  endif()
+
  if (NOT DEFINED COMPILE_${FEATURE})
-    message(STATUS "Performing Test ${FEATURE}")
    if(CMAKE_CROSSCOMPILING)
+      message(STATUS "Cross-compiling to test ${FEATURE}")
      try_compile(COMPILE_${FEATURE}
              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 17
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
      if(COMPILE_${FEATURE})
        message(WARNING
              "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
-        set(RUN_${FEATURE} 0)
+        set(RUN_${FEATURE} 0 CACHE INTERNAL "")
      else()
-        set(RUN_${FEATURE} 1)
+        set(RUN_${FEATURE} 1 CACHE INTERNAL "")
      endif()
    else()
-      message(STATUS "Performing Test ${FEATURE}")
+      message(STATUS "Compiling and running to test ${FEATURE}")
      try_run(RUN_${FEATURE} COMPILE_${FEATURE}
              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 17
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
    endif()
  endif()

@@ -56,7 +70,11 @@ function(cxx_feature_check FILE)
    add_definitions(-DHAVE_${VAR})
  else()
    if(NOT COMPILE_${FEATURE})
-      message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      if(CXXFEATURECHECK_DEBUG)
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile: ${COMPILE_OUTPUT_VAR}")
+      else()
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      endif()
    else()
      message(STATUS "Performing Test ${FEATURE} -- compiled but failed to run")
    endif()
--- a/third_party/benchmark/cmake/Config.cmake.in
+++ b/third_party/benchmark/cmake/Config.cmake.in
@@ -1 +1,12 @@
+@PACKAGE_INIT@
+
+include (CMakeFindDependencyMacro)
+
+find_dependency (Threads)
+
+if (@BENCHMARK_ENABLE_LIBPFM@)
+    list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+    find_dependency (PFM)
+endif()
+
 include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
--- a/third_party/benchmark/cmake/GetGitVersion.cmake
+++ b/third_party/benchmark/cmake/GetGitVersion.cmake
@@ -20,35 +20,17 @@ set(__get_git_version INCLUDED)

 function(get_git_version var)
  if(GIT_EXECUTABLE)
-      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8 --dirty
          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
          RESULT_VARIABLE status
          OUTPUT_VARIABLE GIT_VERSION
          ERROR_QUIET)
-      if(${status})
+      if(status)
          set(GIT_VERSION "v0.0.0")
-      else()
-          string(STRIP ${GIT_VERSION} GIT_VERSION)
-          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
-      endif()
-
-      # Work out if the repository is dirty
-      execute_process(COMMAND ${GIT_EXECUTABLE} update-index -q --refresh
-          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-          OUTPUT_QUIET
-          ERROR_QUIET)
-      execute_process(COMMAND ${GIT_EXECUTABLE} diff-index --name-only HEAD --
-          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-          OUTPUT_VARIABLE GIT_DIFF_INDEX
-          ERROR_QUIET)
-      string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
-      if (${GIT_DIRTY})
-          set(GIT_VERSION "${GIT_VERSION}-dirty")
      endif()
  else()
      set(GIT_VERSION "v0.0.0")
  endif()

-  message(STATUS "git Version: ${GIT_VERSION}")
  set(${var} ${GIT_VERSION} PARENT_SCOPE)
 endfunction()
--- a/third_party/benchmark/cmake/GoogleTest.cmake
+++ b/third_party/benchmark/cmake/GoogleTest.cmake
@@ -0,0 +1,58 @@
+# Download and unpack googletest at configure time
+set(GOOGLETEST_PREFIX "${benchmark_BINARY_DIR}/third_party/googletest")
+configure_file(${benchmark_SOURCE_DIR}/cmake/GoogleTest.cmake.in ${GOOGLETEST_PREFIX}/CMakeLists.txt @ONLY)
+
+set(GOOGLETEST_PATH "${CMAKE_CURRENT_SOURCE_DIR}/googletest" CACHE PATH "") # Mind the quotes
+execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
+  -DALLOW_DOWNLOADING_GOOGLETEST=${BENCHMARK_DOWNLOAD_DEPENDENCIES} -DGOOGLETEST_PATH:PATH=${GOOGLETEST_PATH} .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+endif()
+
+execute_process(
+  COMMAND ${CMAKE_COMMAND} --build .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "Build step for googletest failed: ${result}")
+endif()
+
+# Prevent overriding the parent project's compiler/linker
+# settings on Windows
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+include(${GOOGLETEST_PREFIX}/googletest-paths.cmake)
+
+# Add googletest directly to our build. This defines
+# the gtest and gtest_main targets.
+add_subdirectory(${GOOGLETEST_SOURCE_DIR}
+                 ${GOOGLETEST_BINARY_DIR}
+                 EXCLUDE_FROM_ALL)
+
+# googletest doesn't seem to want to stay build warning clean so let's not hurt ourselves.
+if (MSVC)
+  target_compile_options(gtest PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gtest_main PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock_main PRIVATE "/wd4244" "/wd4722")
+else()
+  target_compile_options(gtest PRIVATE "-w")
+  target_compile_options(gtest_main PRIVATE "-w")
+  target_compile_options(gmock PRIVATE "-w")
+  target_compile_options(gmock_main PRIVATE "-w")
+endif()
+
+if(NOT DEFINED GTEST_COMPILE_COMMANDS)
+    set(GTEST_COMPILE_COMMANDS ON)
+endif()
+
+set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
--- a/third_party/benchmark/cmake/GoogleTest.cmake.in
+++ b/third_party/benchmark/cmake/GoogleTest.cmake.in
@@ -0,0 +1,60 @@
+cmake_minimum_required (VERSION 3.13...3.22)
+
+project(googletest-download NONE)
+
+# Enable ExternalProject CMake module
+include(ExternalProject)
+
+option(ALLOW_DOWNLOADING_GOOGLETEST "If googletest src tree is not found in location specified by GOOGLETEST_PATH, do fetch the archive from internet" OFF)
+set(GOOGLETEST_PATH "/usr/src/googletest" CACHE PATH
+                    "Path to the googletest root tree. Should contain googletest and googlemock subdirs. And CMakeLists.txt in root, and in both of these subdirs")
+
+# Download and install GoogleTest
+
+message(STATUS "Looking for Google Test sources")
+message(STATUS "Looking for Google Test sources in ${GOOGLETEST_PATH}")
+if(EXISTS "${GOOGLETEST_PATH}"            AND IS_DIRECTORY "${GOOGLETEST_PATH}"            AND EXISTS "${GOOGLETEST_PATH}/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googletest" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googletest" AND EXISTS "${GOOGLETEST_PATH}/googletest/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googlemock" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googlemock" AND EXISTS "${GOOGLETEST_PATH}/googlemock/CMakeLists.txt")
+  message(STATUS "Found Google Test in ${GOOGLETEST_PATH}")
+
+  ExternalProject_Add(
+    googletest
+    PREFIX            "${CMAKE_BINARY_DIR}"
+    DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+    SOURCE_DIR        "${GOOGLETEST_PATH}" # use existing src dir.
+    BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+  )
+else()
+  if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_USE_BUNDLED_GTEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    return()
+  else()
+    message(STATUS "Did not find Google Test sources! Fetching from web...")
+    ExternalProject_Add(
+      googletest
+      GIT_REPOSITORY    https://github.com/google/googletest.git
+      GIT_TAG           "v1.15.2"
+      GIT_SHALLOW       "ON"
+      PREFIX            "${CMAKE_BINARY_DIR}"
+      STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
+      DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+      SOURCE_DIR        "${CMAKE_BINARY_DIR}/src"
+      BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND     ""
+      INSTALL_COMMAND   ""
+      TEST_COMMAND      ""
+    )
+  endif()
+endif()
+
+ExternalProject_Get_Property(googletest SOURCE_DIR BINARY_DIR)
+file(WRITE googletest-paths.cmake
+"set(GOOGLETEST_SOURCE_DIR \"${SOURCE_DIR}\")
+set(GOOGLETEST_BINARY_DIR \"${BINARY_DIR}\")
+")
--- a/third_party/benchmark/cmake/HandleGTest.cmake
+++ b/third_party/benchmark/cmake/HandleGTest.cmake
@@ -1,113 +0,0 @@
-
-include(split_list)
-
-macro(build_external_gtest)
-  include(ExternalProject)
-  set(GTEST_FLAGS "")
-  if (BENCHMARK_USE_LIBCXX)
-    if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-      list(APPEND GTEST_FLAGS -stdlib=libc++)
-    else()
-      message(WARNING "Unsupported compiler (${CMAKE_CXX_COMPILER}) when using libc++")
-    endif()
-  endif()
-  if (BENCHMARK_BUILD_32_BITS)
-    list(APPEND GTEST_FLAGS -m32)
-  endif()
-  if (NOT "${CMAKE_CXX_FLAGS}" STREQUAL "")
-    list(APPEND GTEST_FLAGS ${CMAKE_CXX_FLAGS})
-  endif()
-  string(TOUPPER "${CMAKE_BUILD_TYPE}" GTEST_BUILD_TYPE)
-  if ("${GTEST_BUILD_TYPE}" STREQUAL "COVERAGE")
-    set(GTEST_BUILD_TYPE "DEBUG")
-  endif()
-  # FIXME: Since 10/Feb/2017 the googletest trunk has had a bug where
-  # -Werror=unused-function fires during the build on OS X. This is a temporary
-  # workaround to keep our travis bots from failing. It should be removed
-  # once gtest is fixed.
-  if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-    list(APPEND GTEST_FLAGS "-Wno-unused-function")
-  endif()
-  split_list(GTEST_FLAGS)
-  set(EXCLUDE_FROM_ALL_OPT "")
-  set(EXCLUDE_FROM_ALL_VALUE "")
-  if (${CMAKE_VERSION} VERSION_GREATER "3.0.99")
-      set(EXCLUDE_FROM_ALL_OPT "EXCLUDE_FROM_ALL")
-      set(EXCLUDE_FROM_ALL_VALUE "ON")
-  endif()
-  ExternalProject_Add(googletest
-      ${EXCLUDE_FROM_ALL_OPT} ${EXCLUDE_FROM_ALL_VALUE}
-      GIT_REPOSITORY https://github.com/google/googletest.git
-      GIT_TAG master
-      PREFIX "${CMAKE_BINARY_DIR}/googletest"
-      INSTALL_DIR "${CMAKE_BINARY_DIR}/googletest"
-      CMAKE_CACHE_ARGS
-        -DCMAKE_BUILD_TYPE:STRING=${GTEST_BUILD_TYPE}
-        -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-        -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-        -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
-        -DCMAKE_INSTALL_LIBDIR:PATH=<INSTALL_DIR>/lib
-        -DCMAKE_CXX_FLAGS:STRING=${GTEST_FLAGS}
-        -Dgtest_force_shared_crt:BOOL=ON
-      )
-
-  ExternalProject_Get_Property(googletest install_dir)
-  set(GTEST_INCLUDE_DIRS ${install_dir}/include)
-  file(MAKE_DIRECTORY ${GTEST_INCLUDE_DIRS})
-
-  set(LIB_SUFFIX "${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  set(LIB_PREFIX "${CMAKE_STATIC_LIBRARY_PREFIX}")
-  if("${GTEST_BUILD_TYPE}" STREQUAL "DEBUG")
-    set(LIB_SUFFIX "d${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  endif()
-
-  # Use gmock_main instead of gtest_main because it initializes gtest as well.
-  # Note: The libraries are listed in reverse order of their dependancies.
-  foreach(LIB gtest gmock gmock_main)
-    add_library(${LIB} UNKNOWN IMPORTED)
-    set_target_properties(${LIB} PROPERTIES
-      IMPORTED_LOCATION ${install_dir}/lib/${LIB_PREFIX}${LIB}${LIB_SUFFIX}
-      INTERFACE_INCLUDE_DIRECTORIES ${GTEST_INCLUDE_DIRS}
-      INTERFACE_LINK_LIBRARIES "${GTEST_BOTH_LIBRARIES}"
-    )
-    add_dependencies(${LIB} googletest)
-    list(APPEND GTEST_BOTH_LIBRARIES ${LIB})
-  endforeach()
-endmacro(build_external_gtest)
-
-if (BENCHMARK_ENABLE_GTEST_TESTS)
-  if (IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/googletest)
-    set(GTEST_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/googletest")
-    set(INSTALL_GTEST OFF CACHE INTERNAL "")
-    set(INSTALL_GMOCK OFF CACHE INTERNAL "")
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/googletest)
-    set(GTEST_BOTH_LIBRARIES gtest gmock gmock_main)
-    foreach(HEADER test mock)
-      # CMake 2.8 and older don't respect INTERFACE_INCLUDE_DIRECTORIES, so we
-      # have to add the paths ourselves.
-      set(HFILE g${HEADER}/g${HEADER}.h)
-      set(HPATH ${GTEST_ROOT}/google${HEADER}/include)
-      find_path(HEADER_PATH_${HEADER} ${HFILE}
-          NO_DEFAULT_PATHS
-          HINTS ${HPATH}
-      )
-      if (NOT HEADER_PATH_${HEADER})
-        message(FATAL_ERROR "Failed to find header ${HFILE} in ${HPATH}")
-      endif()
-      list(APPEND GTEST_INCLUDE_DIRS ${HEADER_PATH_${HEADER}})
-    endforeach()
-  elseif(BENCHMARK_DOWNLOAD_DEPENDENCIES)
-    build_external_gtest()
-  else()
-    find_package(GTest REQUIRED)
-    find_path(GMOCK_INCLUDE_DIRS gmock/gmock.h
-        HINTS ${GTEST_INCLUDE_DIRS})
-    if (NOT GMOCK_INCLUDE_DIRS)
-      message(FATAL_ERROR "Failed to find header gmock/gmock.h with hint ${GTEST_INCLUDE_DIRS}")
-    endif()
-    set(GTEST_INCLUDE_DIRS ${GTEST_INCLUDE_DIRS} ${GMOCK_INCLUDE_DIRS})
-    # FIXME: We don't currently require the gmock library to build the tests,
-    # and it's likely we won't find it, so we don't try. As long as we've
-    # found the gmock/gmock.h header and gtest_main that should be good enough.
-  endif()
-endif()
--- a/third_party/benchmark/cmake/benchmark.pc.in
+++ b/third_party/benchmark/cmake/benchmark.pc.in
@@ -1,11 +1,12 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-libdir=${prefix}/lib
-includedir=${prefix}/include
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@

 Name: @PROJECT_NAME@
 Description: Google microbenchmark framework
-Version: @VERSION@
+Version: @NORMALIZED_VERSION@

 Libs: -L${libdir} -lbenchmark
+Libs.private: -lpthread @BENCHMARK_PRIVATE_LINK_LIBRARIES@
 Cflags: -I${includedir}
--- a/third_party/benchmark/cmake/benchmark_main.pc.in
+++ b/third_party/benchmark/cmake/benchmark_main.pc.in
@@ -0,0 +1,7 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+
+Name: @PROJECT_NAME@
+Description: Google microbenchmark framework (with main() function)
+Version: @NORMALIZED_VERSION@
+Requires: benchmark
+Libs: -L${libdir} -lbenchmark_main
--- a/third_party/benchmark/cmake/pthread_affinity.cpp
+++ b/third_party/benchmark/cmake/pthread_affinity.cpp
@@ -0,0 +1,16 @@
+#include <pthread.h>
+int main() {
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  for (int i = 0; i < CPU_SETSIZE; ++i) {
+    CPU_SET(i, &set);
+    CPU_CLR(i, &set);
+  }
+  pthread_t self = pthread_self();
+  int ret;
+  ret = pthread_getaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  ret = pthread_setaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  return 0;
+}
--- a/third_party/benchmark/docs/AssemblyTests.md
+++ b/third_party/benchmark/docs/AssemblyTests.md
@@ -111,6 +111,7 @@ between compilers or compiler versions. A common example of this
 is matching stack frame addresses. In this case regular expressions
 can be used to match the differing bits of output. For example:

+<!-- {% raw %} -->
 ```c++
 int ExternInt;
 struct Point { int x, y, z; };
@@ -127,6 +128,7 @@ extern "C" void test_store_point() {
    // CHECK: ret
 }
 ```
+<!-- {% endraw %} -->

 ## Current Requirements and Limitations

--- a/third_party/benchmark/docs/_config.yml
+++ b/third_party/benchmark/docs/_config.yml
@@ -0,0 +1,3 @@
+theme: jekyll-theme-minimal
+logo: /assets/images/icon_black.png
+show_downloads: true
--- a/third_party/benchmark/docs/assets/images/icon.png
+++ b/third_party/benchmark/docs/assets/images/icon.png
--- a/third_party/benchmark/docs/assets/images/icon.xcf
+++ b/third_party/benchmark/docs/assets/images/icon.xcf
--- a/third_party/benchmark/docs/assets/images/icon_black.png
+++ b/third_party/benchmark/docs/assets/images/icon_black.png
--- a/third_party/benchmark/docs/assets/images/icon_black.xcf
+++ b/third_party/benchmark/docs/assets/images/icon_black.xcf
--- a/third_party/benchmark/docs/dependencies.md
+++ b/third_party/benchmark/docs/dependencies.md
@@ -0,0 +1,19 @@
+# Build tool dependency policy
+
+We follow the [Foundational C++ support policy](https://opensource.google/documentation/policies/cplusplus-support) for our build tools. In
+particular the ["Build Systems" section](https://opensource.google/documentation/policies/cplusplus-support#build-systems).
+
+## CMake
+
+The current supported version is CMake 3.10 as of 2023-08-10. Most modern
+distributions include newer versions, for example:
+
+* Ubuntu 20.04 provides CMake 3.16.3
+* Debian 11.4 provides CMake 3.18.4
+* Ubuntu 22.04 provides CMake 3.22.1
+
+## Python
+
+The Python bindings require Python 3.10+ as of v1.9.0 (2024-08-16) for installation from PyPI.
+Building from source for older versions probably still works, though. See the [user guide](python_bindings.md) for details on how to build from source.
+The minimum theoretically supported version is Python 3.8, since the used bindings generator (nanobind) only supports Python 3.8+.
--- a/third_party/benchmark/docs/index.md
+++ b/third_party/benchmark/docs/index.md
@@ -0,0 +1,12 @@
+# Benchmark
+
+* [Assembly Tests](AssemblyTests.md)
+* [Dependencies](dependencies.md)
+* [Perf Counters](perf_counters.md)
+* [Platform Specific Build Instructions](platform_specific_build_instructions.md)
+* [Python Bindings](python_bindings.md)
+* [Random Interleaving](random_interleaving.md)
+* [Reducing Variance](reducing_variance.md)
+* [Releasing](releasing.md)
+* [Tools](tools.md)
+* [User Guide](user_guide.md)
--- a/third_party/benchmark/docs/perf_counters.md
+++ b/third_party/benchmark/docs/perf_counters.md
@@ -0,0 +1,35 @@
+<a name="perf-counters" />
+
+# User-Requested Performance Counters
+
+When running benchmarks, the user may choose to request collection of
+performance counters. This may be useful in investigation scenarios - narrowing
+down the cause of a regression; or verifying that the underlying cause of a
+performance improvement matches expectations.
+
+This feature is available if:
+
+* The benchmark is run on an architecture featuring a Performance Monitoring
+  Unit (PMU),
+* The benchmark is compiled with support for collecting counters. Currently,
+  this requires [libpfm](http://perfmon2.sourceforge.net/), which is built as a
+  dependency via Bazel.
+
+The feature does not require modifying benchmark code. Counter collection is
+handled at the boundaries where timer collection is also handled. 
+
+To opt-in:
+* If using a Bazel build, add `--define pfm=1` to your build flags
+* If using CMake:
+  * Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`.
+  * Enable the CMake flag `BENCHMARK_ENABLE_LIBPFM` in `CMakeLists.txt`.
+
+To use, pass a comma-separated list of counter names through the
+`--benchmark_perf_counters` flag. The names are decoded through libpfm - meaning,
+they are platform specific, but some (e.g. `CYCLES` or `INSTRUCTIONS`) are
+mapped by libpfm to platform-specifics - see libpfm
+[documentation](http://perfmon2.sourceforge.net/docs.html) for more details.
+
+The counter values are reported back through the [User Counters](../README.md#custom-counters)
+mechanism, meaning, they are available in all the formats (e.g. JSON) supported
+by User Counters.
--- a/third_party/benchmark/docs/platform_specific_build_instructions.md
+++ b/third_party/benchmark/docs/platform_specific_build_instructions.md
@@ -0,0 +1,52 @@
+# Platform Specific Build Instructions
+
+## Building with GCC
+
+When the library is built using GCC it is necessary to link with the pthread
+library due to how GCC implements `std::thread`. Failing to link to pthread will
+lead to runtime exceptions (unless you're using libc++), not linker errors. See
+[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
+can link to pthread by adding `-pthread` to your linker command. Note, you can
+also use `-lpthread`, but there are potential issues with ordering of command
+line parameters if you use that.
+
+On QNX, the pthread library is part of libc and usually included automatically
+(see
+[`pthread_create()`](https://www.qnx.com/developers/docs/7.1/index.html#com.qnx.doc.neutrino.lib_ref/topic/p/pthread_create.html)).
+There's no separate pthread library to link.
+
+## Building with Visual Studio 2015, 2017 or 2022
+
+The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
+
+```
+// Alternatively, can add libraries using linker options.
+
+// First, Add the path to the generated library files (directory containing the `benchmark.lib`) in `[Configuration Properties > Linker > General > Additional Library Directories]`. Then do the following:
+#ifdef _WIN32
+#pragma comment ( lib, "Shlwapi.lib" )
+#ifdef _DEBUG
+#pragma comment ( lib, "benchmark.lib" )
+#else
+#pragma comment ( lib, "benchmark.lib" )
+#endif
+#endif
+```
+
+When using the static library, make sure to add `BENCHMARK_STATIC_DEFINE` under `[Configuration Properties > C/C++ > Preprocessor > Preprocessor Definitions]`
+
+Can also use the graphical version of CMake:
+* Open `CMake GUI`.
+* Under `Where to build the binaries`, same path as source plus `build`.
+* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
+* Click `Configure`, `Generate`, `Open Project`.
+* If build fails, try deleting entire directory and starting again, or unticking options to build less.
+
+## Building with Intel 2015 Update 1 or Intel System Studio Update 4
+
+See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
+
+## Building on Solaris
+
+If you're running benchmarks on solaris, you'll want the kstat library linked in
+too (`-lkstat`).
--- a/third_party/benchmark/docs/python_bindings.md
+++ b/third_party/benchmark/docs/python_bindings.md
@@ -0,0 +1,34 @@
+# Building and installing Python bindings
+
+Python bindings are available as wheels on [PyPI](https://pypi.org/project/google-benchmark/) for importing and 
+using Google Benchmark directly in Python. 
+Currently, pre-built wheels exist for macOS (both ARM64 and Intel x86), Linux x86-64 and 64-bit Windows.
+Supported Python versions are Python 3.8 - 3.12.
+
+To install Google Benchmark's Python bindings, run:
+
+```bash
+python -m pip install --upgrade pip  # for manylinux2014 support
+python -m pip install google-benchmark
+```
+
+In order to keep your system Python interpreter clean, it is advisable to run these commands in a virtual
+environment. See the [official Python documentation](https://docs.python.org/3/library/venv.html) 
+on how to create virtual environments.
+
+To build a wheel directly from source, you can follow these steps:
+```bash
+git clone https://github.com/google/benchmark.git
+cd benchmark
+# create a virtual environment and activate it
+python3 -m venv venv --system-site-packages
+source venv/bin/activate  # .\venv\Scripts\Activate.ps1 on Windows
+
+# upgrade Python's system-wide packages
+python -m pip install --upgrade pip build
+# builds the wheel and stores it in the directory "dist".
+python -m build
+```
+
+NB: Building wheels from source requires Bazel. For platform-specific instructions on how to install Bazel,
+refer to the [Bazel installation docs](https://bazel.build/install).
--- a/third_party/benchmark/docs/random_interleaving.md
+++ b/third_party/benchmark/docs/random_interleaving.md
@@ -0,0 +1,13 @@
+<a name="interleaving" />
+
+# Random Interleaving
+
+[Random Interleaving](https://github.com/google/benchmark/issues/1051) is a
+technique to lower run-to-run variance. It randomly interleaves repetitions of a
+microbenchmark with repetitions from other microbenchmarks in the same benchmark
+test. Data shows it is able to lower run-to-run variance by
+[40%](https://github.com/google/benchmark/issues/1051) on average.
+
+To use, you mainly need to set `--benchmark_enable_random_interleaving=true`,
+and optionally specify non-zero repetition count `--benchmark_repetitions=9`
+and optionally decrease the per-repetition time `--benchmark_min_time=0.1`.
--- a/third_party/benchmark/docs/reducing_variance.md
+++ b/third_party/benchmark/docs/reducing_variance.md
@@ -0,0 +1,133 @@
+# Reducing Variance
+
+<a name="disabling-cpu-frequency-scaling" />
+
+## Disabling CPU Frequency Scaling
+
+If you see this error:
+
+```
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+```
+
+you might want to disable the CPU frequency scaling while running the
+benchmark, as well as consider other ways to stabilize the performance of
+your system while benchmarking.
+
+Exactly how to do this depends on the Linux distribution,
+desktop environment, and installed programs.  Specific details are a moving
+target, so we will not attempt to exhaustively document them here.
+
+One simple option is to use the `cpupower` program to change the
+performance governor to "performance".  This tool is maintained along with
+the Linux kernel and provided by your distribution.
+
+It must be run as root, like this:
+
+```bash
+sudo cpupower frequency-set --governor performance
+```
+
+After this you can verify that all CPUs are using the performance governor
+by running this command:
+
+```bash
+cpupower frequency-info -o proc
+```
+
+The benchmarks you subsequently run will have less variance.
+
+<a name="reducing-variance" />
+
+## Disabling ASLR
+
+If you see this error:
+
+```
+***WARNING*** ASLR is enabled, the results may have unreproducible noise in them.
+```
+
+you might want to disable the ASLR security hardening feature while running the
+benchmark.
+
+The simplest way is to add
+```
+benchmark::MaybeReenterWithoutASLR(argc, argv);
+```
+as the first line of your `main()` function. It will try to disable ASLR
+for the current processor, and, if successful, re-execute the binary.
+Note that `personality(2)` may be forbidden by e.g. seccomp (which happens
+by default if you are running in a Docker container).
+
+Note that if you link to `benchmark_main` already does that for you.
+
+To globally disable ASLR on Linux, run
+```
+echo 0 > /proc/sys/kernel/randomize_va_space
+```
+
+To run a single benchmark with ASLR disabled on Linux, do:
+```
+setarch `uname -m` -R ./a_benchmark
+```
+
+Note that for the information on how to disable ASLR on other operating systems,
+please refer to their documentation.
+
+## Reducing Variance in Benchmarks
+
+The Linux CPU frequency governor [discussed
+above](user_guide#disabling-cpu-frequency-scaling) is not the only source
+of noise in benchmarks.  Some, but not all, of the sources of variance
+include:
+
+1. On multi-core machines not all CPUs/CPU cores/CPU threads run the same
+   speed, so running a benchmark one time and then again may give a
+   different result depending on which CPU it ran on.
+2. CPU scaling features that run on the CPU, like Intel's Turbo Boost and
+   AMD Turbo Core and Precision Boost, can temporarily change the CPU
+   frequency even when the using the "performance" governor on Linux.
+3. Context switching between CPUs, or scheduling competition on the CPU the
+   benchmark is running on.
+4. Intel Hyperthreading or AMD SMT causing the same issue as above.
+5. Cache effects caused by code running on other CPUs.
+6. Non-uniform memory architectures (NUMA).
+
+These can cause variance in benchmarks results within a single run
+(`--benchmark_repetitions=N`) or across multiple runs of the benchmark
+program.
+
+Reducing sources of variance is OS and architecture dependent, which is one
+reason some companies maintain machines dedicated to performance testing.
+
+Some of the easier and effective ways of reducing variance on a typical
+Linux workstation are:
+
+1. Use the performance governor as [discussed
+above](user_guide#disabling-cpu-frequency-scaling).
+1. Disable processor boosting by:
+   ```sh
+   echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost
+   ```
+   See the Linux kernel's
+   [boost.txt](https://www.kernel.org/doc/Documentation/cpu-freq/boost.txt)
+   for more information.
+2. Set the benchmark program's task affinity to a fixed cpu.  For example:
+   ```sh
+   taskset -c 0 ./mybenchmark
+   ```
+3. Disabling Hyperthreading/SMT.  This can be done in the Bios or using the
+   `/sys` file system (see the LLVM project's [Benchmarking
+   tips](https://llvm.org/docs/Benchmarking.html)).
+4. Close other programs that do non-trivial things based on timers, such as
+   your web browser, desktop environment, etc.
+5. Reduce the working set of your benchmark to fit within the L1 cache, but
+   do be aware that this may lead you to optimize for an unrealistic
+   situation.
+
+Further resources on this topic:
+
+1. The LLVM project's [Benchmarking
+   tips](https://llvm.org/docs/Benchmarking.html).
+1. The Arch Wiki [Cpu frequency
+scaling](https://wiki.archlinux.org/title/CPU_frequency_scaling) page.
--- a/third_party/benchmark/docs/releasing.md
+++ b/third_party/benchmark/docs/releasing.md
@@ -0,0 +1,38 @@
+# How to release
+
+* Make sure you're on main and synced to HEAD
+* Ensure the project builds and tests run
+    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
+      passes
+* Prepare release notes
+    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
+      commits between the last annotated tag and HEAD
+    * Pick the most interesting.
+* Create one last commit that updates the version saved in `CMakeLists.txt`, `MODULE.bazel`,
+  and `bindings/python/google_benchmark/__init__.py` to the release version you're creating.
+  (This version will be used if benchmark is installed from the archive you'll be creating
+  in the next step.)
+
+```
+# CMakeLists.txt
+project (benchmark VERSION 1.9.0 LANGUAGES CXX)
+```
+
+```
+# MODULE.bazel
+module(name = "com_github_google_benchmark", version="1.9.0")
+```
+
+```
+# google_benchmark/__init__.py
+__version__ = "1.9.0"
+```
+
+* Create a release through github's interface
+    * Note this will create a lightweight tag.
+    * Update this to an annotated tag:
+      * `git pull --tags`
+      * `git tag -a -f <tag> <tag>`
+      * `git push --force --tags origin`
+* Confirm that the "Build and upload Python wheels" action runs to completion
+    * Run it manually if it hasn't run.
--- a/third_party/benchmark/docs/tools.md
+++ b/third_party/benchmark/docs/tools.md
@@ -4,7 +4,11 @@

 The `compare.py` can be used to compare the result of benchmarks.

-**NOTE**: the utility relies on the scipy package which can be installed using [these instructions](https://www.scipy.org/install.html).
+### Dependencies
+The utility relies on the [scipy](https://www.scipy.org) package which can be installed using pip:
+```bash
+pip3 install -r requirements.txt
+```

 ### Displaying aggregates only

@@ -182,6 +186,146 @@ Benchmark                               Time             CPU      Time Old
 This is a mix of the previous two modes, two (potentially different) benchmark binaries are run, and a different filter is applied to each one.
 As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.

+### Note: Interpreting the output
+
+Performance measurements are an art, and performance comparisons are doubly so.
+Results are often noisy and don't necessarily have large absolute differences to
+them, so just by visual inspection, it is not at all apparent if two
+measurements are actually showing a performance change or not. It is even more
+confusing with multiple benchmark repetitions.
+
+Thankfully, what we can do, is use statistical tests on the results to determine
+whether the performance has statistically-significantly changed. `compare.py`
+uses [Mann–Whitney U
+test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), with a null
+hypothesis being that there's no difference in performance.
+ 
+**The below output is a summary of a benchmark comparison with statistics
+provided for a multi-threaded process.**
+```
+Benchmark                                               Time        CPU    Time Old      Time New       CPU Old       CPU New
+-----------------------------------------------------------------------------------------------------------------------------
+benchmark/threads:1/process_time/real_time_pvalue     0.0000     0.0000    U Test, Repetitions: 27 vs 27
+benchmark/threads:1/process_time/real_time_mean      -0.1442    -0.1442          90            77            90            77
+benchmark/threads:1/process_time/real_time_median    -0.1444    -0.1444          90            77            90            77
+benchmark/threads:1/process_time/real_time_stddev    +0.3974    +0.3933           0             0             0             0
+benchmark/threads:1/process_time/real_time_cv        +0.6329    +0.6280           0             0             0             0
+OVERALL_GEOMEAN                                      -0.1442    -0.1442           0             0             0             0
+```
+--------------------------------------------
+Here's a breakdown of each row:
+
+**benchmark/threads:1/process_time/real_time_pvalue**: This shows the _p-value_ for
+the statistical test comparing the performance of the process running with one
+thread. A value of 0.0000 suggests a statistically significant difference in
+performance. The comparison was conducted using the U Test (Mann-Whitney
+U Test) with 27 repetitions for each case.
+
+**benchmark/threads:1/process_time/real_time_mean**: This shows the relative
+difference in mean execution time between two different cases. The negative
+value (-0.1442) implies that the new process is faster by about 14.42%. The old
+time was 90 units, while the new time is 77 units.
+
+**benchmark/threads:1/process_time/real_time_median**: Similarly, this shows the
+relative difference in the median execution time. Again, the new process is
+faster by 14.44%.
+
+**benchmark/threads:1/process_time/real_time_stddev**: This is the relative
+difference in the standard deviation of the execution time, which is a measure
+of how much variation or dispersion there is from the mean. A positive value
+(+0.3974) implies there is more variance in the execution time in the new
+process.
+
+**benchmark/threads:1/process_time/real_time_cv**: CV stands for Coefficient of
+Variation. It is the ratio of the standard deviation to the mean. It provides a
+standardized measure of dispersion. An increase (+0.6329) indicates more
+relative variability in the new process.
+
+**OVERALL_GEOMEAN**: Geomean stands for geometric mean, a type of average that is
+less influenced by outliers. The negative value indicates a general improvement
+in the new process. However, given the values are all zero for the old and new
+times, this seems to be a mistake or placeholder in the output.
+
+-----------------------------------------
+
+
+
+Let's first try to see what the different columns represent in the above
+`compare.py` benchmarking output:
+
+  1. **Benchmark:** The name of the function being benchmarked, along with the
+     size of the input (after the slash).
+
+  2. **Time:** The average time per operation, across all iterations.
+
+  3. **CPU:** The average CPU time per operation, across all iterations.
+
+  4. **Iterations:** The number of iterations the benchmark was run to get a
+     stable estimate.
+
+  5. **Time Old and Time New:** These represent the average time it takes for a
+     function to run in two different scenarios or versions. For example, you
+     might be comparing how fast a function runs before and after you make some
+     changes to it.
+
+  6. **CPU Old and CPU New:** These show the average amount of CPU time that the
+     function uses in two different scenarios or versions. This is similar to
+     Time Old and Time New, but focuses on CPU usage instead of overall time.
+
+In the comparison section, the relative differences in both time and CPU time
+are displayed for each input size.
+
+
+A statistically-significant difference is determined by a **p-value**, which is
+a measure of the probability that the observed difference could have occurred
+just by random chance. A smaller p-value indicates stronger evidence against the
+null hypothesis. 
+
+**Therefore:**
+  1. If the p-value is less than the chosen significance level (alpha), we
+     reject the null hypothesis and conclude the benchmarks are significantly
+     different.
+  2. If the p-value is greater than or equal to alpha, we fail to reject the
+     null hypothesis and treat the two benchmarks as similar.
+
+
+
+The result of said the statistical test is additionally communicated through color coding:
+```diff
+ Green:
+```
+  The benchmarks are _**statistically different**_. This could mean the
+  performance has either **significantly improved** or **significantly
+  deteriorated**. You should look at the actual performance numbers to see which
+  is the case.
+```diff
+- Red:
+```
+  The benchmarks are _**statistically similar**_. This means the performance
+  **hasn't significantly changed**.
+
+In statistical terms, **'green'** means we reject the null hypothesis that
+there's no difference in performance, and **'red'** means we fail to reject the
+null hypothesis. This might seem counter-intuitive if you're expecting 'green'
+to mean 'improved performance' and 'red' to mean 'worsened performance'. 
+```bash
+  But remember, in this context:
+
+    'Success' means 'successfully finding a difference'.
+    'Failure' means 'failing to find a difference'.
+```
+
+
+Also, please note that **even if** we determine that there **is** a
+statistically-significant difference between the two measurements, it does not
+_necessarily_ mean that the actual benchmarks that were measured **are**
+different, or vice versa, even if we determine that there is **no**
+statistically-significant difference between the two measurements, it does not
+necessarily mean that the actual benchmarks that were measured **are not**
+different.
+
+
+
 ### U test

 If there is a sufficient repetition count of the benchmarks, the tool can do
--- a/third_party/benchmark/docs/user_guide.md
+++ b/third_party/benchmark/docs/user_guide.md
--- a/third_party/benchmark/include/benchmark/benchmark.h
+++ b/third_party/benchmark/include/benchmark/benchmark.h
--- a/third_party/benchmark/include/benchmark/export.h
+++ b/third_party/benchmark/include/benchmark/export.h
@@ -0,0 +1,47 @@
+#ifndef BENCHMARK_EXPORT_H
+#define BENCHMARK_EXPORT_H
+
+#if defined(_WIN32)
+#define EXPORT_ATTR __declspec(dllexport)
+#define IMPORT_ATTR __declspec(dllimport)
+#define NO_EXPORT_ATTR
+#define DEPRECATED_ATTR __declspec(deprecated)
+#else  // _WIN32
+#define EXPORT_ATTR __attribute__((visibility("default")))
+#define IMPORT_ATTR __attribute__((visibility("default")))
+#define NO_EXPORT_ATTR __attribute__((visibility("hidden")))
+#define DEPRECATE_ATTR __attribute__((__deprecated__))
+#endif  // _WIN32
+
+#ifdef BENCHMARK_STATIC_DEFINE
+#define BENCHMARK_EXPORT
+#define BENCHMARK_NO_EXPORT
+#else  // BENCHMARK_STATIC_DEFINE
+#ifndef BENCHMARK_EXPORT
+#ifdef benchmark_EXPORTS
+/* We are building this library */
+#define BENCHMARK_EXPORT EXPORT_ATTR
+#else  // benchmark_EXPORTS
+/* We are using this library */
+#define BENCHMARK_EXPORT IMPORT_ATTR
+#endif  // benchmark_EXPORTS
+#endif  // !BENCHMARK_EXPORT
+
+#ifndef BENCHMARK_NO_EXPORT
+#define BENCHMARK_NO_EXPORT NO_EXPORT_ATTR
+#endif  // !BENCHMARK_NO_EXPORT
+#endif  // BENCHMARK_STATIC_DEFINE
+
+#ifndef BENCHMARK_DEPRECATED
+#define BENCHMARK_DEPRECATED DEPRECATE_ATTR
+#endif  // BENCHMARK_DEPRECATED
+
+#ifndef BENCHMARK_DEPRECATED_EXPORT
+#define BENCHMARK_DEPRECATED_EXPORT BENCHMARK_EXPORT BENCHMARK_DEPRECATED
+#endif  // BENCHMARK_DEPRECATED_EXPORT
+
+#ifndef BENCHMARK_DEPRECATED_NO_EXPORT
+#define BENCHMARK_DEPRECATED_NO_EXPORT BENCHMARK_NO_EXPORT BENCHMARK_DEPRECATED
+#endif  // BENCHMARK_DEPRECATED_EXPORT
+
+#endif /* BENCHMARK_EXPORT_H */
--- a/third_party/benchmark/mingw.py
+++ b/third_party/benchmark/mingw.py
@@ -1,320 +0,0 @@
-#! /usr/bin/env python
-# encoding: utf-8
-
-import argparse
-import errno
-import logging
-import os
-import platform
-import re
-import sys
-import subprocess
-import tempfile
-
-try:
-    import winreg
-except ImportError:
-    import _winreg as winreg
-try:
-    import urllib.request as request
-except ImportError:
-    import urllib as request
-try:
-    import urllib.parse as parse
-except ImportError:
-    import urlparse as parse
-
-class EmptyLogger(object):
-    '''
-    Provides an implementation that performs no logging
-    '''
-    def debug(self, *k, **kw):
-        pass
-    def info(self, *k, **kw):
-        pass
-    def warn(self, *k, **kw):
-        pass
-    def error(self, *k, **kw):
-        pass
-    def critical(self, *k, **kw):
-        pass
-    def setLevel(self, *k, **kw):
-        pass
-
-urls = (
-    'http://downloads.sourceforge.net/project/mingw-w64/Toolchains%20'
-        'targetting%20Win32/Personal%20Builds/mingw-builds/installer/'
-        'repository.txt',
-    'http://downloads.sourceforge.net/project/mingwbuilds/host-windows/'
-        'repository.txt'
-)
-'''
-A list of mingw-build repositories
-'''
-
-def repository(urls = urls, log = EmptyLogger()):
-    '''
-    Downloads and parse mingw-build repository files and parses them
-    '''
-    log.info('getting mingw-builds repository')
-    versions = {}
-    re_sourceforge = re.compile(r'http://sourceforge.net/projects/([^/]+)/files')
-    re_sub = r'http://downloads.sourceforge.net/project/\1'
-    for url in urls:
-        log.debug(' - requesting: %s', url)
-        socket = request.urlopen(url)
-        repo = socket.read()
-        if not isinstance(repo, str):
-            repo = repo.decode();
-        socket.close()
-        for entry in repo.split('\n')[:-1]:
-            value = entry.split('|')
-            version = tuple([int(n) for n in value[0].strip().split('.')])
-            version = versions.setdefault(version, {})
-            arch = value[1].strip()
-            if arch == 'x32':
-                arch = 'i686'
-            elif arch == 'x64':
-                arch = 'x86_64'
-            arch = version.setdefault(arch, {})
-            threading = arch.setdefault(value[2].strip(), {})
-            exceptions = threading.setdefault(value[3].strip(), {})
-            revision = exceptions.setdefault(int(value[4].strip()[3:]),
-                re_sourceforge.sub(re_sub, value[5].strip()))
-    return versions
-
-def find_in_path(file, path=None):
-    '''
-    Attempts to find an executable in the path
-    '''
-    if platform.system() == 'Windows':
-        file += '.exe'
-    if path is None:
-        path = os.environ.get('PATH', '')
-    if type(path) is type(''):
-        path = path.split(os.pathsep)
-    return list(filter(os.path.exists,
-        map(lambda dir, file=file: os.path.join(dir, file), path)))
-
-def find_7zip(log = EmptyLogger()):
-    '''
-    Attempts to find 7zip for unpacking the mingw-build archives
-    '''
-    log.info('finding 7zip')
-    path = find_in_path('7z')
-    if not path:
-        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\7-Zip')
-        path, _ = winreg.QueryValueEx(key, 'Path')
-        path = [os.path.join(path, '7z.exe')]
-    log.debug('found \'%s\'', path[0])
-    return path[0]
-
-find_7zip()
-
-def unpack(archive, location, log = EmptyLogger()):
-    '''
-    Unpacks a mingw-builds archive
-    '''
-    sevenzip = find_7zip(log)
-    log.info('unpacking %s', os.path.basename(archive))
-    cmd = [sevenzip, 'x', archive, '-o' + location, '-y']
-    log.debug(' - %r', cmd)
-    with open(os.devnull, 'w') as devnull:
-        subprocess.check_call(cmd, stdout = devnull)
-
-def download(url, location, log = EmptyLogger()):
-    '''
-    Downloads and unpacks a mingw-builds archive
-    '''
-    log.info('downloading MinGW')
-    log.debug(' - url: %s', url)
-    log.debug(' - location: %s', location)
-
-    re_content = re.compile(r'attachment;[ \t]*filename=(")?([^"]*)(")?[\r\n]*')
-
-    stream = request.urlopen(url)
-    try:
-        content = stream.getheader('Content-Disposition') or ''
-    except AttributeError:
-        content = stream.headers.getheader('Content-Disposition') or ''
-    matches = re_content.match(content)
-    if matches:
-        filename = matches.group(2)
-    else:
-        parsed = parse.urlparse(stream.geturl())
-        filename = os.path.basename(parsed.path)
-
-    try:
-        os.makedirs(location)
-    except OSError as e:
-        if e.errno == errno.EEXIST and os.path.isdir(location):
-            pass
-        else:
-            raise
-
-    archive = os.path.join(location, filename)
-    with open(archive, 'wb') as out:
-        while True:
-            buf = stream.read(1024)
-            if not buf:
-                break
-            out.write(buf)
-    unpack(archive, location, log = log)
-    os.remove(archive)
-
-    possible = os.path.join(location, 'mingw64')
-    if not os.path.exists(possible):
-        possible = os.path.join(location, 'mingw32')
-        if not os.path.exists(possible):
-            raise ValueError('Failed to find unpacked MinGW: ' + possible)
-    return possible
-
-def root(location = None, arch = None, version = None, threading = None,
-        exceptions = None, revision = None, log = EmptyLogger()):
-    '''
-    Returns the root folder of a specific version of the mingw-builds variant
-    of gcc. Will download the compiler if needed
-    '''
-
-    # Get the repository if we don't have all the information
-    if not (arch and version and threading and exceptions and revision):
-        versions = repository(log = log)
-
-    # Determine some defaults
-    version = version or max(versions.keys())
-    if not arch:
-        arch = platform.machine().lower()
-        if arch == 'x86':
-            arch = 'i686'
-        elif arch == 'amd64':
-            arch = 'x86_64'
-    if not threading:
-        keys = versions[version][arch].keys()
-        if 'posix' in keys:
-            threading = 'posix'
-        elif 'win32' in keys:
-            threading = 'win32'
-        else:
-            threading = keys[0]
-    if not exceptions:
-        keys = versions[version][arch][threading].keys()
-        if 'seh' in keys:
-            exceptions = 'seh'
-        elif 'sjlj' in keys:
-            exceptions = 'sjlj'
-        else:
-            exceptions = keys[0]
-    if revision == None:
-        revision = max(versions[version][arch][threading][exceptions].keys())
-    if not location:
-        location = os.path.join(tempfile.gettempdir(), 'mingw-builds')
-
-    # Get the download url
-    url = versions[version][arch][threading][exceptions][revision]
-
-    # Tell the user whatzzup
-    log.info('finding MinGW %s', '.'.join(str(v) for v in version))
-    log.debug(' - arch: %s', arch)
-    log.debug(' - threading: %s', threading)
-    log.debug(' - exceptions: %s', exceptions)
-    log.debug(' - revision: %s', revision)
-    log.debug(' - url: %s', url)
-
-    # Store each specific revision differently
-    slug = '{version}-{arch}-{threading}-{exceptions}-rev{revision}'
-    slug = slug.format(
-        version = '.'.join(str(v) for v in version),
-        arch = arch,
-        threading = threading,
-        exceptions = exceptions,
-        revision = revision
-    )
-    if arch == 'x86_64':
-        root_dir = os.path.join(location, slug, 'mingw64')
-    elif arch == 'i686':
-        root_dir = os.path.join(location, slug, 'mingw32')
-    else:
-        raise ValueError('Unknown MinGW arch: ' + arch)
-
-    # Download if needed
-    if not os.path.exists(root_dir):
-        downloaded = download(url, os.path.join(location, slug), log = log)
-        if downloaded != root_dir:
-            raise ValueError('The location of mingw did not match\n%s\n%s'
-                % (downloaded, root_dir))
-
-    return root_dir
-
-def str2ver(string):
-    '''
-    Converts a version string into a tuple
-    '''
-    try:
-        version = tuple(int(v) for v in string.split('.'))
-        if len(version) is not 3:
-            raise ValueError()
-    except ValueError:
-        raise argparse.ArgumentTypeError(
-            'please provide a three digit version string')
-    return version
-
-def main():
-    '''
-    Invoked when the script is run directly by the python interpreter
-    '''
-    parser = argparse.ArgumentParser(
-        description = 'Downloads a specific version of MinGW',
-        formatter_class = argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument('--location',
-        help = 'the location to download the compiler to',
-        default = os.path.join(tempfile.gettempdir(), 'mingw-builds'))
-    parser.add_argument('--arch', required = True, choices = ['i686', 'x86_64'],
-        help = 'the target MinGW architecture string')
-    parser.add_argument('--version', type = str2ver,
-        help = 'the version of GCC to download')
-    parser.add_argument('--threading', choices = ['posix', 'win32'],
-        help = 'the threading type of the compiler')
-    parser.add_argument('--exceptions', choices = ['sjlj', 'seh', 'dwarf'],
-        help = 'the method to throw exceptions')
-    parser.add_argument('--revision', type=int,
-        help = 'the revision of the MinGW release')
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument('-v', '--verbose', action='store_true',
-        help='increase the script output verbosity')
-    group.add_argument('-q', '--quiet', action='store_true',
-        help='only print errors and warning')
-    args = parser.parse_args()
-
-    # Create the logger
-    logger = logging.getLogger('mingw')
-    handler = logging.StreamHandler()
-    formatter = logging.Formatter('%(message)s')
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    logger.setLevel(logging.INFO)
-    if args.quiet:
-        logger.setLevel(logging.WARN)
-    if args.verbose:
-        logger.setLevel(logging.DEBUG)
-
-    # Get MinGW
-    root_dir = root(location = args.location, arch = args.arch,
-        version = args.version, threading = args.threading,
-        exceptions = args.exceptions, revision = args.revision,
-        log = logger)
-
-    sys.stdout.write('%s\n' % os.path.join(root_dir, 'bin'))
-
-if __name__ == '__main__':
-    try:
-        main()
-    except IOError as e:
-        sys.stderr.write('IO error: %s\n' % e)
-        sys.exit(1)
-    except OSError as e:
-        sys.stderr.write('OS error: %s\n' % e)
-        sys.exit(1)
-    except KeyboardInterrupt as e:
-        sys.stderr.write('Killed\n')
-        sys.exit(1)
--- a/third_party/benchmark/pyproject.toml
+++ b/third_party/benchmark/pyproject.toml
@@ -0,0 +1,78 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "google_benchmark"
+description = "A library to benchmark code snippets."
+requires-python = ">=3.10"
+license = { file = "LICENSE" }
+keywords = ["benchmark"]
+
+authors = [{ name = "Google", email = "benchmark-discuss@googlegroups.com" }]
+
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Software Development :: Testing",
+    "Topic :: System :: Benchmark",
+]
+
+dynamic = ["readme", "version"]
+
+dependencies = ["absl-py>=0.7.1"]
+
+[project.optional-dependencies]
+dev = ["pre-commit>=3.3.3"]
+
+[project.urls]
+Homepage = "https://github.com/google/benchmark"
+Documentation = "https://github.com/google/benchmark/tree/main/docs"
+Repository = "https://github.com/google/benchmark.git"
+Discord = "https://discord.gg/cz7UX7wKC2"
+
+[tool.setuptools]
+package-dir = { "" = "bindings/python" }
+zip-safe = false
+
+[tool.setuptools.packages.find]
+where = ["bindings/python"]
+
+[tool.setuptools.dynamic]
+readme = { file = "README.md", content-type = "text/markdown" }
+version = { attr = "google_benchmark.__version__" }
+
+[tool.mypy]
+check_untyped_defs = true
+disallow_incomplete_defs = true
+pretty = true
+python_version = "3.11"
+strict_optional = false
+warn_unreachable = true
+
+[[tool.mypy.overrides]]
+module = ["yaml"]
+ignore_missing_imports = true
+
+[tool.ruff]
+# explicitly tell ruff the source directory to correctly identify first-party package.
+src = ["bindings/python"]
+
+line-length = 80
+target-version = "py311"
+
+[tool.ruff.lint]
+# Enable pycodestyle (`E`, `W`), Pyflakes (`F`), and isort (`I`) codes by default.
+select = ["ASYNC", "B", "C4", "C90", "E", "F", "I", "PERF", "PIE", "PT018", "RUF", "SIM", "UP", "W"]
+ignore = [
+    "PLW2901",  # redefined-loop-name
+    "UP031",    # printf-string-formatting
+]
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
--- a/third_party/benchmark/releasing.md
+++ b/third_party/benchmark/releasing.md
@@ -1,16 +0,0 @@
-# How to release
-
-* Make sure you're on master and synced to HEAD
-* Ensure the project builds and tests run (sanity check only, obviously)
-    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
-      passes
-* Prepare release notes
-    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
-      commits between the last annotated tag and HEAD
-    * Pick the most interesting.
-* Create a release through github's interface
-    * Note this will create a lightweight tag.
-    * Update this to an annotated tag:
-      * `git pull --tags`
-      * `git tag -a -f <tag> <tag>`
-      * `git push --force origin`
--- a/third_party/benchmark/setup.py
+++ b/third_party/benchmark/setup.py
@@ -0,0 +1,166 @@
+import contextlib
+import os
+import platform
+import re
+import shutil
+import sys
+from collections.abc import Generator
+from pathlib import Path
+from typing import Any
+
+import setuptools
+from setuptools.command import build_ext
+
+IS_WINDOWS = platform.system() == "Windows"
+IS_MAC = platform.system() == "Darwin"
+IS_LINUX = platform.system() == "Linux"
+
+# hardcoded SABI-related options. Requires that each Python interpreter
+# (hermetic or not) participating is of the same major-minor version.
+py_limited_api = sys.version_info >= (3, 12)
+options = {"bdist_wheel": {"py_limited_api": "cp312"}} if py_limited_api else {}
+
+
+def is_cibuildwheel() -> bool:
+    return os.getenv("CIBUILDWHEEL") is not None
+
+
+@contextlib.contextmanager
+def _maybe_patch_toolchains() -> Generator[None, None, None]:
+    """
+    Patch rules_python toolchains to ignore root user error
+    when run in a Docker container on Linux in cibuildwheel.
+    """
+
+    def fmt_toolchain_args(matchobj):
+        suffix = "ignore_root_user_error = True"
+        callargs = matchobj.group(1)
+        # toolchain def is broken over multiple lines
+        if callargs.endswith("\n"):
+            callargs = callargs + "    " + suffix + ",\n"
+        # toolchain def is on one line.
+        else:
+            callargs = callargs + ", " + suffix
+        return "python.toolchain(" + callargs + ")"
+
+    CIBW_LINUX = is_cibuildwheel() and IS_LINUX
+    module_bazel = Path("MODULE.bazel")
+    content: str = module_bazel.read_text()
+    try:
+        if CIBW_LINUX:
+            module_bazel.write_text(
+                re.sub(
+                    r"python.toolchain\(([\w\"\s,.=]*)\)",
+                    fmt_toolchain_args,
+                    content,
+                )
+            )
+        yield
+    finally:
+        if CIBW_LINUX:
+            module_bazel.write_text(content)
+
+
+class BazelExtension(setuptools.Extension):
+    """A C/C++ extension that is defined as a Bazel BUILD target."""
+
+    def __init__(self, name: str, bazel_target: str, **kwargs: Any):
+        super().__init__(name=name, sources=[], **kwargs)
+
+        self.bazel_target = bazel_target
+        stripped_target = bazel_target.split("//")[-1]
+        self.relpath, self.target_name = stripped_target.split(":")
+
+
+class BuildBazelExtension(build_ext.build_ext):
+    """A command that runs Bazel to build a C/C++ extension."""
+
+    def run(self):
+        for ext in self.extensions:
+            self.bazel_build(ext)
+        # explicitly call `bazel shutdown` for graceful exit
+        self.spawn(["bazel", "shutdown"])
+
+    def copy_extensions_to_source(self):
+        """
+        Copy generated extensions into the source tree.
+        This is done in the ``bazel_build`` method, so it's not necessary to
+        do again in the `build_ext` base class.
+        """
+
+    def bazel_build(self, ext: BazelExtension) -> None:  # noqa: C901
+        """Runs the bazel build to create the package."""
+        temp_path = Path(self.build_temp)
+
+        # We round to the minor version, which makes rules_python
+        # look up the latest available patch version internally.
+        python_version = "{}.{}".format(*sys.version_info[:2])
+
+        bazel_argv = [
+            "bazel",
+            "run",
+            ext.bazel_target,
+            f"--symlink_prefix={temp_path / 'bazel-'}",
+            f"--compilation_mode={'dbg' if self.debug else 'opt'}",
+            # C++17 is required by nanobind
+            f"--cxxopt={'/std:c++17' if IS_WINDOWS else '-std=c++17'}",
+            f"--@rules_python//python/config_settings:python_version={python_version}",
+        ]
+
+        if ext.py_limited_api:
+            bazel_argv += ["--@nanobind_bazel//:py-limited-api=cp312"]
+
+        if IS_WINDOWS:
+            # Link with python*.lib.
+            for library_dir in self.library_dirs:
+                bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
+        elif IS_MAC:
+            # C++17 needs macOS 10.14 at minimum
+            bazel_argv.append("--macos_minimum_os=10.14")
+
+        with _maybe_patch_toolchains():
+            self.spawn(bazel_argv)
+
+        if IS_WINDOWS:
+            suffix = ".pyd"
+        else:
+            suffix = ".abi3.so" if ext.py_limited_api else ".so"
+
+        # copy the Bazel build artifacts into setuptools' libdir,
+        # from where the wheel is built.
+        pkgname = "google_benchmark"
+        pythonroot = Path("bindings") / "python" / "google_benchmark"
+        srcdir = temp_path / "bazel-bin" / pythonroot
+        libdir = Path(self.build_lib) / pkgname
+        for root, dirs, files in os.walk(srcdir, topdown=True):
+            # exclude runfiles directories and children.
+            dirs[:] = [d for d in dirs if "runfiles" not in d]
+
+            for f in files:
+                fp = Path(f)
+                should_copy = False
+                # we do not want the bare .so file included
+                # when building for ABI3, so we require a
+                # full and exact match on the file extension.
+                if "".join(fp.suffixes) == suffix or fp.suffix == ".pyi":
+                    should_copy = True
+                elif Path(root) == srcdir and f == "py.typed":
+                    # copy py.typed, but only at the package root.
+                    should_copy = True
+
+                if should_copy:
+                    shutil.copyfile(root / fp, libdir / fp)
+
+
+setuptools.setup(
+    cmdclass={"build_ext": BuildBazelExtension},
+    package_data={"google_benchmark": ["py.typed", "*.pyi"]},
+    ext_modules=[
+        BazelExtension(
+            name="google_benchmark._benchmark",
+            bazel_target="//bindings/python/google_benchmark:benchmark_stubgen",
+            py_limited_api=py_limited_api,
+        )
+    ],
+    options=options,
+)
--- a/third_party/benchmark/src/CMakeLists.txt
+++ b/third_party/benchmark/src/CMakeLists.txt
@@ -1,4 +1,5 @@
-# Allow the source files to find headers in src/
+#Allow the source files to find headers in src /
+include(GNUInstallDirs)
 include_directories(${PROJECT_SOURCE_DIR}/src)

 if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
@@ -17,92 +18,166 @@ foreach(item ${BENCHMARK_MAIN})
 endforeach()

 add_library(benchmark ${SOURCE_FILES})
+add_library(benchmark::benchmark ALIAS benchmark)
 set_target_properties(benchmark PROPERTIES
  OUTPUT_NAME "benchmark"
  VERSION ${GENERIC_LIB_VERSION}
  SOVERSION ${GENERIC_LIB_SOVERSION}
 )
 target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+)
+
+set_property(
+  SOURCE benchmark.cc
+  APPEND
+  PROPERTY COMPILE_DEFINITIONS
+  BENCHMARK_VERSION="${VERSION}"
+)
+
+# libpfm, if available
+if (PFM_FOUND)
+  target_link_libraries(benchmark PRIVATE PFM::libpfm)
+  target_compile_definitions(benchmark PRIVATE -DHAVE_LIBPFM)
+  install(
+      FILES "${PROJECT_SOURCE_DIR}/cmake/Modules/FindPFM.cmake"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+endif()
+
+# pthread affinity, if available
+if(HAVE_PTHREAD_AFFINITY)
+  target_compile_definitions(benchmark PRIVATE -DBENCHMARK_HAS_PTHREAD_AFFINITY)
+endif()

 # Link threads.
-target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-find_library(LIBRT rt)
-if(LIBRT)
-  target_link_libraries(benchmark ${LIBRT})
-endif()
+target_link_libraries(benchmark PRIVATE Threads::Threads)
+
+target_link_libraries(benchmark PRIVATE ${BENCHMARK_CXX_LIBRARIES})
+
+if(HAVE_LIB_RT)
+  target_link_libraries(benchmark PRIVATE rt)
+endif(HAVE_LIB_RT)
+

 # We need extra libraries on Windows
 if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-  target_link_libraries(benchmark Shlwapi)
+  target_link_libraries(benchmark PRIVATE shlwapi)
 endif()

 # We need extra libraries on Solaris
 if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
-  target_link_libraries(benchmark kstat)
+  target_link_libraries(benchmark PRIVATE kstat)
+  set(BENCHMARK_PRIVATE_LINK_LIBRARIES -lkstat)
+endif()
+
+if (NOT BUILD_SHARED_LIBS)
+  target_compile_definitions(benchmark PUBLIC -DBENCHMARK_STATIC_DEFINE)
 endif()

 # Benchmark main library
 add_library(benchmark_main "benchmark_main.cc")
+add_library(benchmark::benchmark_main ALIAS benchmark_main)
 set_target_properties(benchmark_main PROPERTIES
  OUTPUT_NAME "benchmark_main"
  VERSION ${GENERIC_LIB_VERSION}
  SOVERSION ${GENERIC_LIB_SOVERSION}
+  DEFINE_SYMBOL benchmark_EXPORTS
 )
-target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
-target_link_libraries(benchmark_main benchmark)
+target_link_libraries(benchmark_main PUBLIC benchmark::benchmark)

-set(include_install_dir "include")
-set(lib_install_dir "lib/")
-set(bin_install_dir "bin/")
-set(config_install_dir "lib/cmake/${PROJECT_NAME}")
-set(pkgconfig_install_dir "lib/pkgconfig")
-
-set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
+set(generated_dir "${PROJECT_BINARY_DIR}")

 set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
 set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
 set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
+set(pkg_config_main "${generated_dir}/${PROJECT_NAME}_main.pc")
+set(targets_to_export benchmark benchmark_main)
 set(targets_export_name "${PROJECT_NAME}Targets")

 set(namespace "${PROJECT_NAME}::")

 include(CMakePackageConfigHelpers)
+
+configure_package_config_file (
+  ${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in
+  ${project_config}
+  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
+  NO_SET_AND_CHECK_MACRO
+  NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
 write_basic_package_version_file(
  "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY SameMajorVersion
 )

-configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
 configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)
+configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark_main.pc.in" "${pkg_config_main}" @ONLY)
+
+export (
+  TARGETS ${targets_to_export}
+  NAMESPACE "${namespace}"
+  FILE ${generated_dir}/${targets_export_name}.cmake
+)

 if (BENCHMARK_ENABLE_INSTALL)
  # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
  install(
-    TARGETS benchmark benchmark_main
+    TARGETS ${targets_to_export}
    EXPORT ${targets_export_name}
-    ARCHIVE DESTINATION ${lib_install_dir}
-    LIBRARY DESTINATION ${lib_install_dir}
-    RUNTIME DESTINATION ${bin_install_dir}
-    INCLUDES DESTINATION ${include_install_dir})
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

  install(
    DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
-    DESTINATION ${include_install_dir}
+              "${PROJECT_BINARY_DIR}/include/benchmark"
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
    FILES_MATCHING PATTERN "*.*h")

  install(
      FILES "${project_config}" "${version_config}"
-      DESTINATION "${config_install_dir}")
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")

  install(
-      FILES "${pkg_config}"
-      DESTINATION "${pkgconfig_install_dir}")
+      FILES "${pkg_config}" "${pkg_config_main}"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")

  install(
      EXPORT "${targets_export_name}"
      NAMESPACE "${namespace}"
-      DESTINATION "${config_install_dir}")
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+endif()
+
+if (BENCHMARK_ENABLE_DOXYGEN)
+  find_package(Doxygen REQUIRED)
+  set(DOXYGEN_QUIET YES)
+  set(DOXYGEN_RECURSIVE YES)
+  set(DOXYGEN_GENERATE_HTML YES)
+  set(DOXYGEN_GENERATE_MAN NO)
+  set(DOXYGEN_MARKDOWN_SUPPORT YES)
+  set(DOXYGEN_BUILTIN_STL_SUPPORT YES)
+  set(DOXYGEN_EXTRACT_PACKAGE YES)
+  set(DOXYGEN_EXTRACT_STATIC YES)
+  set(DOXYGEN_SHOW_INCLUDE_FILES YES)
+  set(DOXYGEN_BINARY_TOC YES)
+  set(DOXYGEN_TOC_EXPAND YES)
+  set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "index.md")
+  doxygen_add_docs(benchmark_doxygen
+    docs
+    include
+    src
+    ALL
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+    COMMENT "Building documentation with Doxygen.")
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/html/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
+else()
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${PROJECT_SOURCE_DIR}/docs/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
 endif()
--- a/third_party/benchmark/src/benchmark.cc
+++ b/third_party/benchmark/src/benchmark.cc
--- a/third_party/benchmark/src/benchmark_api_internal.cc
+++ b/third_party/benchmark/src/benchmark_api_internal.cc
@@ -1,15 +1,118 @@
 #include "benchmark_api_internal.h"

+#include <cinttypes>
+
+#include "string_util.h"
+
 namespace benchmark {
 namespace internal {

+BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
+                                     int per_family_instance_idx,
+                                     const std::vector<int64_t>& args,
+                                     int thread_count)
+    : benchmark_(*benchmark),
+      family_index_(family_idx),
+      per_family_instance_index_(per_family_instance_idx),
+      aggregation_report_mode_(benchmark_.aggregation_report_mode_),
+      args_(args),
+      time_unit_(benchmark_.GetTimeUnit()),
+      measure_process_cpu_time_(benchmark_.measure_process_cpu_time_),
+      use_real_time_(benchmark_.use_real_time_),
+      use_manual_time_(benchmark_.use_manual_time_),
+      complexity_(benchmark_.complexity_),
+      complexity_lambda_(benchmark_.complexity_lambda_),
+      statistics_(benchmark_.statistics_),
+      repetitions_(benchmark_.repetitions_),
+      min_time_(benchmark_.min_time_),
+      min_warmup_time_(benchmark_.min_warmup_time_),
+      iterations_(benchmark_.iterations_),
+      threads_(thread_count),
+      setup_(benchmark_.setup_),
+      teardown_(benchmark_.teardown_) {
+  name_.function_name = benchmark_.name_;
+
+  size_t arg_i = 0;
+  for (const auto& arg : args) {
+    if (!name_.args.empty()) {
+      name_.args += '/';
+    }
+
+    if (arg_i < benchmark->arg_names_.size()) {
+      const auto& arg_name = benchmark_.arg_names_[arg_i];
+      if (!arg_name.empty()) {
+        name_.args += StrFormat("%s:", arg_name.c_str());
+      }
+    }
+
+    name_.args += StrFormat("%" PRId64, arg);
+    ++arg_i;
+  }
+
+  if (!IsZero(benchmark->min_time_)) {
+    name_.min_time = StrFormat("min_time:%0.3f", benchmark_.min_time_);
+  }
+
+  if (!IsZero(benchmark->min_warmup_time_)) {
+    name_.min_warmup_time =
+        StrFormat("min_warmup_time:%0.3f", benchmark_.min_warmup_time_);
+  }
+
+  if (benchmark_.iterations_ != 0) {
+    name_.iterations = StrFormat(
+        "iterations:%lu", static_cast<unsigned long>(benchmark_.iterations_));
+  }
+
+  if (benchmark_.repetitions_ != 0) {
+    name_.repetitions = StrFormat("repeats:%d", benchmark_.repetitions_);
+  }
+
+  if (benchmark_.measure_process_cpu_time_) {
+    name_.time_type = "process_time";
+  }
+
+  if (benchmark_.use_manual_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "manual_time";
+  } else if (benchmark_.use_real_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "real_time";
+  }
+
+  if (!benchmark_.thread_counts_.empty()) {
+    name_.threads = StrFormat("threads:%d", threads_);
+  }
+}
+
 State BenchmarkInstance::Run(
-    size_t iters, int thread_id, internal::ThreadTimer* timer,
-    internal::ThreadManager* manager) const {
-  State st(iters, arg, thread_id, threads, timer, manager);
-  benchmark->Run(st);
+    IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+    internal::ThreadManager* manager,
+    internal::PerfCountersMeasurement* perf_counters_measurement,
+    ProfilerManager* profiler_manager) const {
+  State st(name_.function_name, iters, args_, thread_id, threads_, timer,
+           manager, perf_counters_measurement, profiler_manager);
+  benchmark_.Run(st);
  return st;
 }

-}  // internal
-}  // benchmark
+void BenchmarkInstance::Setup() const {
+  if (setup_ != nullptr) {
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr, nullptr);
+    setup_(st);
+  }
+}
+
+void BenchmarkInstance::Teardown() const {
+  if (teardown_ != nullptr) {
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr, nullptr);
+    teardown_(st);
+  }
+}
+}  // namespace internal
+}  // namespace benchmark
--- a/third_party/benchmark/src/benchmark_api_internal.h
+++ b/third_party/benchmark/src/benchmark_api_internal.h
@@ -1,9 +1,6 @@
 #ifndef BENCHMARK_API_INTERNAL_H
 #define BENCHMARK_API_INTERNAL_H

-#include "benchmark/benchmark.h"
-#include "commandlineflags.h"
-
 #include <cmath>
 #include <iosfwd>
 #include <limits>
@@ -11,31 +8,71 @@
 #include <string>
 #include <vector>

+#include "benchmark/benchmark.h"
+#include "commandlineflags.h"
+
 namespace benchmark {
 namespace internal {

 // Information kept per benchmark we may want to run
-struct BenchmarkInstance {
-  std::string name;
-  Benchmark* benchmark;
-  AggregationReportMode aggregation_report_mode;
-  std::vector<int64_t> arg;
-  TimeUnit time_unit;
-  int range_multiplier;
-  bool use_real_time;
-  bool use_manual_time;
-  BigO complexity;
-  BigOFunc* complexity_lambda;
-  UserCounters counters;
-  const std::vector<Statistics>* statistics;
-  bool last_benchmark_instance;
-  int repetitions;
-  double min_time;
-  size_t iterations;
-  int threads;  // Number of concurrent threads to us
+class BenchmarkInstance {
+ public:
+  BenchmarkInstance(Benchmark* benchmark, int family_idx,
+                    int per_family_instance_idx,
+                    const std::vector<int64_t>& args, int thread_count);

-  State Run(size_t iters, int thread_id, internal::ThreadTimer* timer,
-            internal::ThreadManager* manager) const;
+  const BenchmarkName& name() const { return name_; }
+  int family_index() const { return family_index_; }
+  int per_family_instance_index() const { return per_family_instance_index_; }
+  AggregationReportMode aggregation_report_mode() const {
+    return aggregation_report_mode_;
+  }
+  TimeUnit time_unit() const { return time_unit_; }
+  bool measure_process_cpu_time() const { return measure_process_cpu_time_; }
+  bool use_real_time() const { return use_real_time_; }
+  bool use_manual_time() const { return use_manual_time_; }
+  BigO complexity() const { return complexity_; }
+  BigOFunc* complexity_lambda() const { return complexity_lambda_; }
+  const std::vector<Statistics>& statistics() const { return statistics_; }
+  int repetitions() const { return repetitions_; }
+  double min_time() const { return min_time_; }
+  double min_warmup_time() const { return min_warmup_time_; }
+  IterationCount iterations() const { return iterations_; }
+  int threads() const { return threads_; }
+  void Setup() const;
+  void Teardown() const;
+  const auto& GetUserThreadRunnerFactory() const {
+    return benchmark_.threadrunner_;
+  }
+
+  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+            internal::ThreadManager* manager,
+            internal::PerfCountersMeasurement* perf_counters_measurement,
+            ProfilerManager* profiler_manager) const;
+
+ private:
+  BenchmarkName name_;
+  Benchmark& benchmark_;
+  const int family_index_;
+  const int per_family_instance_index_;
+  AggregationReportMode aggregation_report_mode_;
+  const std::vector<int64_t>& args_;
+  TimeUnit time_unit_;
+  bool measure_process_cpu_time_;
+  bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
+  UserCounters counters_;
+  const std::vector<Statistics>& statistics_;
+  int repetitions_;
+  double min_time_;
+  double min_warmup_time_;
+  IterationCount iterations_;
+  int threads_;  // Number of concurrent threads to us
+
+  callback_function setup_;
+  callback_function teardown_;
 };

 bool FindBenchmarksInternal(const std::string& re,
@@ -44,6 +81,7 @@ bool FindBenchmarksInternal(const std::string& re,

 bool IsZero(double n);

+BENCHMARK_EXPORT
 ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);

 }  // end namespace internal
--- a/third_party/benchmark/src/benchmark_main.cc
+++ b/third_party/benchmark/src/benchmark_main.cc
@@ -14,4 +14,5 @@

 #include "benchmark/benchmark.h"

+BENCHMARK_EXPORT int main(int /*argc*/, char** /*argv*/);
 BENCHMARK_MAIN();
--- a/third_party/benchmark/src/benchmark_name.cc
+++ b/third_party/benchmark/src/benchmark_name.cc
@@ -0,0 +1,59 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <benchmark/benchmark.h>
+
+namespace benchmark {
+
+namespace {
+
+// Compute the total size of a pack of std::strings
+size_t size_impl() { return 0; }
+
+template <typename Head, typename... Tail>
+size_t size_impl(const Head& head, const Tail&... tail) {
+  return head.size() + size_impl(tail...);
+}
+
+// Join a pack of std::strings using a delimiter
+// TODO(dominic): use absl::StrJoin
+void join_impl(std::string& /*unused*/, char /*unused*/) {}
+
+template <typename Head, typename... Tail>
+void join_impl(std::string& s, const char delimiter, const Head& head,
+               const Tail&... tail) {
+  if (!s.empty() && !head.empty()) {
+    s += delimiter;
+  }
+
+  s += head;
+
+  join_impl(s, delimiter, tail...);
+}
+
+template <typename... Ts>
+std::string join(char delimiter, const Ts&... ts) {
+  std::string s;
+  s.reserve(sizeof...(Ts) + size_impl(ts...));
+  join_impl(s, delimiter, ts...);
+  return s;
+}
+}  // namespace
+
+BENCHMARK_EXPORT
+std::string BenchmarkName::str() const {
+  return join('/', function_name, args, min_time, min_warmup_time, iterations,
+              repetitions, time_type, threads);
+}
+}  // namespace benchmark
--- a/third_party/benchmark/src/benchmark_register.cc
+++ b/third_party/benchmark/src/benchmark_register.cc
@@ -15,7 +15,7 @@
 #include "benchmark_register.h"

 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -24,6 +24,7 @@

 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
@@ -31,6 +32,7 @@
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <numeric>
 #include <sstream>
 #include <thread>

@@ -51,10 +53,13 @@ namespace benchmark {

 namespace {
 // For non-dense Range, intermediate values are powers of kRangeMultiplier.
-static const int kRangeMultiplier = 8;
+constexpr int kRangeMultiplier = 8;
+
 // The size of a benchmark family determines is the number of inputs to repeat
 // the benchmark on. If this is "large" then warn the user during configuration.
-static const size_t kMaxFamilySize = 100;
+constexpr size_t kMaxFamilySize = 100;
+
+constexpr char kDisabledPrefix[] = "DISABLED_";
 }  // end namespace

 namespace internal {
@@ -77,7 +82,7 @@ class BenchmarkFamilies {

  // Extract the list of benchmark instances that match the specified
  // regular expression.
-  bool FindBenchmarks(std::string re,
+  bool FindBenchmarks(std::string spec,
                      std::vector<BenchmarkInstance>* benchmarks,
                      std::ostream* Err);

@@ -109,28 +114,35 @@ void BenchmarkFamilies::ClearBenchmarks() {
 bool BenchmarkFamilies::FindBenchmarks(
    std::string spec, std::vector<BenchmarkInstance>* benchmarks,
    std::ostream* ErrStream) {
-  CHECK(ErrStream);
+  BM_CHECK(ErrStream);
  auto& Err = *ErrStream;
  // Make regular expression out of command-line flag
  std::string error_msg;
  Regex re;
-  bool isNegativeFilter = false;
+  bool is_negative_filter = false;
  if (spec[0] == '-') {
    spec.replace(0, 1, "");
-    isNegativeFilter = true;
+    is_negative_filter = true;
  }
  if (!re.Init(spec, &error_msg)) {
-    Err << "Could not compile benchmark re: " << error_msg << std::endl;
+    Err << "Could not compile benchmark re: " << error_msg << '\n';
    return false;
  }

  // Special list of thread counts to use when none are specified
  const std::vector<int> one_thread = {1};

+  int next_family_index = 0;
+
  MutexLock l(mutex_);
  for (std::unique_ptr<Benchmark>& family : families_) {
+    int family_index = next_family_index;
+    int per_family_instance_index = 0;
+
    // Family was deleted or benchmark doesn't match
-    if (!family) continue;
+    if (!family) {
+      continue;
+    }

    if (family->ArgsCnt() == -1) {
      family->Args({});
@@ -147,67 +159,31 @@ bool BenchmarkFamilies::FindBenchmarks(
          << " will be repeated at least " << family_size << " times.\n";
    }
    // reserve in the special case the regex ".", since we know the final
-    // family size.
-    if (spec == ".") benchmarks->reserve(family_size);
+    // family size.  this doesn't take into account any disabled benchmarks
+    // so worst case we reserve more than we need.
+    if (spec == ".") {
+      benchmarks->reserve(benchmarks->size() + family_size);
+    }

    for (auto const& args : family->args_) {
      for (int num_threads : *thread_counts) {
-        BenchmarkInstance instance;
-        instance.name = family->name_;
-        instance.benchmark = family.get();
-        instance.aggregation_report_mode = family->aggregation_report_mode_;
-        instance.arg = args;
-        instance.time_unit = family->time_unit_;
-        instance.range_multiplier = family->range_multiplier_;
-        instance.min_time = family->min_time_;
-        instance.iterations = family->iterations_;
-        instance.repetitions = family->repetitions_;
-        instance.use_real_time = family->use_real_time_;
-        instance.use_manual_time = family->use_manual_time_;
-        instance.complexity = family->complexity_;
-        instance.complexity_lambda = family->complexity_lambda_;
-        instance.statistics = &family->statistics_;
-        instance.threads = num_threads;
+        BenchmarkInstance instance(family.get(), family_index,
+                                   per_family_instance_index, args,
+                                   num_threads);

-        // Add arguments to instance name
-        size_t arg_i = 0;
-        for (auto const& arg : args) {
-          instance.name += "/";
-
-          if (arg_i < family->arg_names_.size()) {
-            const auto& arg_name = family->arg_names_[arg_i];
-            if (!arg_name.empty()) {
-              instance.name +=
-                  StrFormat("%s:", family->arg_names_[arg_i].c_str());
-            }
-          }
-
-          instance.name += StrFormat("%d", arg);
-          ++arg_i;
-        }
-
-        if (!IsZero(family->min_time_))
-          instance.name += StrFormat("/min_time:%0.3f", family->min_time_);
-        if (family->iterations_ != 0)
-          instance.name += StrFormat("/iterations:%d", family->iterations_);
-        if (family->repetitions_ != 0)
-          instance.name += StrFormat("/repeats:%d", family->repetitions_);
-
-        if (family->use_manual_time_) {
-          instance.name += "/manual_time";
-        } else if (family->use_real_time_) {
-          instance.name += "/real_time";
-        }
-
-        // Add the number of threads used to the name
-        if (!family->thread_counts_.empty()) {
-          instance.name += StrFormat("/threads:%d", instance.threads);
-        }
-
-        if ((re.Match(instance.name) && !isNegativeFilter) ||
-            (!re.Match(instance.name) && isNegativeFilter)) {
-          instance.last_benchmark_instance = (&args == &family->args_.back());
+        const auto full_name = instance.name().str();
+        if (full_name.rfind(kDisabledPrefix, 0) != 0 &&
+            ((re.Match(full_name) && !is_negative_filter) ||
+             (!re.Match(full_name) && is_negative_filter))) {
          benchmarks->push_back(std::move(instance));
+
+          ++per_family_instance_index;
+
+          // Only bump the next family index once we've estabilished that
+          // at least one instance of this family will be run.
+          if (next_family_index == family_index) {
+            ++next_family_index;
+          }
        }
      }
    }
@@ -215,11 +191,11 @@ bool BenchmarkFamilies::FindBenchmarks(
  return true;
 }

-Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
-  std::unique_ptr<Benchmark> bench_ptr(bench);
+Benchmark* RegisterBenchmarkInternal(std::unique_ptr<Benchmark> bench) {
+  Benchmark* bench_ptr = bench.get();
  BenchmarkFamilies* families = BenchmarkFamilies::GetInstance();
-  families->AddBenchmark(std::move(bench_ptr));
-  return bench;
+  families->AddBenchmark(std::move(bench));
+  return bench_ptr;
 }

 // FIXME: This function is a hack so that benchmark.cc can access
@@ -234,14 +210,17 @@ bool FindBenchmarksInternal(const std::string& re,
 //                               Benchmark
 //=============================================================================//

-Benchmark::Benchmark(const char* name)
+Benchmark::Benchmark(const std::string& name)
    : name_(name),
      aggregation_report_mode_(ARM_Unspecified),
-      time_unit_(kNanosecond),
+      time_unit_(GetDefaultTimeUnit()),
+      use_default_time_unit_(true),
      range_multiplier_(kRangeMultiplier),
      min_time_(0),
+      min_warmup_time_(0),
      iterations_(0),
      repetitions_(0),
+      measure_process_cpu_time_(false),
      use_real_time_(false),
      use_manual_time_(false),
      complexity_(oNone),
@@ -249,23 +228,30 @@ Benchmark::Benchmark(const char* name)
  ComputeStatistics("mean", StatisticsMean);
  ComputeStatistics("median", StatisticsMedian);
  ComputeStatistics("stddev", StatisticsStdDev);
+  ComputeStatistics("cv", StatisticsCV, kPercentage);
 }

 Benchmark::~Benchmark() {}

+Benchmark* Benchmark::Name(const std::string& name) {
+  SetName(name);
+  return this;
+}
+
 Benchmark* Benchmark::Arg(int64_t x) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
  args_.push_back({x});
  return this;
 }

 Benchmark* Benchmark::Unit(TimeUnit unit) {
  time_unit_ = unit;
+  use_default_time_unit_ = false;
  return this;
 }

 Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
  std::vector<int64_t> arglist;
  AddRange(&arglist, start, limit, range_multiplier_);

@@ -277,54 +263,61 @@ Benchmark* Benchmark::Range(int64_t start, int64_t limit) {

 Benchmark* Benchmark::Ranges(
    const std::vector<std::pair<int64_t, int64_t>>& ranges) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
  std::vector<std::vector<int64_t>> arglists(ranges.size());
-  std::size_t total = 1;
  for (std::size_t i = 0; i < ranges.size(); i++) {
    AddRange(&arglists[i], ranges[i].first, ranges[i].second,
             range_multiplier_);
-    total *= arglists[i].size();
  }

-  std::vector<std::size_t> ctr(arglists.size(), 0);
+  ArgsProduct(arglists);

+  return this;
+}
+
+Benchmark* Benchmark::ArgsProduct(
+    const std::vector<std::vector<int64_t>>& arglists) {
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(arglists.size()));
+
+  std::vector<std::size_t> indices(arglists.size());
+  const std::size_t total = std::accumulate(
+      std::begin(arglists), std::end(arglists), std::size_t{1},
+      [](const std::size_t res, const std::vector<int64_t>& arglist) {
+        return res * arglist.size();
+      });
+  std::vector<int64_t> args;
+  args.reserve(arglists.size());
  for (std::size_t i = 0; i < total; i++) {
-    std::vector<int64_t> tmp;
-    tmp.reserve(arglists.size());
-
-    for (std::size_t j = 0; j < arglists.size(); j++) {
-      tmp.push_back(arglists[j].at(ctr[j]));
+    for (std::size_t arg = 0; arg < arglists.size(); arg++) {
+      args.push_back(arglists[arg][indices[arg]]);
    }
+    args_.push_back(args);
+    args.clear();

-    args_.push_back(std::move(tmp));
-
-    for (std::size_t j = 0; j < arglists.size(); j++) {
-      if (ctr[j] + 1 < arglists[j].size()) {
-        ++ctr[j];
-        break;
-      }
-      ctr[j] = 0;
-    }
+    std::size_t arg = 0;
+    do {
+      indices[arg] = (indices[arg] + 1) % arglists[arg].size();
+    } while (indices[arg++] == 0 && arg < arglists.size());
  }
+
  return this;
 }

 Benchmark* Benchmark::ArgName(const std::string& name) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
  arg_names_ = {name};
  return this;
 }

 Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
  arg_names_ = names;
  return this;
 }

 Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-  CHECK_GE(start, 0);
-  CHECK_LE(start, limit);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK_LE(start, limit);
  for (int64_t arg = start; arg <= limit; arg += step) {
    args_.push_back({arg});
  }
@@ -332,7 +325,7 @@ Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
 }

 Benchmark* Benchmark::Args(const std::vector<int64_t>& args) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
  args_.push_back(args);
  return this;
 }
@@ -342,28 +335,60 @@ Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
  return this;
 }

+Benchmark* Benchmark::Setup(callback_function&& setup) {
+  BM_CHECK(setup != nullptr);
+  setup_ = std::forward<callback_function>(setup);
+  return this;
+}
+
+Benchmark* Benchmark::Setup(const callback_function& setup) {
+  BM_CHECK(setup != nullptr);
+  setup_ = setup;
+  return this;
+}
+
+Benchmark* Benchmark::Teardown(callback_function&& teardown) {
+  BM_CHECK(teardown != nullptr);
+  teardown_ = std::forward<callback_function>(teardown);
+  return this;
+}
+
+Benchmark* Benchmark::Teardown(const callback_function& teardown) {
+  BM_CHECK(teardown != nullptr);
+  teardown_ = teardown;
+  return this;
+}
+
 Benchmark* Benchmark::RangeMultiplier(int multiplier) {
-  CHECK(multiplier > 1);
+  BM_CHECK(multiplier > 1);
  range_multiplier_ = multiplier;
  return this;
 }

 Benchmark* Benchmark::MinTime(double t) {
-  CHECK(t > 0.0);
-  CHECK(iterations_ == 0);
+  BM_CHECK(t > 0.0);
+  BM_CHECK(iterations_ == 0);
  min_time_ = t;
  return this;
 }

-Benchmark* Benchmark::Iterations(size_t n) {
-  CHECK(n > 0);
-  CHECK(IsZero(min_time_));
+Benchmark* Benchmark::MinWarmUpTime(double t) {
+  BM_CHECK(t >= 0.0);
+  BM_CHECK(iterations_ == 0);
+  min_warmup_time_ = t;
+  return this;
+}
+
+Benchmark* Benchmark::Iterations(IterationCount n) {
+  BM_CHECK(n > 0);
+  BM_CHECK(IsZero(min_time_));
+  BM_CHECK(IsZero(min_warmup_time_));
  iterations_ = n;
  return this;
 }

 Benchmark* Benchmark::Repetitions(int n) {
-  CHECK(n > 0);
+  BM_CHECK(n > 0);
  repetitions_ = n;
  return this;
 }
@@ -389,15 +414,21 @@ Benchmark* Benchmark::DisplayAggregatesOnly(bool value) {
  return this;
 }

+Benchmark* Benchmark::MeasureProcessCPUTime() {
+  // Can be used together with UseRealTime() / UseManualTime().
+  measure_process_cpu_time_ = true;
+  return this;
+}
+
 Benchmark* Benchmark::UseRealTime() {
-  CHECK(!use_manual_time_)
+  BM_CHECK(!use_manual_time_)
      << "Cannot set UseRealTime and UseManualTime simultaneously.";
  use_real_time_ = true;
  return this;
 }

 Benchmark* Benchmark::UseManualTime() {
-  CHECK(!use_real_time_)
+  BM_CHECK(!use_real_time_)
      << "Cannot set UseRealTime and UseManualTime simultaneously.";
  use_manual_time_ = true;
  return this;
@@ -414,21 +445,22 @@ Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
  return this;
 }

-Benchmark* Benchmark::ComputeStatistics(std::string name,
-                                        StatisticsFunc* statistics) {
-  statistics_.emplace_back(name, statistics);
+Benchmark* Benchmark::ComputeStatistics(const std::string& name,
+                                        StatisticsFunc* statistics,
+                                        StatisticUnit unit) {
+  statistics_.emplace_back(name, statistics, unit);
  return this;
 }

 Benchmark* Benchmark::Threads(int t) {
-  CHECK_GT(t, 0);
+  BM_CHECK_GT(t, 0);
  thread_counts_.push_back(t);
  return this;
 }

 Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);

  AddRange(&thread_counts_, min_threads, max_threads, 2);
  return this;
@@ -436,9 +468,9 @@ Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {

 Benchmark* Benchmark::DenseThreadRange(int min_threads, int max_threads,
                                       int stride) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
-  CHECK_GE(stride, 1);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GE(stride, 1);

  for (auto i = min_threads; i < max_threads; i += stride) {
    thread_counts_.push_back(i);
@@ -452,16 +484,36 @@ Benchmark* Benchmark::ThreadPerCpu() {
  return this;
 }

-void Benchmark::SetName(const char* name) { name_ = name; }
+Benchmark* Benchmark::ThreadRunner(threadrunner_factory&& factory) {
+  threadrunner_ = std::move(factory);
+  return this;
+}
+
+void Benchmark::SetName(const std::string& name) { name_ = name; }
+
+const char* Benchmark::GetName() const { return name_.c_str(); }

 int Benchmark::ArgsCnt() const {
  if (args_.empty()) {
-    if (arg_names_.empty()) return -1;
+    if (arg_names_.empty()) {
+      return -1;
+    }
    return static_cast<int>(arg_names_.size());
  }
  return static_cast<int>(args_.front().size());
 }

+const char* Benchmark::GetArgName(int arg) const {
+  BM_CHECK_GE(arg, 0);
+  size_t uarg = static_cast<size_t>(arg);
+  BM_CHECK_LT(uarg, arg_names_.size());
+  return arg_names_[uarg].c_str();
+}
+
+TimeUnit Benchmark::GetTimeUnit() const {
+  return use_default_time_unit_ ? GetDefaultTimeUnit() : time_unit_;
+}
+
 //=============================================================================//
 //                            FunctionBenchmark
 //=============================================================================//
@@ -474,4 +526,19 @@ void ClearRegisteredBenchmarks() {
  internal::BenchmarkFamilies::GetInstance()->ClearBenchmarks();
 }

+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi) {
+  std::vector<int64_t> args;
+  internal::AddRange(&args, lo, hi, multi);
+  return args;
+}
+
+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit, int step) {
+  BM_CHECK_LE(start, limit);
+  std::vector<int64_t> args;
+  for (int64_t arg = start; arg <= limit; arg += step) {
+    args.push_back(arg);
+  }
+  return args;
+}
+
 }  // end namespace benchmark
--- a/third_party/benchmark/src/benchmark_register.h
+++ b/third_party/benchmark/src/benchmark_register.h
@@ -1,33 +1,109 @@
 #ifndef BENCHMARK_REGISTER_H
 #define BENCHMARK_REGISTER_H

+#include <algorithm>
+#include <limits>
 #include <vector>

 #include "check.h"

+namespace benchmark {
+namespace internal {
+
+// Append the powers of 'mult' in the closed interval [lo, hi].
+// Returns iterator to the start of the inserted range.
+template <typename T>
+typename std::vector<T>::iterator AddPowers(std::vector<T>* dst, T lo, T hi,
+                                            int mult) {
+  BM_CHECK_GE(lo, 0);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);
+
+  const size_t start_offset = dst->size();
+
+  static const T kmax = std::numeric_limits<T>::max();
+
+  // Space out the values in multiples of "mult"
+  for (T i = static_cast<T>(1); i <= hi; i = static_cast<T>(i * mult)) {
+    if (i >= lo) {
+      dst->push_back(i);
+    }
+    // Break the loop here since multiplying by
+    // 'mult' would move outside of the range of T
+    if (i > kmax / mult) break;
+  }
+
+  return dst->begin() + static_cast<int>(start_offset);
+}
+
+template <typename T>
+void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
+  // We negate lo and hi so we require that they cannot be equal to 'min'.
+  BM_CHECK_GT(lo, std::numeric_limits<T>::min());
+  BM_CHECK_GT(hi, std::numeric_limits<T>::min());
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_LE(hi, 0);
+
+  // Add positive powers, then negate and reverse.
+  // Casts necessary since small integers get promoted
+  // to 'int' when negating.
+  const auto lo_complement = static_cast<T>(-lo);
+  const auto hi_complement = static_cast<T>(-hi);
+
+  const auto it = AddPowers(dst, hi_complement, lo_complement, mult);
+
+  std::for_each(it, dst->end(), [](T& t) { t = static_cast<T>(t * -1); });
+  std::reverse(it, dst->end());
+}
+
 template <typename T>
 void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
-  CHECK_GE(lo, 0);
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+  static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
+                "Args type must be a signed integer");
+
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);

  // Add "lo"
  dst->push_back(lo);

-  static const T kmax = std::numeric_limits<T>::max();
+  // Handle lo == hi as a special case, so we then know
+  // lo < hi and so it is safe to add 1 to lo and subtract 1
+  // from hi without falling outside of the range of T.
+  if (lo == hi) return;

-  // Now space out the benchmarks in multiples of "mult"
-  for (T i = 1; i < kmax / mult; i *= mult) {
-    if (i >= hi) break;
-    if (i > lo) {
-      dst->push_back(i);
-    }
+  // Ensure that lo_inner <= hi_inner below.
+  if (lo + 1 == hi) {
+    dst->push_back(hi);
+    return;
  }

-  // Add "hi" (if different from "lo")
-  if (hi != lo) {
+  // Add all powers of 'mult' in the range [lo+1, hi-1] (inclusive).
+  const auto lo_inner = static_cast<T>(lo + 1);
+  const auto hi_inner = static_cast<T>(hi - 1);
+
+  // Insert negative values
+  if (lo_inner < 0) {
+    AddNegatedPowers(dst, lo_inner, std::min(hi_inner, T{-1}), mult);
+  }
+
+  // Treat 0 as a special case (see discussion on #762).
+  if (lo < 0 && hi >= 0) {
+    dst->push_back(0);
+  }
+
+  // Insert positive values
+  if (hi_inner > 0) {
+    AddPowers(dst, std::max(lo_inner, T{1}), hi_inner, mult);
+  }
+
+  // Add "hi" (if different from last value).
+  if (hi != dst->back()) {
    dst->push_back(hi);
  }
 }

+}  // namespace internal
+}  // namespace benchmark
+
 #endif  // BENCHMARK_REGISTER_H
--- a/third_party/benchmark/src/benchmark_runner.cc
+++ b/third_party/benchmark/src/benchmark_runner.cc
@@ -13,12 +13,13 @@
 // limitations under the License.

 #include "benchmark_runner.h"
+
 #include "benchmark/benchmark.h"
 #include "benchmark_api_internal.h"
 #include "internal_macros.h"

 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -27,11 +28,15 @@

 #include <algorithm>
 #include <atomic>
+#include <climits>
+#include <cmath>
 #include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
 #include <fstream>
+#include <functional>
 #include <iostream>
+#include <limits>
 #include <memory>
 #include <string>
 #include <thread>
@@ -42,9 +47,9 @@
 #include "commandlineflags.h"
 #include "complexity.h"
 #include "counter.h"
-#include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
+#include "perf_counters.h"
 #include "re.h"
 #include "statistics.h"
 #include "string_util.h"
@@ -53,64 +58,95 @@

 namespace benchmark {

+BM_DECLARE_bool(benchmark_dry_run);
+BM_DECLARE_string(benchmark_min_time);
+BM_DECLARE_double(benchmark_min_warmup_time);
+BM_DECLARE_int32(benchmark_repetitions);
+BM_DECLARE_bool(benchmark_report_aggregates_only);
+BM_DECLARE_bool(benchmark_display_aggregates_only);
+BM_DECLARE_string(benchmark_perf_counters);
+
 namespace internal {

 MemoryManager* memory_manager = nullptr;

+ProfilerManager* profiler_manager = nullptr;
+
 namespace {

-static const size_t kMaxIterations = 1000000000;
+constexpr IterationCount kMaxIterations = 1000000000000;
+const double kDefaultMinTime =
+    std::strtod(::benchmark::kDefaultMinTimeStr, /*p_end*/ nullptr);

 BenchmarkReporter::Run CreateRunReport(
    const benchmark::internal::BenchmarkInstance& b,
-    const internal::ThreadManager::Result& results, size_t memory_iterations,
-    const MemoryManager::Result& memory_result, double seconds) {
+    const internal::ThreadManager::Result& results,
+    IterationCount memory_iterations,
+    const MemoryManager::Result& memory_result, double seconds,
+    int64_t repetition_index, int64_t repeats) {
  // Create report about this benchmark run.
  BenchmarkReporter::Run report;

-  report.run_name = b.name;
-  report.error_occurred = results.has_error_;
-  report.error_message = results.error_message_;
+  report.run_name = b.name();
+  report.family_index = b.family_index();
+  report.per_family_instance_index = b.per_family_instance_index();
+  report.skipped = results.skipped_;
+  report.skip_message = results.skip_message_;
  report.report_label = results.report_label_;
  // This is the total iterations across all threads.
  report.iterations = results.iterations;
-  report.time_unit = b.time_unit;
+  report.time_unit = b.time_unit();
+  report.threads = b.threads();
+  report.repetition_index = repetition_index;
+  report.repetitions = repeats;

-  if (!report.error_occurred) {
-    if (b.use_manual_time) {
+  if (report.skipped == 0u) {
+    if (b.use_manual_time()) {
      report.real_accumulated_time = results.manual_time_used;
    } else {
      report.real_accumulated_time = results.real_time_used;
    }
+    report.use_real_time_for_initial_big_o = b.use_manual_time();
    report.cpu_accumulated_time = results.cpu_time_used;
    report.complexity_n = results.complexity_n;
-    report.complexity = b.complexity;
-    report.complexity_lambda = b.complexity_lambda;
-    report.statistics = b.statistics;
+    report.complexity = b.complexity();
+    report.complexity_lambda = b.complexity_lambda();
+    report.statistics = &b.statistics();
    report.counters = results.counters;

    if (memory_iterations > 0) {
-      report.has_memory_result = true;
+      report.memory_result = memory_result;
      report.allocs_per_iter =
-          memory_iterations ? static_cast<double>(memory_result.num_allocs) /
-                                  memory_iterations
-                            : 0;
-      report.max_bytes_used = memory_result.max_bytes_used;
+          memory_iterations != 0
+              ? static_cast<double>(memory_result.num_allocs) /
+                    static_cast<double>(memory_iterations)
+              : 0;
    }

-    internal::Finish(&report.counters, results.iterations, seconds, b.threads);
+    internal::Finish(&report.counters, results.iterations, seconds,
+                     b.threads());
  }
  return report;
 }

 // Execute one thread of benchmark b for the specified number of iterations.
-// Adds the stats collected for the thread into *total.
-void RunInThread(const BenchmarkInstance* b, size_t iters, int thread_id,
-                 ThreadManager* manager) {
-  internal::ThreadTimer timer;
-  State st = b->Run(iters, thread_id, &timer, manager);
-  CHECK(st.iterations() >= st.max_iterations)
-      << "Benchmark returned before State::KeepRunning() returned false!";
+// Adds the stats collected for the thread into manager->results.
+void RunInThread(const BenchmarkInstance* b, IterationCount iters,
+                 int thread_id, ThreadManager* manager,
+                 PerfCountersMeasurement* perf_counters_measurement,
+                 ProfilerManager* profiler_manager_) {
+  internal::ThreadTimer timer(
+      b->measure_process_cpu_time()
+          ? internal::ThreadTimer::CreateProcessCpuTime()
+          : internal::ThreadTimer::Create());
+
+  State st = b->Run(iters, thread_id, &timer, manager,
+                    perf_counters_measurement, profiler_manager_);
+  if (!(st.skipped() || st.iterations() >= st.max_iterations)) {
+    st.SkipWithError(
+        "The benchmark didn't run, nor was it explicitly skipped. Please call "
+        "'SkipWithXXX` in your benchmark as appropriate.");
+  }
  {
    MutexLock l(manager->GetBenchmarkMutex());
    internal::ThreadManager::Result& results = manager->results;
@@ -124,225 +160,412 @@ void RunInThread(const BenchmarkInstance* b, size_t iters, int thread_id,
  manager->NotifyThreadComplete();
 }

-class BenchmarkRunner {
- public:
-  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
-                  std::vector<BenchmarkReporter::Run>* complexity_reports_)
-      : b(b_),
-        complexity_reports(*complexity_reports_),
-        min_time(!IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time),
-        repeats(b.repetitions != 0 ? b.repetitions
-                                   : FLAGS_benchmark_repetitions),
-        has_explicit_iteration_count(b.iterations != 0),
-        pool(b.threads - 1),
-        iters(has_explicit_iteration_count ? b.iterations : 1) {
-    run_results.display_report_aggregates_only =
-        (FLAGS_benchmark_report_aggregates_only ||
-         FLAGS_benchmark_display_aggregates_only);
-    run_results.file_report_aggregates_only =
-        FLAGS_benchmark_report_aggregates_only;
-    if (b.aggregation_report_mode != internal::ARM_Unspecified) {
-      run_results.display_report_aggregates_only =
-          (b.aggregation_report_mode &
-           internal::ARM_DisplayReportAggregatesOnly);
-      run_results.file_report_aggregates_only =
-          (b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly);
-    }
-
-    for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
-      const bool is_the_first_repetition = repetition_num == 0;
-      DoOneRepetition(is_the_first_repetition);
-    }
-
-    // Calculate additional statistics
-    run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
-
-    // Maybe calculate complexity report
-    if ((b.complexity != oNone) && b.last_benchmark_instance) {
-      auto additional_run_stats = ComputeBigO(complexity_reports);
-      run_results.aggregates_only.insert(run_results.aggregates_only.end(),
-                                         additional_run_stats.begin(),
-                                         additional_run_stats.end());
-      complexity_reports.clear();
-    }
+double ComputeMinTime(const benchmark::internal::BenchmarkInstance& b,
+                      const BenchTimeType& iters_or_time) {
+  if (!IsZero(b.min_time())) {
+    return b.min_time();
+  }
+  // If the flag was used to specify number of iters, then return the default
+  // min_time.
+  if (iters_or_time.tag == BenchTimeType::ITERS) {
+    return kDefaultMinTime;
  }

-  RunResults&& get_results() { return std::move(run_results); }
+  return iters_or_time.time;
+}

- private:
-  RunResults run_results;
+IterationCount ComputeIters(const benchmark::internal::BenchmarkInstance& b,
+                            const BenchTimeType& iters_or_time) {
+  if (b.iterations() != 0) {
+    return b.iterations();
+  }

-  const benchmark::internal::BenchmarkInstance& b;
-  std::vector<BenchmarkReporter::Run>& complexity_reports;
+  // We've already concluded that this flag is currently used to pass
+  // iters but do a check here again anyway.
+  BM_CHECK(iters_or_time.tag == BenchTimeType::ITERS);
+  return iters_or_time.iters;
+}

-  const double min_time;
-  const int repeats;
-  const bool has_explicit_iteration_count;
-
-  std::vector<std::thread> pool;
-
-  size_t iters;  // preserved between repetitions!
-  // So only the first repetition has to find/calculate it,
-  // the other repetitions will just use that precomputed iteration count.
-
-  struct IterationResults {
-    internal::ThreadManager::Result results;
-    size_t iters;
-    double seconds;
-  };
-  IterationResults DoNIterations() {
-    VLOG(2) << "Running " << b.name << " for " << iters << "\n";
-
-    std::unique_ptr<internal::ThreadManager> manager;
-    manager.reset(new internal::ThreadManager(b.threads));
+class ThreadRunnerDefault : public ThreadRunnerBase {
+ public:
+  explicit ThreadRunnerDefault(int num_threads)
+      : pool(static_cast<size_t>(num_threads - 1)) {}

+  void RunThreads(const std::function<void(int)>& fn) final {
    // Run all but one thread in separate threads
    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-      pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
-                             manager.get());
+      pool[ti] = std::thread(fn, static_cast<int>(ti + 1));
    }
    // And run one thread here directly.
    // (If we were asked to run just one thread, we don't create new threads.)
    // Yes, we need to do this here *after* we start the separate threads.
-    RunInThread(&b, iters, 0, manager.get());
+    fn(0);

    // The main thread has finished. Now let's wait for the other threads.
-    manager->WaitForAllThreads();
-    for (std::thread& thread : pool) thread.join();
-
-    IterationResults i;
-    // Acquire the measurements/counters from the manager, UNDER THE LOCK!
-    {
-      MutexLock l(manager->GetBenchmarkMutex());
-      i.results = manager->results;
+    for (std::thread& thread : pool) {
+      thread.join();
    }
-
-    // And get rid of the manager.
-    manager.reset();
-
-    // Adjust real/manual time stats since they were reported per thread.
-    i.results.real_time_used /= b.threads;
-    i.results.manual_time_used /= b.threads;
-
-    VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
-            << i.results.real_time_used << "\n";
-
-    // So for how long were we running?
-    i.iters = iters;
-    // Base decisions off of real time if requested by this benchmark.
-    i.seconds = i.results.cpu_time_used;
-    if (b.use_manual_time) {
-      i.seconds = i.results.manual_time_used;
-    } else if (b.use_real_time) {
-      i.seconds = i.results.real_time_used;
-    }
-
-    return i;
  }

-  size_t PredictNumItersNeeded(const IterationResults& i) const {
-    // See how much iterations should be increased by.
-    // Note: Avoid division by zero with max(seconds, 1ns).
-    double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
-    // If our last run was at least 10% of FLAGS_benchmark_min_time then we
-    // use the multiplier directly.
-    // Otherwise we use at most 10 times expansion.
-    // NOTE: When the last run was at least 10% of the min time the max
-    // expansion should be 14x.
-    bool is_significant = (i.seconds / min_time) > 0.1;
-    multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
-    if (multiplier <= 1.0) multiplier = 2.0;
-
-    // So what seems to be the sufficiently-large iteration count? Round up.
-    const size_t max_next_iters =
-        0.5 + std::max(multiplier * i.iters, i.iters + 1.0);
-    // But we do have *some* sanity limits though..
-    const size_t next_iters = std::min(max_next_iters, kMaxIterations);
-
-    VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
-    return next_iters;  // round up before conversion to integer.
-  }
-
-  bool ShouldReportIterationResults(const IterationResults& i) const {
-    // Determine if this run should be reported;
-    // Either it has run for a sufficient amount of time
-    // or because an error was reported.
-    return i.results.has_error_ ||
-           i.iters >= kMaxIterations ||  // Too many iterations already.
-           i.seconds >= min_time ||      // The elapsed time is large enough.
-           // CPU time is specified but the elapsed real time greatly exceeds
-           // the minimum time.
-           // Note that user provided timers are except from this sanity check.
-           ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time);
-  }
-
-  void DoOneRepetition(bool is_the_first_repetition) {
-    IterationResults i;
-
-    // We *may* be gradually increasing the length (iteration count)
-    // of the benchmark until we decide the results are significant.
-    // And once we do, we report those last results and exit.
-    // Please do note that the if there are repetitions, the iteration count
-    // is *only* calculated for the *first* repetition, and other repetitions
-    // simply use that precomputed iteration count.
-    for (;;) {
-      i = DoNIterations();
-
-      // Do we consider the results to be significant?
-      // If we are doing repetitions, and the first repetition was already done,
-      // it has calculated the correct iteration time, so we have run that very
-      // iteration count just now. No need to calculate anything. Just report.
-      // Else, the normal rules apply.
-      const bool results_are_significant = !is_the_first_repetition ||
-                                           has_explicit_iteration_count ||
-                                           ShouldReportIterationResults(i);
-
-      if (results_are_significant) break;  // Good, let's report them!
-
-      // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
-      // iteration count, and run the benchmark again...
-
-      iters = PredictNumItersNeeded(i);
-      assert(iters > i.iters &&
-             "if we did more iterations than we want to do the next time, "
-             "then we should have accepted the current iteration run.");
-    }
-
-    // Oh, one last thing, we need to also produce the 'memory measurements'..
-    MemoryManager::Result memory_result;
-    size_t memory_iterations = 0;
-    if (memory_manager != nullptr) {
-      // Only run a few iterations to reduce the impact of one-time
-      // allocations in benchmarks that are not properly managed.
-      memory_iterations = std::min<size_t>(16, iters);
-      memory_manager->Start();
-      std::unique_ptr<internal::ThreadManager> manager;
-      manager.reset(new internal::ThreadManager(1));
-      RunInThread(&b, memory_iterations, 0, manager.get());
-      manager->WaitForAllThreads();
-      manager.reset();
-
-      memory_manager->Stop(&memory_result);
-    }
-
-    // Ok, now actualy report.
-    BenchmarkReporter::Run report = CreateRunReport(
-        b, i.results, memory_iterations, memory_result, i.seconds);
-
-    if (!report.error_occurred && b.complexity != oNone)
-      complexity_reports.push_back(report);
-
-    run_results.non_aggregates.push_back(report);
-  }
+ private:
+  std::vector<std::thread> pool;
 };

+std::unique_ptr<ThreadRunnerBase> GetThreadRunner(
+    const threadrunner_factory& userThreadRunnerFactory, int num_threads) {
+  return userThreadRunnerFactory
+             ? userThreadRunnerFactory(num_threads)
+             : std::make_unique<ThreadRunnerDefault>(num_threads);
+}
+
 }  // end namespace

-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports) {
-  internal::BenchmarkRunner r(b, complexity_reports);
-  return r.get_results();
+BenchTimeType ParseBenchMinTime(const std::string& value) {
+  BenchTimeType ret = {};
+
+  if (value.empty()) {
+    ret.tag = BenchTimeType::TIME;
+    ret.time = 0.0;
+    return ret;
+  }
+
+  if (value.back() == 'x') {
+    char* p_end = nullptr;
+    // Reset errno before it's changed by strtol.
+    errno = 0;
+    IterationCount num_iters = std::strtol(value.c_str(), &p_end, 10);
+
+    // After a valid parse, p_end should have been set to
+    // point to the 'x' suffix.
+    BM_CHECK(errno == 0 && p_end != nullptr && *p_end == 'x')
+        << "Malformed iters value passed to --benchmark_min_time: `" << value
+        << "`. Expected --benchmark_min_time=<integer>x.";
+
+    ret.tag = BenchTimeType::ITERS;
+    ret.iters = num_iters;
+    return ret;
+  }
+
+  bool has_suffix = value.back() == 's';
+  if (!has_suffix) {
+    BM_VLOG(0) << "Value passed to --benchmark_min_time should have a suffix. "
+                  "Eg., `30s` for 30-seconds.";
+  }
+
+  char* p_end = nullptr;
+  // Reset errno before it's changed by strtod.
+  errno = 0;
+  double min_time = std::strtod(value.c_str(), &p_end);
+
+  // After a successful parse, p_end should point to the suffix 's',
+  // or the end of the string if the suffix was omitted.
+  BM_CHECK(errno == 0 && p_end != nullptr &&
+           ((has_suffix && *p_end == 's') || *p_end == '\0'))
+      << "Malformed seconds value passed to --benchmark_min_time: `" << value
+      << "`. Expected --benchmark_min_time=<float>x.";
+
+  ret.tag = BenchTimeType::TIME;
+  ret.time = min_time;
+
+  return ret;
+}
+
+BenchmarkRunner::BenchmarkRunner(
+    const benchmark::internal::BenchmarkInstance& b_,
+    PerfCountersMeasurement* pcm_,
+    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
+    : b(b_),
+      reports_for_family(reports_for_family_),
+      parsed_benchtime_flag(ParseBenchMinTime(FLAGS_benchmark_min_time)),
+      min_time(FLAGS_benchmark_dry_run
+                   ? 0
+                   : ComputeMinTime(b_, parsed_benchtime_flag)),
+      min_warmup_time(
+          FLAGS_benchmark_dry_run
+              ? 0
+              : ((!IsZero(b.min_time()) && b.min_warmup_time() > 0.0)
+                     ? b.min_warmup_time()
+                     : FLAGS_benchmark_min_warmup_time)),
+      warmup_done(FLAGS_benchmark_dry_run ? true : !(min_warmup_time > 0.0)),
+      repeats(FLAGS_benchmark_dry_run
+                  ? 1
+                  : (b.repetitions() != 0 ? b.repetitions()
+                                          : FLAGS_benchmark_repetitions)),
+      has_explicit_iteration_count(b.iterations() != 0 ||
+                                   parsed_benchtime_flag.tag ==
+                                       BenchTimeType::ITERS),
+      thread_runner(
+          GetThreadRunner(b.GetUserThreadRunnerFactory(), b.threads())),
+      iters(FLAGS_benchmark_dry_run
+                ? 1
+                : (has_explicit_iteration_count
+                       ? ComputeIters(b_, parsed_benchtime_flag)
+                       : 1)),
+      perf_counters_measurement_ptr(pcm_) {
+  run_results.display_report_aggregates_only =
+      (FLAGS_benchmark_report_aggregates_only ||
+       FLAGS_benchmark_display_aggregates_only);
+  run_results.file_report_aggregates_only =
+      FLAGS_benchmark_report_aggregates_only;
+  if (b.aggregation_report_mode() != internal::ARM_Unspecified) {
+    run_results.display_report_aggregates_only =
+        ((b.aggregation_report_mode() &
+          internal::ARM_DisplayReportAggregatesOnly) != 0u);
+    run_results.file_report_aggregates_only =
+        ((b.aggregation_report_mode() &
+          internal::ARM_FileReportAggregatesOnly) != 0u);
+    BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
+             (perf_counters_measurement_ptr->num_counters() == 0))
+        << "Perf counters were requested but could not be set up.";
+  }
+}
+
+BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
+  BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n";
+
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(b.threads()));
+
+  thread_runner->RunThreads([&](int thread_idx) {
+    RunInThread(&b, iters, thread_idx, manager.get(),
+                perf_counters_measurement_ptr, /*profiler_manager=*/nullptr);
+  });
+
+  IterationResults i;
+  // Acquire the measurements/counters from the manager, UNDER THE LOCK!
+  {
+    MutexLock l(manager->GetBenchmarkMutex());
+    i.results = manager->results;
+  }
+
+  // And get rid of the manager.
+  manager.reset();
+
+  BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
+             << i.results.real_time_used << "\n";
+
+  // By using KeepRunningBatch a benchmark can iterate more times than
+  // requested, so take the iteration count from i.results.
+  i.iters = i.results.iterations / b.threads();
+
+  // Base decisions off of real time if requested by this benchmark.
+  i.seconds = i.results.cpu_time_used;
+  if (b.use_manual_time()) {
+    i.seconds = i.results.manual_time_used;
+  } else if (b.use_real_time()) {
+    i.seconds = i.results.real_time_used;
+  }
+
+  return i;
+}
+
+IterationCount BenchmarkRunner::PredictNumItersNeeded(
+    const IterationResults& i) const {
+  // See how much iterations should be increased by.
+  // Note: Avoid division by zero with max(seconds, 1ns).
+  double multiplier = GetMinTimeToApply() * 1.4 / std::max(i.seconds, 1e-9);
+  // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+  // use the multiplier directly.
+  // Otherwise we use at most 10 times expansion.
+  // NOTE: When the last run was at least 10% of the min time the max
+  // expansion should be 14x.
+  const bool is_significant = (i.seconds / GetMinTimeToApply()) > 0.1;
+  multiplier = is_significant ? multiplier : 10.0;
+
+  // So what seems to be the sufficiently-large iteration count? Round up.
+  const IterationCount max_next_iters = static_cast<IterationCount>(
+      std::llround(std::max(multiplier * static_cast<double>(i.iters),
+                            static_cast<double>(i.iters) + 1.0)));
+  // But we do have *some* limits though..
+  const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
+
+  BM_VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+  return next_iters;  // round up before conversion to integer.
+}
+
+bool BenchmarkRunner::ShouldReportIterationResults(
+    const IterationResults& i) const {
+  // Determine if this run should be reported;
+  // Either it has run for a sufficient amount of time
+  // or because an error was reported.
+  return (i.results.skipped_ != 0u) || FLAGS_benchmark_dry_run ||
+         i.iters >= kMaxIterations ||  // Too many iterations already.
+         i.seconds >=
+             GetMinTimeToApply() ||  // The elapsed time is large enough.
+         // CPU time is specified but the elapsed real time greatly exceeds
+         // the minimum time.
+         // Note that user provided timers are except from this test.
+         ((i.results.real_time_used >= 5 * GetMinTimeToApply()) &&
+          !b.use_manual_time());
+}
+
+double BenchmarkRunner::GetMinTimeToApply() const {
+  // In order to re-use functionality to run and measure benchmarks for running
+  // a warmup phase of the benchmark, we need a way of telling whether to apply
+  // min_time or min_warmup_time. This function will figure out if we are in the
+  // warmup phase and therefore need to apply min_warmup_time or if we already
+  // in the benchmarking phase and min_time needs to be applied.
+  return warmup_done ? min_time : min_warmup_time;
+}
+
+void BenchmarkRunner::FinishWarmUp(const IterationCount& i) {
+  warmup_done = true;
+  iters = i;
+}
+
+void BenchmarkRunner::RunWarmUp() {
+  // Use the same mechanisms for warming up the benchmark as used for actually
+  // running and measuring the benchmark.
+  IterationResults i_warmup;
+  // Dont use the iterations determined in the warmup phase for the actual
+  // measured benchmark phase. While this may be a good starting point for the
+  // benchmark and it would therefore get rid of the need to figure out how many
+  // iterations are needed if min_time is set again, this may also be a complete
+  // wrong guess since the warmup loops might be considerably slower (e.g
+  // because of caching effects).
+  const IterationCount i_backup = iters;
+
+  for (;;) {
+    b.Setup();
+    i_warmup = DoNIterations();
+    b.Teardown();
+
+    const bool finish = ShouldReportIterationResults(i_warmup);
+
+    if (finish) {
+      FinishWarmUp(i_backup);
+      break;
+    }
+
+    // Although we are running "only" a warmup phase where running enough
+    // iterations at once without measuring time isn't as important as it is for
+    // the benchmarking phase, we still do it the same way as otherwise it is
+    // very confusing for the user to know how to choose a proper value for
+    // min_warmup_time if a different approach on running it is used.
+    iters = PredictNumItersNeeded(i_warmup);
+    assert(iters > i_warmup.iters &&
+           "if we did more iterations than we want to do the next time, "
+           "then we should have accepted the current iteration run.");
+  }
+}
+
+MemoryManager::Result BenchmarkRunner::RunMemoryManager(
+    IterationCount memory_iterations) {
+  memory_manager->Start();
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(1));
+  b.Setup();
+  RunInThread(&b, memory_iterations, 0, manager.get(),
+              perf_counters_measurement_ptr,
+              /*profiler_manager=*/nullptr);
+  manager.reset();
+  b.Teardown();
+  MemoryManager::Result memory_result;
+  memory_manager->Stop(memory_result);
+  memory_result.memory_iterations = memory_iterations;
+  return memory_result;
+}
+
+void BenchmarkRunner::RunProfilerManager(IterationCount profile_iterations) {
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(1));
+  b.Setup();
+  RunInThread(&b, profile_iterations, 0, manager.get(),
+              /*perf_counters_measurement_ptr=*/nullptr,
+              /*profiler_manager=*/profiler_manager);
+  manager.reset();
+  b.Teardown();
+}
+
+void BenchmarkRunner::DoOneRepetition() {
+  assert(HasRepeatsRemaining() && "Already done all repetitions?");
+
+  const bool is_the_first_repetition = num_repetitions_done == 0;
+
+  // In case a warmup phase is requested by the benchmark, run it now.
+  // After running the warmup phase the BenchmarkRunner should be in a state as
+  // this warmup never happened except the fact that warmup_done is set. Every
+  // other manipulation of the BenchmarkRunner instance would be a bug! Please
+  // fix it.
+  if (!warmup_done) {
+    RunWarmUp();
+  }
+
+  IterationResults i;
+  // We *may* be gradually increasing the length (iteration count)
+  // of the benchmark until we decide the results are significant.
+  // And once we do, we report those last results and exit.
+  // Please do note that the if there are repetitions, the iteration count
+  // is *only* calculated for the *first* repetition, and other repetitions
+  // simply use that precomputed iteration count.
+  for (;;) {
+    b.Setup();
+    i = DoNIterations();
+    b.Teardown();
+
+    // Do we consider the results to be significant?
+    // If we are doing repetitions, and the first repetition was already done,
+    // it has calculated the correct iteration time, so we have run that very
+    // iteration count just now. No need to calculate anything. Just report.
+    // Else, the normal rules apply.
+    const bool results_are_significant = !is_the_first_repetition ||
+                                         has_explicit_iteration_count ||
+                                         ShouldReportIterationResults(i);
+    // Good, let's report them!
+    if (results_are_significant) {
+      break;
+    }
+
+    // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
+    // iteration count, and run the benchmark again...
+
+    iters = PredictNumItersNeeded(i);
+    assert(iters > i.iters &&
+           "if we did more iterations than we want to do the next time, "
+           "then we should have accepted the current iteration run.");
+  }
+
+  // Produce memory measurements if requested.
+  MemoryManager::Result memory_result;
+  IterationCount memory_iterations = 0;
+  if (memory_manager != nullptr) {
+    // Only run a few iterations to reduce the impact of one-time
+    // allocations in benchmarks that are not properly managed.
+    memory_iterations = std::min<IterationCount>(16, iters);
+    memory_result = RunMemoryManager(memory_iterations);
+  }
+
+  if (profiler_manager != nullptr) {
+    // We want to externally profile the benchmark for the same number of
+    // iterations because, for example, if we're tracing the benchmark then we
+    // want trace data to reasonably match PMU data.
+    RunProfilerManager(iters);
+  }
+
+  // Ok, now actually report.
+  BenchmarkReporter::Run report =
+      CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
+                      num_repetitions_done, repeats);
+
+  if (reports_for_family != nullptr) {
+    ++reports_for_family->num_runs_done;
+    if (report.skipped == 0u) {
+      reports_for_family->Runs.push_back(report);
+    }
+  }
+
+  run_results.non_aggregates.push_back(report);
+
+  ++num_repetitions_done;
+}
+
+RunResults&& BenchmarkRunner::GetResults() {
+  assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?");
+
+  // Calculate additional statistics over the repetitions of this instance.
+  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+
+  return std::move(run_results);
 }

 }  // end namespace internal
--- a/third_party/benchmark/src/benchmark_runner.h
+++ b/third_party/benchmark/src/benchmark_runner.h
@@ -15,22 +15,20 @@
 #ifndef BENCHMARK_RUNNER_H_
 #define BENCHMARK_RUNNER_H_

+#include <memory>
+#include <thread>
+#include <vector>
+
 #include "benchmark_api_internal.h"
-#include "internal_macros.h"
-
-DECLARE_double(benchmark_min_time);
-
-DECLARE_int32(benchmark_repetitions);
-
-DECLARE_bool(benchmark_report_aggregates_only);
-
-DECLARE_bool(benchmark_display_aggregates_only);
+#include "perf_counters.h"
+#include "thread_manager.h"

 namespace benchmark {

 namespace internal {

 extern MemoryManager* memory_manager;
+extern ProfilerManager* profiler_manager;

 struct RunResults {
  std::vector<BenchmarkReporter::Run> non_aggregates;
@@ -40,9 +38,87 @@ struct RunResults {
  bool file_report_aggregates_only = false;
 };

-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports);
+struct BENCHMARK_EXPORT BenchTimeType {
+  enum { UNSPECIFIED, ITERS, TIME } tag;
+  union {
+    IterationCount iters;
+    double time;
+  };
+};
+
+BENCHMARK_EXPORT
+BenchTimeType ParseBenchMinTime(const std::string& value);
+
+class BenchmarkRunner {
+ public:
+  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  benchmark::internal::PerfCountersMeasurement* pcm_,
+                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);
+
+  int GetNumRepeats() const { return repeats; }
+
+  bool HasRepeatsRemaining() const {
+    return GetNumRepeats() != num_repetitions_done;
+  }
+
+  void DoOneRepetition();
+
+  RunResults&& GetResults();
+
+  BenchmarkReporter::PerFamilyRunReports* GetReportsForFamily() const {
+    return reports_for_family;
+  }
+
+  double GetMinTime() const { return min_time; }
+
+  bool HasExplicitIters() const { return has_explicit_iteration_count; }
+
+  IterationCount GetIters() const { return iters; }
+
+ private:
+  RunResults run_results;
+
+  const benchmark::internal::BenchmarkInstance& b;
+  BenchmarkReporter::PerFamilyRunReports* reports_for_family;
+
+  BenchTimeType parsed_benchtime_flag;
+  const double min_time;
+  const double min_warmup_time;
+  bool warmup_done;
+  const int repeats;
+  const bool has_explicit_iteration_count;
+
+  int num_repetitions_done = 0;
+
+  std::unique_ptr<ThreadRunnerBase> thread_runner;
+
+  IterationCount iters;  // preserved between repetitions!
+  // So only the first repetition has to find/calculate it,
+  // the other repetitions will just use that precomputed iteration count.
+
+  PerfCountersMeasurement* const perf_counters_measurement_ptr = nullptr;
+
+  struct IterationResults {
+    internal::ThreadManager::Result results;
+    IterationCount iters;
+    double seconds;
+  };
+  IterationResults DoNIterations();
+
+  MemoryManager::Result RunMemoryManager(IterationCount memory_iterations);
+
+  void RunProfilerManager(IterationCount profile_iterations);
+
+  IterationCount PredictNumItersNeeded(const IterationResults& i) const;
+
+  bool ShouldReportIterationResults(const IterationResults& i) const;
+
+  double GetMinTimeToApply() const;
+
+  void FinishWarmUp(const IterationCount& i);
+
+  void RunWarmUp();
+};

 }  // namespace internal

--- a/third_party/benchmark/src/check.cc
+++ b/third_party/benchmark/src/check.cc
@@ -0,0 +1,14 @@
+#include "check.h"
+
+namespace benchmark {
+namespace internal {
+
+namespace {
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+AbortHandlerT* handler = &std::abort;
+}  // namespace
+
+BENCHMARK_EXPORT AbortHandlerT*& GetAbortHandler() { return handler; }
+
+}  // namespace internal
+}  // namespace benchmark
--- a/third_party/benchmark/src/check.h
+++ b/third_party/benchmark/src/check.h
@@ -4,30 +4,51 @@
 #include <cmath>
 #include <cstdlib>
 #include <ostream>
+#include <string_view>

+#include "benchmark/export.h"
 #include "internal_macros.h"
 #include "log.h"

+#if defined(__GNUC__) || defined(__clang__)
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#elif defined(_MSC_VER) && !defined(__clang__)
+#if _MSC_VER >= 1900
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+#define __func__ __FUNCTION__
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+
 namespace benchmark {
 namespace internal {

 typedef void(AbortHandlerT)();

-inline AbortHandlerT*& GetAbortHandler() {
-  static AbortHandlerT* handler = &std::abort;
-  return handler;
-}
+BENCHMARK_EXPORT
+AbortHandlerT*& GetAbortHandler();

 BENCHMARK_NORETURN inline void CallAbortHandler() {
  GetAbortHandler()();
+  std::flush(std::cout);
+  std::flush(std::cerr);
  std::abort();  // fallback to enforce noreturn
 }

-// CheckHandler is the class constructed by failing CHECK macros. CheckHandler
-// will log information about the failures and abort when it is destructed.
+// CheckHandler is the class constructed by failing BM_CHECK macros.
+// CheckHandler will log information about the failures and abort when it is
+// destructed.
 class CheckHandler {
 public:
-  CheckHandler(const char* check, const char* file, const char* func, int line)
+  CheckHandler(std::string_view check, std::string_view file,
+               std::string_view func, int line)
      : log_(GetErrorLogInstance()) {
    log_ << file << ":" << line << ": " << func << ": Check `" << check
         << "' failed. ";
@@ -35,10 +56,17 @@ class CheckHandler {

  LogType& GetLog() { return log_; }

+#if defined(COMPILER_MSVC)
+#pragma warning(push)
+#pragma warning(disable : 4722)
+#endif
  BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
-    log_ << std::endl;
+    log_ << '\n';
    CallAbortHandler();
  }
+#if defined(COMPILER_MSVC)
+#pragma warning(pop)
+#endif

  CheckHandler& operator=(const CheckHandler&) = delete;
  CheckHandler(const CheckHandler&) = delete;
@@ -51,32 +79,34 @@ class CheckHandler {
 }  // end namespace internal
 }  // end namespace benchmark

-// The CHECK macro returns a std::ostream object that can have extra information
-// written to it.
+// The BM_CHECK macro returns a std::ostream object that can have extra
+// information written to it.
 #ifndef NDEBUG
-#define CHECK(b)                                                             \
-  (b ? ::benchmark::internal::GetNullLogInstance()                           \
-     : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
+#define BM_CHECK(b)                                          \
+  (b ? ::benchmark::internal::GetNullLogInstance()           \
+     : ::benchmark::internal::CheckHandler(                  \
+           std::string_view(#b), std::string_view(__FILE__), \
+           std::string_view(__func__), __LINE__)             \
           .GetLog())
 #else
-#define CHECK(b) ::benchmark::internal::GetNullLogInstance()
+#define BM_CHECK(b) ::benchmark::internal::GetNullLogInstance()
 #endif

 // clang-format off
 // preserve whitespacing between operators for alignment
-#define CHECK_EQ(a, b) CHECK((a) == (b))
-#define CHECK_NE(a, b) CHECK((a) != (b))
-#define CHECK_GE(a, b) CHECK((a) >= (b))
-#define CHECK_LE(a, b) CHECK((a) <= (b))
-#define CHECK_GT(a, b) CHECK((a) > (b))
-#define CHECK_LT(a, b) CHECK((a) < (b))
+#define BM_CHECK_EQ(a, b) BM_CHECK((a) == (b))
+#define BM_CHECK_NE(a, b) BM_CHECK((a) != (b))
+#define BM_CHECK_GE(a, b) BM_CHECK((a) >= (b))
+#define BM_CHECK_LE(a, b) BM_CHECK((a) <= (b))
+#define BM_CHECK_GT(a, b) BM_CHECK((a) > (b))
+#define BM_CHECK_LT(a, b) BM_CHECK((a) < (b))

-#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) <  (eps))
-#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps))
-#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps))
-#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
-#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
-#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
+#define BM_CHECK_FLOAT_EQ(a, b, eps) BM_CHECK(std::fabs((a) - (b)) <  (eps))
+#define BM_CHECK_FLOAT_NE(a, b, eps) BM_CHECK(std::fabs((a) - (b)) >= (eps))
+#define BM_CHECK_FLOAT_GE(a, b, eps) BM_CHECK((a) - (b) > -(eps))
+#define BM_CHECK_FLOAT_LE(a, b, eps) BM_CHECK((b) - (a) > -(eps))
+#define BM_CHECK_FLOAT_GT(a, b, eps) BM_CHECK((a) - (b) >  (eps))
+#define BM_CHECK_FLOAT_LT(a, b, eps) BM_CHECK((b) - (a) >  (eps))
 //clang-format on

 #endif  // CHECK_H_
--- a/third_party/benchmark/src/colorprint.cc
+++ b/third_party/benchmark/src/colorprint.cc
@@ -25,8 +25,8 @@
 #include "internal_macros.h"

 #ifdef BENCHMARK_OS_WINDOWS
-#include <windows.h>
 #include <io.h>
+#include <windows.h>
 #else
 #include <unistd.h>
 #endif  // BENCHMARK_OS_WINDOWS
@@ -94,20 +94,20 @@ std::string FormatString(const char* msg, va_list args) {
  va_end(args_cp);

  // currently there is no error handling for failure, so this is hack.
-  CHECK(ret >= 0);
+  BM_CHECK(ret >= 0);

-  if (ret == 0)  // handle empty expansion
+  if (ret == 0) {  // handle empty expansion
    return {};
-  else if (static_cast<size_t>(ret) < size)
-    return local_buff;
-  else {
-    // we did not provide a long enough buffer on our first attempt.
-    size = (size_t)ret + 1;  // + 1 for the null byte
-    std::unique_ptr<char[]> buff(new char[size]);
-    ret = vsnprintf(buff.get(), size, msg, args);
-    CHECK(ret > 0 && ((size_t)ret) < size);
-    return buff.get();
  }
+  if (static_cast<size_t>(ret) < size) {
+    return local_buff;
+  }
+  // we did not provide a long enough buffer on our first attempt.
+  size = static_cast<size_t>(ret) + 1;  // + 1 for the null byte
+  std::unique_ptr<char[]> buff(new char[size]);
+  ret = vsnprintf(buff.get(), size, msg, args);
+  BM_CHECK(ret > 0 && (static_cast<size_t>(ret)) < size);
+  return buff.get();
 }

 std::string FormatString(const char* msg, ...) {
@@ -135,22 +135,30 @@ void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
  // Gets the current text color.
  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
-  const WORD old_color_attrs = buffer_info.wAttributes;
+  const WORD original_color_attrs = buffer_info.wAttributes;

  // We need to flush the stream buffers into the console before each
  // SetConsoleTextAttribute call lest it affect the text that is already
  // printed but has not yet reached the console.
-  fflush(stdout);
-  SetConsoleTextAttribute(stdout_handle,
-                          GetPlatformColorCode(color) | FOREGROUND_INTENSITY);
-  vprintf(fmt, args);
+  out.flush();

-  fflush(stdout);
-  // Restores the text color.
-  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+  const WORD original_background_attrs =
+      original_color_attrs & (BACKGROUND_RED | BACKGROUND_GREEN |
+                              BACKGROUND_BLUE | BACKGROUND_INTENSITY);
+
+  SetConsoleTextAttribute(stdout_handle, GetPlatformColorCode(color) |
+                                             FOREGROUND_INTENSITY |
+                                             original_background_attrs);
+  out << FormatString(fmt, args);
+
+  out.flush();
+  // Restores the text and background color.
+  SetConsoleTextAttribute(stdout_handle, original_color_attrs);
 #else
  const char* color_code = GetPlatformColorCode(color);
-  if (color_code) out << FormatString("\033[0;3%sm", color_code);
+  if (color_code != nullptr) {
+    out << FormatString("\033[0;3%sm", color_code);
+  }
  out << FormatString(fmt, args) << "\033[m";
 #endif
 }
@@ -163,19 +171,31 @@ bool IsColorTerminal() {
 #else
  // On non-Windows platforms, we rely on the TERM variable. This list of
  // supported TERM values is copied from Google Test:
-  // <https://github.com/google/googletest/blob/master/googletest/src/gtest.cc#L2925>.
+  // <https://github.com/google/googletest/blob/v1.13.0/googletest/src/gtest.cc#L3225-L3259>.
  const char* const SUPPORTED_TERM_VALUES[] = {
-      "xterm",         "xterm-color",     "xterm-256color",
-      "screen",        "screen-256color", "tmux",
-      "tmux-256color", "rxvt-unicode",    "rxvt-unicode-256color",
-      "linux",         "cygwin",
+      "xterm",
+      "xterm-color",
+      "xterm-256color",
+      "screen",
+      "screen-256color",
+      "tmux",
+      "tmux-256color",
+      "rxvt-unicode",
+      "rxvt-unicode-256color",
+      "linux",
+      "cygwin",
+      "xterm-kitty",
+      "alacritty",
+      "foot",
+      "foot-extra",
+      "wezterm",
  };

  const char* const term = getenv("TERM");

  bool term_supports_color = false;
  for (const char* candidate : SUPPORTED_TERM_VALUES) {
-    if (term && 0 == strcmp(term, candidate)) {
+    if ((term != nullptr) && 0 == strcmp(term, candidate)) {
      term_supports_color = true;
      break;
    }
--- a/third_party/benchmark/src/commandlineflags.cc
+++ b/third_party/benchmark/src/commandlineflags.cc
@@ -14,13 +14,20 @@

 #include "commandlineflags.h"

+#include <algorithm>
 #include <cctype>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include <limits>
+#include <map>
+#include <utility>
+
+#include "../src/string_util.h"

 namespace benchmark {
+namespace {
+
 // Parses 'str' for a 32-bit signed integer.  If successful, writes
 // the result to *value and returns true; otherwise leaves *value
 // unchanged and returns false.
@@ -75,57 +82,101 @@ bool ParseDouble(const std::string& src_text, const char* str, double* value) {
  return true;
 }

+// Parses 'str' into KV pairs. If successful, writes the result to *value and
+// returns true; otherwise leaves *value unchanged and returns false.
+bool ParseKvPairs(const std::string& src_text, const char* str,
+                  std::map<std::string, std::string>* value) {
+  std::map<std::string, std::string> kvs;
+  for (const auto& kvpair : StrSplit(str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) {
+      std::cerr << src_text << " is expected to be a comma-separated list of "
+                << "<key>=<value> strings, but actually has value \"" << str
+                << "\".\n";
+      return false;
+    }
+    if (!kvs.emplace(kv[0], kv[1]).second) {
+      std::cerr << src_text << " is expected to contain unique keys but key \""
+                << kv[0] << "\" was repeated.\n";
+      return false;
+    }
+  }
+
+  *value = kvs;
+  return true;
+}
+
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "BENCHMARK_FOO" in the open-source version.
-static std::string FlagToEnvVar(const char* flag) {
+std::string FlagToEnvVar(const char* flag) {
  const std::string flag_str(flag);

  std::string env_var;
-  for (size_t i = 0; i != flag_str.length(); ++i)
+  for (size_t i = 0; i != flag_str.length(); ++i) {
    env_var += static_cast<char>(::toupper(flag_str.c_str()[i]));
-
-  return "BENCHMARK_" + env_var;
-}
-
-// Reads and returns the Boolean environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-//
-// The value is considered true iff it's not "0".
-bool BoolFromEnv(const char* flag, bool default_value) {
-  const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = getenv(env_var.c_str());
-  return string_value == nullptr ? default_value
-                                 : strcmp(string_value, "0") != 0;
-}
-
-// Reads and returns a 32-bit integer stored in the environment
-// variable corresponding to the given flag; if it isn't set or
-// doesn't represent a valid 32-bit integer, returns default_value.
-int32_t Int32FromEnv(const char* flag, int32_t default_value) {
-  const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = getenv(env_var.c_str());
-  if (string_value == nullptr) {
-    // The environment variable is not set.
-    return default_value;
  }

-  int32_t result = default_value;
-  if (!ParseInt32(std::string("Environment variable ") + env_var, string_value,
-                  &result)) {
-    std::cout << "The default value " << default_value << " is used.\n";
-    return default_value;
-  }
-
-  return result;
+  return env_var;
 }

-// Reads and returns the string environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-const char* StringFromEnv(const char* flag, const char* default_value) {
+}  // namespace
+
+BENCHMARK_EXPORT
+bool BoolFromEnv(const char* flag, bool default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+  return value_str == nullptr ? default_val : IsTruthyFlagValue(value_str);
+}
+
+BENCHMARK_EXPORT
+int32_t Int32FromEnv(const char* flag, int32_t default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+  int32_t value = default_val;
+  if (value_str == nullptr ||
+      !ParseInt32(std::string("Environment variable ") + env_var, value_str,
+                  &value)) {
+    return default_val;
+  }
+  return value;
+}
+
+BENCHMARK_EXPORT
+double DoubleFromEnv(const char* flag, double default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+  double value = default_val;
+  if (value_str == nullptr ||
+      !ParseDouble(std::string("Environment variable ") + env_var, value_str,
+                   &value)) {
+    return default_val;
+  }
+  return value;
+}
+
+BENCHMARK_EXPORT
+const char* StringFromEnv(const char* flag, const char* default_val) {
  const std::string env_var = FlagToEnvVar(flag);
  const char* const value = getenv(env_var.c_str());
-  return value == nullptr ? default_value : value;
+  return value == nullptr ? default_val : value;
+}
+
+BENCHMARK_EXPORT
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+
+  if (value_str == nullptr) {
+    return default_val;
+  }
+
+  std::map<std::string, std::string> value;
+  if (!ParseKvPairs("Environment variable " + env_var, value_str, &value)) {
+    return default_val;
+  }
+  return value;
 }

 // Parses a string as a command line flag.  The string should have
@@ -136,83 +187,135 @@ const char* StringFromEnv(const char* flag, const char* default_value) {
 const char* ParseFlagValue(const char* str, const char* flag,
                           bool def_optional) {
  // str and flag must not be nullptr.
-  if (str == nullptr || flag == nullptr) return nullptr;
+  if (str == nullptr || flag == nullptr) {
+    return nullptr;
+  }

  // The flag must start with "--".
  const std::string flag_str = std::string("--") + std::string(flag);
  const size_t flag_len = flag_str.length();
-  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) {
+    return nullptr;
+  }

  // Skips the flag name.
  const char* flag_end = str + flag_len;

  // When def_optional is true, it's OK to not have a "=value" part.
-  if (def_optional && (flag_end[0] == '\0')) return flag_end;
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }

  // If def_optional is true and there are more characters after the
  // flag name, or if def_optional is false, there must be a '=' after
  // the flag name.
-  if (flag_end[0] != '=') return nullptr;
+  if (flag_end[0] != '=') {
+    return nullptr;
+  }

  // Returns the string after "=".
  return flag_end + 1;
 }

+BENCHMARK_EXPORT
 bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
  // Gets the value of the flag as a string.
  const char* const value_str = ParseFlagValue(str, flag, true);

  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }

  // Converts the string value to a bool.
  *value = IsTruthyFlagValue(value_str);
  return true;
 }

+BENCHMARK_EXPORT
 bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
  // Gets the value of the flag as a string.
  const char* const value_str = ParseFlagValue(str, flag, false);

  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }

  // Sets *value to the value of the flag.
  return ParseInt32(std::string("The value of flag --") + flag, value_str,
                    value);
 }

+BENCHMARK_EXPORT
 bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
  // Gets the value of the flag as a string.
  const char* const value_str = ParseFlagValue(str, flag, false);

  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }

  // Sets *value to the value of the flag.
  return ParseDouble(std::string("The value of flag --") + flag, value_str,
                     value);
 }

+BENCHMARK_EXPORT
 bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
  // Gets the value of the flag as a string.
  const char* const value_str = ParseFlagValue(str, flag, false);

  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }

  *value = value_str;
  return true;
 }

+BENCHMARK_EXPORT
+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value) {
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  if (value_str == nullptr) {
+    return false;
+  }
+
+  for (const auto& kvpair : StrSplit(value_str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) {
+      return false;
+    }
+    value->emplace(kv[0], kv[1]);
+  }
+
+  return true;
+}
+
+BENCHMARK_EXPORT
 bool IsFlag(const char* str, const char* flag) {
  return (ParseFlagValue(str, flag, true) != nullptr);
 }

+BENCHMARK_EXPORT
 bool IsTruthyFlagValue(const std::string& value) {
-  if (value.empty()) return true;
-  char ch = value[0];
-  return isalnum(ch) &&
-         !(ch == '0' || ch == 'f' || ch == 'F' || ch == 'n' || ch == 'N');
+  if (value.size() == 1) {
+    char v = value[0];
+    return isalnum(v) &&
+           !(v == '0' || v == 'f' || v == 'F' || v == 'n' || v == 'N');
+  }
+  if (!value.empty()) {
+    std::string value_lower(value);
+    std::transform(value_lower.begin(), value_lower.end(), value_lower.begin(),
+                   [](char c) { return static_cast<char>(::tolower(c)); });
+    return !(value_lower == "false" || value_lower == "no" ||
+             value_lower == "off");
+  }
+  return true;
 }
+
 }  // end namespace benchmark
--- a/third_party/benchmark/src/commandlineflags.h
+++ b/third_party/benchmark/src/commandlineflags.h
@@ -2,39 +2,84 @@
 #define BENCHMARK_COMMANDLINEFLAGS_H_

 #include <cstdint>
+#include <map>
 #include <string>

+#include "benchmark/export.h"
+
 // Macro for referencing flags.
 #define FLAG(name) FLAGS_##name

 // Macros for declaring flags.
-#define DECLARE_bool(name) extern bool FLAG(name)
-#define DECLARE_int32(name) extern int32_t FLAG(name)
-#define DECLARE_int64(name) extern int64_t FLAG(name)
-#define DECLARE_double(name) extern double FLAG(name)
-#define DECLARE_string(name) extern std::string FLAG(name)
+// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
+#define BM_DECLARE_bool(name) BENCHMARK_EXPORT extern bool FLAG(name)
+#define BM_DECLARE_int32(name) BENCHMARK_EXPORT extern int32_t FLAG(name)
+#define BM_DECLARE_double(name) BENCHMARK_EXPORT extern double FLAG(name)
+#define BM_DECLARE_string(name) BENCHMARK_EXPORT extern std::string FLAG(name)
+#define BM_DECLARE_kvpairs(name) \
+  BENCHMARK_EXPORT extern std::map<std::string, std::string> FLAG(name)
+// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)

 // Macros for defining flags.
-#define DEFINE_bool(name, default_val, doc) bool FLAG(name) = (default_val)
-#define DEFINE_int32(name, default_val, doc) int32_t FLAG(name) = (default_val)
-#define DEFINE_int64(name, default_val, doc) int64_t FLAG(name) = (default_val)
-#define DEFINE_double(name, default_val, doc) double FLAG(name) = (default_val)
-#define DEFINE_string(name, default_val, doc) \
-  std::string FLAG(name) = (default_val)
+// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
+#define BM_DEFINE_bool(name, default_val) \
+  BENCHMARK_EXPORT bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
+#define BM_DEFINE_int32(name, default_val) \
+  BENCHMARK_EXPORT int32_t FLAG(name) =    \
+      benchmark::Int32FromEnv(#name, default_val)
+#define BM_DEFINE_double(name, default_val) \
+  BENCHMARK_EXPORT double FLAG(name) =      \
+      benchmark::DoubleFromEnv(#name, default_val)
+#define BM_DEFINE_string(name, default_val) \
+  BENCHMARK_EXPORT std::string FLAG(name) = \
+      benchmark::StringFromEnv(#name, default_val)
+#define BM_DEFINE_kvpairs(name, default_val)                       \
+  BENCHMARK_EXPORT std::map<std::string, std::string> FLAG(name) = \
+      benchmark::KvPairsFromEnv(#name, default_val)
+// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)

 namespace benchmark {
-// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
-// to *value and returns true; otherwise leaves *value unchanged and returns
-// false.
-bool ParseInt32(const std::string& src_text, const char* str, int32_t* value);

-// Parses a bool/Int32/string from the environment variable
-// corresponding to the given Google Test flag.
+// Parses a bool from the environment variable corresponding to the given flag.
+//
+// If the variable exists, returns IsTruthyFlagValue() value;  if not,
+// returns the given default value.
+BENCHMARK_EXPORT
 bool BoolFromEnv(const char* flag, bool default_val);
+
+// Parses an Int32 from the environment variable corresponding to the given
+// flag.
+//
+// If the variable exists, returns ParseInt32() value;  if not, returns
+// the given default value.
+BENCHMARK_EXPORT
 int32_t Int32FromEnv(const char* flag, int32_t default_val);
+
+// Parses an Double from the environment variable corresponding to the given
+// flag.
+//
+// If the variable exists, returns ParseDouble();  if not, returns
+// the given default value.
+BENCHMARK_EXPORT
 double DoubleFromEnv(const char* flag, double default_val);
+
+// Parses a string from the environment variable corresponding to the given
+// flag.
+//
+// If variable exists, returns its value;  if not, returns
+// the given default value.
+BENCHMARK_EXPORT
 const char* StringFromEnv(const char* flag, const char* default_val);

+// Parses a set of kvpairs from the environment variable corresponding to the
+// given flag.
+//
+// If variable exists, returns its value;  if not, returns
+// the given default value.
+BENCHMARK_EXPORT
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val);
+
 // Parses a string for a bool flag, in the form of either
 // "--flag=value" or "--flag".
 //
@@ -44,36 +89,49 @@ const char* StringFromEnv(const char* flag, const char* default_val);
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseBoolFlag(const char* str, const char* flag, bool* value);

-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
+// Parses a string for an Int32 flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);

-// Parses a string for a Double flag, in the form of
-// "--flag=value".
+// Parses a string for a Double flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseDoubleFlag(const char* str, const char* flag, double* value);

-// Parses a string for a string flag, in the form of
-// "--flag=value".
+// Parses a string for a string flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseStringFlag(const char* str, const char* flag, std::string* value);

+// Parses a string for a kvpairs flag in the form "--flag=key=value,key=value"
+//
+// On success, stores the value of the flag in *value and returns true. On
+// failure returns false, though *value may have been mutated.
+BENCHMARK_EXPORT
+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value);
+
 // Returns true if the string matches the flag.
+BENCHMARK_EXPORT
 bool IsFlag(const char* str, const char* flag);

 // Returns true unless value starts with one of: '0', 'f', 'F', 'n' or 'N', or
-// some non-alphanumeric character. As a special case, also returns true if
-// value is the empty string.
+// some non-alphanumeric character. Also returns false if the value matches
+// one of 'no', 'false', 'off' (case-insensitive). As a special case, also
+// returns true if value is the empty string.
+BENCHMARK_EXPORT
 bool IsTruthyFlagValue(const std::string& value);
+
 }  // end namespace benchmark

 #endif  // BENCHMARK_COMMANDLINEFLAGS_H_
--- a/third_party/benchmark/src/complexity.cc
+++ b/third_party/benchmark/src/complexity.cc
@@ -15,34 +15,36 @@
 // Source project : https://github.com/ismaelJimenez/cpp.leastsq
 // Adapted to be used with google benchmark

-#include "benchmark/benchmark.h"
+#include "complexity.h"

 #include <algorithm>
 #include <cmath>
+
+#include "benchmark/benchmark.h"
 #include "check.h"
-#include "complexity.h"

 namespace benchmark {

 // Internal function to calculate the different scalability forms
 BigOFunc* FittingCurve(BigO complexity) {
-  static const double kLog2E = 1.44269504088896340736;
  switch (complexity) {
    case oN:
-      return [](int64_t n) -> double { return static_cast<double>(n); };
+      return [](IterationCount n) -> double { return static_cast<double>(n); };
    case oNSquared:
-      return [](int64_t n) -> double { return std::pow(n, 2); };
+      return [](IterationCount n) -> double { return std::pow(n, 2); };
    case oNCubed:
-      return [](int64_t n) -> double { return std::pow(n, 3); };
+      return [](IterationCount n) -> double { return std::pow(n, 3); };
    case oLogN:
-      /* Note: can't use log2 because Android's GNU STL lacks it */
-      return [](int64_t n) { return kLog2E * log(static_cast<double>(n)); };
+      return [](IterationCount n) -> double {
+        return std::log2(static_cast<double>(n));
+      };
    case oNLogN:
-      /* Note: can't use log2 because Android's GNU STL lacks it */
-      return [](int64_t n) { return kLog2E * n * log(static_cast<double>(n)); };
+      return [](IterationCount n) -> double {
+        return static_cast<double>(n) * std::log2(static_cast<double>(n));
+      };
    case o1:
    default:
-      return [](int64_t) { return 1.0; };
+      return [](IterationCount) { return 1.0; };
  }
 }

@@ -71,15 +73,14 @@ std::string GetBigOString(BigO complexity) {
 // given by the lambda expression.
 //   - n             : Vector containing the size of the benchmark tests.
 //   - time          : Vector containing the times for the benchmark tests.
-//   - fitting_curve : lambda expression (e.g. [](int64_t n) {return n; };).
+//   - fitting_curve : lambda expression (e.g. [](ComplexityN n) {return n; };).

-// For a deeper explanation on the algorithm logic, look the README file at
-// http://github.com/ismaelJimenez/Minimal-Cpp-Least-Squared-Fit
+// For a deeper explanation on the algorithm logic, please refer to
+// https://en.wikipedia.org/wiki/Least_squares#Least_squares,_regression_analysis_and_statistics

-LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
+LeastSq MinimalLeastSq(const std::vector<ComplexityN>& n,
                       const std::vector<double>& time,
                       BigOFunc* fitting_curve) {
-  double sigma_gn = 0.0;
  double sigma_gn_squared = 0.0;
  double sigma_time = 0.0;
  double sigma_time_gn = 0.0;
@@ -87,7 +88,6 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
  // Calculate least square fitting parameter
  for (size_t i = 0; i < n.size(); ++i) {
    double gn_i = fitting_curve(n[i]);
-    sigma_gn += gn_i;
    sigma_gn_squared += gn_i * gn_i;
    sigma_time += time[i];
    sigma_time_gn += time[i] * gn_i;
@@ -103,12 +103,12 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
  double rms = 0.0;
  for (size_t i = 0; i < n.size(); ++i) {
    double fit = result.coef * fitting_curve(n[i]);
-    rms += pow((time[i] - fit), 2);
+    rms += std::pow((time[i] - fit), 2);
  }

  // Normalized RMS by the mean of the observed values
-  double mean = sigma_time / n.size();
-  result.rms = sqrt(rms / n.size()) / mean;
+  double mean = sigma_time / static_cast<double>(n.size());
+  result.rms = std::sqrt(rms / static_cast<double>(n.size())) / mean;

  return result;
 }
@@ -120,12 +120,12 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
 //   - complexity : If different than oAuto, the fitting curve will stick to
 //                  this one. If it is oAuto, it will be calculated the best
 //                  fitting curve.
-LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
+LeastSq MinimalLeastSq(const std::vector<ComplexityN>& n,
                       const std::vector<double>& time, const BigO complexity) {
-  CHECK_EQ(n.size(), time.size());
-  CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
-                          // benchmark runs are given
-  CHECK_NE(complexity, oNone);
+  BM_CHECK_EQ(n.size(), time.size());
+  BM_CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
+                             // benchmark runs are given
+  BM_CHECK_NE(complexity, oNone);

  LeastSq best_fit;

@@ -157,19 +157,24 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
  typedef BenchmarkReporter::Run Run;
  std::vector<Run> results;

-  if (reports.size() < 2) return results;
+  if (reports.size() < 2) {
+    return results;
+  }

  // Accumulators.
-  std::vector<int64_t> n;
+  std::vector<ComplexityN> n;
  std::vector<double> real_time;
  std::vector<double> cpu_time;

  // Populate the accumulators.
  for (const Run& run : reports) {
-    CHECK_GT(run.complexity_n, 0) << "Did you forget to call SetComplexityN?";
+    BM_CHECK_GT(run.complexity_n, 0)
+        << "Did you forget to call SetComplexityN?";
    n.push_back(run.complexity_n);
-    real_time.push_back(run.real_accumulated_time / run.iterations);
-    cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
+    real_time.push_back(run.real_accumulated_time /
+                        static_cast<double>(run.iterations));
+    cpu_time.push_back(run.cpu_accumulated_time /
+                       static_cast<double>(run.iterations));
  }

  LeastSq result_cpu;
@@ -179,18 +184,37 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity_lambda);
    result_real = MinimalLeastSq(n, real_time, reports[0].complexity_lambda);
  } else {
-    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity);
-    result_real = MinimalLeastSq(n, real_time, result_cpu.complexity);
+    const BigO* InitialBigO = &reports[0].complexity;
+    const bool use_real_time_for_initial_big_o =
+        reports[0].use_real_time_for_initial_big_o;
+    if (use_real_time_for_initial_big_o) {
+      result_real = MinimalLeastSq(n, real_time, *InitialBigO);
+      InitialBigO = &result_real.complexity;
+      // The Big-O complexity for CPU time must have the same Big-O function!
+    }
+    result_cpu = MinimalLeastSq(n, cpu_time, *InitialBigO);
+    InitialBigO = &result_cpu.complexity;
+    if (!use_real_time_for_initial_big_o) {
+      result_real = MinimalLeastSq(n, real_time, *InitialBigO);
+    }
  }

-  std::string run_name = reports[0].benchmark_name().substr(
-      0, reports[0].benchmark_name().find('/'));
+  // Drop the 'args' when reporting complexity.
+  auto run_name = reports[0].run_name;
+  run_name.args.clear();

  // Get the data from the accumulator to BenchmarkReporter::Run's.
  Run big_o;
  big_o.run_name = run_name;
+  big_o.family_index = reports[0].family_index;
+  big_o.per_family_instance_index = reports[0].per_family_instance_index;
  big_o.run_type = BenchmarkReporter::Run::RT_Aggregate;
+  big_o.repetitions = reports[0].repetitions;
+  big_o.repetition_index = Run::no_repetition_index;
+  big_o.threads = reports[0].threads;
  big_o.aggregate_name = "BigO";
+  big_o.aggregate_unit = StatisticUnit::kTime;
+  big_o.report_label = reports[0].report_label;
  big_o.iterations = 0;
  big_o.real_accumulated_time = result_real.coef;
  big_o.cpu_accumulated_time = result_cpu.coef;
@@ -207,11 +231,16 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
  // Only add label to mean/stddev if it is same for all runs
  Run rms;
  rms.run_name = run_name;
-  big_o.report_label = reports[0].report_label;
+  rms.family_index = reports[0].family_index;
+  rms.per_family_instance_index = reports[0].per_family_instance_index;
  rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
  rms.aggregate_name = "RMS";
+  rms.aggregate_unit = StatisticUnit::kPercentage;
  rms.report_label = big_o.report_label;
  rms.iterations = 0;
+  rms.repetition_index = Run::no_repetition_index;
+  rms.repetitions = reports[0].repetitions;
+  rms.threads = reports[0].threads;
  rms.real_accumulated_time = result_real.rms / multiplier;
  rms.cpu_accumulated_time = result_cpu.rms / multiplier;
  rms.report_rms = true;
--- a/third_party/benchmark/src/complexity.h
+++ b/third_party/benchmark/src/complexity.h
@@ -31,7 +31,7 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
    const std::vector<BenchmarkReporter::Run>& reports);

 // This data structure will contain the result returned by MinimalLeastSq
-//   - coef        : Estimated coeficient for the high-order term as
+//   - coef        : Estimated coefficient for the high-order term as
 //                   interpolated from data.
 //   - rms         : Normalized Root Mean Squared Error.
 //   - complexity  : Scalability form (e.g. oN, oNLogN). In case a scalability
--- a/third_party/benchmark/src/console_reporter.cc
+++ b/third_party/benchmark/src/console_reporter.cc
@@ -12,27 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "benchmark/benchmark.h"
-#include "complexity.h"
-#include "counter.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <cstdio>
+#include <cstring>
 #include <iostream>
 #include <string>
 #include <tuple>
 #include <vector>

+#include "benchmark/benchmark.h"
 #include "check.h"
 #include "colorprint.h"
 #include "commandlineflags.h"
+#include "complexity.h"
+#include "counter.h"
 #include "internal_macros.h"
 #include "string_util.h"
 #include "timers.h"

 namespace benchmark {

+BENCHMARK_EXPORT
 bool ConsoleReporter::ReportContext(const Context& context) {
  name_field_width_ = context.name_field_width;
  printed_header_ = false;
@@ -41,34 +42,40 @@ bool ConsoleReporter::ReportContext(const Context& context) {
  PrintBasicContext(&GetErrorStream(), context);

 #ifdef BENCHMARK_OS_WINDOWS
-  if ((output_options_ & OO_Color) && &std::cout != &GetOutputStream()) {
-    GetErrorStream()
-        << "Color printing is only supported for stdout on windows."
-           " Disabling color printing\n";
-    output_options_ = static_cast< OutputOptions >(output_options_ & ~OO_Color);
+  if ((output_options_ & OO_Color)) {
+    auto stdOutBuf = std::cout.rdbuf();
+    auto outStreamBuf = GetOutputStream().rdbuf();
+    if (stdOutBuf != outStreamBuf) {
+      GetErrorStream()
+          << "Color printing is only supported for stdout on windows."
+             " Disabling color printing\n";
+      output_options_ = static_cast<OutputOptions>(output_options_ & ~OO_Color);
+    }
  }
 #endif

  return true;
 }

+BENCHMARK_EXPORT
 void ConsoleReporter::PrintHeader(const Run& run) {
-  std::string str = FormatString("%-*s %13s %13s %10s", static_cast<int>(name_field_width_),
-                                 "Benchmark", "Time", "CPU", "Iterations");
-  if(!run.counters.empty()) {
-    if(output_options_ & OO_Tabular) {
-      for(auto const& c : run.counters) {
+  std::string str =
+      FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
+                   "Benchmark", "Time", "CPU", "Iterations");
+  if (!run.counters.empty()) {
+    if ((output_options_ & OO_Tabular) != 0) {
+      for (auto const& c : run.counters) {
        str += FormatString(" %10s", c.first.c_str());
      }
    } else {
      str += " UserCounters...";
    }
  }
-  str += "\n";
  std::string line = std::string(str.length(), '-');
-  GetOutputStream() << line << "\n" << str << line << "\n";
+  GetOutputStream() << line << "\n" << str << "\n" << line << "\n";
 }

+BENCHMARK_EXPORT
 void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
  for (const auto& run : reports) {
    // print the header:
@@ -76,7 +83,7 @@ void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
    bool print_header = !printed_header_;
    // --- or if the format is tabular and this run
    //     has different fields from the prev header
-    print_header |= (output_options_ & OO_Tabular) &&
+    print_header |= ((output_options_ & OO_Tabular) != 0) &&
                    (!internal::SameNames(run.counters, prev_counters_));
    if (print_header) {
      printed_header_ = true;
@@ -90,65 +97,108 @@ void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
  }
 }

-static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
-                             ...) {
+static void IgnoreColorPrint(std::ostream& out, LogColor /*unused*/,
+                             const char* fmt, ...) {
  va_list args;
  va_start(args, fmt);
  out << FormatString(fmt, args);
  va_end(args);
 }

+static std::string FormatTime(double time) {
+  // For the time columns of the console printer 13 digits are reserved. One of
+  // them is a space and max two of them are the time unit (e.g ns). That puts
+  // us at 10 digits usable for the number.
+  // Align decimal places...
+  if (time < 1.0) {
+    return FormatString("%10.3f", time);
+  }
+  if (time < 10.0) {
+    return FormatString("%10.2f", time);
+  }
+  if (time < 100.0) {
+    return FormatString("%10.1f", time);
+  }
+  // Assuming the time is at max 9.9999e+99 and we have 10 digits for the
+  // number, we get 10-1(.)-1(e)-1(sign)-2(exponent) = 5 digits to print.
+  if (time > 9999999999 /*max 10 digit number*/) {
+    return FormatString("%1.4e", time);
+  }
+  return FormatString("%10.0f", time);
+}
+
+BENCHMARK_EXPORT
 void ConsoleReporter::PrintRunData(const Run& result) {
  typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
  auto& Out = GetOutputStream();
-  PrinterFn* printer = (output_options_ & OO_Color) ?
-                         (PrinterFn*)ColorPrintf : IgnoreColorPrint;
+  PrinterFn* printer = (output_options_ & OO_Color) != 0
+                           ? static_cast<PrinterFn*>(ColorPrintf)
+                           : IgnoreColorPrint;
  auto name_color =
      (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
  printer(Out, name_color, "%-*s ", name_field_width_,
          result.benchmark_name().c_str());

-  if (result.error_occurred) {
+  if (internal::SkippedWithError == result.skipped) {
    printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
-            result.error_message.c_str());
+            result.skip_message.c_str());
+    printer(Out, COLOR_DEFAULT, "\n");
+    return;
+  }
+  if (internal::SkippedWithMessage == result.skipped) {
+    printer(Out, COLOR_WHITE, "SKIPPED: \'%s\'", result.skip_message.c_str());
    printer(Out, COLOR_DEFAULT, "\n");
    return;
  }

  const double real_time = result.GetAdjustedRealTime();
  const double cpu_time = result.GetAdjustedCPUTime();
+  const std::string real_time_str = FormatTime(real_time);
+  const std::string cpu_time_str = FormatTime(cpu_time);

  if (result.report_big_o) {
    std::string big_o = GetBigOString(result.complexity);
-    printer(Out, COLOR_YELLOW, "%10.2f %s %10.2f %s ", real_time, big_o.c_str(),
-            cpu_time, big_o.c_str());
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time,
+            big_o.c_str(), cpu_time, big_o.c_str());
  } else if (result.report_rms) {
-    printer(Out, COLOR_YELLOW, "%10.0f %% %10.0f %% ", real_time * 100,
-            cpu_time * 100);
-  } else {
+    printer(Out, COLOR_YELLOW, "%10.0f %-4s %10.0f %-4s ", real_time * 100, "%",
+            cpu_time * 100, "%");
+  } else if (result.run_type != Run::RT_Aggregate ||
+             result.aggregate_unit == StatisticUnit::kTime) {
    const char* timeLabel = GetTimeUnitString(result.time_unit);
-    printer(Out, COLOR_YELLOW, "%10.0f %s %10.0f %s ", real_time, timeLabel,
-            cpu_time, timeLabel);
+    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(),
+            timeLabel, cpu_time_str.c_str(), timeLabel);
+  } else {
+    assert(result.aggregate_unit == StatisticUnit::kPercentage);
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ",
+            (100. * result.real_accumulated_time), "%",
+            (100. * result.cpu_accumulated_time), "%");
  }

  if (!result.report_big_o && !result.report_rms) {
    printer(Out, COLOR_CYAN, "%10lld", result.iterations);
  }

-  for (auto& c : result.counters) {
-    const std::size_t cNameLen = std::max(std::string::size_type(10),
-                                          c.first.length());
-    auto const& s = HumanReadableNumber(c.second.value, c.second.oneK);
-    if (output_options_ & OO_Tabular) {
-      if (c.second.flags & Counter::kIsRate) {
-        printer(Out, COLOR_DEFAULT, " %*s/s", cNameLen - 2, s.c_str());
-      } else {
-        printer(Out, COLOR_DEFAULT, " %*s", cNameLen, s.c_str());
-      }
+  for (const auto& c : result.counters) {
+    const std::size_t cNameLen =
+        std::max(static_cast<std::size_t>(10), c.first.length());
+    std::string s;
+    const char* unit = "";
+    if (result.run_type == Run::RT_Aggregate &&
+        result.aggregate_unit == StatisticUnit::kPercentage) {
+      s = StrFormat("%.2f", 100. * c.second.value);
+      unit = "%";
    } else {
-      const char* unit = (c.second.flags & Counter::kIsRate) ? "/s" : "";
-      printer(Out, COLOR_DEFAULT, " %s=%s%s", c.first.c_str(), s.c_str(),
+      s = HumanReadableNumber(c.second.value, c.second.oneK);
+      if ((c.second.flags & Counter::kIsRate) != 0) {
+        unit = (c.second.flags & Counter::kInvert) != 0 ? "s" : "/s";
+      }
+    }
+    if ((output_options_ & OO_Tabular) != 0) {
+      printer(Out, COLOR_DEFAULT, " %*s%s", cNameLen - strlen(unit), s.c_str(),
              unit);
+    } else {
+      printer(Out, COLOR_DEFAULT, " %s=%s%s", c.first.c_str(), s.c_str(), unit);
    }
  }

--- a/third_party/benchmark/src/counter.cc
+++ b/third_party/benchmark/src/counter.cc
@@ -17,25 +17,30 @@
 namespace benchmark {
 namespace internal {

-double Finish(Counter const& c, int64_t iterations, double cpu_time,
+double Finish(Counter const& c, IterationCount iterations, double cpu_time,
              double num_threads) {
  double v = c.value;
-  if (c.flags & Counter::kIsRate) {
+  if ((c.flags & Counter::kIsRate) != 0) {
    v /= cpu_time;
  }
-  if (c.flags & Counter::kAvgThreads) {
+  if ((c.flags & Counter::kAvgThreads) != 0) {
    v /= num_threads;
  }
-  if (c.flags & Counter::kIsIterationInvariant) {
-    v *= iterations;
+  if ((c.flags & Counter::kIsIterationInvariant) != 0) {
+    v *= static_cast<double>(iterations);
  }
-  if (c.flags & Counter::kAvgIterations) {
-    v /= iterations;
+  if ((c.flags & Counter::kAvgIterations) != 0) {
+    v /= static_cast<double>(iterations);
+  }
+
+  if ((c.flags & Counter::kInvert) != 0) {  // Invert is *always* last.
+    v = 1.0 / v;
  }
  return v;
 }

-void Finish(UserCounters* l, int64_t iterations, double cpu_time, double num_threads) {
+void Finish(UserCounters* l, IterationCount iterations, double cpu_time,
+            double num_threads) {
  for (auto& c : *l) {
    c.second.value = Finish(c.second, iterations, cpu_time, num_threads);
  }
@@ -59,7 +64,9 @@ void Increment(UserCounters* l, UserCounters const& r) {
 }

 bool SameNames(UserCounters const& l, UserCounters const& r) {
-  if (&l == &r) return true;
+  if (&l == &r) {
+    return true;
+  }
  if (l.size() != r.size()) {
    return false;
  }
--- a/third_party/benchmark/src/counter.h
+++ b/third_party/benchmark/src/counter.h
@@ -12,15 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#ifndef BENCHMARK_COUNTER_H_
+#define BENCHMARK_COUNTER_H_
+
 #include "benchmark/benchmark.h"

 namespace benchmark {

 // these counter-related functions are hidden to reduce API surface.
 namespace internal {
-void Finish(UserCounters* l, int64_t iterations, double time, double num_threads);
+void Finish(UserCounters* l, IterationCount iterations, double time,
+            double num_threads);
 void Increment(UserCounters* l, UserCounters const& r);
 bool SameNames(UserCounters const& l, UserCounters const& r);
 }  // end namespace internal

 }  // end namespace benchmark
+
+#endif  // BENCHMARK_COUNTER_H_
--- a/third_party/benchmark/src/csv_reporter.cc
+++ b/third_party/benchmark/src/csv_reporter.cc
@@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <iostream>
@@ -22,7 +19,9 @@
 #include <tuple>
 #include <vector>

+#include "benchmark/benchmark.h"
 #include "check.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"

@@ -37,11 +36,29 @@ std::vector<std::string> elements = {
    "error_occurred", "error_message"};
 }  // namespace

+std::string CsvEscape(const std::string& s) {
+  std::string tmp;
+  tmp.reserve(s.size() + 2);
+  for (char c : s) {
+    switch (c) {
+      case '"':
+        tmp += "\"\"";
+        break;
+      default:
+        tmp += c;
+        break;
+    }
+  }
+  return '"' + tmp + '"';
+}
+
+BENCHMARK_EXPORT
 bool CSVReporter::ReportContext(const Context& context) {
  PrintBasicContext(&GetErrorStream(), context);
  return true;
 }

+BENCHMARK_EXPORT
 void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
  std::ostream& Out = GetOutputStream();

@@ -49,8 +66,10 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
    // save the names of all the user counters
    for (const auto& run : reports) {
      for (const auto& cnt : run.counters) {
-        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+        if (cnt.first == "bytes_per_second" ||
+            cnt.first == "items_per_second") {
          continue;
+        }
        user_counter_names_.insert(cnt.first);
      }
    }
@@ -58,7 +77,9 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
    // print the header
    for (auto B = elements.begin(); B != elements.end();) {
      Out << *B++;
-      if (B != elements.end()) Out << ",";
+      if (B != elements.end()) {
+        Out << ",";
+      }
    }
    for (auto B = user_counter_names_.begin();
         B != user_counter_names_.end();) {
@@ -71,9 +92,12 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
    // check that all the current counters are saved in the name set
    for (const auto& run : reports) {
      for (const auto& cnt : run.counters) {
-        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+        if (cnt.first == "bytes_per_second" ||
+            cnt.first == "items_per_second") {
          continue;
-        CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
+        }
+        BM_CHECK(user_counter_names_.find(cnt.first) !=
+                 user_counter_names_.end())
            << "All counters must be present in each run. "
            << "Counter named \"" << cnt.first
            << "\" was not in a run after being added to the header";
@@ -87,20 +111,14 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
  }
 }

+BENCHMARK_EXPORT
 void CSVReporter::PrintRunData(const Run& run) {
  std::ostream& Out = GetOutputStream();
-
-  // Field with embedded double-quote characters must be doubled and the field
-  // delimited with double-quotes.
-  std::string name = run.benchmark_name();
-  ReplaceAll(&name, "\"", "\"\"");
-  Out << '"' << name << "\",";
-  if (run.error_occurred) {
+  Out << CsvEscape(run.benchmark_name()) << ",";
+  if (run.skipped != 0u) {
    Out << std::string(elements.size() - 3, ',');
-    Out << "true,";
-    std::string msg = run.error_message;
-    ReplaceAll(&msg, "\"", "\"\"");
-    Out << '"' << msg << "\"\n";
+    Out << std::boolalpha << (internal::SkippedWithError == run.skipped) << ",";
+    Out << CsvEscape(run.skip_message) << "\n";
    return;
  }

@@ -110,13 +128,21 @@ void CSVReporter::PrintRunData(const Run& run) {
  }
  Out << ",";

-  Out << run.GetAdjustedRealTime() << ",";
-  Out << run.GetAdjustedCPUTime() << ",";
+  if (run.run_type != Run::RT_Aggregate ||
+      run.aggregate_unit == StatisticUnit::kTime) {
+    Out << run.GetAdjustedRealTime() << ",";
+    Out << run.GetAdjustedCPUTime() << ",";
+  } else {
+    assert(run.aggregate_unit == StatisticUnit::kPercentage);
+    Out << run.real_accumulated_time << ",";
+    Out << run.cpu_accumulated_time << ",";
+  }

  // Do not print timeLabel on bigO and RMS report
  if (run.report_big_o) {
    Out << GetBigOString(run.complexity);
-  } else if (!run.report_rms) {
+  } else if (!run.report_rms &&
+             run.aggregate_unit != StatisticUnit::kPercentage) {
    Out << GetTimeUnitString(run.time_unit);
  }
  Out << ",";
@@ -130,11 +156,7 @@ void CSVReporter::PrintRunData(const Run& run) {
  }
  Out << ",";
  if (!run.report_label.empty()) {
-    // Field with embedded double-quote characters must be doubled and the field
-    // delimited with double-quotes.
-    std::string label = run.report_label;
-    ReplaceAll(&label, "\"", "\"\"");
-    Out << "\"" << label << "\"";
+    Out << CsvEscape(run.report_label);
  }
  Out << ",,";  // for error_occurred and error_message

--- a/third_party/benchmark/src/cycleclock.h
+++ b/third_party/benchmark/src/cycleclock.h
@@ -36,7 +36,8 @@
 // declarations of some other intrinsics, breaking compilation.
 // Therefore, we simply declare __rdtsc ourselves. See also
 // http://connect.microsoft.com/VisualStudio/feedback/details/262047
-#if defined(COMPILER_MSVC) && !defined(_M_IX86)
+#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64) && \
+    !defined(_M_ARM64EC)
 extern "C" uint64_t __rdtsc();
 #pragma intrinsic(__rdtsc)
 #endif
@@ -69,7 +70,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  // frequency scaling).  Also note that when the Mac sleeps, this
  // counter pauses; it does not continue counting, nor does it
  // reset to zero.
-  return mach_absolute_time();
+  return static_cast<int64_t>(mach_absolute_time());
 #elif defined(BENCHMARK_OS_EMSCRIPTEN)
  // this goes above x86-specific code because old versions of Emscripten
  // define __x86_64__, although they have nothing to do with it.
@@ -81,16 +82,24 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
 #elif defined(__x86_64__) || defined(__amd64__)
  uint64_t low, high;
  __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
-  return (high << 32) | low;
+  return static_cast<int64_t>((high << 32) | low);
 #elif defined(__powerpc__) || defined(__ppc__)
  // This returns a time-base, which is not always precisely a cycle-count.
-  int64_t tbl, tbu0, tbu1;
-  asm("mftbu %0" : "=r"(tbu0));
-  asm("mftb  %0" : "=r"(tbl));
-  asm("mftbu %0" : "=r"(tbu1));
-  tbl &= -static_cast<int64_t>(tbu0 == tbu1);
-  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
-  return (tbu1 << 32) | tbl;
+#if defined(__powerpc64__) || defined(__ppc64__)
+  int64_t tb;
+  asm volatile("mfspr %0, 268" : "=r"(tb));
+  return tb;
+#else
+  uint32_t tbl, tbu0, tbu1;
+  asm volatile(
+      "mftbu %0\n"
+      "mftb %1\n"
+      "mftbu %2"
+      : "=r"(tbu0), "=r"(tbl), "=r"(tbu1));
+  tbl &= -static_cast<int32_t>(tbu0 == tbu1);
+  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is no longer needed)
+  return (static_cast<uint64_t>(tbu1) << 32) | tbl;
+#endif
 #elif defined(__sparc__)
  int64_t tick;
  asm(".byte 0x83, 0x41, 0x00, 0x00");
@@ -106,6 +115,12 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  // when I know it will work.  Otherwise, I'll use __rdtsc and hope
  // the code is being compiled with a non-ancient compiler.
  _asm rdtsc
+#elif defined(COMPILER_MSVC) && (defined(_M_ARM64) || defined(_M_ARM64EC))
+  // See // https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics
+  // and https://reviews.llvm.org/D53115
+  int64_t virtual_timer_value;
+  virtual_timer_value = _ReadStatusReg(ARM64_CNTVCT);
+  return virtual_timer_value;
 #elif defined(COMPILER_MSVC)
  return __rdtsc();
 #elif defined(BENCHMARK_OS_NACL)
@@ -118,7 +133,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {

  // Native Client does not provide any API to access cycle counter.
  // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
-  // because is provides nanosecond resolution (which is noticable at
+  // because is provides nanosecond resolution (which is noticeable at
  // least for PNaCl modules running on x86 Mac & Linux).
  // Initialize to always return 0 if clock_gettime fails.
  struct timespec ts = {0, 0};
@@ -153,21 +168,84 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  struct timeval tv;
  gettimeofday(&tv, nullptr);
  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-#elif defined(__mips__)
+#elif defined(__mips__) || defined(__m68k__)
  // mips apparently only allows rdtsc for superusers, so we fall
  // back to gettimeofday.  It's possible clock_gettime would be better.
  struct timeval tv;
  gettimeofday(&tv, nullptr);
  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__loongarch__) || defined(__csky__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 #elif defined(__s390__)  // Covers both s390 and s390x.
  // Return the CPU clock.
  uint64_t tsc;
-  asm("stck %0" : "=Q"(tsc) : : "cc");
-  return tsc;
+#if defined(BENCHMARK_OS_ZOS)
+  // z/OS HLASM syntax.
+  asm(" stck %0" : "=m"(tsc) : : "cc");
 #else
-// The soft failover to a generic implementation is automatic only for ARM.
-// For other platforms the developer is expected to make an attempt to create
-// a fast implementation and use generic version if nothing better is available.
+  // Linux on Z syntax.
+  asm("stck %0" : "=Q"(tsc) : : "cc");
+#endif
+  return tsc;
+#elif defined(__riscv)  // RISC-V
+  // Use RDTIME (and RDTIMEH on riscv32).
+  // RDCYCLE is a privileged instruction since Linux 6.6.
+#if __riscv_xlen == 32
+  uint32_t cycles_lo, cycles_hi0, cycles_hi1;
+  // This asm also includes the PowerPC overflow handling strategy, as above.
+  // Implemented in assembly because Clang insisted on branching.
+  asm volatile(
+      "rdtimeh %0\n"
+      "rdtime %1\n"
+      "rdtimeh %2\n"
+      "sub %0, %0, %2\n"
+      "seqz %0, %0\n"
+      "sub %0, zero, %0\n"
+      "and %1, %1, %0\n"
+      : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1));
+  return static_cast<int64_t>((static_cast<uint64_t>(cycles_hi1) << 32) |
+                              cycles_lo);
+#else
+  uint64_t cycles;
+  asm volatile("rdtime %0" : "=r"(cycles));
+  return static_cast<int64_t>(cycles);
+#endif
+#elif defined(__e2k__) || defined(__elbrus__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__hexagon__)
+  uint64_t pcycle;
+  asm volatile("%0 = C15:14" : "=r"(pcycle));
+  return static_cast<int64_t>(pcycle);
+#elif defined(__alpha__)
+  // Alpha has a cycle counter, the PCC register, but it is an unsigned 32-bit
+  // integer and thus wraps every ~4s, making using it for tick counts
+  // unreliable beyond this time range.  The real-time clock is low-precision,
+  // roughtly ~1ms, but it is the only option that can reasonable count
+  // indefinitely.
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__hppa__) || defined(__linux__)
+  // Fallback for all other architectures with a recent Linux kernel, e.g.:
+  // HP PA-RISC provides a user-readable clock counter (cr16), but
+  // it's not syncronized across CPUs and only 32-bit wide when programs
+  // are built as 32-bit binaries.
+  // Same for SH-4 and possibly others.
+  // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
+  // because is provides nanosecond resolution.
+  // Initialize to always return 0 if clock_gettime fails.
+  struct timespec ts = {0, 0};
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return static_cast<int64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#else
+  // The soft failover to a generic implementation is automatic only for ARM.
+  // For other platforms the developer is expected to make an attempt to create
+  // a fast implementation and use generic version if nothing better is
+  // available.
 #error You need to define CycleTimer for your OS and CPU
 #endif
 }
--- a/third_party/benchmark/src/internal_macros.h
+++ b/third_party/benchmark/src/internal_macros.h
@@ -1,8 +1,6 @@
 #ifndef BENCHMARK_INTERNAL_MACROS_H_
 #define BENCHMARK_INTERNAL_MACROS_H_

-#include "benchmark/benchmark.h"
-
 /* Needed to detect STL */
 #include <cstdlib>

@@ -40,6 +38,19 @@
  #define BENCHMARK_OS_CYGWIN 1
 #elif defined(_WIN32)
  #define BENCHMARK_OS_WINDOWS 1
+  // WINAPI_FAMILY_PARTITION is defined in winapifamily.h.
+  // We include windows.h which implicitly includes winapifamily.h for compatibility.
+  #ifndef NOMINMAX
+    #define NOMINMAX
+  #endif
+  #include <windows.h>
+  #if defined(WINAPI_FAMILY_PARTITION)
+    #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+      #define BENCHMARK_OS_WINDOWS_WIN32 1
+    #elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+      #define BENCHMARK_OS_WINDOWS_RT 1
+    #endif
+  #endif
  #if defined(__MINGW32__)
    #define BENCHMARK_OS_MINGW 1
  #endif
@@ -58,6 +69,8 @@
  #define BENCHMARK_OS_NETBSD 1
 #elif defined(__OpenBSD__)
  #define BENCHMARK_OS_OPENBSD 1
+#elif defined(__DragonFly__)
+  #define BENCHMARK_OS_DRAGONFLY 1
 #elif defined(__linux__)
  #define BENCHMARK_OS_LINUX 1
 #elif defined(__native_client__)
@@ -70,6 +83,12 @@
 #define BENCHMARK_OS_FUCHSIA 1
 #elif defined (__SVR4) && defined (__sun)
 #define BENCHMARK_OS_SOLARIS 1
+#elif defined(__QNX__)
+#define BENCHMARK_OS_QNX 1
+#elif defined(__MVS__)
+#define BENCHMARK_OS_ZOS 1
+#elif defined(__hexagon__)
+#define BENCHMARK_OS_QURT 1
 #endif

 #if defined(__ANDROID__) && defined(__GLIBCXX__)
--- a/third_party/benchmark/src/json_reporter.cc
+++ b/third_party/benchmark/src/json_reporter.cc
@@ -12,10 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
 #include <iomanip>  // for setprecision
 #include <iostream>
@@ -24,43 +22,92 @@
 #include <tuple>
 #include <vector>

+#include "benchmark/benchmark.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"

 namespace benchmark {
-
 namespace {

+std::string StrEscape(const std::string& s) {
+  std::string tmp;
+  tmp.reserve(s.size());
+  for (char c : s) {
+    switch (c) {
+      case '\b':
+        tmp += "\\b";
+        break;
+      case '\f':
+        tmp += "\\f";
+        break;
+      case '\n':
+        tmp += "\\n";
+        break;
+      case '\r':
+        tmp += "\\r";
+        break;
+      case '\t':
+        tmp += "\\t";
+        break;
+      case '\\':
+        tmp += "\\\\";
+        break;
+      case '"':
+        tmp += "\\\"";
+        break;
+      default:
+        tmp += c;
+        break;
+    }
+  }
+  return tmp;
+}
+
 std::string FormatKV(std::string const& key, std::string const& value) {
-  return StrFormat("\"%s\": \"%s\"", key.c_str(), value.c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }

 std::string FormatKV(std::string const& key, const char* value) {
-  return StrFormat("\"%s\": \"%s\"", key.c_str(), value);
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }

 std::string FormatKV(std::string const& key, bool value) {
-  return StrFormat("\"%s\": %s", key.c_str(), value ? "true" : "false");
+  return StrFormat("\"%s\": %s", StrEscape(key).c_str(),
+                   value ? "true" : "false");
 }

 std::string FormatKV(std::string const& key, int64_t value) {
  std::stringstream ss;
-  ss << '"' << key << "\": " << value;
+  ss << '"' << StrEscape(key) << "\": " << value;
  return ss.str();
 }

+std::string FormatKV(std::string const& key, int value) {
+  return FormatKV(key, static_cast<int64_t>(value));
+}
+
 std::string FormatKV(std::string const& key, double value) {
  std::stringstream ss;
-  ss << '"' << key << "\": ";
+  ss << '"' << StrEscape(key) << "\": ";

-  const auto max_digits10 = std::numeric_limits<decltype(value)>::max_digits10;
-  const auto max_fractional_digits10 = max_digits10 - 1;
-
-  ss << std::scientific << std::setprecision(max_fractional_digits10) << value;
+  if (std::isnan(value)) {
+    ss << (value < 0 ? "-" : "") << "NaN";
+  } else if (std::isinf(value)) {
+    ss << (value < 0 ? "-" : "") << "Infinity";
+  } else {
+    const auto max_digits10 =
+        std::numeric_limits<decltype(value)>::max_digits10;
+    const auto max_fractional_digits10 = max_digits10 - 1;
+    ss << std::scientific << std::setprecision(max_fractional_digits10)
+       << value;
+  }
  return ss.str();
 }

-int64_t RoundDouble(double v) { return static_cast<int64_t>(v + 0.5); }
+int64_t RoundDouble(double v) { return std::lround(v); }

 }  // end namespace

@@ -77,13 +124,10 @@ bool JSONReporter::ReportContext(const Context& context) {
  std::string walltime_value = LocalDateTimeString();
  out << indent << FormatKV("date", walltime_value) << ",\n";

-  if (Context::executable_name) {
-    // windows uses backslash for its path separator,
-    // which must be escaped in JSON otherwise it blows up conforming JSON
-    // decoders
-    std::string executable_name = Context::executable_name;
-    ReplaceAll(&executable_name, "\\", "\\\\");
-    out << indent << FormatKV("executable", executable_name) << ",\n";
+  out << indent << FormatKV("host_name", context.sys_info.name) << ",\n";
+
+  if (Context::executable_name != nullptr) {
+    out << indent << FormatKV("executable", Context::executable_name) << ",\n";
  }

  CPUInfo const& info = context.cpu_info;
@@ -93,25 +137,39 @@ bool JSONReporter::ReportContext(const Context& context) {
      << FormatKV("mhz_per_cpu",
                  RoundDouble(info.cycles_per_second / 1000000.0))
      << ",\n";
-  out << indent << FormatKV("cpu_scaling_enabled", info.scaling_enabled)
-      << ",\n";
+  if (CPUInfo::Scaling::UNKNOWN != info.scaling) {
+    out << indent
+        << FormatKV("cpu_scaling_enabled",
+                    info.scaling == CPUInfo::Scaling::ENABLED)
+        << ",\n";
+  }
+
+  const SystemInfo& sysinfo = context.sys_info;
+  if (SystemInfo::ASLR::UNKNOWN != sysinfo.ASLRStatus) {
+    out << indent
+        << FormatKV("aslr_enabled",
+                    sysinfo.ASLRStatus == SystemInfo::ASLR::ENABLED)
+        << ",\n";
+  }

  out << indent << "\"caches\": [\n";
  indent = std::string(6, ' ');
  std::string cache_indent(8, ' ');
  for (size_t i = 0; i < info.caches.size(); ++i) {
-    auto& CI = info.caches[i];
+    const auto& CI = info.caches[i];
    out << indent << "{\n";
    out << cache_indent << FormatKV("type", CI.type) << ",\n";
    out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
        << ",\n";
-    out << cache_indent
-        << FormatKV("size", static_cast<int64_t>(CI.size) * 1000u) << ",\n";
+    out << cache_indent << FormatKV("size", static_cast<int64_t>(CI.size))
+        << ",\n";
    out << cache_indent
        << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
        << "\n";
    out << indent << "}";
-    if (i != info.caches.size() - 1) out << ",";
+    if (i != info.caches.size() - 1) {
+      out << ",";
+    }
    out << "\n";
  }
  indent = std::string(4, ' ');
@@ -119,16 +177,37 @@ bool JSONReporter::ReportContext(const Context& context) {
  out << indent << "\"load_avg\": [";
  for (auto it = info.load_avg.begin(); it != info.load_avg.end();) {
    out << *it++;
-    if (it != info.load_avg.end()) out << ",";
+    if (it != info.load_avg.end()) {
+      out << ",";
+    }
  }
  out << "],\n";

+  out << indent << FormatKV("library_version", GetBenchmarkVersion());
+  out << ",\n";
+
 #if defined(NDEBUG)
  const char build_type[] = "release";
 #else
  const char build_type[] = "debug";
 #endif
-  out << indent << FormatKV("library_build_type", build_type) << "\n";
+  out << indent << FormatKV("library_build_type", build_type);
+  out << ",\n";
+
+  // NOTE: our json schema is not strictly tied to the library version!
+  out << indent << FormatKV("json_schema_version", 1);
+
+  std::map<std::string, std::string>* global_context =
+      internal::GetGlobalContext();
+
+  if (global_context != nullptr) {
+    for (const auto& kv : *global_context) {
+      out << ",\n";
+      out << indent << FormatKV(kv.first, kv.second);
+    }
+  }
+  out << "\n";
+
  // Close context block and open the list of benchmarks.
  out << inner_indent << "},\n";
  out << inner_indent << "\"benchmarks\": [\n";
@@ -166,7 +245,11 @@ void JSONReporter::PrintRunData(Run const& run) {
  std::string indent(6, ' ');
  std::ostream& out = GetOutputStream();
  out << indent << FormatKV("name", run.benchmark_name()) << ",\n";
-  out << indent << FormatKV("run_name", run.run_name) << ",\n";
+  out << indent << FormatKV("family_index", run.family_index) << ",\n";
+  out << indent
+      << FormatKV("per_family_instance_index", run.per_family_instance_index)
+      << ",\n";
+  out << indent << FormatKV("run_name", run.run_name.str()) << ",\n";
  out << indent << FormatKV("run_type", [&run]() -> const char* {
    switch (run.run_type) {
      case BenchmarkReporter::Run::RT_Iteration:
@@ -176,17 +259,44 @@ void JSONReporter::PrintRunData(Run const& run) {
    }
    BENCHMARK_UNREACHABLE();
  }()) << ",\n";
+  out << indent << FormatKV("repetitions", run.repetitions) << ",\n";
+  if (run.run_type != BenchmarkReporter::Run::RT_Aggregate) {
+    out << indent << FormatKV("repetition_index", run.repetition_index)
+        << ",\n";
+  }
+  out << indent << FormatKV("threads", run.threads) << ",\n";
  if (run.run_type == BenchmarkReporter::Run::RT_Aggregate) {
    out << indent << FormatKV("aggregate_name", run.aggregate_name) << ",\n";
+    out << indent << FormatKV("aggregate_unit", [&run]() -> const char* {
+      switch (run.aggregate_unit) {
+        case StatisticUnit::kTime:
+          return "time";
+        case StatisticUnit::kPercentage:
+          return "percentage";
+      }
+      BENCHMARK_UNREACHABLE();
+    }()) << ",\n";
  }
-  if (run.error_occurred) {
-    out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
-    out << indent << FormatKV("error_message", run.error_message) << ",\n";
+  if (internal::SkippedWithError == run.skipped) {
+    out << indent << FormatKV("error_occurred", true) << ",\n";
+    out << indent << FormatKV("error_message", run.skip_message) << ",\n";
+  } else if (internal::SkippedWithMessage == run.skipped) {
+    out << indent << FormatKV("skipped", true) << ",\n";
+    out << indent << FormatKV("skip_message", run.skip_message) << ",\n";
  }
  if (!run.report_big_o && !run.report_rms) {
    out << indent << FormatKV("iterations", run.iterations) << ",\n";
-    out << indent << FormatKV("real_time", run.GetAdjustedRealTime()) << ",\n";
-    out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    if (run.run_type != Run::RT_Aggregate ||
+        run.aggregate_unit == StatisticUnit::kTime) {
+      out << indent << FormatKV("real_time", run.GetAdjustedRealTime())
+          << ",\n";
+      out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    } else {
+      assert(run.aggregate_unit == StatisticUnit::kPercentage);
+      out << indent << FormatKV("real_time", run.real_accumulated_time)
+          << ",\n";
+      out << indent << FormatKV("cpu_time", run.cpu_accumulated_time);
+    }
    out << ",\n"
        << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
  } else if (run.report_big_o) {
@@ -200,13 +310,26 @@ void JSONReporter::PrintRunData(Run const& run) {
    out << indent << FormatKV("rms", run.GetAdjustedCPUTime());
  }

-  for (auto& c : run.counters) {
+  for (const auto& c : run.counters) {
    out << ",\n" << indent << FormatKV(c.first, c.second);
  }

-  if (run.has_memory_result) {
+  if (run.memory_result.memory_iterations > 0) {
+    const auto& memory_result = run.memory_result;
    out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
-    out << ",\n" << indent << FormatKV("max_bytes_used", run.max_bytes_used);
+    out << ",\n"
+        << indent << FormatKV("max_bytes_used", memory_result.max_bytes_used);
+
+    auto report_if_present = [&out, &indent](const std::string& label,
+                                             int64_t val) {
+      if (val != MemoryManager::TombstoneValue) {
+        out << ",\n" << indent << FormatKV(label, val);
+      }
+    };
+
+    report_if_present("total_allocated_bytes",
+                      memory_result.total_allocated_bytes);
+    report_if_present("net_heap_growth", memory_result.net_heap_growth);
  }

  if (!run.report_label.empty()) {
--- a/third_party/benchmark/src/log.h
+++ b/third_party/benchmark/src/log.h
@@ -4,8 +4,6 @@
 #include <iostream>
 #include <ostream>

-#include "benchmark/benchmark.h"
-
 namespace benchmark {
 namespace internal {

@@ -23,7 +21,11 @@ class LogType {
 private:
  LogType(std::ostream* out) : out_(out) {}
  std::ostream* out_;
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(LogType);
+
+  // NOTE: we could use BENCHMARK_DISALLOW_COPY_AND_ASSIGN but we shouldn't have
+  // a dependency on benchmark.h from here.
+  LogType(const LogType&) = delete;
+  LogType& operator=(const LogType&) = delete;
 };

 template <class Tp>
@@ -47,13 +49,13 @@ inline int& LogLevel() {
 }

 inline LogType& GetNullLogInstance() {
-  static LogType log(nullptr);
-  return log;
+  static LogType null_log(static_cast<std::ostream*>(nullptr));
+  return null_log;
 }

 inline LogType& GetErrorLogInstance() {
-  static LogType log(&std::clog);
-  return log;
+  static LogType error_log(&std::clog);
+  return error_log;
 }

 inline LogType& GetLogInstanceForLevel(int level) {
@@ -67,7 +69,7 @@ inline LogType& GetLogInstanceForLevel(int level) {
 }  // end namespace benchmark

 // clang-format off
-#define VLOG(x)                                                               \
+#define BM_VLOG(x)                                                               \
  (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
                                                                         " ")
 // clang-format on
--- a/third_party/benchmark/src/mutex.h
+++ b/third_party/benchmark/src/mutex.h
@@ -9,60 +9,60 @@
 // Enable thread safety attributes only with clang.
 // The attributes can be safely erased when compiling with other compilers.
 #if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
-#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
+#define THREAD_ANNOTATION_ATTRIBUTE_(x) __attribute__((x))
 #else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
+#define THREAD_ANNOTATION_ATTRIBUTE_(x)  // no-op
 #endif

-#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(capability(x))

-#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE_(scoped_lockable)

-#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(guarded_by(x))

-#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(pt_guarded_by(x))

 #define ACQUIRED_BEFORE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_before(__VA_ARGS__))

 #define ACQUIRED_AFTER(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_after(__VA_ARGS__))

 #define REQUIRES(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_capability(__VA_ARGS__))

 #define REQUIRES_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_shared_capability(__VA_ARGS__))

 #define ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_capability(__VA_ARGS__))

 #define ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_shared_capability(__VA_ARGS__))

 #define RELEASE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_capability(__VA_ARGS__))

 #define RELEASE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_shared_capability(__VA_ARGS__))

 #define TRY_ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_capability(__VA_ARGS__))

 #define TRY_ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_shared_capability(__VA_ARGS__))

-#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE_(locks_excluded(__VA_ARGS__))

-#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(assert_capability(x))

 #define ASSERT_SHARED_CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
+  THREAD_ANNOTATION_ATTRIBUTE_(assert_shared_capability(x))

-#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(lock_returned(x))

 #define NO_THREAD_SAFETY_ANALYSIS \
-  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+  THREAD_ANNOTATION_ATTRIBUTE_(no_thread_safety_analysis)

 namespace benchmark {

@@ -71,7 +71,7 @@ typedef std::condition_variable Condition;
 // NOTE: Wrappers for std::mutex and std::unique_lock are provided so that
 // we can annotate them with thread safety attributes and use the
 // -Wthread-safety warning with clang. The standard library types cannot be
-// used directly because they do not provided the required annotations.
+// used directly because they do not provide the required annotations.
 class CAPABILITY("mutex") Mutex {
 public:
  Mutex() {}
@@ -130,7 +130,7 @@ class Barrier {
  // entered the barrier.  Returns iff this is the last thread to
  // enter the barrier.
  bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
-    CHECK_LT(entered_, running_threads_);
+    BM_CHECK_LT(entered_, running_threads_);
    entered_++;
    if (entered_ < running_threads_) {
      // Wait for all threads to enter
--- a/third_party/benchmark/src/perf_counters.cc
+++ b/third_party/benchmark/src/perf_counters.cc
@@ -0,0 +1,282 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "perf_counters.h"
+
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#if defined HAVE_LIBPFM
+#include "perfmon/pfmlib.h"
+#include "perfmon/pfmlib_perf_event.h"
+#endif
+
+namespace benchmark {
+namespace internal {
+
+#if defined HAVE_LIBPFM
+
+size_t PerfCounterValues::Read(const std::vector<int>& leaders) {
+  // Create a pointer for multiple reads
+  const size_t bufsize = values_.size() * sizeof(values_[0]);
+  char* ptr = reinterpret_cast<char*>(values_.data());
+  size_t size = bufsize;
+  for (int lead : leaders) {
+    auto read_bytes = ::read(lead, ptr, size);
+    if (read_bytes >= ssize_t(sizeof(uint64_t))) {
+      // Actual data bytes are all bytes minus initial padding
+      std::size_t data_bytes =
+          static_cast<std::size_t>(read_bytes) - sizeof(uint64_t);
+      // This should be very cheap since it's in hot cache
+      std::memmove(ptr, ptr + sizeof(uint64_t), data_bytes);
+      // Increment our counters
+      ptr += data_bytes;
+      size -= data_bytes;
+    } else {
+      int err = errno;
+      GetErrorLogInstance() << "Error reading lead " << lead << " errno:" << err
+                            << " " << ::strerror(err) << "\n";
+      return 0;
+    }
+  }
+  return (bufsize - size) / sizeof(uint64_t);
+}
+
+const bool PerfCounters::kSupported = true;
+
+// Initializes libpfm only on the first call.  Returns whether that single
+// initialization was successful.
+bool PerfCounters::Initialize() {
+  // Function-scope static gets initialized only once on first call.
+  static const bool success = []() {
+    return pfm_initialize() == PFM_SUCCESS;
+  }();
+  return success;
+}
+
+bool PerfCounters::IsCounterSupported(const std::string& name) {
+  Initialize();
+  perf_event_attr_t attr;
+  std::memset(&attr, 0, sizeof(attr));
+  pfm_perf_encode_arg_t arg;
+  std::memset(&arg, 0, sizeof(arg));
+  arg.attr = &attr;
+  const int mode = PFM_PLM3;  // user mode only
+  int ret = pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT_EXT,
+                                      &arg);
+  return (ret == PFM_SUCCESS);
+}
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (!counter_names.empty()) {
+    Initialize();
+  }
+
+  // Valid counters will populate these arrays but we start empty
+  std::vector<std::string> valid_names;
+  std::vector<int> counter_ids;
+  std::vector<int> leader_ids;
+
+  // Resize to the maximum possible
+  valid_names.reserve(counter_names.size());
+  counter_ids.reserve(counter_names.size());
+
+  const int kCounterMode = PFM_PLM3;  // user mode only
+
+  // Group leads will be assigned on demand. The idea is that once we cannot
+  // create a counter descriptor, the reason is that this group has maxed out
+  // so we set the group_id again to -1 and retry - giving the algorithm a
+  // chance to create a new group leader to hold the next set of counters.
+  int group_id = -1;
+
+  // Loop through all performance counters
+  for (size_t i = 0; i < counter_names.size(); ++i) {
+    // we are about to push into the valid names vector
+    // check if we did not reach the maximum
+    if (valid_names.size() == PerfCounterValues::kMaxCounters) {
+      // Log a message if we maxed out and stop adding
+      GetErrorLogInstance()
+          << counter_names.size() << " counters were requested. The maximum is "
+          << PerfCounterValues::kMaxCounters << " and " << valid_names.size()
+          << " were already added. All remaining counters will be ignored\n";
+      // stop the loop and return what we have already
+      break;
+    }
+
+    // Check if this name is empty
+    const auto& name = counter_names[i];
+    if (name.empty()) {
+      GetErrorLogInstance()
+          << "A performance counter name was the empty string\n";
+      continue;
+    }
+
+    // Here first means first in group, ie the group leader
+    const bool is_first = (group_id < 0);
+
+    // This struct will be populated by libpfm from the counter string
+    // and then fed into the syscall perf_event_open
+    struct perf_event_attr attr {};
+    attr.size = sizeof(attr);
+
+    // This is the input struct to libpfm.
+    pfm_perf_encode_arg_t arg{};
+    arg.attr = &attr;
+    const int pfm_get = pfm_get_os_event_encoding(name.c_str(), kCounterMode,
+                                                  PFM_OS_PERF_EVENT, &arg);
+    if (pfm_get != PFM_SUCCESS) {
+      GetErrorLogInstance()
+          << "Unknown performance counter name: " << name << "\n";
+      continue;
+    }
+
+    // We then proceed to populate the remaining fields in our attribute struct
+    // Note: the man page for perf_event_create suggests inherit = true and
+    // read_format = PERF_FORMAT_GROUP don't work together, but that's not the
+    // case.
+    attr.disabled = is_first;
+    attr.inherit = true;
+    attr.pinned = is_first;
+    attr.exclude_kernel = true;
+    attr.exclude_user = false;
+    attr.exclude_hv = true;
+
+    // Read all counters in a group in one read.
+    attr.read_format = PERF_FORMAT_GROUP;  //| PERF_FORMAT_TOTAL_TIME_ENABLED |
+                                           // PERF_FORMAT_TOTAL_TIME_RUNNING;
+
+    int id = -1;
+    while (id < 0) {
+      static constexpr size_t kNrOfSyscallRetries = 5;
+      // Retry syscall as it was interrupted often (b/64774091).
+      for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
+           ++num_retries) {
+        id = perf_event_open(&attr, 0, -1, group_id, 0);
+        if (id >= 0 || errno != EINTR) {
+          break;
+        }
+      }
+      if (id < 0) {
+        // If the file descriptor is negative we might have reached a limit
+        // in the current group. Set the group_id to -1 and retry
+        if (group_id >= 0) {
+          // Create a new group
+          group_id = -1;
+        } else {
+          // At this point we have already retried to set a new group id and
+          // failed. We then give up.
+          break;
+        }
+      }
+    }
+
+    // We failed to get a new file descriptor. We might have reached a hard
+    // hardware limit that cannot be resolved even with group multiplexing
+    if (id < 0) {
+      GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor "
+                               "for performance counter "
+                            << name << ". Ignoring\n";
+
+      // We give up on this counter but try to keep going
+      // as the others would be fine
+      continue;
+    }
+    if (group_id < 0) {
+      // This is a leader, store and assign it to the current file descriptor
+      leader_ids.push_back(id);
+      group_id = id;
+    }
+    // This is a valid counter, add it to our descriptor's list
+    counter_ids.push_back(id);
+    valid_names.push_back(name);
+  }
+
+  // Loop through all group leaders activating them
+  // There is another option of starting ALL counters in a process but
+  // that would be far reaching an intrusion. If the user is using PMCs
+  // by themselves then this would have a side effect on them. It is
+  // friendlier to loop through all groups individually.
+  for (int lead : leader_ids) {
+    if (ioctl(lead, PERF_EVENT_IOC_ENABLE) != 0) {
+      // This should never happen but if it does, we give up on the
+      // entire batch as recovery would be a mess.
+      GetErrorLogInstance() << "***WARNING*** Failed to start counters. "
+                               "Claring out all counters.\n";
+
+      // Close all performance counters
+      for (int id : counter_ids) {
+        ::close(id);
+      }
+
+      // Return an empty object so our internal state is still good and
+      // the process can continue normally without impact
+      return NoCounters();
+    }
+  }
+
+  return PerfCounters(std::move(valid_names), std::move(counter_ids),
+                      std::move(leader_ids));
+}
+
+void PerfCounters::CloseCounters() const {
+  if (counter_ids_.empty()) {
+    return;
+  }
+  for (int lead : leader_ids_) {
+    ioctl(lead, PERF_EVENT_IOC_DISABLE);
+  }
+  for (int fd : counter_ids_) {
+    close(fd);
+  }
+}
+#else   // defined HAVE_LIBPFM
+size_t PerfCounterValues::Read(const std::vector<int>&) { return 0; }
+
+const bool PerfCounters::kSupported = false;
+
+bool PerfCounters::Initialize() { return false; }
+
+bool PerfCounters::IsCounterSupported(const std::string&) { return false; }
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (!counter_names.empty()) {
+    GetErrorLogInstance() << "Performance counters not supported.\n";
+  }
+  return NoCounters();
+}
+
+void PerfCounters::CloseCounters() const {}
+#endif  // defined HAVE_LIBPFM
+
+PerfCountersMeasurement::PerfCountersMeasurement(
+    const std::vector<std::string>& counter_names)
+    : start_values_(counter_names.size()), end_values_(counter_names.size()) {
+  counters_ = PerfCounters::Create(counter_names);
+}
+
+PerfCounters& PerfCounters::operator=(PerfCounters&& other) noexcept {
+  if (this != &other) {
+    CloseCounters();
+
+    counter_ids_ = std::move(other.counter_ids_);
+    leader_ids_ = std::move(other.leader_ids_);
+    counter_names_ = std::move(other.counter_names_);
+  }
+  return *this;
+}
+}  // namespace internal
+}  // namespace benchmark
--- a/third_party/benchmark/src/perf_counters.h
+++ b/third_party/benchmark/src/perf_counters.h
@@ -0,0 +1,200 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_PERF_COUNTERS_H
+#define BENCHMARK_PERF_COUNTERS_H
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "check.h"
+#include "log.h"
+#include "mutex.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#include <unistd.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// C4251: <symbol> needs to have dll-interface to be used by clients of class
+#pragma warning(disable : 4251)
+#endif
+
+namespace benchmark {
+namespace internal {
+
+// Typically, we can only read a small number of counters. There is also a
+// padding preceding counter values, when reading multiple counters with one
+// syscall (which is desirable). PerfCounterValues abstracts these details.
+// The implementation ensures the storage is inlined, and allows 0-based
+// indexing into the counter values.
+// The object is used in conjunction with a PerfCounters object, by passing it
+// to Snapshot(). The Read() method relocates individual reads, discarding
+// the initial padding from each group leader in the values buffer such that
+// all user accesses through the [] operator are correct.
+class BENCHMARK_EXPORT PerfCounterValues {
+ public:
+  explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) {
+    BM_CHECK_LE(nr_counters_, kMaxCounters);
+  }
+
+  // We are reading correctly now so the values don't need to skip padding
+  uint64_t operator[](size_t pos) const { return values_[pos]; }
+
+  // Increased the maximum to 32 only since the buffer
+  // is std::array<> backed
+  static constexpr size_t kMaxCounters = 32;
+
+ private:
+  friend class PerfCounters;
+  // Get the byte buffer in which perf counters can be captured.
+  // This is used by PerfCounters::Read
+  std::pair<char*, size_t> get_data_buffer() {
+    return {reinterpret_cast<char*>(values_.data()),
+            sizeof(uint64_t) * (kPadding + nr_counters_)};
+  }
+
+  // This reading is complex and as the goal of this class is to
+  // abstract away the intrincacies of the reading process, this is
+  // a better place for it
+  size_t Read(const std::vector<int>& leaders);
+
+  // Move the padding to 2 due to the reading algorithm (1st padding plus a
+  // current read padding)
+  static constexpr size_t kPadding = 2;
+  std::array<uint64_t, kPadding + kMaxCounters> values_;
+  const size_t nr_counters_;
+};
+
+// Collect PMU counters. The object, once constructed, is ready to be used by
+// calling read(). PMU counter collection is enabled from the time create() is
+// called, to obtain the object, until the object's destructor is called.
+class BENCHMARK_EXPORT PerfCounters final {
+ public:
+  // True iff this platform supports performance counters.
+  static const bool kSupported;
+
+  // Returns an empty object
+  static PerfCounters NoCounters() { return PerfCounters(); }
+
+  ~PerfCounters() { CloseCounters(); }
+  PerfCounters() = default;
+  PerfCounters(PerfCounters&&) = default;
+  PerfCounters(const PerfCounters&) = delete;
+  PerfCounters& operator=(PerfCounters&&) noexcept;
+  PerfCounters& operator=(const PerfCounters&) = delete;
+
+  // Platform-specific implementations may choose to do some library
+  // initialization here.
+  static bool Initialize();
+
+  // Check if the given counter is supported, if the app wants to
+  // check before passing
+  static bool IsCounterSupported(const std::string& name);
+
+  // Return a PerfCounters object ready to read the counters with the names
+  // specified. The values are user-mode only. The counter name format is
+  // implementation and OS specific.
+  // In case of failure, this method will in the worst case return an
+  // empty object whose state will still be valid.
+  static PerfCounters Create(const std::vector<std::string>& counter_names);
+
+  // Take a snapshot of the current value of the counters into the provided
+  // valid PerfCounterValues storage. The values are populated such that:
+  // names()[i]'s value is (*values)[i]
+  BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) const {
+#ifndef BENCHMARK_OS_WINDOWS
+    assert(values != nullptr);
+    return values->Read(leader_ids_) == counter_ids_.size();
+#else
+    (void)values;
+    return false;
+#endif
+  }
+
+  const std::vector<std::string>& names() const { return counter_names_; }
+  size_t num_counters() const { return counter_names_.size(); }
+
+ private:
+  PerfCounters(const std::vector<std::string>& counter_names,
+               std::vector<int>&& counter_ids, std::vector<int>&& leader_ids)
+      : counter_ids_(std::move(counter_ids)),
+        leader_ids_(std::move(leader_ids)),
+        counter_names_(counter_names) {}
+
+  void CloseCounters() const;
+
+  std::vector<int> counter_ids_;
+  std::vector<int> leader_ids_;
+  std::vector<std::string> counter_names_;
+};
+
+// Typical usage of the above primitives.
+class BENCHMARK_EXPORT PerfCountersMeasurement final {
+ public:
+  PerfCountersMeasurement(const std::vector<std::string>& counter_names);
+
+  size_t num_counters() const { return counters_.num_counters(); }
+
+  std::vector<std::string> names() const { return counters_.names(); }
+
+  BENCHMARK_ALWAYS_INLINE bool Start() {
+    if (num_counters() == 0) return true;
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    valid_read_ &= counters_.Snapshot(&start_values_);
+    ClobberMemory();
+
+    return valid_read_;
+  }
+
+  BENCHMARK_ALWAYS_INLINE bool Stop(
+      std::vector<std::pair<std::string, double>>& measurements) {
+    if (num_counters() == 0) return true;
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    valid_read_ &= counters_.Snapshot(&end_values_);
+    ClobberMemory();
+
+    for (size_t i = 0; i < counters_.names().size(); ++i) {
+      double measurement = static_cast<double>(end_values_[i]) -
+                           static_cast<double>(start_values_[i]);
+      measurements.push_back({counters_.names()[i], measurement});
+    }
+
+    return valid_read_;
+  }
+
+ private:
+  PerfCounters counters_;
+  bool valid_read_ = true;
+  PerfCounterValues start_values_;
+  PerfCounterValues end_values_;
+};
+
+}  // namespace internal
+}  // namespace benchmark
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#endif  // BENCHMARK_PERF_COUNTERS_H
--- a/third_party/benchmark/src/re.h
+++ b/third_party/benchmark/src/re.h
@@ -33,7 +33,7 @@
 // Prefer C regex libraries when compiling w/o exceptions so that we can
 // correctly report errors.
 #if defined(BENCHMARK_HAS_NO_EXCEPTIONS) && \
-    defined(BENCHMARK_HAVE_STD_REGEX) && \
+    defined(HAVE_STD_REGEX) && \
    (defined(HAVE_GNU_POSIX_REGEX) || defined(HAVE_POSIX_REGEX))
  #undef HAVE_STD_REGEX
 #endif
@@ -121,15 +121,13 @@ inline bool Regex::Init(const std::string& spec, std::string* error) {
  if (ec != 0) {
    if (error) {
      size_t needed = regerror(ec, &re_, nullptr, 0);
-      char* errbuf = new char[needed];
-      regerror(ec, &re_, errbuf, needed);
+      std::vector<char> errbuf(needed);
+      regerror(ec, &re_, errbuf.data(), needed);

      // regerror returns the number of bytes necessary to null terminate
      // the string, so we move that when assigning to error.
-      CHECK_NE(needed, 0);
-      error->assign(errbuf, needed - 1);
-
-      delete[] errbuf;
+      BM_CHECK_NE(needed, 0);
+      error->assign(errbuf.data(), needed - 1);
    }

    return false;
--- a/third_party/benchmark/src/reporter.cc
+++ b/third_party/benchmark/src/reporter.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "benchmark/benchmark.h"
-#include "timers.h"
-
 #include <cstdlib>
-
 #include <iostream>
+#include <map>
+#include <string>
 #include <tuple>
 #include <vector>

+#include "benchmark/benchmark.h"
 #include "check.h"
 #include "string_util.h"
+#include "timers.h"

 namespace benchmark {

@@ -33,25 +33,32 @@ BenchmarkReporter::~BenchmarkReporter() {}

 void BenchmarkReporter::PrintBasicContext(std::ostream *out,
                                          Context const &context) {
-  CHECK(out) << "cannot be null";
+  BM_CHECK(out) << "cannot be null";
  auto &Out = *out;

+#ifndef BENCHMARK_OS_QURT
+  // Date/time information is not available on QuRT.
+  // Attempting to get it via this call cause the binary to crash.
  Out << LocalDateTimeString() << "\n";
+#endif

-  if (context.executable_name)
-    Out << "Running " << context.executable_name << "\n";
+  if (benchmark::BenchmarkReporter::Context::executable_name != nullptr) {
+    Out << "Running " << benchmark::BenchmarkReporter::Context::executable_name
+        << "\n";
+  }

  const CPUInfo &info = context.cpu_info;
  Out << "Run on (" << info.num_cpus << " X "
      << (info.cycles_per_second / 1000000.0) << " MHz CPU "
      << ((info.num_cpus > 1) ? "s" : "") << ")\n";
-  if (info.caches.size() != 0) {
+  if (!info.caches.empty()) {
    Out << "CPU Caches:\n";
-    for (auto &CInfo : info.caches) {
+    for (const auto &CInfo : info.caches) {
      Out << "  L" << CInfo.level << " " << CInfo.type << " "
-          << (CInfo.size / 1000) << "K";
-      if (CInfo.num_sharing != 0)
+          << (CInfo.size / 1024) << " KiB";
+      if (CInfo.num_sharing != 0) {
        Out << " (x" << (info.num_cpus / CInfo.num_sharing) << ")";
+      }
      Out << "\n";
    }
  }
@@ -59,17 +66,34 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
    Out << "Load Average: ";
    for (auto It = info.load_avg.begin(); It != info.load_avg.end();) {
      Out << StrFormat("%.2f", *It++);
-      if (It != info.load_avg.end()) Out << ", ";
+      if (It != info.load_avg.end()) {
+        Out << ", ";
+      }
    }
    Out << "\n";
  }

-  if (info.scaling_enabled) {
+  std::map<std::string, std::string> *global_context =
+      internal::GetGlobalContext();
+
+  if (global_context != nullptr) {
+    for (const auto &kv : *global_context) {
+      Out << kv.first << ": " << kv.second << "\n";
+    }
+  }
+
+  if (CPUInfo::Scaling::ENABLED == info.scaling) {
    Out << "***WARNING*** CPU scaling is enabled, the benchmark "
           "real time measurements may be noisy and will incur extra "
           "overhead.\n";
  }

+  const SystemInfo &sysinfo = context.sys_info;
+  if (SystemInfo::ASLR::ENABLED == sysinfo.ASLRStatus) {
+    Out << "***WARNING*** ASLR is enabled, the results may have unreproducible "
+           "noise in them.\n";
+  }
+
 #ifndef NDEBUG
  Out << "***WARNING*** Library was built as DEBUG. Timings may be "
         "affected.\n";
@@ -79,10 +103,11 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
 // No initializer because it's already initialized to NULL.
 const char *BenchmarkReporter::Context::executable_name;

-BenchmarkReporter::Context::Context() : cpu_info(CPUInfo::Get()) {}
+BenchmarkReporter::Context::Context()
+    : cpu_info(CPUInfo::Get()), sys_info(SystemInfo::Get()) {}

 std::string BenchmarkReporter::Run::benchmark_name() const {
-  std::string name = run_name;
+  std::string name = run_name.str();
  if (run_type == RT_Aggregate) {
    name += "_" + aggregate_name;
  }
@@ -91,13 +116,17 @@ std::string BenchmarkReporter::Run::benchmark_name() const {

 double BenchmarkReporter::Run::GetAdjustedRealTime() const {
  double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit);
-  if (iterations != 0) new_time /= static_cast<double>(iterations);
+  if (iterations != 0) {
+    new_time /= static_cast<double>(iterations);
+  }
  return new_time;
 }

 double BenchmarkReporter::Run::GetAdjustedCPUTime() const {
  double new_time = cpu_accumulated_time * GetTimeUnitMultiplier(time_unit);
-  if (iterations != 0) new_time /= static_cast<double>(iterations);
+  if (iterations != 0) {
+    new_time /= static_cast<double>(iterations);
+  }
  return new_time;
 }

--- a/Show More
+++ b/Show More