immediately GetMappedRange after MapAsync still would return null pointer

Use a stage pool for WebGPU, but somehow GetMappedRange would sometimes return null, which breaks the code.
Try to fix test_UboBatching (#9448 )
2025-11-21 16:53:03 -05:00 · 2025-11-21 10:57:02 -05:00 · 2025-11-20 15:14:19 +08:00 · 2025-11-20 14:28:13 +08:00 · 2025-11-20 00:52:04 +00:00 · 2025-11-19 22:03:05 +00:00
202 changed files with 20125 additions and 7899 deletions
--- a/docs/dup/intro.html
+++ b/docs/dup/intro.html
@@ -181,7 +181,7 @@ important for <code>matc</code> (material compiler).</p>
 }

 dependencies {
-    implementation 'com.google.android.filament:filament-android:1.66.2'
+    implementation 'com.google.android.filament:filament-android:1.67.1'
 }
 </code></pre>
 <p>Here are all the libraries available in the group <code>com.google.android.filament</code>:</p>
@@ -196,7 +196,7 @@ dependencies {
 </div>
 <h3 id="ios"><a class="header" href="#ios">iOS</a></h3>
 <p>iOS projects can use CocoaPods to install the latest release:</p>
-<pre><code class="language-shell">pod 'Filament', '~&gt; 1.66.2'
+<pre><code class="language-shell">pod 'Filament', '~&gt; 1.67.1'
 </code></pre>
 <h2 id="documentation"><a class="header" href="#documentation">Documentation</a></h2>
 <ul>
--- a/docs/main/filament.html
+++ b/docs/main/filament.html
--- a/docs/main/materials.html
+++ b/docs/main/materials.html
--- a/docs/searchindex.js
+++ b/docs/searchindex.js
--- a/docs/searchindex.json
+++ b/docs/searchindex.json
--- a/filament/backend/CMakeLists.txt
+++ b/filament/backend/CMakeLists.txt
@@ -316,6 +316,8 @@ if (FILAMENT_SUPPORTS_WEBGPU)
            src/webgpu/WebGPURenderPrimitive.h
            src/webgpu/WebGPURenderTarget.cpp
            src/webgpu/WebGPURenderTarget.h
+            src/webgpu/WebGPUStagePool.cpp
+            src/webgpu/WebGPUStagePool.h
            src/webgpu/WebGPUStrings.h
            src/webgpu/WebGPUSwapChain.cpp
            src/webgpu/WebGPUSwapChain.h
--- a/filament/backend/src/webgpu/WebGPUBufferBase.cpp
+++ b/filament/backend/src/webgpu/WebGPUBufferBase.cpp
@@ -18,6 +18,7 @@

 #include "WebGPUConstants.h"
 #include "WebGPUQueueManager.h"
+#include "WebGPUStagePool.h"

 #include "DriverBase.h"
 #include <backend/BufferDescriptor.h>
@@ -29,6 +30,7 @@

 #include <cstdint>
 #include <cstring>
+#include <iostream>

 namespace filament::backend {

@@ -65,7 +67,7 @@ WebGPUBufferBase::WebGPUBufferBase(wgpu::Device const& device, const wgpu::Buffe
 // of 4 by padding with zeros.
 void WebGPUBufferBase::updateGPUBuffer(BufferDescriptor const& bufferDescriptor,
        const uint32_t byteOffset, wgpu::Device const& device,
-        WebGPUQueueManager* const webGPUQueueManager) {
+        WebGPUQueueManager* const webGPUQueueManager, WebGPUStagePool* const webGPUStagePool) {
    FILAMENT_CHECK_PRECONDITION(bufferDescriptor.buffer)
            << "updateGPUBuffer called with a null buffer";
    FILAMENT_CHECK_PRECONDITION(bufferDescriptor.size + byteOffset <= mBuffer.GetSize())
@@ -79,34 +81,54 @@ void WebGPUBufferBase::updateGPUBuffer(BufferDescriptor const& bufferDescriptor,
    // This may have some performance implications. That should be investigated later.
    assert_invariant(mBuffer.GetUsage() & wgpu::BufferUsage::CopyDst);

-    // Calculate some alignment related sizes
+    // // Calculate some alignment related sizes
    const size_t remainder = bufferDescriptor.size % FILAMENT_WEBGPU_BUFFER_SIZE_MODULUS;
    const size_t mainBulk = bufferDescriptor.size - remainder;
    const size_t stagingBufferSize =
            remainder == 0 ? bufferDescriptor.size : mainBulk + FILAMENT_WEBGPU_BUFFER_SIZE_MODULUS;

-    // create a staging buffer
-    wgpu::BufferDescriptor descriptor{
-        .label = "Filament WebGPU Staging Buffer",
-        .usage = wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc,
-        .size = stagingBufferSize,
-        .mappedAtCreation = true };
-    wgpu::Buffer stagingBuffer = device.CreateBuffer(&descriptor);
+    Stage stage = webGPUStagePool->acquireBuffer(stagingBufferSize);

-    void* mappedRange = stagingBuffer.GetMappedRange();
-    memcpy(mappedRange, bufferDescriptor.buffer, bufferDescriptor.size);
+    std::string mappedRangeIsNull = stage.mappedRange
+            ? "no"
+            : "yes";
+    std::cout << "Run Yu: got mapped range on the staging buffer with size "
+              << stage.buffer.GetSize() << " and it is null? " <<  mappedRangeIsNull << std::endl;
+    memcpy(stage.mappedRange, bufferDescriptor.buffer, bufferDescriptor.size);

-    // Make sure the padded memory is set to 0 to have deterministic behaviors
-    if (remainder != 0) {
-        uint8_t* paddingStart = static_cast<uint8_t*>(mappedRange) + bufferDescriptor.size;
-        memset(paddingStart, 0, FILAMENT_WEBGPU_BUFFER_SIZE_MODULUS - remainder);
-    }
-
-    stagingBuffer.Unmap();
+    stage.buffer.Unmap();

+    std::cout << "Run Yu: about to issue copy command with actual staging buffer of size "
+              << stage.buffer.GetSize() << ", and computed size of " << stagingBufferSize
+              << ". The mBuffer size is " << mBuffer.GetSize() << std::endl;
    // Copy the staging buffer contents to the destination buffer.
-    webGPUQueueManager->getCommandEncoder().CopyBufferToBuffer(stagingBuffer, 0, mBuffer,
-            byteOffset, stagingBufferSize);
+    webGPUQueueManager->getCommandEncoder().CopyBufferToBuffer(stage.buffer, 0, mBuffer,
+            byteOffset,
+            remainder == 0 ? bufferDescriptor.size
+                           : mainBulk + FILAMENT_WEBGPU_BUFFER_SIZE_MODULUS);
+    webGPUQueueManager->flush();
+
+    struct UserData final {
+        wgpu::Buffer stagingBuffer;
+        WebGPUStagePool* webGPUStagePool;
+    };
+    auto userData = std::make_unique<UserData>(
+            UserData{ .stagingBuffer = stage.buffer, .webGPUStagePool = webGPUStagePool });
+    stage.buffer.MapAsync(wgpu::MapMode::Write, 0, stagingBufferSize,
+            wgpu::CallbackMode::AllowSpontaneous,
+            [data = std::move(userData)](wgpu::MapAsyncStatus status, const char* message) {
+                if (UTILS_LIKELY(status == wgpu::MapAsyncStatus::Success)) {
+                    std::cout << "Run Yu: successfully mapped a buffer with size "
+                              << data->stagingBuffer.GetSize() << std::endl;
+                    void* mappedRange = data->stagingBuffer.GetMappedRange();
+                    if (!mappedRange) {
+                        std::cout << "Run Yu: MAPPED RANGE IS NULL RIGHT AWAY!!\n";
+                    }
+                    data->webGPUStagePool->addBufferToPool(data->stagingBuffer, mappedRange);
+                } else {
+                    std::cout << "Run Yu: MAPPING UNSUCCESSFUL!!\n";
+                }
+            });
 }

 } // namespace filament::backend
--- a/filament/backend/src/webgpu/WebGPUBufferBase.h
+++ b/filament/backend/src/webgpu/WebGPUBufferBase.h
@@ -25,6 +25,7 @@ namespace filament::backend {

 class BufferDescriptor;
 class WebGPUQueueManager;
+class WebGPUStagePool;

 /**
  * A base class for WebGPU buffer objects, providing common functionality for creating and
@@ -40,7 +41,7 @@ public:
     * ensures the calls happen in the expected sequence.
     */
    void updateGPUBuffer(BufferDescriptor const&, uint32_t byteOffset, wgpu::Device const& device,
-            WebGPUQueueManager* const webGPUQueueManager);
+            WebGPUQueueManager* const webGPUQueueManager, WebGPUStagePool* const webGPUStagePool);

    [[nodiscard]] wgpu::Buffer const& getBuffer() const { return mBuffer; }

--- a/filament/backend/src/webgpu/WebGPUDriver.cpp
+++ b/filament/backend/src/webgpu/WebGPUDriver.cpp
@@ -107,6 +107,7 @@ WebGPUDriver::WebGPUDriver(WebGPUPlatform& platform,
      mAdapter{ mPlatform.requestAdapter(nullptr) },
      mDevice{ mPlatform.requestDevice(mAdapter) },
      mQueueManager{ mDevice },
+      mStagePool{ mDevice },
      mPipelineLayoutCache{ mDevice },
      mPipelineCache{ mDevice },
      mRenderPassMipmapGenerator{ mDevice, &mQueueManager },
@@ -856,7 +857,7 @@ void WebGPUDriver::updateIndexBuffer(Handle<HwIndexBuffer> indexBufferHandle,
    // draw calls are made.
    flush();
    handleCast<WebGPUIndexBuffer>(indexBufferHandle)
-            ->updateGPUBuffer(bufferDescriptor, byteOffset, mDevice, &mQueueManager);
+            ->updateGPUBuffer(bufferDescriptor, byteOffset, mDevice, &mQueueManager, &mStagePool);
    scheduleDestroy(std::move(bufferDescriptor));
 }

@@ -867,14 +868,14 @@ void WebGPUDriver::updateBufferObject(Handle<HwBufferObject> bufferObjectHandle,
    // draw calls are made.
    flush();
    handleCast<WebGPUBufferObject>(bufferObjectHandle)
-            ->updateGPUBuffer(bufferDescriptor, byteOffset, mDevice, &mQueueManager);
+            ->updateGPUBuffer(bufferDescriptor, byteOffset, mDevice, &mQueueManager, &mStagePool);
    scheduleDestroy(std::move(bufferDescriptor));
 }

 void WebGPUDriver::updateBufferObjectUnsynchronized(Handle<HwBufferObject> bufferObjectHandle,
        BufferDescriptor&& bufferDescriptor, const uint32_t byteOffset) {
    handleCast<WebGPUBufferObject>(bufferObjectHandle)
-            ->updateGPUBuffer(bufferDescriptor, byteOffset, mDevice, &mQueueManager);
+            ->updateGPUBuffer(bufferDescriptor, byteOffset, mDevice, &mQueueManager, &mStagePool);
    scheduleDestroy(std::move(bufferDescriptor));
 }

--- a/filament/backend/src/webgpu/WebGPUDriver.h
+++ b/filament/backend/src/webgpu/WebGPUDriver.h
@@ -25,6 +25,7 @@
 #include "webgpu/WebGPUPipelineLayoutCache.h"
 #include "webgpu/WebGPURenderPassMipmapGenerator.h"
 #include "webgpu/WebGPUQueueManager.h"
+#include "webgpu/WebGPUStagePool.h"
 #include "webgpu/utils/AsyncTaskCounter.h"
 #include <backend/platforms/WebGPUPlatform.h>

@@ -81,6 +82,7 @@ private:
    wgpu::Device mDevice = nullptr;
    wgpu::Limits mDeviceLimits = {};
    WebGPUQueueManager mQueueManager;
+    WebGPUStagePool mStagePool;
    void* mNativeWindow = nullptr;
    WebGPUSwapChain* mSwapChain = nullptr;
    uint64_t mNextFakeHandle = 1;
--- a/filament/backend/src/webgpu/WebGPUStagePool.cpp
+++ b/filament/backend/src/webgpu/WebGPUStagePool.cpp
@@ -0,0 +1,86 @@
+/*
+* Copyright (C) 2025 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "WebGPUStagePool.h"
+
+#include "WebGPUConstants.h"
+
+#include <iostream>
+
+namespace filament::backend {
+
+WebGPUStagePool::WebGPUStagePool(wgpu::Device const& device) : mDevice(device) {}
+
+WebGPUStagePool::~WebGPUStagePool() = default;
+
+Stage WebGPUStagePool::acquireBuffer(size_t requiredSize) {
+    std::cout << "Run Yu: required size in acquireBuffer: " << requiredSize << std::endl;
+    std::cout << "Run Yu: the pool size is " << mBuffers.size() << std::endl;
+    {
+        std::lock_guard<std::mutex> lock(mMutex);
+        auto iter = mBuffers.lower_bound(requiredSize);
+        if (iter != mBuffers.end()) {
+            const Stage& fromPool = iter->second;
+            std::cout << "Run Yu: found buffer in the pool with size " << fromPool.buffer.GetSize()
+                      << std::endl;
+            if (fromPool.buffer.GetMapState() != wgpu::BufferMapState::Mapped) {
+                std::cout << "Run Yu: buffer from pool is not mapped!!" << std::endl;
+            }
+
+            Stage result{ .buffer = fromPool.buffer, .mappedRange = fromPool.mappedRange };
+            mBuffers.erase(iter);
+            return result;
+        }
+    }
+    wgpu::Buffer newBuffer = createNewBuffer(requiredSize);
+    return { .buffer = newBuffer, .mappedRange = newBuffer.GetMappedRange() };
+}
+
+void WebGPUStagePool::addBufferToPool(wgpu::Buffer buffer, void* mappedRange) {
+    std::lock_guard<std::mutex> lock(mMutex);
+    std::cout << "Run Yu: adding buffer to the pool with size " << buffer.GetSize() << std::endl;
+    Stage stage {.buffer = buffer, .mappedRange = mappedRange};
+    mBuffers.emplace(buffer.GetSize(), stage);
+    std::cout << "Run Yu: added buffer to the pool with size " << buffer.GetSize() << std::endl;
+
+    bool allMapped = true;
+    for (const auto& pair : mBuffers) {
+        auto state = pair.second.buffer.GetMapState();
+        if (state != wgpu::BufferMapState::Mapped) {
+            allMapped = false;
+            std::cout << "Run Yu: the buffer with size " << pair.second.buffer.GetSize()
+                      << " is not mapped but somehow was added to the pool, its state is "
+                      << static_cast<int>(state) << std::endl;
+        }
+    }
+    if (!allMapped) {
+        std::cout << "Run Yu: found buffers that are not mapped\n";
+    } else {
+        std::cout << "Run Yu: all buffers are mapped\n";
+    }
+}
+
+wgpu::Buffer WebGPUStagePool::createNewBuffer(size_t bufferSize) {
+    std::cout << "Run Yu: creating new buffer with size " << bufferSize << std::endl;
+    wgpu::BufferDescriptor descriptor{
+        .label = "Filament WebGPU Staging Buffer",
+        .usage = wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc,
+        .size = bufferSize,
+        .mappedAtCreation = true };
+    return mDevice.CreateBuffer(&descriptor);
+}
+
+} // namespace filament::backend
--- a/filament/backend/src/webgpu/WebGPUStagePool.h
+++ b/filament/backend/src/webgpu/WebGPUStagePool.h
@@ -0,0 +1,49 @@
+/*
+* Copyright (C) 2025 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TNT_FILAMENT_BACKEND_WEBGPUSTAGEPOOL_H
+#define TNT_FILAMENT_BACKEND_WEBGPUSTAGEPOOL_H
+
+#include <webgpu/webgpu_cpp.h>
+
+#include <map>
+#include <mutex>
+
+namespace filament::backend {
+
+struct Stage {
+    wgpu::Buffer buffer;
+    void* mappedRange;
+};
+
+class WebGPUStagePool {
+public:
+    WebGPUStagePool(wgpu::Device const& device);
+    ~WebGPUStagePool();
+
+    Stage acquireBuffer(size_t requiredSize);
+    void addBufferToPool(wgpu::Buffer buffer, void* mappedRange);
+private:
+    wgpu::Buffer createNewBuffer(size_t bufferSize);
+    std::multimap<uint32_t, Stage> mBuffers;
+    mutable std::mutex mMutex;
+
+    wgpu::Device mDevice;
+};
+
+}
+
+#endif // TNT_FILAMENT_BACKEND_WEBGPUSTAGEPOOL_H
--- a/filament/src/details/BufferAllocator.cpp
+++ b/filament/src/details/BufferAllocator.cpp
@@ -16,6 +16,7 @@

 #include "details/BufferAllocator.h"

+#include <private/utils/Tracing.h>
 #include <utils/Panic.h>
 #include <utils/debug.h>

@@ -169,6 +170,7 @@ void BufferAllocator::releaseGpu(AllocationId id) {
 }

 void BufferAllocator::releaseFreeSlots() {
+    FILAMENT_TRACING_CALL(FILAMENT_TRACING_CATEGORY_FILAMENT);
    if (!mHasPendingFrees) {
        return;
    }
--- a/filament/src/details/Engine.cpp
+++ b/filament/src/details/Engine.cpp
@@ -718,8 +718,7 @@ void FEngine::prepare() {

    if (useUboBatching) {
        assert_invariant(mUboManager != nullptr);
-
-        mUboManager->beginFrame(driver, mMaterialInstances);
+        mUboManager->beginFrame(driver);
    }

    UboManager* uboManager = mUboManager;
@@ -758,7 +757,7 @@ void FEngine::gc() {
 void FEngine::submitFrame() {
    if (isUboBatchingEnabled()) {
        DriverApi& driver = getDriverApi();
-        getUboManager()->endFrame(driver, getMaterialInstanceResourceList());
+        getUboManager()->endFrame(driver);
    }
 }

@@ -1285,11 +1284,6 @@ UTILS_NOINLINE
 bool FEngine::destroy(const FMaterialInstance* p) {
    if (p == nullptr) return true;

-    if (p->isUsingUboBatching()) {
-        assert_invariant(isUboBatchingEnabled());
-        mUboManager->retireSlot(p->getAllocationId());
-    }
-
    // Check that the material instance we're destroying is not in use in the RenderableManager
    // To do this, we currently need to inspect all render primitives in the RenderableManager
    EntityManager const& em = mEntityManager;
--- a/filament/src/details/MaterialInstance.cpp
+++ b/filament/src/details/MaterialInstance.cpp
@@ -95,6 +95,7 @@ FMaterialInstance::FMaterialInstance(FEngine& engine, FMaterial const* material,

    if (mUseUboBatching) {
        mUboData = BufferAllocator::UNALLOCATED;
+        engine.getUboManager()->manageMaterialInstance(this);
    } else {
        mUboData = driver.createBufferObject(mUniforms.getSize(), BufferObjectBinding::UNIFORM,
                BufferUsage::STATIC, ImmutableCString{ material->getName().c_str_safe() });
@@ -167,6 +168,7 @@ FMaterialInstance::FMaterialInstance(FEngine& engine,

    if (mUseUboBatching) {
        mUboData = BufferAllocator::UNALLOCATED;
+        engine.getUboManager()->manageMaterialInstance(this);
    } else {
        mUboData = driver.createBufferObject(mUniforms.getSize(), BufferObjectBinding::UNIFORM,
                BufferUsage::DYNAMIC, ImmutableCString{ material->getName().c_str_safe() });
@@ -211,6 +213,10 @@ FMaterialInstance::~FMaterialInstance() noexcept = default;
 void FMaterialInstance::terminate(FEngine& engine) {
    FEngine::DriverApi& driver = engine.getDriverApi();
    mDescriptorSet.terminate(driver);
+    if (mUseUboBatching) {
+        engine.getUboManager()->unmanageMaterialInstance(this);
+    }
+
    auto* ubHandle = std::get_if<Handle<HwBufferObject>>(&mUboData);
    if (ubHandle){
        driver.destroyBufferObject(*ubHandle);
--- a/filament/src/details/UboManager.cpp
+++ b/filament/src/details/UboManager.cpp
@@ -48,6 +48,7 @@ void UboManager::FenceManager::track(DriverApi& driver, std::unordered_set<Alloc

 void UboManager::FenceManager::reclaimCompletedResources(DriverApi& driver,
        std::function<void(AllocationId)> const& onReclaimed) {
+    FILAMENT_TRACING_CALL(FILAMENT_TRACING_CATEGORY_FILAMENT);
    uint32_t signaledCount = 0;
    bool seenSignaledFence = false;

@@ -115,8 +116,7 @@ UboManager::UboManager(DriverApi& driver, allocation_size_t defaultSlotSizeInByt
    reallocate(driver, defaultTotalSizeInBytes);
 }

-void UboManager::beginFrame(DriverApi& driver,
-        const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>& materialInstances) {
+void UboManager::beginFrame(DriverApi& driver) {
    FILAMENT_TRACING_CALL(FILAMENT_TRACING_CATEGORY_FILAMENT);
    // Check finished frames and decrement GPU count accordingly.
    mFenceManager.reclaimCompletedResources(driver,
@@ -126,7 +126,7 @@ void UboManager::beginFrame(DriverApi& driver,
    mAllocator.releaseFreeSlots();

    // Traverse all MIs and see which of them need slot allocation.
-    if (allocateOnDemand(materialInstances) == SUCCESS) {
+    if (allocateOnDemand() == SUCCESS) {
        // No need to grow the buffer, so we can just map the buffer for writing and return.
        mMemoryMappedBufferHandle = driver.mapBuffer(mUbHandle, 0, mUboSize, MapBufferAccessFlags::WRITE_BIT,
                "UboManager");
@@ -135,25 +135,19 @@ void UboManager::beginFrame(DriverApi& driver,
    }

    // Calculate the required size and grow the Ubo.
-    const allocation_size_t requiredSize = calculateRequiredSize(materialInstances);
+    const allocation_size_t requiredSize = calculateRequiredSize();
    reallocate(driver, requiredSize);

    // Allocate slots for each MI on the new Ubo.
-    allocateAllInstances(materialInstances);
+    allocateAllInstances();

    // Map the buffer so that we can write to it
    mMemoryMappedBufferHandle =
            driver.mapBuffer(mUbHandle, 0, mUboSize, MapBufferAccessFlags::WRITE_BIT, "UboManager");

    // Invalidate the migrated MIs, so that next commit() call must be triggered.
-    for (const auto& materialInstance : materialInstances) {
-        materialInstance.second.forEach([](const FMaterialInstance* mi) {
-            if (!mi->isUsingUboBatching()) {
-                return;
-            }
-
-            mi->getUniformBuffer().invalidate();
-        });
+    for (const auto* mi : mManagedInstances) {
+        mi->getUniformBuffer().invalidate();
    }
 }

@@ -164,24 +158,16 @@ void UboManager::finishBeginFrame(DriverApi& driver) {
    }
 }

-void UboManager::endFrame(DriverApi& driver,
-        const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>& materialInstances) {
-    BufferAllocator& allocator = mAllocator;
+void UboManager::endFrame(DriverApi& driver) {
    std::unordered_set<AllocationId> allocationIds;
-    for (const auto& materialInstance : materialInstances) {
-        materialInstance.second.forEach([&allocator, &allocationIds](const FMaterialInstance* mi) {
-            if (!mi->isUsingUboBatching()) {
-                return;
-            }
+    for (const auto* mi : mManagedInstances) {
+        const AllocationId id = mi->getAllocationId();
+        if (UTILS_UNLIKELY(!BufferAllocator::isValid(id))) {
+            continue;
+        }

-            const AllocationId id = mi->getAllocationId();
-            if (!BufferAllocator::isValid(id)) {
-                return;
-            }
-
-            allocator.acquireGpu(id);
-            allocationIds.insert(id);
-        });
+        mAllocator.acquireGpu(id);
+        allocationIds.insert(id);
    }

    mFenceManager.track(driver, std::move(allocationIds));
@@ -194,76 +180,90 @@ void UboManager::terminate(DriverApi& driver) {

 void UboManager::updateSlot(DriverApi& driver, AllocationId id,
        BufferDescriptor bufferDescriptor) const {
-    if (!mMemoryMappedBufferHandle)
+    if (!mMemoryMappedBufferHandle) {
        return;
+    }

    const allocation_size_t offset = mAllocator.getAllocationOffset(id);
    driver.copyToMemoryMappedBuffer(mMemoryMappedBufferHandle, offset, std::move(bufferDescriptor));
 }

-void UboManager::retireSlot(BufferAllocator::AllocationId id) {
-    if (!BufferAllocator::isValid(id))
-        return;
-    mAllocator.retire(id);
+void UboManager::manageMaterialInstance(FMaterialInstance* instance) {
+    mPendingInstances.insert(instance);
 }

-UboManager::AllocationResult UboManager::allocateOnDemand(
-        const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                materialInstances) {
-    // Collect all MIs that need allocation into two groups.
-    std::vector<FMaterialInstance*> newInstances;
-    std::vector<FMaterialInstance*> existingInstances;
-    for (const auto& [_, miList] : materialInstances) {
-        miList.forEach([&](FMaterialInstance* mi) {
-            if (!mi->isUsingUboBatching()) {
-                return;
-            }
-            if (BufferAllocator::isValid(mi->getAllocationId())) {
-                existingInstances.push_back(mi);
-            } else {
-                newInstances.push_back(mi);
-            }
-        });
+void UboManager::unmanageMaterialInstance(FMaterialInstance* materialInstance) {
+    AllocationId id = materialInstance->getAllocationId();
+    mPendingInstances.erase(materialInstance);
+    mManagedInstances.erase(materialInstance);
+
+    if (!BufferAllocator::isValid(id)) {
+        return;
    }

+    mAllocator.retire(id);
+    materialInstance->assignUboAllocation(mUbHandle, BufferAllocator::UNALLOCATED, 0);
+}
+
+UboManager::AllocationResult UboManager::allocateOnDemand() {
+    FILAMENT_TRACING_CALL(FILAMENT_TRACING_CATEGORY_FILAMENT);
    bool reallocationNeeded = false;

    // Pass 1: Allocate slots for new material instances (that don't have a slot yet).
-    for (FMaterialInstance* mi : newInstances) {
+    for (auto* mi : mPendingInstances) {
+        mManagedInstances.insert(mi);
        auto [newId, newOffset] = mAllocator.allocate(mi->getUniformBuffer().getSize());
+
+        // Even if the newId is not valid, we assign it to the MI so that the following process knows
+        // this material instance was not allocated successfully. Then we can calculate the new
+        // required UBO size properly.
        mi->assignUboAllocation(mUbHandle, newId, newOffset);
+
        if (!BufferAllocator::isValid(newId)) {
            reallocationNeeded = true;
        }
    }
+    mPendingInstances.clear();

    // Pass 2: Allocate slots for existing material instances that need to be orphaned.
-    for (FMaterialInstance* mi : existingInstances) {
-        if (mi->getUniformBuffer().isDirty() && mAllocator.isLockedByGpu(mi->getAllocationId())) {
-            mAllocator.retire(mi->getAllocationId());
-            auto [newId, newOffset] = mAllocator.allocate(mi->getUniformBuffer().getSize());
-            mi->assignUboAllocation(mUbHandle, newId, newOffset);
-            if (!BufferAllocator::isValid(newId)) {
-                reallocationNeeded = true;
-            }
+    for (auto* mi: mManagedInstances) {
+        if (!BufferAllocator::isValid(mi->getAllocationId())) {
+            continue;
+        }
+
+        // This instance doesn't need orphaning.
+        if (!mi->getUniformBuffer().isDirty() || !mAllocator.isLockedByGpu(mi->getAllocationId())) {
+            continue;
+        }
+
+        mAllocator.retire(mi->getAllocationId());
+
+        // If the space is already not sufficient, we don't need to give another try on allocation.
+        if (reallocationNeeded) {
+            mi->assignUboAllocation(mUbHandle, REALLOCATION_REQUIRED, 0);
+            continue;
+        }
+
+        auto [newId, newOffset] = mAllocator.allocate(mi->getUniformBuffer().getSize());
+
+        // Even if the newId is not valid, we assign it to the MI so that the following process knows
+        // this material instance was not allocated successfully. Then we can calculate the new
+        // required UBO size properly.
+        mi->assignUboAllocation(mUbHandle, newId, newOffset);
+
+        if (!BufferAllocator::isValid(newId)) {
+            reallocationNeeded = true;
        }
    }

    return reallocationNeeded ? REALLOCATION_REQUIRED : SUCCESS;
 }

-void UboManager::allocateAllInstances(
-        const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                materialInstances) {
-    for (const auto& [_, miList] : materialInstances) {
-        miList.forEach([this](FMaterialInstance* mi) {
-            if (!mi->isUsingUboBatching()) {
-                return;
-            }
-            auto [newId, newOffset] = mAllocator.allocate(mi->getUniformBuffer().getSize());
-            assert_invariant(BufferAllocator::isValid(newId));
-            mi->assignUboAllocation(mUbHandle, newId, newOffset);
-        });
+void UboManager::allocateAllInstances() {
+    for (auto* mi: mManagedInstances) {
+        auto [newId, newOffset] = mAllocator.allocate(mi->getUniformBuffer().getSize());
+        assert_invariant(BufferAllocator::isValid(newId));
+        mi->assignUboAllocation(mUbHandle, newId, newOffset);
    }
 }

@@ -288,28 +288,19 @@ void UboManager::reallocate(DriverApi& driver, allocation_size_t requiredSize) {
            BufferUsage::DYNAMIC | BufferUsage::SHARED_WRITE_BIT);
 }

-allocation_size_t UboManager::calculateRequiredSize(
-        const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                materialInstances) {
-    BufferAllocator& allocator = mAllocator;
+allocation_size_t UboManager::calculateRequiredSize() {
    allocation_size_t newBufferSize = 0;
-    for (const auto& materialInstance: materialInstances) {
-        materialInstance.second.forEach([&newBufferSize, &allocator](const FMaterialInstance* mi) {
-            if (!mi->isUsingUboBatching()) {
-                return;
-            }
-
-            const AllocationId allocationId = mi->getAllocationId();
-            if (allocationId == BufferAllocator::REALLOCATION_REQUIRED) {
-                // For MIs whose parameters have been updated, aside from the slot it is being
-                // occupied by the GPU, we need to preserve an additional slot for it.
-                newBufferSize += 2 * allocator.alignUp(mi->getUniformBuffer().getSize());
-            } else {
-                newBufferSize += allocator.alignUp(mi->getUniformBuffer().getSize());
-            }
-        });
+    for (const auto* mi: mManagedInstances) {
+        const AllocationId allocationId = mi->getAllocationId();
+        if (allocationId == BufferAllocator::REALLOCATION_REQUIRED) {
+            // For MIs whose parameters have been updated, aside from the slot it is being
+            // occupied by the GPU, we need to preserve an additional slot for it.
+            newBufferSize += 2 * mAllocator.alignUp(mi->getUniformBuffer().getSize());
+        } else {
+            newBufferSize += mAllocator.alignUp(mi->getUniformBuffer().getSize());
+        }
    }
-    return allocator.alignUp(newBufferSize * BUFFER_SIZE_GROWTH_MULTIPLIER);
+    return mAllocator.alignUp(newBufferSize * BUFFER_SIZE_GROWTH_MULTIPLIER);
 }

 } // namespace filament
--- a/filament/src/details/UboManager.h
+++ b/filament/src/details/UboManager.h
@@ -17,7 +17,6 @@
 #ifndef TNT_FILAMENT_DETAILS_UBOMANAGER_H
 #define TNT_FILAMENT_DETAILS_UBOMANAGER_H

-#include "ResourceList.h"
 #include "backend/DriverApiForward.h"

 #include "details/BufferAllocator.h"
@@ -29,6 +28,8 @@
 #include <unordered_set>
 #include <vector>

+class UboManagerTest;
+
 namespace filament {

 class FMaterial;
@@ -96,9 +97,8 @@ public:
    //    instances with modified uniforms).
    // 3. Reallocating a larger shared UBO if the current one is insufficient.
    // 4. Mapping the shared UBO into CPU-accessible memory to prepare for uniform data writes.
-    void beginFrame(backend::DriverApi& driver,
-            const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                    materialInstances);
+    // Note that it must happen before committing all MIs.
+    void beginFrame(backend::DriverApi& driver);

    // Unmap the buffer here
    void finishBeginFrame(backend::DriverApi& driver);
@@ -106,23 +106,31 @@ public:
    // Create a fence and associate it with a set of allocation ids.
    // The gpuUseCount of these allocations will be incremented, and they will be decremented
    // After the corresponding frame has been done.
-    void endFrame(backend::DriverApi& driver,
-            const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                    materialInstances);
+    void endFrame(backend::DriverApi& driver);

    void terminate(backend::DriverApi& driver);

    void updateSlot(backend::DriverApi& driver, BufferAllocator::AllocationId id,
            backend::BufferDescriptor bufferDescriptor) const;

-    // Call this when a material instance is no longer holding a slot. e.g. it is destroyed.
-    void retireSlot(BufferAllocator::AllocationId id);
+    // Call this to register a new material instance to UboManager.
+    void manageMaterialInstance(FMaterialInstance* instance);
+
+    // Call this when a material instance is destroyed.
+    void unmanageMaterialInstance(FMaterialInstance* materialInstance);

    // Returns the size of the actual UBO. Note that when there's allocation failed, it will be
    // reallocated to a bigger size at the next frame.
    [[nodiscard]] BufferAllocator::allocation_size_t getTotalSize() const noexcept;

+    // For testing
+    [[nodiscard]] backend::MemoryMappedBufferHandle getMemoryMappedBufferHandle() const noexcept {
+        return mMemoryMappedBufferHandle;
+    }
+
 private:
+    friend class ::UboManagerTest;
+
    constexpr static float BUFFER_SIZE_GROWTH_MULTIPLIER = 1.5f;

    enum AllocationResult {
@@ -134,23 +142,19 @@ private:
    [[nodiscard]] BufferAllocator::allocation_size_t getAllocationOffset(
            BufferAllocator::AllocationId id) const;

-    AllocationResult allocateOnDemand(
-            const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                    materialInstances);
+    AllocationResult allocateOnDemand();

-    void allocateAllInstances(
-            const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                    materialInstances);
+    void allocateAllInstances();

    void reallocate(backend::DriverApi& driver, BufferAllocator::allocation_size_t requiredSize);

-    BufferAllocator::allocation_size_t calculateRequiredSize(
-            const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                    materialInstances);
+    BufferAllocator::allocation_size_t calculateRequiredSize();

    backend::Handle<backend::HwBufferObject> mUbHandle;
    backend::MemoryMappedBufferHandle mMemoryMappedBufferHandle;
    BufferAllocator::allocation_size_t mUboSize{};
+    std::unordered_set<FMaterialInstance*> mPendingInstances;
+    std::unordered_set<FMaterialInstance*> mManagedInstances;

    FenceManager mFenceManager;
    BufferAllocator mAllocator;
--- a/filament/src/details/VertexBuffer.cpp
+++ b/filament/src/details/VertexBuffer.cpp
@@ -285,53 +285,45 @@ FVertexBuffer::FVertexBuffer(FEngine& engine, const Builder& builder)

    // calculate buffer sizes
    size_t bufferSizes[MAX_VERTEX_BUFFER_COUNT] = {};
-    #pragma nounroll
-    for (size_t i = 0, n = mAttributes.size(); i < n; ++i) {
-        if (mDeclaredAttributes[i]) {
-            const uint32_t offset = mAttributes[i].offset;
-            const uint8_t stride = mAttributes[i].stride;
-            const uint8_t slot = mAttributes[i].buffer;
-            const size_t end = offset + mVertexCount * stride;
-            if (slot != Attribute::BUFFER_UNUSED) {
-                assert_invariant(slot < MAX_VERTEX_BUFFER_COUNT);
-                bufferSizes[slot] = std::max(bufferSizes[slot], end);
-            }
-        }
-    }
+
+    auto shouldCreateBuffer = [this](size_t attributeIndex) {
+        const uint8_t slot = mAttributes[attributeIndex].buffer;
+        return mDeclaredAttributes[attributeIndex] && slot != Attribute::BUFFER_UNUSED &&
+                !mBufferObjects[slot];
+    };
+    auto updateBufferSize = [&bufferSizes, this](size_t attributeIndex) {
+        const uint32_t offset = mAttributes[attributeIndex].offset;
+        const uint8_t stride = mAttributes[attributeIndex].stride;
+        const uint8_t slot = mAttributes[attributeIndex].buffer;
+        const size_t end = offset + mVertexCount * stride;
+        assert_invariant(slot < MAX_VERTEX_BUFFER_COUNT);
+        bufferSizes[slot] = std::max(bufferSizes[slot], end);
+    };

    if (!mBufferObjectsEnabled) {
-        // If buffer objects are not enabled at the API level, then we create them internally.
        #pragma nounroll
-        for (size_t index = 0; index < MAX_VERTEX_ATTRIBUTE_COUNT; ++index) {
-            size_t const i = mAttributes[index].buffer;
-            if (i != Attribute::BUFFER_UNUSED) {
-                assert_invariant(bufferSizes[i] > 0);
-                if (!mBufferObjects[i]) {
-                    BufferObjectHandle const bo = driver.createBufferObject(bufferSizes[i],
-                            BufferObjectBinding::VERTEX, BufferUsage::STATIC,
-                            utils::ImmutableCString{ builder.getName() });
-                    driver.setVertexBufferObject(mHandle, i, bo);
-                    mBufferObjects[i] = bo;
-                }
+        for (size_t i = 0, n = mAttributes.size(); i < n; ++i) {
+            if (shouldCreateBuffer(i)) {
+                updateBufferSize(i);
            }
        }
-    } else {
-        // in advanced skinning mode, we manage the BONE_INDICES and BONE_WEIGHTS arrays ourselves,
-        // so we have to set the corresponding buffer objects.
-        if (mAdvancedSkinningEnabled) {
-            for (auto const index : { BONE_INDICES, BONE_WEIGHTS }) {
-                size_t const i = mAttributes[index].buffer;
-                assert_invariant(i != Attribute::BUFFER_UNUSED);
-                assert_invariant(bufferSizes[i] > 0);
-                if (!mBufferObjects[i]) {
-                    BufferObjectHandle const bo = driver.createBufferObject(bufferSizes[i],
-                            BufferObjectBinding::VERTEX, BufferUsage::STATIC,
-                            utils::ImmutableCString{ builder.getName() });
-                    driver.setVertexBufferObject(mHandle, i, bo);
-                    mBufferObjects[i] = bo;
-                }
-            }
+    } else if (mAdvancedSkinningEnabled) {
+        // For advanced skinning mode, only relevant buffers (BONE_INDICES & BONE_WEIGHTS) are
+        // created. We manually populated the relevant attributes for those buffers above.
+        updateBufferSize(BONE_INDICES);
+        updateBufferSize(BONE_WEIGHTS);
+    }
+
+    // create buffers
+    for (size_t i = 0; i < MAX_VERTEX_BUFFER_COUNT; ++i) {
+        if (bufferSizes[i] == 0 || mBufferObjects[i]) {
+            continue;
        }
+        BufferObjectHandle const bo = driver.createBufferObject(bufferSizes[i],
+                BufferObjectBinding::VERTEX, BufferUsage::STATIC,
+                utils::ImmutableCString{ builder.getName() });
+        driver.setVertexBufferObject(mHandle, i, bo);
+        mBufferObjects[i] = bo;
    }
 }

--- a/filament/test/CMakeLists.txt
+++ b/filament/test/CMakeLists.txt
@@ -50,14 +50,17 @@ if (TNT_DEV)
            test_BufferAllocatorStress.cpp
            test_CircularQueue.cpp
            test_FenceManager.cpp
+            test_UboManager.cpp
            filament_test_exposure.cpp
            filament_rendering_test.cpp
            filament_bimap_test.cpp
            filament_framegraph_test.cpp
-            filament_test.cpp)
+            filament_test.cpp
+            ${RESGEN_SOURCE})

    target_link_libraries(test_${TARGET} PRIVATE filament gtest)
    target_compile_options(test_${TARGET} PRIVATE ${COMPILER_FLAGS})
+    target_include_directories(test_${TARGET} PRIVATE ${RESOURCE_DIR})
    set_target_properties(test_${TARGET} PROPERTIES FOLDER Tests)

    add_executable(test_depth depth_test.cpp)
--- a/filament/test/test_UboManager.cpp
+++ b/filament/test/test_UboManager.cpp
@@ -0,0 +1,366 @@
+/*
+ * Copyright (C) 2025 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "MockDriver.h"
+#include "details/MaterialInstance.h"
+#include "details/UboManager.h"
+
+#include <private/backend/CommandBufferQueue.h>
+#include <private/backend/CommandStream.h>
+#include <private/backend/Driver.h>
+
+#include "filament_test_resources.h"
+
+namespace {
+using namespace filament;
+using namespace backend;
+
+using ::testing::NiceMock;
+
+using AllocationId = BufferAllocator::AllocationId;
+using allocation_size_t = BufferAllocator::allocation_size_t;
+} // anonymous namespace
+
+class UboManagerTest : public ::testing::Test {
+protected:
+    static constexpr size_t MIN_COMMAND_BUFFERS_SIZE = 1 * 1024 * 1024;
+    static constexpr size_t COMMAND_BUFFERS_SIZE = 3 * MIN_COMMAND_BUFFERS_SIZE;
+    static constexpr BufferAllocator::allocation_size_t DEFAULT_SLOT_SIZE = 64;
+    static constexpr BufferAllocator::allocation_size_t DEFAULT_TOTAL_SIZE = 1024;
+
+    UboManagerTest()
+            : mCommandBufferQueue(MIN_COMMAND_BUFFERS_SIZE, COMMAND_BUFFERS_SIZE, false),
+              mCommandStream(mMockDriver, mCommandBufferQueue.getCircularBuffer()),
+
+              mDriverApi(mCommandStream),
+              // The constructor will call reallocate, which calls createBufferObject.
+              // MockDriver's default ...S() implementation returns an incrementing handle.
+              // So, the first handle will be 1.
+              mUboManager(mDriverApi, DEFAULT_SLOT_SIZE, DEFAULT_TOTAL_SIZE),
+              mPendingInstances(mUboManager.mPendingInstances),
+              mManagedInstances(mUboManager.mManagedInstances),
+              mUbHandle(mUboManager.mUbHandle),
+              mAllocator(mUboManager.mAllocator) {
+        mEngine = Engine::Builder()
+                          .feature("material.enable_material_instance_uniform_batching", true)
+                          .backend(Backend::NOOP)
+                          .build();
+
+        mMaterial = Material::Builder()
+                            .package(FILAMENT_TEST_RESOURCES_TEST_MATERIAL_DATA,
+                                    FILAMENT_TEST_RESOURCES_TEST_MATERIAL_SIZE)
+                            .build(*mEngine);
+    }
+
+    void TearDown() override {
+        mEngine->destroy(mMaterial);
+        Engine::destroy(&mEngine);
+    }
+
+    // The engine is only for creating materials/material instances, we're not using the UboManager
+    // inside for testing.
+    Engine* mEngine = nullptr;
+    NiceMock<MockDriver> mMockDriver;
+    CommandBufferQueue mCommandBufferQueue;
+    CommandStream mCommandStream;
+    DriverApi& mDriverApi;
+    UboManager mUboManager;
+    Material const* mMaterial;
+    std::unordered_set<FMaterialInstance*>& mPendingInstances;
+    std::unordered_set<FMaterialInstance*>& mManagedInstances;
+    Handle<HwBufferObject>& mUbHandle;
+    BufferAllocator& mAllocator;
+};
+
+TEST_F(UboManagerTest, InitialState) {
+    EXPECT_EQ(mUboManager.getTotalSize(), DEFAULT_TOTAL_SIZE);
+    EXPECT_EQ(mMockDriver.nextFakeHandle, 2);
+    EXPECT_NE(mUbHandle.getId(), HandleBase::nullid);
+}
+
+TEST_F(UboManagerTest, BeginFrameWithoutReallocate) {
+    BufferAllocator::allocation_size_t originalBufferSize = mUboManager.getTotalSize();
+    auto mi1 = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+    EXPECT_EQ(mi1->getAllocationId(), BufferAllocator::UNALLOCATED);
+    ASSERT_TRUE(mi1->isUsingUboBatching());
+
+    // The mi1 should be put in the pending list.
+    mUboManager.manageMaterialInstance(mi1);
+    EXPECT_TRUE(mPendingInstances.contains(mi1));
+    EXPECT_FALSE(mManagedInstances.contains(mi1));
+
+    mUboManager.beginFrame(mDriverApi);
+
+    // The mi1 should be moved to managed list after beginFrame.
+    EXPECT_FALSE(mPendingInstances.contains(mi1));
+    EXPECT_TRUE(mManagedInstances.contains(mi1));
+    // Should have allocation after beginFrame.
+    EXPECT_TRUE(BufferAllocator::isValid(mi1->getAllocationId()));
+
+    // Reallocation is not triggered under this case.
+    EXPECT_EQ(mUboManager.getTotalSize(), originalBufferSize);
+    EXPECT_NE(mUboManager.getMemoryMappedBufferHandle().getId(), HandleBase::nullid);
+
+    mUboManager.finishBeginFrame(mDriverApi);
+    EXPECT_EQ(mUboManager.getMemoryMappedBufferHandle().getId(), HandleBase::nullid);
+
+    mUboManager.endFrame(mDriverApi);
+    EXPECT_TRUE(mAllocator.isLockedByGpu(mi1->getAllocationId()));
+
+    // We're not using the UboManager inside mEngine, so we need to unmanage the material instance
+    // by ourselves.
+    mUboManager.unmanageMaterialInstance(mi1);
+    EXPECT_FALSE(mPendingInstances.contains(mi1));
+    EXPECT_FALSE(mManagedInstances.contains(mi1));
+
+    mUboManager.terminate(mDriverApi);
+    mEngine->destroy(mi1);
+}
+
+TEST_F(UboManagerTest, BeginFrameWithReallocate) {
+    const allocation_size_t originalBufferSize = mUboManager.getTotalSize();
+    const Handle<HwBufferObject> originalBufferHandle = mUbHandle;
+
+    // Create enough material instances to trigger a reallocation.
+    constexpr size_t numInstances = (DEFAULT_TOTAL_SIZE / DEFAULT_SLOT_SIZE) + 1;
+    std::vector<FMaterialInstance*> instances;
+    instances.reserve(numInstances);
+
+    for (size_t i = 0; i < numInstances; ++i) {
+        auto mi = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+        instances.push_back(mi);
+        mUboManager.manageMaterialInstance(mi);
+    }
+
+    // Before beginFrame, all instances should be pending.
+    EXPECT_EQ(mPendingInstances.size(), numInstances);
+    EXPECT_TRUE(mManagedInstances.empty());
+
+    mUboManager.beginFrame(mDriverApi);
+
+    // After beginFrame, reallocation should have occurred.
+    EXPECT_NE(mUbHandle.getId(), originalBufferHandle.getId());
+    EXPECT_GT(mUboManager.getTotalSize(), originalBufferSize);
+
+    // All instances should now be managed and have valid allocations.
+    EXPECT_TRUE(mPendingInstances.empty());
+    EXPECT_EQ(mManagedInstances.size(), numInstances);
+    for (const auto* mi: instances) {
+        EXPECT_TRUE(mManagedInstances.contains(const_cast<FMaterialInstance*>(mi)));
+        EXPECT_TRUE(BufferAllocator::isValid(mi->getAllocationId()));
+    }
+
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.terminate(mDriverApi);
+
+    for (auto* mi: instances) {
+        // We're not using the UboManager inside mEngine, so we need to unmanage the material instance
+        // by ourselves.
+        mUboManager.unmanageMaterialInstance(mi);
+        mEngine->destroy(mi);
+    }
+}
+
+TEST_F(UboManagerTest, RecycleSlot) {
+    auto mi1 = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+    mUboManager.manageMaterialInstance(mi1);
+
+    // Frame 1: mi1 gets an allocation.
+    mUboManager.beginFrame(mDriverApi);
+    const AllocationId mi1AllocationId = mi1->getAllocationId();
+    const allocation_size_t mi1AllocationOffset =
+            mAllocator.getAllocationOffset(mi1AllocationId);
+    EXPECT_TRUE(BufferAllocator::isValid(mi1AllocationId));
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.endFrame(mDriverApi); // Locks mi1's allocation.
+
+    // Now, unmanage mi1. The slot should be retired but not yet released.
+    mUboManager.unmanageMaterialInstance(mi1);
+    EXPECT_TRUE(mAllocator.isLockedByGpu(mi1AllocationId));
+
+    // Frame 2: The slot for mi1 is still locked by the GPU.
+    // We expect getFenceStatus to be called for the fence from frame 1.
+    // We'll mock it to return TIMEOUT_EXPIRED, so the resource is not reclaimed.
+    EXPECT_CALL(mMockDriver, getFenceStatus(_)).WillOnce(Return(FenceStatus::TIMEOUT_EXPIRED));
+    mUboManager.beginFrame(mDriverApi);
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.endFrame(mDriverApi);
+
+    // Frame 3: Now, we'll simulate that the fence from frame 1 has signaled.
+    // The resource for mi1 should be reclaimed.
+    EXPECT_CALL(mMockDriver, getFenceStatus(_)).WillOnce(Return(FenceStatus::CONDITION_SATISFIED));
+
+    auto mi2 = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+    mUboManager.manageMaterialInstance(mi2);
+
+    mUboManager.beginFrame(mDriverApi);
+
+    // mi2 should now have a valid allocation, and it should reuse the slot from mi1.
+    EXPECT_TRUE(BufferAllocator::isValid(mi2->getAllocationId()));
+    EXPECT_EQ(mAllocator.getAllocationOffset(mi2->getAllocationId()), mi1AllocationOffset);
+
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.unmanageMaterialInstance(mi2);
+    mUboManager.terminate(mDriverApi);
+
+    mEngine->destroy(mi1);
+    mEngine->destroy(mi2);
+}
+
+TEST_F(UboManagerTest, OrphanSlot) {
+    auto mi1 = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+    mUboManager.manageMaterialInstance(mi1);
+
+    // Frame 1: mi1 gets an allocation.
+    mUboManager.beginFrame(mDriverApi);
+    const AllocationId alloc1 = mi1->getAllocationId();
+    EXPECT_TRUE(BufferAllocator::isValid(alloc1));
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.endFrame(mDriverApi); // Locks alloc1.
+
+    // Frame 2: Mark the instance as dirty and begin a new frame.
+    // This should trigger orphaning.
+    mi1->getUniformBuffer().invalidate();
+    EXPECT_CALL(mMockDriver, getFenceStatus(_)).WillOnce(Return(FenceStatus::TIMEOUT_EXPIRED));
+    mUboManager.beginFrame(mDriverApi);
+
+    const AllocationId alloc2 = mi1->getAllocationId();
+    EXPECT_TRUE(BufferAllocator::isValid(alloc2));
+    EXPECT_NE(alloc1, alloc2); // Should have a new allocation.
+
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.endFrame(mDriverApi); // Locks alloc2.
+
+    // Frame 3: The fence for alloc1 should now be signaled.
+    EXPECT_CALL(mMockDriver, getFenceStatus(_))
+            .WillOnce(Return(FenceStatus::TIMEOUT_EXPIRED))      // For alloc2's fence
+            .WillOnce(Return(FenceStatus::CONDITION_SATISFIED)); // For alloc1's fence
+    mUboManager.beginFrame(mDriverApi);
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.unmanageMaterialInstance(mi1);
+    mUboManager.terminate(mDriverApi);
+
+    mEngine->destroy(mi1);
+}
+
+TEST_F(UboManagerTest, DoubleManage) {
+    auto mi1 = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+    mUboManager.manageMaterialInstance(mi1);
+    EXPECT_EQ(mPendingInstances.size(), 1);
+
+    // Managing the same instance again should be a no-op.
+    mUboManager.manageMaterialInstance(mi1);
+    EXPECT_EQ(mPendingInstances.size(), 1);
+
+    mUboManager.terminate(mDriverApi);
+    mEngine->destroy(mi1);
+}
+
+TEST_F(UboManagerTest, ManageAndUnmanageBeforeBeginFrame) {
+    auto mi1 = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+    mUboManager.manageMaterialInstance(mi1);
+    EXPECT_TRUE(mPendingInstances.contains(mi1));
+
+    mUboManager.unmanageMaterialInstance(mi1);
+    EXPECT_FALSE(mPendingInstances.contains(mi1));
+
+    // After beginFrame, the instance should not be in any list.
+    mUboManager.beginFrame(mDriverApi);
+    EXPECT_FALSE(mPendingInstances.contains(mi1));
+    EXPECT_FALSE(mManagedInstances.contains(mi1));
+    EXPECT_EQ(mi1->getAllocationId(), BufferAllocator::UNALLOCATED);
+
+    mUboManager.terminate(mDriverApi);
+    mEngine->destroy(mi1);
+}
+
+TEST_F(UboManagerTest, UnmanageUnmanaged) {
+    auto mi1 = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+
+    // Unmanaging an instance that was never managed should not cause any issues.
+    mUboManager.unmanageMaterialInstance(mi1);
+    EXPECT_FALSE(mPendingInstances.contains(mi1));
+    EXPECT_FALSE(mManagedInstances.contains(mi1));
+
+    mUboManager.terminate(mDriverApi);
+    mEngine->destroy(mi1);
+}
+
+TEST_F(UboManagerTest, AllAllocationsLockedAfterEndFrame) {
+    constexpr size_t numInstances = 5;
+    std::vector<FMaterialInstance*> instances;
+    instances.reserve(numInstances);
+
+    for (size_t i = 0; i < numInstances; ++i) {
+        auto mi = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+        instances.push_back(mi);
+        mUboManager.manageMaterialInstance(mi);
+    }
+
+    mUboManager.beginFrame(mDriverApi);
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.endFrame(mDriverApi);
+
+    for (const auto* mi: instances) {
+        EXPECT_TRUE(mAllocator.isLockedByGpu(mi->getAllocationId()));
+    }
+
+    mUboManager.terminate(mDriverApi);
+    for (auto* mi: instances) {
+        // We're not using the UboManager inside mEngine, so we need to unmanage the material instance
+        // by ourselves.
+        mUboManager.unmanageMaterialInstance(mi);
+        mEngine->destroy(mi);
+    }
+}
+
+TEST_F(UboManagerTest, AllAllocationsLockedAfterEndFrameWithInvalidIdInBetween) {
+    constexpr size_t numInstances = 5;
+    std::vector<FMaterialInstance*> instances;
+    instances.reserve(numInstances);
+
+    for (size_t i = 0; i < numInstances; ++i) {
+        auto mi = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+        instances.push_back(mi);
+        mUboManager.manageMaterialInstance(mi);
+    }
+
+    mUboManager.beginFrame(mDriverApi);
+    mUboManager.finishBeginFrame(mDriverApi);
+
+    // It should rarely happen, but we want to make sure all other instances are locked properly.
+    instances[2]->assignUboAllocation(mUbHandle, BufferAllocator::REALLOCATION_REQUIRED, 0);
+    mUboManager.endFrame(mDriverApi);
+
+    for (const auto* mi: instances) {
+        if (BufferAllocator::isValid(mi->getAllocationId())) {
+            EXPECT_TRUE(mAllocator.isLockedByGpu(mi->getAllocationId()));
+        }
+    }
+
+    mUboManager.terminate(mDriverApi);
+    for (auto* mi: instances) {
+        // We're not using the UboManager inside mEngine, so we need to unmanage the material instance
+        // by ourselves.
+        mUboManager.unmanageMaterialInstance(mi);
+        mEngine->destroy(mi);
+    }
+}
+
+// TODO: Add more tests for the beginFrame flow
--- a/filament/test/test_material.filamat
+++ b/filament/test/test_material.filamat
--- a/filament/test/test_material_transformname.filamat
+++ b/filament/test/test_material_transformname.filamat
--- a/third_party/benchmark/.clang-format
+++ b/third_party/benchmark/.clang-format
@@ -1,5 +1,5 @@
 ---
 Language:        Cpp
 BasedOnStyle:  Google
+PointerAlignment: Left
 ...
-
--- a/third_party/benchmark/.clang-tidy
+++ b/third_party/benchmark/.clang-tidy
@@ -0,0 +1,37 @@
+---
+Checks: >
+  abseil-*,
+  bugprone-*,
+  clang-analyzer-*,
+  cppcoreguidelines-*,
+  google-*,
+  misc-*,
+  performance-*,
+  readability-*,
+  -clang-analyzer-deadcode*,
+  -clang-analyzer-optin*,
+  -readability-identifier-length
+WarningsAsErrors: ''
+HeaderFilterRegex: ''
+FormatStyle:     none
+CheckOptions:
+  llvm-else-after-return.WarnOnConditionVariables: 'false'
+  modernize-loop-convert.MinConfidence: reasonable
+  modernize-replace-auto-ptr.IncludeStyle: llvm
+  cert-str34-c.DiagnoseSignedUnsignedCharComparisons: 'false'
+  google-readability-namespace-comments.ShortNamespaceLines: '10'
+  cert-err33-c.CheckedFunctions: '::aligned_alloc;::asctime_s;::at_quick_exit;::atexit;::bsearch;::bsearch_s;::btowc;::c16rtomb;::c32rtomb;::calloc;::clock;::cnd_broadcast;::cnd_init;::cnd_signal;::cnd_timedwait;::cnd_wait;::ctime_s;::fclose;::fflush;::fgetc;::fgetpos;::fgets;::fgetwc;::fopen;::fopen_s;::fprintf;::fprintf_s;::fputc;::fputs;::fputwc;::fputws;::fread;::freopen;::freopen_s;::fscanf;::fscanf_s;::fseek;::fsetpos;::ftell;::fwprintf;::fwprintf_s;::fwrite;::fwscanf;::fwscanf_s;::getc;::getchar;::getenv;::getenv_s;::gets_s;::getwc;::getwchar;::gmtime;::gmtime_s;::localtime;::localtime_s;::malloc;::mbrtoc16;::mbrtoc32;::mbsrtowcs;::mbsrtowcs_s;::mbstowcs;::mbstowcs_s;::memchr;::mktime;::mtx_init;::mtx_lock;::mtx_timedlock;::mtx_trylock;::mtx_unlock;::printf_s;::putc;::putwc;::raise;::realloc;::remove;::rename;::scanf;::scanf_s;::setlocale;::setvbuf;::signal;::snprintf;::snprintf_s;::sprintf;::sprintf_s;::sscanf;::sscanf_s;::strchr;::strerror_s;::strftime;::strpbrk;::strrchr;::strstr;::strtod;::strtof;::strtoimax;::strtok;::strtok_s;::strtol;::strtold;::strtoll;::strtoul;::strtoull;::strtoumax;::strxfrm;::swprintf;::swprintf_s;::swscanf;::swscanf_s;::thrd_create;::thrd_detach;::thrd_join;::thrd_sleep;::time;::timespec_get;::tmpfile;::tmpfile_s;::tmpnam;::tmpnam_s;::tss_create;::tss_get;::tss_set;::ungetc;::ungetwc;::vfprintf;::vfprintf_s;::vfscanf;::vfscanf_s;::vfwprintf;::vfwprintf_s;::vfwscanf;::vfwscanf_s;::vprintf_s;::vscanf;::vscanf_s;::vsnprintf;::vsnprintf_s;::vsprintf;::vsprintf_s;::vsscanf;::vsscanf_s;::vswprintf;::vswprintf_s;::vswscanf;::vswscanf_s;::vwprintf_s;::vwscanf;::vwscanf_s;::wcrtomb;::wcschr;::wcsftime;::wcspbrk;::wcsrchr;::wcsrtombs;::wcsrtombs_s;::wcsstr;::wcstod;::wcstof;::wcstoimax;::wcstok;::wcstok_s;::wcstol;::wcstold;::wcstoll;::wcstombs;::wcstombs_s;::wcstoul;::wcstoull;::wcstoumax;::wcsxfrm;::wctob;::wctrans;::wctype;::wmemchr;::wprintf_s;::wscanf;::wscanf_s;'
+  cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField: 'false'
+  cert-dcl16-c.NewSuffixes: 'L;LL;LU;LLU'
+  google-readability-braces-around-statements.ShortStatementLines: '1'
+  cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic: 'true'
+  google-readability-namespace-comments.SpacesBeforeComments: '2'
+  modernize-loop-convert.MaxCopySize: '16'
+  modernize-pass-by-value.IncludeStyle: llvm
+  modernize-use-nullptr.NullMacros: 'NULL'
+  llvm-qualified-auto.AddConstToQualified: 'false'
+  modernize-loop-convert.NamingStyle: CamelCase
+  llvm-else-after-return.WarnOnUnfixable: 'false'
+  google-readability-function-size.StatementThreshold: '800'
+...
+
--- a/third_party/benchmark/.clang-tidy.ignore
+++ b/third_party/benchmark/.clang-tidy.ignore
@@ -0,0 +1 @@
+.*third_party/.*
--- a/third_party/benchmark/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/third_party/benchmark/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,32 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: "[BUG]"
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**System**
+Which OS, compiler, and compiler version are you using:
+  - OS: 
+  - Compiler and version: 
+
+**To reproduce**
+Steps to reproduce the behavior:
+1. sync to commit ...
+2. cmake/bazel...
+3. make ...
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Additional context**
+Add any other context about the problem here.
--- a/third_party/benchmark/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/third_party/benchmark/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: "[FR]"
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
--- a/third_party/benchmark/.github/install_bazel.sh
+++ b/third_party/benchmark/.github/install_bazel.sh
@@ -0,0 +1,12 @@
+if ! bazel version; then
+  arch=$(uname -m)
+  if [ "$arch" == "aarch64" ]; then
+    arch="arm64"
+  fi
+  echo "Downloading $arch Bazel binary from GitHub releases."
+  curl -L -o $HOME/bin/bazel --create-dirs "https://github.com/bazelbuild/bazel/releases/download/8.2.0/bazel-8.2.0-linux-$arch"
+  chmod +x $HOME/bin/bazel
+else
+  # Bazel is installed for the correct architecture
+  exit 0
+fi
--- a/third_party/benchmark/.github/libcxx-setup.sh
+++ b/third_party/benchmark/.github/libcxx-setup.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Checkout LLVM sources
+git clone --filter=blob:none --depth=1 --branch llvmorg-19.1.6 --no-checkout https://github.com/llvm/llvm-project.git llvm-project
+cd llvm-project
+git sparse-checkout set --cone
+git checkout llvmorg-19.1.6
+git sparse-checkout set cmake llvm/cmake runtimes libcxx libcxxabi
+cd ..
+
+## Setup libc++ options
+if [ -z "$BUILD_32_BITS" ]; then
+  export BUILD_32_BITS=OFF && echo disabling 32 bit build
+fi
+
+## Build and install libc++ (Use unstable ABI for better sanitizer coverage)
+mkdir llvm-build && cd llvm-build
+cmake -GNinja                                   \
+      -DCMAKE_C_COMPILER=${CC}                  \
+      -DCMAKE_CXX_COMPILER=${CXX}               \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
+      -DCMAKE_INSTALL_PREFIX=/usr               \
+      -DLIBCXX_ABI_UNSTABLE=OFF                 \
+      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER}  \
+      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS}     \
+      -DLIBCXXABI_USE_LLVM_UNWINDER=OFF         \
+      -DLLVM_INCLUDE_TESTS=OFF                  \
+      -DLIBCXX_INCLUDE_TESTS=OFF                \
+      -DLIBCXX_INCLUDE_BENCHMARKS=OFF           \
+      -DLLVM_ENABLE_RUNTIMES='libcxx;libcxxabi' \
+      ../llvm-project/runtimes/
+cmake --build . -- cxx cxxabi
+cd ..
--- a/third_party/benchmark/.github/workflows/bazel.yml
+++ b/third_party/benchmark/.github/workflows/bazel.yml
@@ -0,0 +1,37 @@
+name: bazel
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  build_and_test_default:
+    name: bazel.${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: mount bazel cache
+      uses: actions/cache@v4
+      env:
+        cache-name: bazel-cache
+      with:
+        path: "~/.cache/bazel"
+        key: ${{ env.cache-name }}-${{ matrix.os }}-${{ github.ref }}
+        restore-keys: |
+          ${{ env.cache-name }}-${{ matrix.os }}-main
+
+    - name: build
+      run: |
+        bazel build //:benchmark //:benchmark_main //test/...
+
+    - name: test
+      run: |
+        bazel test --test_output=all //test/...
--- a/third_party/benchmark/.github/workflows/build-and-test-min-cmake.yml
+++ b/third_party/benchmark/.github/workflows/build-and-test-min-cmake.yml
@@ -0,0 +1,49 @@
+name: build-and-test-min-cmake
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  job:
+    name: ${{ matrix.os }}.min-cmake
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: lukka/get-cmake@latest
+        with:
+          cmakeVersion: 3.13.0
+
+      - name: create build environment
+        run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+      - name: setup cmake initial cache
+        run: touch compiler-cache.cmake
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: >
+          cmake -C ${{ github.workspace }}/compiler-cache.cmake
+          $GITHUB_WORKSPACE
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DCMAKE_CXX_VISIBILITY_PRESET=hidden
+          -DCMAKE_VISIBILITY_INLINES_HIDDEN=ON
+
+      - name: build
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: cmake --build .
--- a/third_party/benchmark/.github/workflows/build-and-test-perfcounters.yml
+++ b/third_party/benchmark/.github/workflows/build-and-test-perfcounters.yml
@@ -0,0 +1,54 @@
+name: build-and-test-perfcounters
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  job:
+    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
+    name: ${{ matrix.os }}.${{ matrix.build_type }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        build_type: ['Release', 'Debug']
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: install libpfm
+      run: |
+        sudo apt update
+        sudo apt -y install libpfm4-dev
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_LIBPFM=1
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    # Skip testing, for now. It seems perf_event_open does not succeed on the
+    # hosting machine, very likely a permissions issue.
+    # TODO(mtrofin): Enable test.
+    # - name: test
+    #   shell: bash
+    #   working-directory: ${{ runner.workspace }}/_build
+    #   run: ctest -C ${{ matrix.build_type }} --rerun-failed --output-on-failure
+
--- a/third_party/benchmark/.github/workflows/build-and-test.yml
+++ b/third_party/benchmark/.github/workflows/build-and-test.yml
@@ -0,0 +1,151 @@
+name: build-and-test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  # TODO: add 32-bit builds (g++ and clang++) for ubuntu
+  #   (requires g++-multilib and libc6:i386)
+  # TODO: add coverage build (requires lcov)
+  # TODO: add clang + libc++ builds for ubuntu
+  job:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.compiler }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-24.04, ubuntu-22.04, ubuntu-24.04-arm, macos-latest]
+        build_type: ['Release', 'Debug']
+        compiler: ['g++', 'clang++']
+        lib: ['shared', 'static']
+
+    steps:
+      - name: Install dependencies (macos)
+        if: runner.os == 'macOS'
+        run: brew install ninja
+
+      - uses: actions/checkout@v4
+
+      - name: build
+        uses: threeal/cmake-action@v2.1.0
+        with:
+          build-dir: ${{ runner.workspace }}/_build
+          cxx-compiler: ${{ matrix.compiler }}
+          options: |
+            BENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+            BUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+            CMAKE_BUILD_TYPE=${{ matrix.build_type }}
+            CMAKE_CXX_COMPILER=${{ matrix.compiler }}
+            CMAKE_CXX_VISIBILITY_PRESET=hidden
+            CMAKE_VISIBILITY_INLINES_HIDDEN=ON
+
+      - name: test
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: ctest -C ${{ matrix.build_type }} -VV
+
+  msvc:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.msvc }}
+    runs-on: ${{ matrix.os }}
+    defaults:
+        run:
+            shell: powershell
+    strategy:
+      fail-fast: false
+      matrix:
+        msvc:
+          - VS-16-2019
+          - VS-17-2022
+        build_type:
+          - Debug
+          - Release
+        lib:
+          - shared
+          - static
+        include:
+          - msvc: VS-16-2019
+            os: windows-2019
+            generator: 'Visual Studio 16 2019'
+          - msvc: VS-17-2022
+            os: windows-2022
+            generator: 'Visual Studio 17 2022'
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: lukka/get-cmake@latest
+
+      - name: configure cmake
+        run: >
+          cmake -S . -B ${{ runner.workspace }}/_build/
+          -G "${{ matrix.generator }}"
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+
+      - name: build
+        run: cmake --build ${{ runner.workspace }}/_build/ --config ${{ matrix.build_type }}
+
+      - name: test
+        run: ctest --test-dir ${{ runner.workspace }}/_build/ -C ${{ matrix.build_type }} -VV
+
+  msys2:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.msys2.msystem }}
+    runs-on: ${{ matrix.os }}
+    defaults:
+        run:
+            shell: msys2 {0}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ windows-latest ]
+        msys2:
+          - { msystem: MINGW64,    arch: x86_64,  family: GNU,  compiler: g++ }
+          - { msystem: CLANG64,    arch: x86_64,  family: LLVM, compiler: clang++ }
+          - { msystem: UCRT64,     arch: x86_64,  family: GNU,  compiler: g++ }
+        build_type:
+          - Debug
+          - Release
+        lib:
+          - shared
+          - static
+
+    steps:
+      - name: setup msys2
+        uses: msys2/setup-msys2@v2
+        with:
+          cache: false
+          msystem: ${{ matrix.msys2.msystem }}
+          update: true
+          install: >-
+            git
+            base-devel
+          pacboy: >-
+            gcc:p
+            clang:p
+            cmake:p
+            ninja:p
+
+      - uses: actions/checkout@v4
+
+      # NOTE: we can't use cmake actions here as we need to do everything in msys2 shell.
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.msys2.compiler }}
+        run: >
+          cmake -S . -B _build/
+          -GNinja
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+
+      - name: build
+        run: cmake --build _build/ --config ${{ matrix.build_type }}
+
+      - name: test
+        working-directory: _build
+        run: ctest -C ${{ matrix.build_type }} -VV
--- a/third_party/benchmark/.github/workflows/clang-format-lint.yml
+++ b/third_party/benchmark/.github/workflows/clang-format-lint.yml
@@ -0,0 +1,19 @@
+name: clang-format-lint
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  job:
+    name: check-clang-format
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - uses: DoozyX/clang-format-lint-action@v0.18.2
+      with:
+        source: './include/benchmark ./src ./test'
+        clangFormatVersion: 18
--- a/third_party/benchmark/.github/workflows/clang-tidy-lint.yml
+++ b/third_party/benchmark/.github/workflows/clang-tidy-lint.yml
@@ -0,0 +1,41 @@
+name: clang-tidy
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  job:
+    name: run-clang-tidy
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: install clang-tidy
+      run: sudo apt update && sudo apt -y install clang-tidy
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ github.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ github.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=clang
+        -DCMAKE_CXX_COMPILER=clang++
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DGTEST_COMPILE_COMMANDS=OFF
+
+    - name: run
+      shell: bash
+      working-directory: ${{ github.workspace }}/_build
+      run: run-clang-tidy -config-file=$GITHUB_WORKSPACE/.clang-tidy
--- a/third_party/benchmark/.github/workflows/doxygen.yml
+++ b/third_party/benchmark/.github/workflows/doxygen.yml
@@ -0,0 +1,31 @@
+name: doxygen
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  build-and-deploy:
+    name: Build HTML documentation
+    runs-on: ubuntu-latest
+    steps:
+    - name: Fetching sources
+      uses: actions/checkout@v4
+
+    - name: Installing build dependencies
+      run: |
+        sudo apt update
+        sudo apt install doxygen gcc git
+
+    - name: Creating build directory
+      run: mkdir build
+
+    - name: Building HTML documentation with Doxygen
+      run: |
+        cmake -S . -B build -DBENCHMARK_ENABLE_TESTING:BOOL=OFF -DBENCHMARK_ENABLE_DOXYGEN:BOOL=ON -DBENCHMARK_INSTALL_DOCS:BOOL=ON
+        cmake --build build --target benchmark_doxygen
--- a/third_party/benchmark/.github/workflows/pre-commit.yml
+++ b/third_party/benchmark/.github/workflows/pre-commit.yml
@@ -0,0 +1,41 @@
+name: python + Bazel pre-commit checks
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    env:
+      MYPY_CACHE_DIR: "${{ github.workspace }}/.cache/mypy"
+      RUFF_CACHE_DIR: "${{ github.workspace }}/.cache/ruff"
+      PRE_COMMIT_HOME: "${{ github.workspace }}/.cache/pre-commit"
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.11
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Install dependencies
+      run: python -m pip install ".[dev]"
+    - name: Cache pre-commit tools
+      uses: actions/cache@v4
+      with:
+        path: |
+          ${{ env.MYPY_CACHE_DIR }}
+          ${{ env.RUFF_CACHE_DIR }}
+          ${{ env.PRE_COMMIT_HOME }}
+        key: ${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }}-linter-cache
+    - name: Run pre-commit checks
+      run: pre-commit run --all-files --verbose --show-diff-on-failure
--- a/third_party/benchmark/.github/workflows/sanitizer.yml
+++ b/third_party/benchmark/.github/workflows/sanitizer.yml
@@ -0,0 +1,97 @@
+name: sanitizer
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  CMAKE_GENERATOR: Ninja
+  UBSAN_OPTIONS: "print_stacktrace=1"
+
+jobs:
+  job:
+    name: ${{ matrix.sanitizer }}.${{ matrix.build_type }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: ['Debug', 'RelWithDebInfo']
+        sanitizer: ['asan', 'ubsan', 'tsan', 'msan']
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: configure msan env
+      if: matrix.sanitizer == 'msan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=MemoryWithOrigins" >> $GITHUB_ENV
+
+    - name: configure ubsan env
+      if: matrix.sanitizer == 'ubsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Undefined" >> $GITHUB_ENV
+
+    - name: configure asan env
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=address -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Address" >> $GITHUB_ENV
+
+    - name: configure tsan env
+      if: matrix.sanitizer == 'tsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Thread" >> $GITHUB_ENV
+
+    - name: fine-tune asan options
+      # in asan we get an error from std::regex. ignore it.
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "ASAN_OPTIONS=alloc_dealloc_mismatch=0" >> $GITHUB_ENV
+
+    - name: setup clang
+      uses: egor-tensin/setup-clang@v1
+      with:
+        version: latest
+        platform: x64
+
+    - name: configure clang
+      run: |
+        echo "CC=cc" >> $GITHUB_ENV
+        echo "CXX=c++" >> $GITHUB_ENV
+
+    - name: build libc++ (non-asan)
+      if: matrix.sanitizer != 'asan'
+      run: |
+        "${GITHUB_WORKSPACE}/.github/libcxx-setup.sh"
+        echo "EXTRA_CXX_FLAGS=-stdlib=libc++ -L${GITHUB_WORKSPACE}/llvm-build/lib -lc++abi -I${GITHUB_WORKSPACE}/llvm-build/include/c++/v1 -Isystem${GITHUB_WORKSPACE}/llvm-build/include/c++/v1 -Wl,-rpath,${GITHUB_WORKSPACE}/llvm-build/lib" >> $GITHUB_ENV
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        VERBOSE=1
+        cmake -GNinja $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=${{ env.CC }}
+        -DCMAKE_CXX_COMPILER=${{ env.CXX }}
+        -DCMAKE_C_FLAGS="${{ env.EXTRA_FLAGS }}"
+        -DCMAKE_CXX_FLAGS="${{ env.EXTRA_FLAGS }} ${{ env.EXTRA_CXX_FLAGS }}"
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    - name: test
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: ctest -C ${{ matrix.build_type }} -VV
--- a/third_party/benchmark/.github/workflows/test_bindings.yml
+++ b/third_party/benchmark/.github/workflows/test_bindings.yml
@@ -0,0 +1,33 @@
+name: test-bindings
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  python_bindings:
+    name: Test GBM Python ${{ matrix.python-version }} bindings on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
+        python-version: [ "3.10", "3.11", "3.12", "3.13" ]
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install GBM Python bindings on ${{ matrix.os }}
+        run: python -m pip install .
+      - name: Run example on ${{ matrix.os }} under Python ${{ matrix.python-version }}
+        run: python bindings/python/google_benchmark/example.py
--- a/third_party/benchmark/.github/workflows/wheels.yml
+++ b/third_party/benchmark/.github/workflows/wheels.yml
@@ -0,0 +1,83 @@
+name: Build and upload Python wheels
+
+on:
+  workflow_dispatch:
+  release:
+    types:
+      - published
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Install Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - run: python -m pip install build
+      - name: Build sdist
+        run: python -m build --sdist
+      - uses: actions/upload-artifact@v4
+        with:
+          name: dist-sdist
+          path: dist/*.tar.gz
+
+  build_wheels:
+    name: Build Google Benchmark wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, ubuntu-24.04-arm, macos-13, macos-14, windows-latest]
+    steps:
+      - name: Check out Google Benchmark
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - uses: actions/setup-python@v5
+        name: Install Python 3.12
+        with:
+          python-version: "3.12"
+      - run: pip install --upgrade pip uv
+
+      - name: Build wheels on ${{ matrix.os }} using cibuildwheel
+        uses: pypa/cibuildwheel@v2.23.2
+        env:
+          CIBW_BUILD: "cp310-* cp311-* cp312-*"
+          CIBW_BUILD_FRONTEND: "build[uv]"
+          CIBW_SKIP: "*-musllinux_*"
+          CIBW_ARCHS: auto64
+          CIBW_BEFORE_ALL_LINUX: bash .github/install_bazel.sh
+          # Grab the rootless Bazel installation inside the container.
+          CIBW_ENVIRONMENT_LINUX: PATH=$PATH:$HOME/bin
+          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+          # unused by Bazel, but needed explicitly by delocate on MacOS.
+          MACOSX_DEPLOYMENT_TARGET: "10.14"
+
+      - name: Upload Google Benchmark ${{ matrix.os }} wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: dist-${{ matrix.os }}
+          path: wheelhouse/*.whl
+
+  pypi_upload:
+    name: Publish google-benchmark wheels to PyPI
+    needs: [build_sdist, build_wheels]
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          path: dist
+          pattern: dist-*
+          merge-multiple: true
+      - uses: pypa/gh-action-pypi-publish@release/v1
--- a/third_party/benchmark/.gitignore
+++ b/third_party/benchmark/.gitignore
@@ -8,8 +8,10 @@
 !/cmake/*.cmake
 !/test/AssemblyTests.cmake
 *~
+*.swp
 *.pyc
 __pycache__
+.DS_Store

 # lcov
 *.lcov
@@ -44,6 +46,7 @@ rules.ninja

 # bazel output symlinks.
 bazel-*
+MODULE.bazel.lock

 # out-of-source build top-level folders.
 build/
@@ -56,3 +59,10 @@ build*/
 # Visual Studio 2015/2017 cache/options directory
 .vs/
 CMakeSettings.json
+
+# Visual Studio Code cache/options directory
+.vscode/
+
+# Python build stuff
+dist/
+*.egg-info*
--- a/third_party/benchmark/.pre-commit-config.yaml
+++ b/third_party/benchmark/.pre-commit-config.yaml
@@ -0,0 +1,18 @@
+repos:
+  -   repo: https://github.com/keith/pre-commit-buildifier
+      rev: 8.0.3
+      hooks:
+      -   id: buildifier
+      -   id: buildifier-lint
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.15.0
+    hooks:
+      - id: mypy
+        types_or: [ python, pyi ]
+        args: [ "--ignore-missing-imports", "--scripts-are-modules" ]
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.8
+    hooks:
+      - id: ruff
+        args: [ --fix, --exit-non-zero-on-fix ]
+      - id: ruff-format
--- a/third_party/benchmark/.travis-libcxx-setup.sh
+++ b/third_party/benchmark/.travis-libcxx-setup.sh
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-# Install a newer CMake version
-curl -sSL https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh -o install-cmake.sh
-chmod +x install-cmake.sh
-sudo ./install-cmake.sh --prefix=/usr/local --skip-license
-
-# Checkout LLVM sources
-git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
-git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
-git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
-
-# Setup libc++ options
-if [ -z "$BUILD_32_BITS" ]; then
-  export BUILD_32_BITS=OFF && echo disabling 32 bit build
-fi
-
-# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
-mkdir llvm-build && cd llvm-build
-cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
-      -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
-      -DLIBCXX_ABI_UNSTABLE=ON \
-      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
-      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \
-      ../llvm-source
-make cxx -j2
-sudo make install-cxxabi install-cxx
-cd ../
--- a/third_party/benchmark/.travis.yml
+++ b/third_party/benchmark/.travis.yml
@@ -1,199 +0,0 @@
-sudo: required
-dist: trusty
-language: cpp
-
-env:
-  global:
-    - /usr/local/bin:$PATH
-
-matrix:
-  include:
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - lcov
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - g++-multilib
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug BUILD_32_BITS=ON
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - g++-multilib
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release BUILD_32_BITS=ON
-    - compiler: gcc
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
-    # Clang w/ libc++
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1
-        - EXTRA_FLAGS="-stdlib=libc++"
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
-        - LIBCXX_BUILD=1
-        - EXTRA_FLAGS="-stdlib=libc++"
-    # Clang w/ 32bit libc++
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            - clang-3.8
-            - g++-multilib
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-stdlib=libc++ -m32"
-    # Clang w/ 32bit libc++
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            - clang-3.8
-            - g++-multilib
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
-        - LIBCXX_BUILD=1
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-stdlib=libc++ -m32"
-    # Clang w/ libc++, ASAN, UBSAN
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
-        - UBSAN_OPTIONS=print_stacktrace=1
-    # Clang w/ libc++ and MSAN
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
-    # Clang w/ libc++ and MSAN
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Debug
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Release
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Release BUILD_32_BITS=ON
-    - os: osx
-      osx_image: xcode8.3
-      compiler: gcc
-      env:
-        - COMPILER=g++-7 C_COMPILER=gcc-7  BUILD_TYPE=Debug
-
-before_script:
-  - if [ -n "${LIBCXX_BUILD}" ]; then
-      source .travis-libcxx-setup.sh;
-    fi
-  - if [ -n "${ENABLE_SANITIZER}" ]; then
-      export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
-    else
-      export EXTRA_OPTIONS="";
-    fi
-  - mkdir -p build && cd build
-
-before_install:
-  - if [ -z "$BUILD_32_BITS" ]; then
-      export BUILD_32_BITS=OFF && echo disabling 32 bit build;
-    fi
-  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
-      sudo add-apt-repository -y "ppa:ubuntu-toolchain-r/test";
-      sudo apt-get update --option Acquire::Retries=100 --option Acquire::http::Timeout="60";
-    fi
-
-install:
-  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
-      travis_wait sudo -E apt-get -yq --no-install-suggests --no-install-recommends install g++-6;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "linux" -a "${BUILD_32_BITS}" == "OFF" ]; then
-      travis_wait sudo -E apt-get -y --no-install-suggests --no-install-recommends install llvm-3.9-tools;
-      sudo cp /usr/lib/llvm-3.9/bin/FileCheck /usr/local/bin/;
-    fi
-  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
-      PATH=~/.local/bin:${PATH};
-      pip install --user --upgrade pip;
-      travis_wait pip install --user cpp-coveralls;
-    fi
-  - if [ "${C_COMPILER}" == "gcc-7" -a "${TRAVIS_OS_NAME}" == "osx" ]; then
-      rm -f /usr/local/include/c++;
-      brew update;
-      travis_wait brew install gcc@7;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "linux" ]; then
-      sudo apt-get update -qq;
-      sudo apt-get install -qq unzip;
-      wget https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-linux-x86_64.sh --output-document bazel-installer.sh;
-      travis_wait sudo bash bazel-installer.sh;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
-      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-darwin-x86_64.sh;
-      travis_wait sudo bash bazel-installer.sh;
-    fi
-
-script:
-  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS}" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ${EXTRA_OPTIONS} ..
-  - make
-  - ctest -C ${BUILD_TYPE} --output-on-failure
-  - bazel test -c dbg --define google_benchmark.have_regex=posix --announce_rc --verbose_failures --test_output=errors --keep_going //test/...
-
-after_success:
-  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
-      coveralls --include src --include include --gcov-options '\-lp' --root .. --build-root .;
-    fi
--- a/third_party/benchmark/.ycm_extra_conf.py
+++ b/third_party/benchmark/.ycm_extra_conf.py
@@ -1,25 +1,30 @@
 import os
+
 import ycm_core

 # These are the compilation flags that will be used in case there's no
 # compilation database set (by default, one is not set).
 # CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR.
 flags = [
-'-Wall',
-'-Werror',
-'-pedantic-errors',
-'-std=c++0x',
-'-fno-strict-aliasing',
-'-O3',
-'-DNDEBUG',
-# ...and the same thing goes for the magic -x option which specifies the
-# language that the files to be compiled are written in. This is mostly
-# relevant for c++ headers.
-# For a C project, you would set this to 'c' instead of 'c++'.
-'-x', 'c++',
-'-I', 'include',
-'-isystem', '/usr/include',
-'-isystem', '/usr/local/include',
+    "-Wall",
+    "-Werror",
+    "-pedantic-errors",
+    "-std=c++0x",
+    "-fno-strict-aliasing",
+    "-O3",
+    "-DNDEBUG",
+    # ...and the same thing goes for the magic -x option which specifies the
+    # language that the files to be compiled are written in. This is mostly
+    # relevant for c++ headers.
+    # For a C project, you would set this to 'c' instead of 'c++'.
+    "-x",
+    "c++",
+    "-I",
+    "include",
+    "-isystem",
+    "/usr/include",
+    "-isystem",
+    "/usr/local/include",
 ]


@@ -29,87 +34,87 @@ flags = [
 #
 # Most projects will NOT need to set this to anything; you can just change the
 # 'flags' list of compilation flags. Notice that YCM itself uses that approach.
-compilation_database_folder = ''
+compilation_database_folder = ""

-if os.path.exists( compilation_database_folder ):
-  database = ycm_core.CompilationDatabase( compilation_database_folder )
+if os.path.exists(compilation_database_folder):
+    database = ycm_core.CompilationDatabase(compilation_database_folder)
 else:
-  database = None
+    database = None
+
+SOURCE_EXTENSIONS = [".cc"]

-SOURCE_EXTENSIONS = [ '.cc' ]

 def DirectoryOfThisScript():
-  return os.path.dirname( os.path.abspath( __file__ ) )
+    return os.path.dirname(os.path.abspath(__file__))


-def MakeRelativePathsInFlagsAbsolute( flags, working_directory ):
-  if not working_directory:
-    return list( flags )
-  new_flags = []
-  make_next_absolute = False
-  path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ]
-  for flag in flags:
-    new_flag = flag
+def MakeRelativePathsInFlagsAbsolute(flags, working_directory):
+    if not working_directory:
+        return list(flags)
+    new_flags = []
+    make_next_absolute = False
+    path_flags = ["-isystem", "-I", "-iquote", "--sysroot="]
+    for flag in flags:
+        new_flag = flag

-    if make_next_absolute:
-      make_next_absolute = False
-      if not flag.startswith( '/' ):
-        new_flag = os.path.join( working_directory, flag )
+        if make_next_absolute:
+            make_next_absolute = False
+            if not flag.startswith("/"):
+                new_flag = os.path.join(working_directory, flag)

-    for path_flag in path_flags:
-      if flag == path_flag:
-        make_next_absolute = True
-        break
+        for path_flag in path_flags:
+            if flag == path_flag:
+                make_next_absolute = True
+                break

-      if flag.startswith( path_flag ):
-        path = flag[ len( path_flag ): ]
-        new_flag = path_flag + os.path.join( working_directory, path )
-        break
+            if flag.startswith(path_flag):
+                path = flag[len(path_flag) :]
+                new_flag = path_flag + os.path.join(working_directory, path)
+                break

-    if new_flag:
-      new_flags.append( new_flag )
-  return new_flags
+        if new_flag:
+            new_flags.append(new_flag)
+    return new_flags


-def IsHeaderFile( filename ):
-  extension = os.path.splitext( filename )[ 1 ]
-  return extension in [ '.h', '.hxx', '.hpp', '.hh' ]
+def IsHeaderFile(filename):
+    extension = os.path.splitext(filename)[1]
+    return extension in [".h", ".hxx", ".hpp", ".hh"]


-def GetCompilationInfoForFile( filename ):
-  # The compilation_commands.json file generated by CMake does not have entries
-  # for header files. So we do our best by asking the db for flags for a
-  # corresponding source file, if any. If one exists, the flags for that file
-  # should be good enough.
-  if IsHeaderFile( filename ):
-    basename = os.path.splitext( filename )[ 0 ]
-    for extension in SOURCE_EXTENSIONS:
-      replacement_file = basename + extension
-      if os.path.exists( replacement_file ):
-        compilation_info = database.GetCompilationInfoForFile(
-          replacement_file )
-        if compilation_info.compiler_flags_:
-          return compilation_info
-    return None
-  return database.GetCompilationInfoForFile( filename )
+def GetCompilationInfoForFile(filename):
+    # The compilation_commands.json file generated by CMake does not have
+    # entries for header files. So we do our best by asking the db for flags for
+    # a corresponding source file, if any. If one exists, the flags for that
+    # file should be good enough.
+    if IsHeaderFile(filename):
+        basename = os.path.splitext(filename)[0]
+        for extension in SOURCE_EXTENSIONS:
+            replacement_file = basename + extension
+            if os.path.exists(replacement_file):
+                compilation_info = database.GetCompilationInfoForFile(
+                    replacement_file
+                )
+                if compilation_info.compiler_flags_:
+                    return compilation_info
+        return None
+    return database.GetCompilationInfoForFile(filename)


-def FlagsForFile( filename, **kwargs ):
-  if database:
-    # Bear in mind that compilation_info.compiler_flags_ does NOT return a
-    # python list, but a "list-like" StringVec object
-    compilation_info = GetCompilationInfoForFile( filename )
-    if not compilation_info:
-      return None
+def FlagsForFile(filename, **kwargs):
+    if database:
+        # Bear in mind that compilation_info.compiler_flags_ does NOT return a
+        # python list, but a "list-like" StringVec object
+        compilation_info = GetCompilationInfoForFile(filename)
+        if not compilation_info:
+            return None

-    final_flags = MakeRelativePathsInFlagsAbsolute(
-      compilation_info.compiler_flags_,
-      compilation_info.compiler_working_dir_ )
-  else:
-    relative_to = DirectoryOfThisScript()
-    final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to )
+        final_flags = MakeRelativePathsInFlagsAbsolute(
+            compilation_info.compiler_flags_,
+            compilation_info.compiler_working_dir_,
+        )
+    else:
+        relative_to = DirectoryOfThisScript()
+        final_flags = MakeRelativePathsInFlagsAbsolute(flags, relative_to)

-  return {
-    'flags': final_flags,
-    'do_cache': True
-  }
+    return {"flags": final_flags, "do_cache": True}
--- a/third_party/benchmark/AUTHORS
+++ b/third_party/benchmark/AUTHORS
@@ -9,40 +9,64 @@
 # Please keep the list sorted.

 Albert Pretorius <pretoalb@gmail.com>
+Alex Steele <steeleal123@gmail.com>
+Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
 Carto
+Cezary Skrzyński <czars1988@gmail.com>
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
+Colin Braley <braley.colin@gmail.com>
+Daniel Harvey <danielharvey458@gmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Deniz Evrenci <denizevrenci@gmail.com>
 Dirac Research 
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
+Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fabien Pichot <pichot.fabien@gmail.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Gergely Meszaros <maetveis@gmail.com>
+Gergő Szitár <szitar.gergo@gmail.com>
 Google Inc.
+Henrique Bucher <hbucher@gmail.com>
 International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
+Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
 MongoDB Inc.
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raghu Raja <raghu@enfabrica.net>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Roman Lebedev <lebedev.ri@gmail.com>
+Sayan Bhattacharjee <aero.sayan@gmail.com>
+Shapr3D <google-contributors@shapr3d.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Staffan Tjernstrom <staffantj@gmail.com>
 Steinar H. Gunderson <sgunderson@bigfoot.com>
 Stripe, Inc.
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
--- a/third_party/benchmark/BUILD.bazel
+++ b/third_party/benchmark/BUILD.bazel
@@ -1,9 +1,38 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
 licenses(["notice"])

+COPTS = [
+    "-pedantic",
+    "-pedantic-errors",
+    "-std=c++17",
+    "-Wall",
+    "-Wconversion",
+    "-Wextra",
+    "-Wshadow",
+    #    "-Wshorten-64-to-32",
+    "-Wfloat-equal",
+    "-fstrict-aliasing",
+    ## assert() are used a lot in tests upstream, which may be optimised out leading to
+    ## unused-variable warning.
+    "-Wno-unused-variable",
+    "-Werror=old-style-cast",
+]
+
+MSVC_COPTS = [
+    "/std:c++17",
+]
+
 config_setting(
    name = "windows",
-    values = {
-        "cpu": "x64_windows",
+    constraint_values = ["@platforms//os:windows"],
+    visibility = [":__subpackages__"],
+)
+
+config_setting(
+    name = "perfcounters",
+    define_values = {
+        "pfm": "1",
    },
    visibility = [":__subpackages__"],
 )
@@ -17,20 +46,51 @@ cc_library(
        ],
        exclude = ["src/benchmark_main.cc"],
    ),
-    hdrs = ["include/benchmark/benchmark.h"],
+    hdrs = [
+        "include/benchmark/benchmark.h",
+        "include/benchmark/export.h",
+    ],
+    copts = select({
+        ":windows": MSVC_COPTS,
+        "//conditions:default": COPTS,
+    }),
+    defines = [
+        "BENCHMARK_STATIC_DEFINE",
+        "BENCHMARK_VERSION=\\\"" + (module_version() if module_version() != None else "") + "\\\"",
+    ] + select({
+        ":perfcounters": ["HAVE_LIBPFM"],
+        "//conditions:default": [],
+    }),
+    includes = ["include"],
    linkopts = select({
        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
        "//conditions:default": ["-pthread"],
    }),
-    strip_include_prefix = "include",
+    # Only static linking is allowed; no .so will be produced.
+    # Using `defines` (i.e. not `local_defines`) means that no
+    # dependent rules need to bother about defining the macro.
+    linkstatic = True,
+    local_defines = [
+        # Turn on Large-file Support
+        "_FILE_OFFSET_BITS=64",
+        "_LARGEFILE64_SOURCE",
+        "_LARGEFILE_SOURCE",
+    ],
    visibility = ["//visibility:public"],
+    deps = select({
+        ":perfcounters": ["@libpfm"],
+        "//conditions:default": [],
+    }),
 )

 cc_library(
    name = "benchmark_main",
    srcs = ["src/benchmark_main.cc"],
-    hdrs = ["include/benchmark/benchmark.h"],
-    strip_include_prefix = "include",
+    hdrs = [
+        "include/benchmark/benchmark.h",
+        "include/benchmark/export.h",
+    ],
+    includes = ["include"],
    visibility = ["//visibility:public"],
    deps = [":benchmark"],
 )
--- a/third_party/benchmark/CMakeLists.txt
+++ b/third_party/benchmark/CMakeLists.txt
@@ -1,27 +1,34 @@
-cmake_minimum_required (VERSION 2.8.12)
+# Require CMake 3.10. If available, use the policies up to CMake 3.22.
+cmake_minimum_required (VERSION 3.13...3.22)

-project (benchmark)
-
-foreach(p
-    CMP0054 # CMake 3.1
-    CMP0056 # export EXE_LINKER_FLAGS to try_run
-    CMP0057 # Support no if() IN_LIST operator
-    )
-  if(POLICY ${p})
-    cmake_policy(SET ${p} NEW)
-  endif()
-endforeach()
+project (benchmark VERSION 1.9.4 LANGUAGES CXX)

 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
-if(NOT MSVC)
+option(BENCHMARK_ENABLE_WERROR "Build Release candidates with -Werror." ON)
+option(BENCHMARK_FORCE_WERROR "Build Release candidates with -Werror regardless of compiler issues." OFF)
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
+  # PGC++ maybe reporting false positives.
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "NVHPC")
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if(BENCHMARK_FORCE_WERROR)
+  set(BENCHMARK_ENABLE_WERROR ON)
+endif(BENCHMARK_FORCE_WERROR)
+
+if(NOT (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC"))
  option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
 else()
  set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
 endif()
 option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
+option(BENCHMARK_ENABLE_DOXYGEN "Build documentation with Doxygen." OFF)
+option(BENCHMARK_INSTALL_DOCS "Enable installation of documentation." ON)

 # Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
 # may require downloading the source code.
@@ -30,6 +37,24 @@ option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree buildi
 # This option can be used to disable building and running unit tests which depend on gtest
 # in cases where it is not possible to build or find a valid version of gtest.
 option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
+option(BENCHMARK_USE_BUNDLED_GTEST "Use bundled GoogleTest. If disabled, the find_package(GTest) will be used." ON)
+
+option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm" OFF)
+
+# Export only public symbols
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    # As of CMake 3.18, CMAKE_SYSTEM_PROCESSOR is not set properly for MSVC and
+    # cross-compilation (e.g. Host=x86_64, target=aarch64) requires using the
+    # undocumented, but working variable.
+    # See https://gitlab.kitware.com/cmake/cmake/-/issues/15170
+    set(CMAKE_SYSTEM_PROCESSOR ${MSVC_CXX_ARCHITECTURE_ID})
+    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ARM")
+      set(CMAKE_CROSSCOMPILING TRUE)
+    endif()
+endif()

 set(ENABLE_ASSEMBLY_TESTS_DEFAULT OFF)
 function(should_enable_assembly_tests)
@@ -41,7 +66,7 @@ function(should_enable_assembly_tests)
      return()
    endif()
  endif()
-  if (MSVC)
+  if (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC")
    return()
  elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
    return()
@@ -77,29 +102,63 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 include(GetGitVersion)
 get_git_version(GIT_VERSION)

+# If no git version can be determined, use the version
+# from the project() command
+if ("${GIT_VERSION}" STREQUAL "v0.0.0")
+  set(VERSION "v${benchmark_VERSION}")
+else()
+  set(VERSION "${GIT_VERSION}")
+endif()
+
+# Normalize version: drop "v" prefix, replace first "-" with ".",
+# drop everything after second "-" (including said "-").
+string(STRIP ${VERSION} VERSION)
+if(VERSION MATCHES v[^-]*-)
+   string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  NORMALIZED_VERSION ${VERSION})
+else()
+   string(REGEX REPLACE "v(.*)" "\\1" NORMALIZED_VERSION ${VERSION})
+endif()
+
 # Tell the user what versions we are using
-string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
-message(STATUS "Version: ${VERSION}")
+message(STATUS "Google Benchmark version: ${VERSION}, normalized to ${NORMALIZED_VERSION}")

 # The version of the libraries
-set(GENERIC_LIB_VERSION ${VERSION})
-string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
+set(GENERIC_LIB_VERSION ${NORMALIZED_VERSION})
+string(SUBSTRING ${NORMALIZED_VERSION} 0 1 GENERIC_LIB_SOVERSION)

 # Import our CMake modules
-include(CheckCXXCompilerFlag)
 include(AddCXXCompilerFlag)
+include(CheckCXXCompilerFlag)
+include(CheckLibraryExists)
 include(CXXFeatureCheck)

+check_library_exists(rt shm_open "" HAVE_LIB_RT)
+
 if (BENCHMARK_BUILD_32_BITS)
  add_required_cxx_compiler_flag(-m32)
 endif()

+set(BENCHMARK_CXX_STANDARD 17)
+
+set(CMAKE_CXX_STANDARD ${BENCHMARK_CXX_STANDARD})
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
 if (MSVC)
  # Turn compiler warnings up to 11
  string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
+
+  # MP flag only applies to cl, not cl frontends to other compilers (e.g. clang-cl, icx-cl etc)
+  if(CMAKE_CXX_COMPILER_ID MATCHES MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
+  endif()
  add_definitions(-D_CRT_SECURE_NO_WARNINGS)

+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-WX)
+  endif()
+
  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
    add_cxx_compiler_flag(-EHs-)
    add_cxx_compiler_flag(-EHa-)
@@ -126,45 +185,48 @@ if (MSVC)
    set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
  endif()
 else()
-  # Try and enable C++11. Don't use C++14 because it doesn't work in some
-  # configurations.
-  add_cxx_compiler_flag(-std=c++11)
-  if (NOT HAVE_CXX_FLAG_STD_CXX11)
-    add_cxx_compiler_flag(-std=c++0x)
-  endif()
-
+  # Turn on Large-file Support
+  add_definitions(-D_FILE_OFFSET_BITS=64)
+  add_definitions(-D_LARGEFILE64_SOURCE)
+  add_definitions(-D_LARGEFILE_SOURCE)
  # Turn compiler warnings up to 11
-  if (NOT MSVC)
-    add_cxx_compiler_flag(-Wall)
-    add_cxx_compiler_flag(-Wextra)
-    add_cxx_compiler_flag(-Wshadow)
-    add_cxx_compiler_flag(-Werror RELEASE)
-    add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
-    add_cxx_compiler_flag(-Werror MINSIZEREL)
-    add_cxx_compiler_flag(-pedantic)
-    add_cxx_compiler_flag(-pedantic-errors)
-    add_cxx_compiler_flag(-Wshorten-64-to-32)
-    add_cxx_compiler_flag(-fstrict-aliasing)
-    # Disable warnings regarding deprecated parts of the library while building
-    # and testing those parts of the library.
-    add_cxx_compiler_flag(-Wno-deprecated-declarations)
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-      # Intel silently ignores '-Wno-deprecated-declarations',
-      # warning no. 1786 must be explicitly disabled.
-      # See #631 for rationale.
-      add_cxx_compiler_flag(-wd1786)
-    endif()
-    # Disable deprecation warnings for release builds (when -Werror is enabled).
-    add_cxx_compiler_flag(-Wno-deprecated RELEASE)
-    add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
-    add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
-    if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
-      add_cxx_compiler_flag(-fno-exceptions)
-    endif()
+  add_cxx_compiler_flag(-Wall)
+  add_cxx_compiler_flag(-Wextra)
+  add_cxx_compiler_flag(-Wshadow)
+  add_cxx_compiler_flag(-Wfloat-equal)
+  add_cxx_compiler_flag(-Wold-style-cast)
+  add_cxx_compiler_flag(-Wconversion)
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Werror)
+  endif()
+  if (NOT BENCHMARK_ENABLE_TESTING)
+    # Disable warning when compiling tests as gtest does not use 'override'.
+    add_cxx_compiler_flag(-Wsuggest-override)
+  endif()
+  add_cxx_compiler_flag(-pedantic)
+  add_cxx_compiler_flag(-pedantic-errors)
+  add_cxx_compiler_flag(-Wshorten-64-to-32)
+  add_cxx_compiler_flag(-fstrict-aliasing)
+  # Disable warnings regarding deprecated parts of the library while building
+  # and testing those parts of the library.
+  add_cxx_compiler_flag(-Wno-deprecated-declarations)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
+    # Intel silently ignores '-Wno-deprecated-declarations',
+    # warning no. 1786 must be explicitly disabled.
+    # See #631 for rationale.
+    add_cxx_compiler_flag(-wd1786)
+    add_cxx_compiler_flag(-fno-finite-math-only)
+  endif()
+  # Disable deprecation warnings for release builds (when -Werror is enabled).
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Wno-deprecated)
+  endif()
+  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
+    add_cxx_compiler_flag(-fno-exceptions)
  endif()

  if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
-    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
+    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM") #ICC17u2: Many false positives for Wstrict-aliasing
      add_cxx_compiler_flag(-Wstrict-aliasing)
    endif()
  endif()
@@ -173,21 +235,26 @@ else()
  add_cxx_compiler_flag(-wd654)
  add_cxx_compiler_flag(-Wthread-safety)
  if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
-    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
+    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES "-DINCLUDE_DIRECTORIES=${PROJECT_SOURCE_DIR}/include")
  endif()

  # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a
  # predefined macro, which turns on all of the wonderful libc extensions.
-  # However g++ doesn't do this in Cygwin so we have to define it ourselfs
+  # However g++ doesn't do this in Cygwin so we have to define it ourselves
  # since we depend on GNU/POSIX/BSD extensions.
  if (CYGWIN)
    add_definitions(-D_GNU_SOURCE=1)
  endif()

+  if (QNXNTO)
+    add_definitions(-D_QNX_SOURCE)
+  endif()
+
  # Link time optimisation
  if (BENCHMARK_ENABLE_LTO)
    add_cxx_compiler_flag(-flto)
-    if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+    add_cxx_compiler_flag(-Wno-lto-type-mismatch)
+    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
      find_program(GCC_AR gcc-ar)
      if (GCC_AR)
        set(CMAKE_AR ${GCC_AR})
@@ -196,7 +263,7 @@ else()
      if (GCC_RANLIB)
        set(CMAKE_RANLIB ${GCC_RANLIB})
      endif()
-    elseif("${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
+    elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
      include(llvm-toolchain)
    endif()
  endif()
@@ -224,7 +291,8 @@ if (BENCHMARK_USE_LIBCXX)
  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
    add_cxx_compiler_flag(-stdlib=libc++)
  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
-          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel" OR
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM")
    add_cxx_compiler_flag(-nostdinc++)
    message(WARNING "libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
    # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
@@ -250,9 +318,16 @@ if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
        AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
  message(WARNING "Using std::regex with exceptions disabled is not fully supported")
 endif()
+
 cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
+cxx_feature_check(PTHREAD_AFFINITY)
+
+if (BENCHMARK_ENABLE_LIBPFM)
+  find_package(PFM REQUIRED)
+endif()

 # Set up directories
 include_directories(${PROJECT_SOURCE_DIR}/include)
@@ -262,8 +337,18 @@ add_subdirectory(src)

 if (BENCHMARK_ENABLE_TESTING)
  enable_testing()
-  if (BENCHMARK_ENABLE_GTEST_TESTS)
-    include(HandleGTest)
+  if (BENCHMARK_ENABLE_GTEST_TESTS AND
+      NOT (TARGET gtest AND TARGET gtest_main AND
+           TARGET gmock AND TARGET gmock_main))
+    if (BENCHMARK_USE_BUNDLED_GTEST)
+      include(GoogleTest)
+    else()
+      find_package(GTest CONFIG REQUIRED)
+      add_library(gtest ALIAS GTest::gtest)
+      add_library(gtest_main ALIAS GTest::gtest_main)
+      add_library(gmock ALIAS GTest::gmock)
+      add_library(gmock_main ALIAS GTest::gmock_main)
+    endif()
  endif()
  add_subdirectory(test)
 endif()
--- a/third_party/benchmark/CONTRIBUTORS
+++ b/third_party/benchmark/CONTRIBUTORS
@@ -22,44 +22,75 @@
 #
 # Please keep the list sorted.

+Abhina Sreeskantharajan <abhina.sreeskantharajan@ibm.com>
 Albert Pretorius <pretoalb@gmail.com>
+Alex Steele <steelal123@gmail.com>
+Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
+Bátor Tallér <bator.taller@shapr3d.com>
 Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
+Cezary Skrzyński <czars1988@gmail.com>
 Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
+Colin Braley <braley.colin@gmail.com>
+Cyrille Faucheux <cyrille.faucheux@gmail.com>
+Daniel Harvey <danielharvey458@gmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Deniz Evrenci <denizevrenci@gmail.com>
 Dominic Hamon <dma@stripysock.com> <dominic@google.com>
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
+Doug Evans <xdje42@gmail.com>
+Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fabien Pichot <pichot.fabien@gmail.com>
+Fanbo Meng <fanbo.meng@ibm.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Geoffrey Martin-Noble <gcmn@google.com> <gmngeoffrey@gmail.com>
+Gergely Meszaros <maetveis@gmail.com>
+Gergő Szitár <szitar.gergo@gmail.com>
+Hannes Hauswedell <h2@fsfe.org>
+Henrique Bucher <hbucher@gmail.com>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+Iakov Sergeev <yahontu@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 John Millikin <jmillikin@stripe.com>
+Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
+Kaito Udagawa <umireon@gmail.com>
 Kai Wolf <kai.wolf@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
-Kaito Udagawa <umireon@gmail.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raghu Raja <raghu@enfabrica.net>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Raul Marin <rmrodriguez@cartodb.com>
 Ray Glover <ray.glover@uk.ibm.com>
 Robert Guo <robert.guo@mongodb.com>
 Roman Lebedev <lebedev.ri@gmail.com>
+Sayan Bhattacharjee <aero.sayan@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Steven Wan <wan.yu@ibm.com>
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Tobias Ulvgård <tobias.ulvgard@dirac.se>
 Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
 Yixuan Qiu <yixuanq@gmail.com>
--- a/third_party/benchmark/MODULE.bazel
+++ b/third_party/benchmark/MODULE.bazel
@@ -0,0 +1,41 @@
+module(
+    name = "google_benchmark",
+    version = "1.9.4",
+)
+
+bazel_dep(name = "bazel_skylib", version = "1.7.1")
+bazel_dep(name = "platforms", version = "0.0.10")
+bazel_dep(name = "rules_cc", version = "0.0.9")
+
+bazel_dep(name = "rules_python", version = "1.0.0", dev_dependency = True)
+bazel_dep(name = "googletest", version = "1.14.0", dev_dependency = True, repo_name = "com_google_googletest")
+
+bazel_dep(name = "libpfm", version = "4.11.0.bcr.1")
+
+# Register a toolchain for Python 3.9 to be able to build numpy. Python
+# versions >=3.10 are problematic.
+# A second reason for this is to be able to build Python hermetically instead
+# of relying on the changing default version from rules_python.
+
+python = use_extension("@rules_python//python/extensions:python.bzl", "python", dev_dependency = True)
+python.toolchain(python_version = "3.8")
+python.toolchain(python_version = "3.9")
+python.toolchain(python_version = "3.10")
+python.toolchain(python_version = "3.11")
+python.toolchain(
+    is_default = True,
+    python_version = "3.12",
+)
+python.toolchain(python_version = "3.13")
+
+pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip", dev_dependency = True)
+pip.parse(
+    hub_name = "tools_pip_deps",
+    python_version = "3.9",
+    requirements_lock = "//tools:requirements.txt",
+)
+use_repo(pip, "tools_pip_deps")
+
+# -- bazel_dep definitions -- #
+
+bazel_dep(name = "nanobind_bazel", version = "2.7.0", dev_dependency = True)
--- a/third_party/benchmark/README.md
+++ b/third_party/benchmark/README.md
--- a/third_party/benchmark/WORKSPACE
+++ b/third_party/benchmark/WORKSPACE
@@ -1,7 +1,20 @@
 workspace(name = "com_github_google_benchmark")

-http_archive(
-     name = "com_google_googletest",
-     urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
-     strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
+load("//:bazel/benchmark_deps.bzl", "benchmark_deps")
+
+benchmark_deps()
+
+load("@rules_python//python:repositories.bzl", "py_repositories")
+
+py_repositories()
+
+load("@rules_python//python:pip.bzl", "pip_parse")
+
+pip_parse(
+    name = "tools_pip_deps",
+    requirements_lock = "//tools:requirements.txt",
 )
+
+load("@tools_pip_deps//:requirements.bzl", "install_deps")
+
+install_deps()
--- a/third_party/benchmark/WORKSPACE.bzlmod
+++ b/third_party/benchmark/WORKSPACE.bzlmod
@@ -0,0 +1,2 @@
+# This file marks the root of the Bazel workspace.
+# See MODULE.bazel for dependencies and setup.
--- a/third_party/benchmark/_config.yml
+++ b/third_party/benchmark/_config.yml
@@ -0,0 +1,2 @@
+theme: jekyll-theme-midnight
+markdown: GFM
--- a/third_party/benchmark/appveyor.yml
+++ b/third_party/benchmark/appveyor.yml
@@ -41,7 +41,7 @@ build_script:
  - cmake --build . --config %configuration%

 test_script:
-  - ctest -c %configuration% --timeout 300 --output-on-failure
+  - ctest --build-config %configuration% --timeout 300 --output-on-failure

 artifacts:
  - path: '_build/CMakeFiles/*.log'
--- a/third_party/benchmark/bazel/benchmark_deps.bzl
+++ b/third_party/benchmark/bazel/benchmark_deps.bzl
@@ -0,0 +1,54 @@
+"""
+This file contains the Bazel build dependencies for Google Benchmark (both C++ source and Python bindings).
+"""
+
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "new_git_repository")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+def benchmark_deps():
+    """Loads dependencies required to build Google Benchmark."""
+
+    if "bazel_skylib" not in native.existing_rules():
+        http_archive(
+            name = "bazel_skylib",
+            sha256 = "cd55a062e763b9349921f0f5db8c3933288dc8ba4f76dd9416aac68acee3cb94",
+            urls = [
+                "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
+                "https://github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
+            ],
+        )
+
+    if "rules_python" not in native.existing_rules():
+        http_archive(
+            name = "rules_python",
+            sha256 = "e85ae30de33625a63eca7fc40a94fea845e641888e52f32b6beea91e8b1b2793",
+            strip_prefix = "rules_python-0.27.1",
+            url = "https://github.com/bazelbuild/rules_python/releases/download/0.27.1/rules_python-0.27.1.tar.gz",
+        )
+
+    if "com_google_googletest" not in native.existing_rules():
+        new_git_repository(
+            name = "com_google_googletest",
+            remote = "https://github.com/google/googletest.git",
+            tag = "release-1.12.1",
+        )
+
+    if "nanobind" not in native.existing_rules():
+        new_git_repository(
+            name = "nanobind",
+            remote = "https://github.com/wjakob/nanobind.git",
+            tag = "v1.9.2",
+            build_file = "@//bindings/python:nanobind.BUILD",
+            recursive_init_submodules = True,
+        )
+
+    if "libpfm" not in native.existing_rules():
+        # Downloaded from v4.9.0 tag at https://sourceforge.net/p/perfmon2/libpfm4/ref/master/tags/
+        http_archive(
+            name = "libpfm",
+            build_file = str(Label("//tools:libpfm.BUILD.bazel")),
+            sha256 = "5da5f8872bde14b3634c9688d980f68bda28b510268723cc12973eedbab9fecc",
+            type = "tar.gz",
+            strip_prefix = "libpfm-4.11.0",
+            urls = ["https://sourceforge.net/projects/perfmon2/files/libpfm4/libpfm-4.11.0.tar.gz/download"],
+        )
--- a/third_party/benchmark/bindings/python/google_benchmark/BUILD
+++ b/third_party/benchmark/bindings/python/google_benchmark/BUILD
@@ -0,0 +1,34 @@
+load("@nanobind_bazel//:build_defs.bzl", "nanobind_extension", "nanobind_stubgen")
+load("@rules_python//python:defs.bzl", "py_library", "py_test")
+
+py_library(
+    name = "google_benchmark",
+    srcs = ["__init__.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":_benchmark",
+    ],
+)
+
+nanobind_extension(
+    name = "_benchmark",
+    srcs = ["benchmark.cc"],
+    deps = ["//:benchmark"],
+)
+
+nanobind_stubgen(
+    name = "benchmark_stubgen",
+    marker_file = "bindings/python/google_benchmark/py.typed",
+    module = ":_benchmark",
+)
+
+py_test(
+    name = "example",
+    srcs = ["example.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":google_benchmark",
+    ],
+)
--- a/third_party/benchmark/bindings/python/google_benchmark/init.py
+++ b/third_party/benchmark/bindings/python/google_benchmark/init.py
@@ -0,0 +1,145 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Python benchmarking utilities.
+
+Example usage:
+  import google_benchmark as benchmark
+
+  @benchmark.register
+  def my_benchmark(state):
+      ...  # Code executed outside `while` loop is not timed.
+
+      while state:
+        ...  # Code executed within `while` loop is timed.
+
+  if __name__ == '__main__':
+    benchmark.main()
+"""
+
+import atexit
+
+from absl import app
+
+from google_benchmark import _benchmark
+from google_benchmark._benchmark import (
+    Counter as Counter,
+    State as State,
+    kMicrosecond as kMicrosecond,
+    kMillisecond as kMillisecond,
+    kNanosecond as kNanosecond,
+    kSecond as kSecond,
+    o1 as o1,
+    oAuto as oAuto,
+    oLambda as oLambda,
+    oLogN as oLogN,
+    oN as oN,
+    oNCubed as oNCubed,
+    oNLogN as oNLogN,
+    oNone as oNone,
+    oNSquared as oNSquared,
+)
+
+__version__ = "1.9.4"
+
+
+class __OptionMaker:
+    """A stateless class to collect benchmark options.
+
+    Collect all decorator calls like @option.range(start=0, limit=1<<5).
+    """
+
+    class Options:
+        """Pure data class to store options calls, along with the benchmarked
+        function."""
+
+        def __init__(self, func):
+            self.func = func
+            self.builder_calls = []
+
+    @classmethod
+    def make(cls, func_or_options):
+        """Make Options from Options or the benchmarked function."""
+        if isinstance(func_or_options, cls.Options):
+            return func_or_options
+        return cls.Options(func_or_options)
+
+    def __getattr__(self, builder_name):
+        """Append option call in the Options."""
+
+        # The function that get returned on @option.range(start=0, limit=1<<5).
+        def __builder_method(*args, **kwargs):
+            # The decorator that get called, either with the benchmared function
+            # or the previous Options
+            def __decorator(func_or_options):
+                options = self.make(func_or_options)
+                options.builder_calls.append((builder_name, args, kwargs))
+                # The decorator returns Options so it is not technically a
+                # decorator and needs a final call to @register
+                return options
+
+            return __decorator
+
+        return __builder_method
+
+
+# Alias for nicer API.
+# We have to instantiate an object, even if stateless, to be able to use
+# __getattr__ on option.range
+option = __OptionMaker()
+
+
+def register(undefined=None, *, name=None):
+    """Register function for benchmarking."""
+    if undefined is None:
+        # Decorator is called without parenthesis so we return a decorator
+        return lambda f: register(f, name=name)
+
+    # We have either the function to benchmark (simple case) or an instance of
+    # Options (@option._ case).
+    options = __OptionMaker.make(undefined)
+
+    if name is None:
+        name = options.func.__name__
+
+    # We register the benchmark and reproduce all the @option._ calls onto the
+    # benchmark builder pattern
+    benchmark = _benchmark.RegisterBenchmark(name, options.func)
+    for name, args, kwargs in options.builder_calls[::-1]:
+        getattr(benchmark, name)(*args, **kwargs)
+
+    # return the benchmarked function because the decorator does not modify it
+    return options.func
+
+
+def _flags_parser(argv):
+    argv = _benchmark.Initialize(argv)
+    return app.parse_flags_with_usage(argv)
+
+
+def _run_benchmarks(argv):
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+    return _benchmark.RunSpecifiedBenchmarks()
+
+
+def main(argv=None):
+    return app.run(_run_benchmarks, argv=argv, flags_parser=_flags_parser)
+
+
+# FIXME: can we rerun with disabled ASLR?
+
+# Methods for use with custom main function.
+initialize = _benchmark.Initialize
+run_benchmarks = _benchmark.RunSpecifiedBenchmarks
+atexit.register(_benchmark.ClearRegisteredBenchmarks)
--- a/third_party/benchmark/bindings/python/google_benchmark/benchmark.cc
+++ b/third_party/benchmark/bindings/python/google_benchmark/benchmark.cc
@@ -0,0 +1,184 @@
+// Benchmark for Python.
+
+#include "benchmark/benchmark.h"
+
+#include "nanobind/nanobind.h"
+#include "nanobind/operators.h"
+#include "nanobind/stl/bind_map.h"
+#include "nanobind/stl/string.h"
+#include "nanobind/stl/vector.h"
+
+NB_MAKE_OPAQUE(benchmark::UserCounters);
+
+namespace {
+namespace nb = nanobind;
+
+std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
+  // The `argv` pointers here become invalid when this function returns, but
+  // benchmark holds the pointer to `argv[0]`. We create a static copy of it
+  // so it persists, and replace the pointer below.
+  static std::string executable_name(argv[0]);
+  std::vector<char*> ptrs;
+  ptrs.reserve(argv.size());
+  for (auto& arg : argv) {
+    ptrs.push_back(const_cast<char*>(arg.c_str()));
+  }
+  ptrs[0] = const_cast<char*>(executable_name.c_str());
+  int argc = static_cast<int>(argv.size());
+  benchmark::Initialize(&argc, ptrs.data());
+  std::vector<std::string> remaining_argv;
+  remaining_argv.reserve(argc);
+  for (int i = 0; i < argc; ++i) {
+    remaining_argv.emplace_back(ptrs[i]);
+  }
+  return remaining_argv;
+}
+
+benchmark::internal::Benchmark* RegisterBenchmark(const std::string& name,
+                                                  nb::callable f) {
+  return benchmark::RegisterBenchmark(
+      name, [f](benchmark::State& state) { f(&state); });
+}
+
+NB_MODULE(_benchmark, m) {
+
+  using benchmark::TimeUnit;
+  nb::enum_<TimeUnit>(m, "TimeUnit")
+      .value("kNanosecond", TimeUnit::kNanosecond)
+      .value("kMicrosecond", TimeUnit::kMicrosecond)
+      .value("kMillisecond", TimeUnit::kMillisecond)
+      .value("kSecond", TimeUnit::kSecond)
+      .export_values();
+
+  using benchmark::BigO;
+  nb::enum_<BigO>(m, "BigO")
+      .value("oNone", BigO::oNone)
+      .value("o1", BigO::o1)
+      .value("oN", BigO::oN)
+      .value("oNSquared", BigO::oNSquared)
+      .value("oNCubed", BigO::oNCubed)
+      .value("oLogN", BigO::oLogN)
+      .value("oNLogN", BigO::oNLogN)
+      .value("oAuto", BigO::oAuto)
+      .value("oLambda", BigO::oLambda)
+      .export_values();
+
+  using benchmark::internal::Benchmark;
+  nb::class_<Benchmark>(m, "Benchmark")
+      // For methods returning a pointer to the current object, reference
+      // return policy is used to ask nanobind not to take ownership of the
+      // returned object and avoid calling delete on it.
+      // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies
+      //
+      // For methods taking a const std::vector<...>&, a copy is created
+      // because a it is bound to a Python list.
+      // https://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html
+      .def("unit", &Benchmark::Unit, nb::rv_policy::reference)
+      .def("arg", &Benchmark::Arg, nb::rv_policy::reference)
+      .def("args", &Benchmark::Args, nb::rv_policy::reference)
+      .def("range", &Benchmark::Range, nb::rv_policy::reference,
+           nb::arg("start"), nb::arg("limit"))
+      .def("dense_range", &Benchmark::DenseRange,
+           nb::rv_policy::reference, nb::arg("start"),
+           nb::arg("limit"), nb::arg("step") = 1)
+      .def("ranges", &Benchmark::Ranges, nb::rv_policy::reference)
+      .def("args_product", &Benchmark::ArgsProduct,
+           nb::rv_policy::reference)
+      .def("arg_name", &Benchmark::ArgName, nb::rv_policy::reference)
+      .def("arg_names", &Benchmark::ArgNames,
+           nb::rv_policy::reference)
+      .def("range_pair", &Benchmark::RangePair,
+           nb::rv_policy::reference, nb::arg("lo1"), nb::arg("hi1"),
+           nb::arg("lo2"), nb::arg("hi2"))
+      .def("range_multiplier", &Benchmark::RangeMultiplier,
+           nb::rv_policy::reference)
+      .def("min_time", &Benchmark::MinTime, nb::rv_policy::reference)
+      .def("min_warmup_time", &Benchmark::MinWarmUpTime,
+           nb::rv_policy::reference)
+      .def("iterations", &Benchmark::Iterations,
+           nb::rv_policy::reference)
+      .def("repetitions", &Benchmark::Repetitions,
+           nb::rv_policy::reference)
+      .def("report_aggregates_only", &Benchmark::ReportAggregatesOnly,
+           nb::rv_policy::reference, nb::arg("value") = true)
+      .def("display_aggregates_only", &Benchmark::DisplayAggregatesOnly,
+           nb::rv_policy::reference, nb::arg("value") = true)
+      .def("measure_process_cpu_time", &Benchmark::MeasureProcessCPUTime,
+           nb::rv_policy::reference)
+      .def("use_real_time", &Benchmark::UseRealTime,
+           nb::rv_policy::reference)
+      .def("use_manual_time", &Benchmark::UseManualTime,
+           nb::rv_policy::reference)
+      .def(
+          "complexity",
+          (Benchmark * (Benchmark::*)(benchmark::BigO)) & Benchmark::Complexity,
+          nb::rv_policy::reference,
+          nb::arg("complexity") = benchmark::oAuto);
+
+  using benchmark::Counter;
+  nb::class_<Counter> py_counter(m, "Counter");
+
+  nb::enum_<Counter::Flags>(py_counter, "Flags", nb::is_arithmetic(), nb::is_flag())
+      .value("kDefaults", Counter::Flags::kDefaults)
+      .value("kIsRate", Counter::Flags::kIsRate)
+      .value("kAvgThreads", Counter::Flags::kAvgThreads)
+      .value("kAvgThreadsRate", Counter::Flags::kAvgThreadsRate)
+      .value("kIsIterationInvariant", Counter::Flags::kIsIterationInvariant)
+      .value("kIsIterationInvariantRate",
+             Counter::Flags::kIsIterationInvariantRate)
+      .value("kAvgIterations", Counter::Flags::kAvgIterations)
+      .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate)
+      .value("kInvert", Counter::Flags::kInvert)
+      .export_values();
+
+  nb::enum_<Counter::OneK>(py_counter, "OneK")
+      .value("kIs1000", Counter::OneK::kIs1000)
+      .value("kIs1024", Counter::OneK::kIs1024)
+      .export_values();
+
+  py_counter
+      .def(nb::init<double, Counter::Flags, Counter::OneK>(),
+           nb::arg("value") = 0., nb::arg("flags") = Counter::kDefaults,
+           nb::arg("k") = Counter::kIs1000)
+      .def("__init__",
+           ([](Counter* c, double value) { new (c) Counter(value); }))
+      .def_rw("value", &Counter::value)
+      .def_rw("flags", &Counter::flags)
+      .def_rw("oneK", &Counter::oneK)
+      .def(nb::init_implicit<double>());
+
+  nb::implicitly_convertible<nb::int_, Counter>();
+
+  nb::bind_map<benchmark::UserCounters>(m, "UserCounters");
+
+  using benchmark::State;
+  nb::class_<State>(m, "State")
+      .def("__bool__", &State::KeepRunning)
+      .def_prop_ro("keep_running", &State::KeepRunning)
+      .def("pause_timing", &State::PauseTiming)
+      .def("resume_timing", &State::ResumeTiming)
+      .def("skip_with_error", &State::SkipWithError)
+      .def_prop_ro("error_occurred", &State::error_occurred)
+      .def("set_iteration_time", &State::SetIterationTime)
+      .def_prop_rw("bytes_processed", &State::bytes_processed,
+                    &State::SetBytesProcessed)
+      .def_prop_rw("complexity_n", &State::complexity_length_n,
+                    &State::SetComplexityN)
+      .def_prop_rw("items_processed", &State::items_processed,
+                   &State::SetItemsProcessed)
+      .def("set_label", &State::SetLabel)
+      .def("range", &State::range, nb::arg("pos") = 0)
+      .def_prop_ro("iterations", &State::iterations)
+      .def_prop_ro("name", &State::name)
+      .def_rw("counters", &State::counters)
+      .def_prop_ro("thread_index", &State::thread_index)
+      .def_prop_ro("threads", &State::threads);
+
+  m.def("Initialize", Initialize);
+  m.def("RegisterBenchmark", RegisterBenchmark,
+        nb::rv_policy::reference);
+  m.def("RunSpecifiedBenchmarks",
+        []() { benchmark::RunSpecifiedBenchmarks(); });
+  m.def("ClearRegisteredBenchmarks", benchmark::ClearRegisteredBenchmarks);
+};
+}  // namespace
--- a/third_party/benchmark/bindings/python/google_benchmark/example.py
+++ b/third_party/benchmark/bindings/python/google_benchmark/example.py
@@ -0,0 +1,140 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Example of Python using C++ benchmark framework.
+
+To run this example, you must first install the `google_benchmark` Python
+package.
+
+To install using `setup.py`, download and extract the `google_benchmark` source.
+In the extracted directory, execute:
+  python setup.py install
+"""
+
+import random
+import time
+
+import google_benchmark as benchmark
+from google_benchmark import Counter
+
+
+@benchmark.register
+def empty(state):
+    while state:
+        pass
+
+
+@benchmark.register
+def sum_million(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register
+def pause_timing(state):
+    """Pause timing every iteration."""
+    while state:
+        # Construct a list of random ints every iteration without timing it
+        state.pause_timing()
+        random_list = [random.randint(0, 100) for _ in range(100)]
+        state.resume_timing()
+        # Time the in place sorting algorithm
+        random_list.sort()
+
+
+@benchmark.register
+def skipped(state):
+    if True:  # Test some predicate here.
+        state.skip_with_error("some error")
+        return  # NOTE: You must explicitly return, or benchmark will continue.
+
+    # Benchmark code would be here.
+
+
+@benchmark.register
+@benchmark.option.use_manual_time()
+def manual_timing(state):
+    while state:
+        # Manually count Python CPU time
+        start = time.perf_counter()  # perf_counter_ns() in Python 3.7+
+        # Something to benchmark
+        time.sleep(0.01)
+        end = time.perf_counter()
+        state.set_iteration_time(end - start)
+
+
+@benchmark.register
+def custom_counters(state):
+    """Collect custom metric using benchmark.Counter."""
+    num_foo = 0.0
+    while state:
+        # Benchmark some code here
+        # Collect some custom metric named foo
+        num_foo += 0.13
+
+    # Automatic Counter from numbers.
+    state.counters["foo"] = num_foo
+    # Set a counter as a rate.
+    state.counters["foo_rate"] = Counter(num_foo, Counter.kIsRate)
+    #  Set a counter as an inverse of rate.
+    state.counters["foo_inv_rate"] = Counter(
+        num_foo, Counter.kIsRate | Counter.kInvert
+    )
+    # Set a counter as a thread-average quantity.
+    state.counters["foo_avg"] = Counter(num_foo, Counter.kAvgThreads)
+    # There's also a combined flag:
+    state.counters["foo_avg_rate"] = Counter(num_foo, Counter.kAvgThreadsRate)
+
+
+@benchmark.register
+@benchmark.option.measure_process_cpu_time()
+@benchmark.option.use_real_time()
+def with_options(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register(name="sum_million_microseconds")
+@benchmark.option.unit(benchmark.kMicrosecond)
+def with_options2(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register
+@benchmark.option.arg(100)
+@benchmark.option.arg(1000)
+def passing_argument(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+@benchmark.register
+@benchmark.option.range(8, limit=8 << 10)
+def using_range(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+@benchmark.register
+@benchmark.option.range_multiplier(2)
+@benchmark.option.range(1 << 10, 1 << 18)
+@benchmark.option.complexity(benchmark.oN)
+def computing_complexity(state):
+    while state:
+        sum(range(state.range(0)))
+    state.complexity_n = state.range(0)
+
+
+if __name__ == "__main__":
+    benchmark.main()
--- a/third_party/benchmark/cmake/AddCXXCompilerFlag.cmake
+++ b/third_party/benchmark/cmake/AddCXXCompilerFlag.cmake
@@ -34,9 +34,11 @@ function(add_cxx_compiler_flag FLAG)
  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
  if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
      string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
    endif()
    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${BENCHMARK_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
  endif()
@@ -49,9 +51,11 @@ function(add_required_cxx_compiler_flag FLAG)
  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
  if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
      string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
    endif()
    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
--- a/third_party/benchmark/cmake/CXXFeatureCheck.cmake
+++ b/third_party/benchmark/cmake/CXXFeatureCheck.cmake
@@ -17,6 +17,8 @@ if(__cxx_feature_check)
 endif()
 set(__cxx_feature_check INCLUDED)

+option(CXXFEATURECHECK_DEBUG OFF)
+
 function(cxx_feature_check FILE)
  string(TOLOWER ${FILE} FILE)
  string(TOUPPER ${FILE} VAR)
@@ -27,26 +29,38 @@ function(cxx_feature_check FILE)
    return()
  endif()

+  set(FEATURE_CHECK_CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+  if (ARGC GREATER 1)
+    message(STATUS "Enabling additional flags: ${ARGV1}")
+    list(APPEND FEATURE_CHECK_CMAKE_FLAGS ${ARGV1})
+  endif()
+
  if (NOT DEFINED COMPILE_${FEATURE})
-    message(STATUS "Performing Test ${FEATURE}")
    if(CMAKE_CROSSCOMPILING)
+      message(STATUS "Cross-compiling to test ${FEATURE}")
      try_compile(COMPILE_${FEATURE}
              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 17
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
      if(COMPILE_${FEATURE})
        message(WARNING
              "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
-        set(RUN_${FEATURE} 0)
+        set(RUN_${FEATURE} 0 CACHE INTERNAL "")
      else()
-        set(RUN_${FEATURE} 1)
+        set(RUN_${FEATURE} 1 CACHE INTERNAL "")
      endif()
    else()
-      message(STATUS "Performing Test ${FEATURE}")
+      message(STATUS "Compiling and running to test ${FEATURE}")
      try_run(RUN_${FEATURE} COMPILE_${FEATURE}
              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 17
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
    endif()
  endif()

@@ -56,7 +70,11 @@ function(cxx_feature_check FILE)
    add_definitions(-DHAVE_${VAR})
  else()
    if(NOT COMPILE_${FEATURE})
-      message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      if(CXXFEATURECHECK_DEBUG)
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile: ${COMPILE_OUTPUT_VAR}")
+      else()
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      endif()
    else()
      message(STATUS "Performing Test ${FEATURE} -- compiled but failed to run")
    endif()
--- a/third_party/benchmark/cmake/Config.cmake.in
+++ b/third_party/benchmark/cmake/Config.cmake.in
@@ -1 +1,12 @@
+@PACKAGE_INIT@
+
+include (CMakeFindDependencyMacro)
+
+find_dependency (Threads)
+
+if (@BENCHMARK_ENABLE_LIBPFM@)
+    list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+    find_dependency (PFM)
+endif()
+
 include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
--- a/third_party/benchmark/cmake/GetGitVersion.cmake
+++ b/third_party/benchmark/cmake/GetGitVersion.cmake
@@ -20,35 +20,17 @@ set(__get_git_version INCLUDED)

 function(get_git_version var)
  if(GIT_EXECUTABLE)
-      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8 --dirty
          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
          RESULT_VARIABLE status
          OUTPUT_VARIABLE GIT_VERSION
          ERROR_QUIET)
-      if(${status})
+      if(status)
          set(GIT_VERSION "v0.0.0")
-      else()
-          string(STRIP ${GIT_VERSION} GIT_VERSION)
-          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
-      endif()
-
-      # Work out if the repository is dirty
-      execute_process(COMMAND ${GIT_EXECUTABLE} update-index -q --refresh
-          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-          OUTPUT_QUIET
-          ERROR_QUIET)
-      execute_process(COMMAND ${GIT_EXECUTABLE} diff-index --name-only HEAD --
-          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-          OUTPUT_VARIABLE GIT_DIFF_INDEX
-          ERROR_QUIET)
-      string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
-      if (${GIT_DIRTY})
-          set(GIT_VERSION "${GIT_VERSION}-dirty")
      endif()
  else()
      set(GIT_VERSION "v0.0.0")
  endif()

-  message(STATUS "git Version: ${GIT_VERSION}")
  set(${var} ${GIT_VERSION} PARENT_SCOPE)
 endfunction()
--- a/third_party/benchmark/cmake/GoogleTest.cmake
+++ b/third_party/benchmark/cmake/GoogleTest.cmake
@@ -0,0 +1,58 @@
+# Download and unpack googletest at configure time
+set(GOOGLETEST_PREFIX "${benchmark_BINARY_DIR}/third_party/googletest")
+configure_file(${benchmark_SOURCE_DIR}/cmake/GoogleTest.cmake.in ${GOOGLETEST_PREFIX}/CMakeLists.txt @ONLY)
+
+set(GOOGLETEST_PATH "${CMAKE_CURRENT_SOURCE_DIR}/googletest" CACHE PATH "") # Mind the quotes
+execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
+  -DALLOW_DOWNLOADING_GOOGLETEST=${BENCHMARK_DOWNLOAD_DEPENDENCIES} -DGOOGLETEST_PATH:PATH=${GOOGLETEST_PATH} .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+endif()
+
+execute_process(
+  COMMAND ${CMAKE_COMMAND} --build .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "Build step for googletest failed: ${result}")
+endif()
+
+# Prevent overriding the parent project's compiler/linker
+# settings on Windows
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+include(${GOOGLETEST_PREFIX}/googletest-paths.cmake)
+
+# Add googletest directly to our build. This defines
+# the gtest and gtest_main targets.
+add_subdirectory(${GOOGLETEST_SOURCE_DIR}
+                 ${GOOGLETEST_BINARY_DIR}
+                 EXCLUDE_FROM_ALL)
+
+# googletest doesn't seem to want to stay build warning clean so let's not hurt ourselves.
+if (MSVC)
+  target_compile_options(gtest PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gtest_main PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock_main PRIVATE "/wd4244" "/wd4722")
+else()
+  target_compile_options(gtest PRIVATE "-w")
+  target_compile_options(gtest_main PRIVATE "-w")
+  target_compile_options(gmock PRIVATE "-w")
+  target_compile_options(gmock_main PRIVATE "-w")
+endif()
+
+if(NOT DEFINED GTEST_COMPILE_COMMANDS)
+    set(GTEST_COMPILE_COMMANDS ON)
+endif()
+
+set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
--- a/third_party/benchmark/cmake/GoogleTest.cmake.in
+++ b/third_party/benchmark/cmake/GoogleTest.cmake.in
@@ -0,0 +1,60 @@
+cmake_minimum_required (VERSION 3.13...3.22)
+
+project(googletest-download NONE)
+
+# Enable ExternalProject CMake module
+include(ExternalProject)
+
+option(ALLOW_DOWNLOADING_GOOGLETEST "If googletest src tree is not found in location specified by GOOGLETEST_PATH, do fetch the archive from internet" OFF)
+set(GOOGLETEST_PATH "/usr/src/googletest" CACHE PATH
+                    "Path to the googletest root tree. Should contain googletest and googlemock subdirs. And CMakeLists.txt in root, and in both of these subdirs")
+
+# Download and install GoogleTest
+
+message(STATUS "Looking for Google Test sources")
+message(STATUS "Looking for Google Test sources in ${GOOGLETEST_PATH}")
+if(EXISTS "${GOOGLETEST_PATH}"            AND IS_DIRECTORY "${GOOGLETEST_PATH}"            AND EXISTS "${GOOGLETEST_PATH}/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googletest" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googletest" AND EXISTS "${GOOGLETEST_PATH}/googletest/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googlemock" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googlemock" AND EXISTS "${GOOGLETEST_PATH}/googlemock/CMakeLists.txt")
+  message(STATUS "Found Google Test in ${GOOGLETEST_PATH}")
+
+  ExternalProject_Add(
+    googletest
+    PREFIX            "${CMAKE_BINARY_DIR}"
+    DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+    SOURCE_DIR        "${GOOGLETEST_PATH}" # use existing src dir.
+    BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+  )
+else()
+  if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_USE_BUNDLED_GTEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    return()
+  else()
+    message(STATUS "Did not find Google Test sources! Fetching from web...")
+    ExternalProject_Add(
+      googletest
+      GIT_REPOSITORY    https://github.com/google/googletest.git
+      GIT_TAG           "v1.15.2"
+      GIT_SHALLOW       "ON"
+      PREFIX            "${CMAKE_BINARY_DIR}"
+      STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
+      DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+      SOURCE_DIR        "${CMAKE_BINARY_DIR}/src"
+      BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND     ""
+      INSTALL_COMMAND   ""
+      TEST_COMMAND      ""
+    )
+  endif()
+endif()
+
+ExternalProject_Get_Property(googletest SOURCE_DIR BINARY_DIR)
+file(WRITE googletest-paths.cmake
+"set(GOOGLETEST_SOURCE_DIR \"${SOURCE_DIR}\")
+set(GOOGLETEST_BINARY_DIR \"${BINARY_DIR}\")
+")
--- a/third_party/benchmark/cmake/HandleGTest.cmake
+++ b/third_party/benchmark/cmake/HandleGTest.cmake
@@ -1,113 +0,0 @@
-
-include(split_list)
-
-macro(build_external_gtest)
-  include(ExternalProject)
-  set(GTEST_FLAGS "")
-  if (BENCHMARK_USE_LIBCXX)
-    if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-      list(APPEND GTEST_FLAGS -stdlib=libc++)
-    else()
-      message(WARNING "Unsupported compiler (${CMAKE_CXX_COMPILER}) when using libc++")
-    endif()
-  endif()
-  if (BENCHMARK_BUILD_32_BITS)
-    list(APPEND GTEST_FLAGS -m32)
-  endif()
-  if (NOT "${CMAKE_CXX_FLAGS}" STREQUAL "")
-    list(APPEND GTEST_FLAGS ${CMAKE_CXX_FLAGS})
-  endif()
-  string(TOUPPER "${CMAKE_BUILD_TYPE}" GTEST_BUILD_TYPE)
-  if ("${GTEST_BUILD_TYPE}" STREQUAL "COVERAGE")
-    set(GTEST_BUILD_TYPE "DEBUG")
-  endif()
-  # FIXME: Since 10/Feb/2017 the googletest trunk has had a bug where
-  # -Werror=unused-function fires during the build on OS X. This is a temporary
-  # workaround to keep our travis bots from failing. It should be removed
-  # once gtest is fixed.
-  if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-    list(APPEND GTEST_FLAGS "-Wno-unused-function")
-  endif()
-  split_list(GTEST_FLAGS)
-  set(EXCLUDE_FROM_ALL_OPT "")
-  set(EXCLUDE_FROM_ALL_VALUE "")
-  if (${CMAKE_VERSION} VERSION_GREATER "3.0.99")
-      set(EXCLUDE_FROM_ALL_OPT "EXCLUDE_FROM_ALL")
-      set(EXCLUDE_FROM_ALL_VALUE "ON")
-  endif()
-  ExternalProject_Add(googletest
-      ${EXCLUDE_FROM_ALL_OPT} ${EXCLUDE_FROM_ALL_VALUE}
-      GIT_REPOSITORY https://github.com/google/googletest.git
-      GIT_TAG master
-      PREFIX "${CMAKE_BINARY_DIR}/googletest"
-      INSTALL_DIR "${CMAKE_BINARY_DIR}/googletest"
-      CMAKE_CACHE_ARGS
-        -DCMAKE_BUILD_TYPE:STRING=${GTEST_BUILD_TYPE}
-        -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-        -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-        -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
-        -DCMAKE_INSTALL_LIBDIR:PATH=<INSTALL_DIR>/lib
-        -DCMAKE_CXX_FLAGS:STRING=${GTEST_FLAGS}
-        -Dgtest_force_shared_crt:BOOL=ON
-      )
-
-  ExternalProject_Get_Property(googletest install_dir)
-  set(GTEST_INCLUDE_DIRS ${install_dir}/include)
-  file(MAKE_DIRECTORY ${GTEST_INCLUDE_DIRS})
-
-  set(LIB_SUFFIX "${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  set(LIB_PREFIX "${CMAKE_STATIC_LIBRARY_PREFIX}")
-  if("${GTEST_BUILD_TYPE}" STREQUAL "DEBUG")
-    set(LIB_SUFFIX "d${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  endif()
-
-  # Use gmock_main instead of gtest_main because it initializes gtest as well.
-  # Note: The libraries are listed in reverse order of their dependancies.
-  foreach(LIB gtest gmock gmock_main)
-    add_library(${LIB} UNKNOWN IMPORTED)
-    set_target_properties(${LIB} PROPERTIES
-      IMPORTED_LOCATION ${install_dir}/lib/${LIB_PREFIX}${LIB}${LIB_SUFFIX}
-      INTERFACE_INCLUDE_DIRECTORIES ${GTEST_INCLUDE_DIRS}
-      INTERFACE_LINK_LIBRARIES "${GTEST_BOTH_LIBRARIES}"
-    )
-    add_dependencies(${LIB} googletest)
-    list(APPEND GTEST_BOTH_LIBRARIES ${LIB})
-  endforeach()
-endmacro(build_external_gtest)
-
-if (BENCHMARK_ENABLE_GTEST_TESTS)
-  if (IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/googletest)
-    set(GTEST_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/googletest")
-    set(INSTALL_GTEST OFF CACHE INTERNAL "")
-    set(INSTALL_GMOCK OFF CACHE INTERNAL "")
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/googletest)
-    set(GTEST_BOTH_LIBRARIES gtest gmock gmock_main)
-    foreach(HEADER test mock)
-      # CMake 2.8 and older don't respect INTERFACE_INCLUDE_DIRECTORIES, so we
-      # have to add the paths ourselves.
-      set(HFILE g${HEADER}/g${HEADER}.h)
-      set(HPATH ${GTEST_ROOT}/google${HEADER}/include)
-      find_path(HEADER_PATH_${HEADER} ${HFILE}
-          NO_DEFAULT_PATHS
-          HINTS ${HPATH}
-      )
-      if (NOT HEADER_PATH_${HEADER})
-        message(FATAL_ERROR "Failed to find header ${HFILE} in ${HPATH}")
-      endif()
-      list(APPEND GTEST_INCLUDE_DIRS ${HEADER_PATH_${HEADER}})
-    endforeach()
-  elseif(BENCHMARK_DOWNLOAD_DEPENDENCIES)
-    build_external_gtest()
-  else()
-    find_package(GTest REQUIRED)
-    find_path(GMOCK_INCLUDE_DIRS gmock/gmock.h
-        HINTS ${GTEST_INCLUDE_DIRS})
-    if (NOT GMOCK_INCLUDE_DIRS)
-      message(FATAL_ERROR "Failed to find header gmock/gmock.h with hint ${GTEST_INCLUDE_DIRS}")
-    endif()
-    set(GTEST_INCLUDE_DIRS ${GTEST_INCLUDE_DIRS} ${GMOCK_INCLUDE_DIRS})
-    # FIXME: We don't currently require the gmock library to build the tests,
-    # and it's likely we won't find it, so we don't try. As long as we've
-    # found the gmock/gmock.h header and gtest_main that should be good enough.
-  endif()
-endif()
--- a/third_party/benchmark/cmake/benchmark.pc.in
+++ b/third_party/benchmark/cmake/benchmark.pc.in
@@ -1,11 +1,12 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-libdir=${prefix}/lib
-includedir=${prefix}/include
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@

 Name: @PROJECT_NAME@
 Description: Google microbenchmark framework
-Version: @VERSION@
+Version: @NORMALIZED_VERSION@

 Libs: -L${libdir} -lbenchmark
+Libs.private: -lpthread @BENCHMARK_PRIVATE_LINK_LIBRARIES@
 Cflags: -I${includedir}
--- a/third_party/benchmark/cmake/benchmark_main.pc.in
+++ b/third_party/benchmark/cmake/benchmark_main.pc.in
@@ -0,0 +1,7 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+
+Name: @PROJECT_NAME@
+Description: Google microbenchmark framework (with main() function)
+Version: @NORMALIZED_VERSION@
+Requires: benchmark
+Libs: -L${libdir} -lbenchmark_main
--- a/third_party/benchmark/cmake/pthread_affinity.cpp
+++ b/third_party/benchmark/cmake/pthread_affinity.cpp
@@ -0,0 +1,16 @@
+#include <pthread.h>
+int main() {
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  for (int i = 0; i < CPU_SETSIZE; ++i) {
+    CPU_SET(i, &set);
+    CPU_CLR(i, &set);
+  }
+  pthread_t self = pthread_self();
+  int ret;
+  ret = pthread_getaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  ret = pthread_setaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  return 0;
+}
--- a/third_party/benchmark/docs/AssemblyTests.md
+++ b/third_party/benchmark/docs/AssemblyTests.md
@@ -111,6 +111,7 @@ between compilers or compiler versions. A common example of this
 is matching stack frame addresses. In this case regular expressions
 can be used to match the differing bits of output. For example:

+<!-- {% raw %} -->
 ```c++
 int ExternInt;
 struct Point { int x, y, z; };
@@ -127,6 +128,7 @@ extern "C" void test_store_point() {
    // CHECK: ret
 }
 ```
+<!-- {% endraw %} -->

 ## Current Requirements and Limitations

--- a/third_party/benchmark/docs/_config.yml
+++ b/third_party/benchmark/docs/_config.yml
@@ -0,0 +1,3 @@
+theme: jekyll-theme-minimal
+logo: /assets/images/icon_black.png
+show_downloads: true
--- a/third_party/benchmark/docs/assets/images/icon.png
+++ b/third_party/benchmark/docs/assets/images/icon.png
--- a/third_party/benchmark/docs/assets/images/icon.xcf
+++ b/third_party/benchmark/docs/assets/images/icon.xcf
--- a/third_party/benchmark/docs/assets/images/icon_black.png
+++ b/third_party/benchmark/docs/assets/images/icon_black.png
--- a/third_party/benchmark/docs/assets/images/icon_black.xcf
+++ b/third_party/benchmark/docs/assets/images/icon_black.xcf
--- a/third_party/benchmark/docs/dependencies.md
+++ b/third_party/benchmark/docs/dependencies.md
@@ -0,0 +1,19 @@
+# Build tool dependency policy
+
+We follow the [Foundational C++ support policy](https://opensource.google/documentation/policies/cplusplus-support) for our build tools. In
+particular the ["Build Systems" section](https://opensource.google/documentation/policies/cplusplus-support#build-systems).
+
+## CMake
+
+The current supported version is CMake 3.10 as of 2023-08-10. Most modern
+distributions include newer versions, for example:
+
+* Ubuntu 20.04 provides CMake 3.16.3
+* Debian 11.4 provides CMake 3.18.4
+* Ubuntu 22.04 provides CMake 3.22.1
+
+## Python
+
+The Python bindings require Python 3.10+ as of v1.9.0 (2024-08-16) for installation from PyPI.
+Building from source for older versions probably still works, though. See the [user guide](python_bindings.md) for details on how to build from source.
+The minimum theoretically supported version is Python 3.8, since the used bindings generator (nanobind) only supports Python 3.8+.
--- a/third_party/benchmark/docs/index.md
+++ b/third_party/benchmark/docs/index.md
@@ -0,0 +1,12 @@
+# Benchmark
+
+* [Assembly Tests](AssemblyTests.md)
+* [Dependencies](dependencies.md)
+* [Perf Counters](perf_counters.md)
+* [Platform Specific Build Instructions](platform_specific_build_instructions.md)
+* [Python Bindings](python_bindings.md)
+* [Random Interleaving](random_interleaving.md)
+* [Reducing Variance](reducing_variance.md)
+* [Releasing](releasing.md)
+* [Tools](tools.md)
+* [User Guide](user_guide.md)
--- a/third_party/benchmark/docs/perf_counters.md
+++ b/third_party/benchmark/docs/perf_counters.md
@@ -0,0 +1,35 @@
+<a name="perf-counters" />
+
+# User-Requested Performance Counters
+
+When running benchmarks, the user may choose to request collection of
+performance counters. This may be useful in investigation scenarios - narrowing
+down the cause of a regression; or verifying that the underlying cause of a
+performance improvement matches expectations.
+
+This feature is available if:
+
+* The benchmark is run on an architecture featuring a Performance Monitoring
+  Unit (PMU),
+* The benchmark is compiled with support for collecting counters. Currently,
+  this requires [libpfm](http://perfmon2.sourceforge.net/), which is built as a
+  dependency via Bazel.
+
+The feature does not require modifying benchmark code. Counter collection is
+handled at the boundaries where timer collection is also handled. 
+
+To opt-in:
+* If using a Bazel build, add `--define pfm=1` to your build flags
+* If using CMake:
+  * Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`.
+  * Enable the CMake flag `BENCHMARK_ENABLE_LIBPFM` in `CMakeLists.txt`.
+
+To use, pass a comma-separated list of counter names through the
+`--benchmark_perf_counters` flag. The names are decoded through libpfm - meaning,
+they are platform specific, but some (e.g. `CYCLES` or `INSTRUCTIONS`) are
+mapped by libpfm to platform-specifics - see libpfm
+[documentation](http://perfmon2.sourceforge.net/docs.html) for more details.
+
+The counter values are reported back through the [User Counters](../README.md#custom-counters)
+mechanism, meaning, they are available in all the formats (e.g. JSON) supported
+by User Counters.
--- a/third_party/benchmark/docs/platform_specific_build_instructions.md
+++ b/third_party/benchmark/docs/platform_specific_build_instructions.md
@@ -0,0 +1,52 @@
+# Platform Specific Build Instructions
+
+## Building with GCC
+
+When the library is built using GCC it is necessary to link with the pthread
+library due to how GCC implements `std::thread`. Failing to link to pthread will
+lead to runtime exceptions (unless you're using libc++), not linker errors. See
+[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
+can link to pthread by adding `-pthread` to your linker command. Note, you can
+also use `-lpthread`, but there are potential issues with ordering of command
+line parameters if you use that.
+
+On QNX, the pthread library is part of libc and usually included automatically
+(see
+[`pthread_create()`](https://www.qnx.com/developers/docs/7.1/index.html#com.qnx.doc.neutrino.lib_ref/topic/p/pthread_create.html)).
+There's no separate pthread library to link.
+
+## Building with Visual Studio 2015, 2017 or 2022
+
+The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
+
+```
+// Alternatively, can add libraries using linker options.
+
+// First, Add the path to the generated library files (directory containing the `benchmark.lib`) in `[Configuration Properties > Linker > General > Additional Library Directories]`. Then do the following:
+#ifdef _WIN32
+#pragma comment ( lib, "Shlwapi.lib" )
+#ifdef _DEBUG
+#pragma comment ( lib, "benchmark.lib" )
+#else
+#pragma comment ( lib, "benchmark.lib" )
+#endif
+#endif
+```
+
+When using the static library, make sure to add `BENCHMARK_STATIC_DEFINE` under `[Configuration Properties > C/C++ > Preprocessor > Preprocessor Definitions]`
+
+Can also use the graphical version of CMake:
+* Open `CMake GUI`.
+* Under `Where to build the binaries`, same path as source plus `build`.
+* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
+* Click `Configure`, `Generate`, `Open Project`.
+* If build fails, try deleting entire directory and starting again, or unticking options to build less.
+
+## Building with Intel 2015 Update 1 or Intel System Studio Update 4
+
+See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
+
+## Building on Solaris
+
+If you're running benchmarks on solaris, you'll want the kstat library linked in
+too (`-lkstat`).
--- a/third_party/benchmark/docs/python_bindings.md
+++ b/third_party/benchmark/docs/python_bindings.md
@@ -0,0 +1,34 @@
+# Building and installing Python bindings
+
+Python bindings are available as wheels on [PyPI](https://pypi.org/project/google-benchmark/) for importing and 
+using Google Benchmark directly in Python. 
+Currently, pre-built wheels exist for macOS (both ARM64 and Intel x86), Linux x86-64 and 64-bit Windows.
+Supported Python versions are Python 3.8 - 3.12.
+
+To install Google Benchmark's Python bindings, run:
+
+```bash
+python -m pip install --upgrade pip  # for manylinux2014 support
+python -m pip install google-benchmark
+```
+
+In order to keep your system Python interpreter clean, it is advisable to run these commands in a virtual
+environment. See the [official Python documentation](https://docs.python.org/3/library/venv.html) 
+on how to create virtual environments.
+
+To build a wheel directly from source, you can follow these steps:
+```bash
+git clone https://github.com/google/benchmark.git
+cd benchmark
+# create a virtual environment and activate it
+python3 -m venv venv --system-site-packages
+source venv/bin/activate  # .\venv\Scripts\Activate.ps1 on Windows
+
+# upgrade Python's system-wide packages
+python -m pip install --upgrade pip build
+# builds the wheel and stores it in the directory "dist".
+python -m build
+```
+
+NB: Building wheels from source requires Bazel. For platform-specific instructions on how to install Bazel,
+refer to the [Bazel installation docs](https://bazel.build/install).
--- a/third_party/benchmark/docs/random_interleaving.md
+++ b/third_party/benchmark/docs/random_interleaving.md
@@ -0,0 +1,13 @@
+<a name="interleaving" />
+
+# Random Interleaving
+
+[Random Interleaving](https://github.com/google/benchmark/issues/1051) is a
+technique to lower run-to-run variance. It randomly interleaves repetitions of a
+microbenchmark with repetitions from other microbenchmarks in the same benchmark
+test. Data shows it is able to lower run-to-run variance by
+[40%](https://github.com/google/benchmark/issues/1051) on average.
+
+To use, you mainly need to set `--benchmark_enable_random_interleaving=true`,
+and optionally specify non-zero repetition count `--benchmark_repetitions=9`
+and optionally decrease the per-repetition time `--benchmark_min_time=0.1`.
--- a/third_party/benchmark/docs/reducing_variance.md
+++ b/third_party/benchmark/docs/reducing_variance.md
@@ -0,0 +1,133 @@
+# Reducing Variance
+
+<a name="disabling-cpu-frequency-scaling" />
+
+## Disabling CPU Frequency Scaling
+
+If you see this error:
+
+```
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+```
+
+you might want to disable the CPU frequency scaling while running the
+benchmark, as well as consider other ways to stabilize the performance of
+your system while benchmarking.
+
+Exactly how to do this depends on the Linux distribution,
+desktop environment, and installed programs.  Specific details are a moving
+target, so we will not attempt to exhaustively document them here.
+
+One simple option is to use the `cpupower` program to change the
+performance governor to "performance".  This tool is maintained along with
+the Linux kernel and provided by your distribution.
+
+It must be run as root, like this:
+
+```bash
+sudo cpupower frequency-set --governor performance
+```
+
+After this you can verify that all CPUs are using the performance governor
+by running this command:
+
+```bash
+cpupower frequency-info -o proc
+```
+
+The benchmarks you subsequently run will have less variance.
+
+<a name="reducing-variance" />
+
+## Disabling ASLR
+
+If you see this error:
+
+```
+***WARNING*** ASLR is enabled, the results may have unreproducible noise in them.
+```
+
+you might want to disable the ASLR security hardening feature while running the
+benchmark.
+
+The simplest way is to add
+```
+benchmark::MaybeReenterWithoutASLR(argc, argv);
+```
+as the first line of your `main()` function. It will try to disable ASLR
+for the current processor, and, if successful, re-execute the binary.
+Note that `personality(2)` may be forbidden by e.g. seccomp (which happens
+by default if you are running in a Docker container).
+
+Note that if you link to `benchmark_main` already does that for you.
+
+To globally disable ASLR on Linux, run
+```
+echo 0 > /proc/sys/kernel/randomize_va_space
+```
+
+To run a single benchmark with ASLR disabled on Linux, do:
+```
+setarch `uname -m` -R ./a_benchmark
+```
+
+Note that for the information on how to disable ASLR on other operating systems,
+please refer to their documentation.
+
+## Reducing Variance in Benchmarks
+
+The Linux CPU frequency governor [discussed
+above](user_guide#disabling-cpu-frequency-scaling) is not the only source
+of noise in benchmarks.  Some, but not all, of the sources of variance
+include:
+
+1. On multi-core machines not all CPUs/CPU cores/CPU threads run the same
+   speed, so running a benchmark one time and then again may give a
+   different result depending on which CPU it ran on.
+2. CPU scaling features that run on the CPU, like Intel's Turbo Boost and
+   AMD Turbo Core and Precision Boost, can temporarily change the CPU
+   frequency even when the using the "performance" governor on Linux.
+3. Context switching between CPUs, or scheduling competition on the CPU the
+   benchmark is running on.
+4. Intel Hyperthreading or AMD SMT causing the same issue as above.
+5. Cache effects caused by code running on other CPUs.
+6. Non-uniform memory architectures (NUMA).
+
+These can cause variance in benchmarks results within a single run
+(`--benchmark_repetitions=N`) or across multiple runs of the benchmark
+program.
+
+Reducing sources of variance is OS and architecture dependent, which is one
+reason some companies maintain machines dedicated to performance testing.
+
+Some of the easier and effective ways of reducing variance on a typical
+Linux workstation are:
+
+1. Use the performance governor as [discussed
+above](user_guide#disabling-cpu-frequency-scaling).
+1. Disable processor boosting by:
+   ```sh
+   echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost
+   ```
+   See the Linux kernel's
+   [boost.txt](https://www.kernel.org/doc/Documentation/cpu-freq/boost.txt)
+   for more information.
+2. Set the benchmark program's task affinity to a fixed cpu.  For example:
+   ```sh
+   taskset -c 0 ./mybenchmark
+   ```
+3. Disabling Hyperthreading/SMT.  This can be done in the Bios or using the
+   `/sys` file system (see the LLVM project's [Benchmarking
+   tips](https://llvm.org/docs/Benchmarking.html)).
+4. Close other programs that do non-trivial things based on timers, such as
+   your web browser, desktop environment, etc.
+5. Reduce the working set of your benchmark to fit within the L1 cache, but
+   do be aware that this may lead you to optimize for an unrealistic
+   situation.
+
+Further resources on this topic:
+
+1. The LLVM project's [Benchmarking
+   tips](https://llvm.org/docs/Benchmarking.html).
+1. The Arch Wiki [Cpu frequency
+scaling](https://wiki.archlinux.org/title/CPU_frequency_scaling) page.
--- a/third_party/benchmark/docs/releasing.md
+++ b/third_party/benchmark/docs/releasing.md
@@ -0,0 +1,38 @@
+# How to release
+
+* Make sure you're on main and synced to HEAD
+* Ensure the project builds and tests run
+    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
+      passes
+* Prepare release notes
+    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
+      commits between the last annotated tag and HEAD
+    * Pick the most interesting.
+* Create one last commit that updates the version saved in `CMakeLists.txt`, `MODULE.bazel`,
+  and `bindings/python/google_benchmark/__init__.py` to the release version you're creating.
+  (This version will be used if benchmark is installed from the archive you'll be creating
+  in the next step.)
+
+```
+# CMakeLists.txt
+project (benchmark VERSION 1.9.0 LANGUAGES CXX)
+```
+
+```
+# MODULE.bazel
+module(name = "com_github_google_benchmark", version="1.9.0")
+```
+
+```
+# google_benchmark/__init__.py
+__version__ = "1.9.0"
+```
+
+* Create a release through github's interface
+    * Note this will create a lightweight tag.
+    * Update this to an annotated tag:
+      * `git pull --tags`
+      * `git tag -a -f <tag> <tag>`
+      * `git push --force --tags origin`
+* Confirm that the "Build and upload Python wheels" action runs to completion
+    * Run it manually if it hasn't run.
--- a/third_party/benchmark/docs/tools.md
+++ b/third_party/benchmark/docs/tools.md
@@ -4,7 +4,11 @@

 The `compare.py` can be used to compare the result of benchmarks.

-**NOTE**: the utility relies on the scipy package which can be installed using [these instructions](https://www.scipy.org/install.html).
+### Dependencies
+The utility relies on the [scipy](https://www.scipy.org) package which can be installed using pip:
+```bash
+pip3 install -r requirements.txt
+```

 ### Displaying aggregates only

@@ -182,6 +186,146 @@ Benchmark                               Time             CPU      Time Old
 This is a mix of the previous two modes, two (potentially different) benchmark binaries are run, and a different filter is applied to each one.
 As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.

+### Note: Interpreting the output
+
+Performance measurements are an art, and performance comparisons are doubly so.
+Results are often noisy and don't necessarily have large absolute differences to
+them, so just by visual inspection, it is not at all apparent if two
+measurements are actually showing a performance change or not. It is even more
+confusing with multiple benchmark repetitions.
+
+Thankfully, what we can do, is use statistical tests on the results to determine
+whether the performance has statistically-significantly changed. `compare.py`
+uses [Mann–Whitney U
+test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), with a null
+hypothesis being that there's no difference in performance.
+ 
+**The below output is a summary of a benchmark comparison with statistics
+provided for a multi-threaded process.**
+```
+Benchmark                                               Time        CPU    Time Old      Time New       CPU Old       CPU New
+-----------------------------------------------------------------------------------------------------------------------------
+benchmark/threads:1/process_time/real_time_pvalue     0.0000     0.0000    U Test, Repetitions: 27 vs 27
+benchmark/threads:1/process_time/real_time_mean      -0.1442    -0.1442          90            77            90            77
+benchmark/threads:1/process_time/real_time_median    -0.1444    -0.1444          90            77            90            77
+benchmark/threads:1/process_time/real_time_stddev    +0.3974    +0.3933           0             0             0             0
+benchmark/threads:1/process_time/real_time_cv        +0.6329    +0.6280           0             0             0             0
+OVERALL_GEOMEAN                                      -0.1442    -0.1442           0             0             0             0
+```
+--------------------------------------------
+Here's a breakdown of each row:
+
+**benchmark/threads:1/process_time/real_time_pvalue**: This shows the _p-value_ for
+the statistical test comparing the performance of the process running with one
+thread. A value of 0.0000 suggests a statistically significant difference in
+performance. The comparison was conducted using the U Test (Mann-Whitney
+U Test) with 27 repetitions for each case.
+
+**benchmark/threads:1/process_time/real_time_mean**: This shows the relative
+difference in mean execution time between two different cases. The negative
+value (-0.1442) implies that the new process is faster by about 14.42%. The old
+time was 90 units, while the new time is 77 units.
+
+**benchmark/threads:1/process_time/real_time_median**: Similarly, this shows the
+relative difference in the median execution time. Again, the new process is
+faster by 14.44%.
+
+**benchmark/threads:1/process_time/real_time_stddev**: This is the relative
+difference in the standard deviation of the execution time, which is a measure
+of how much variation or dispersion there is from the mean. A positive value
+(+0.3974) implies there is more variance in the execution time in the new
+process.
+
+**benchmark/threads:1/process_time/real_time_cv**: CV stands for Coefficient of
+Variation. It is the ratio of the standard deviation to the mean. It provides a
+standardized measure of dispersion. An increase (+0.6329) indicates more
+relative variability in the new process.
+
+**OVERALL_GEOMEAN**: Geomean stands for geometric mean, a type of average that is
+less influenced by outliers. The negative value indicates a general improvement
+in the new process. However, given the values are all zero for the old and new
+times, this seems to be a mistake or placeholder in the output.
+
+-----------------------------------------
+
+
+
+Let's first try to see what the different columns represent in the above
+`compare.py` benchmarking output:
+
+  1. **Benchmark:** The name of the function being benchmarked, along with the
+     size of the input (after the slash).
+
+  2. **Time:** The average time per operation, across all iterations.
+
+  3. **CPU:** The average CPU time per operation, across all iterations.
+
+  4. **Iterations:** The number of iterations the benchmark was run to get a
+     stable estimate.
+
+  5. **Time Old and Time New:** These represent the average time it takes for a
+     function to run in two different scenarios or versions. For example, you
+     might be comparing how fast a function runs before and after you make some
+     changes to it.
+
+  6. **CPU Old and CPU New:** These show the average amount of CPU time that the
+     function uses in two different scenarios or versions. This is similar to
+     Time Old and Time New, but focuses on CPU usage instead of overall time.
+
+In the comparison section, the relative differences in both time and CPU time
+are displayed for each input size.
+
+
+A statistically-significant difference is determined by a **p-value**, which is
+a measure of the probability that the observed difference could have occurred
+just by random chance. A smaller p-value indicates stronger evidence against the
+null hypothesis. 
+
+**Therefore:**
+  1. If the p-value is less than the chosen significance level (alpha), we
+     reject the null hypothesis and conclude the benchmarks are significantly
+     different.
+  2. If the p-value is greater than or equal to alpha, we fail to reject the
+     null hypothesis and treat the two benchmarks as similar.
+
+
+
+The result of said the statistical test is additionally communicated through color coding:
+```diff
+ Green:
+```
+  The benchmarks are _**statistically different**_. This could mean the
+  performance has either **significantly improved** or **significantly
+  deteriorated**. You should look at the actual performance numbers to see which
+  is the case.
+```diff
+- Red:
+```
+  The benchmarks are _**statistically similar**_. This means the performance
+  **hasn't significantly changed**.
+
+In statistical terms, **'green'** means we reject the null hypothesis that
+there's no difference in performance, and **'red'** means we fail to reject the
+null hypothesis. This might seem counter-intuitive if you're expecting 'green'
+to mean 'improved performance' and 'red' to mean 'worsened performance'. 
+```bash
+  But remember, in this context:
+
+    'Success' means 'successfully finding a difference'.
+    'Failure' means 'failing to find a difference'.
+```
+
+
+Also, please note that **even if** we determine that there **is** a
+statistically-significant difference between the two measurements, it does not
+_necessarily_ mean that the actual benchmarks that were measured **are**
+different, or vice versa, even if we determine that there is **no**
+statistically-significant difference between the two measurements, it does not
+necessarily mean that the actual benchmarks that were measured **are not**
+different.
+
+
+
 ### U test

 If there is a sufficient repetition count of the benchmarks, the tool can do
--- a/third_party/benchmark/docs/user_guide.md
+++ b/third_party/benchmark/docs/user_guide.md
--- a/third_party/benchmark/include/benchmark/benchmark.h
+++ b/third_party/benchmark/include/benchmark/benchmark.h
--- a/third_party/benchmark/include/benchmark/export.h
+++ b/third_party/benchmark/include/benchmark/export.h
@@ -0,0 +1,47 @@
+#ifndef BENCHMARK_EXPORT_H
+#define BENCHMARK_EXPORT_H
+
+#if defined(_WIN32)
+#define EXPORT_ATTR __declspec(dllexport)
+#define IMPORT_ATTR __declspec(dllimport)
+#define NO_EXPORT_ATTR
+#define DEPRECATED_ATTR __declspec(deprecated)
+#else  // _WIN32
+#define EXPORT_ATTR __attribute__((visibility("default")))
+#define IMPORT_ATTR __attribute__((visibility("default")))
+#define NO_EXPORT_ATTR __attribute__((visibility("hidden")))
+#define DEPRECATE_ATTR __attribute__((__deprecated__))
+#endif  // _WIN32
+
+#ifdef BENCHMARK_STATIC_DEFINE
+#define BENCHMARK_EXPORT
+#define BENCHMARK_NO_EXPORT
+#else  // BENCHMARK_STATIC_DEFINE
+#ifndef BENCHMARK_EXPORT
+#ifdef benchmark_EXPORTS
+/* We are building this library */
+#define BENCHMARK_EXPORT EXPORT_ATTR
+#else  // benchmark_EXPORTS
+/* We are using this library */
+#define BENCHMARK_EXPORT IMPORT_ATTR
+#endif  // benchmark_EXPORTS
+#endif  // !BENCHMARK_EXPORT
+
+#ifndef BENCHMARK_NO_EXPORT
+#define BENCHMARK_NO_EXPORT NO_EXPORT_ATTR
+#endif  // !BENCHMARK_NO_EXPORT
+#endif  // BENCHMARK_STATIC_DEFINE
+
+#ifndef BENCHMARK_DEPRECATED
+#define BENCHMARK_DEPRECATED DEPRECATE_ATTR
+#endif  // BENCHMARK_DEPRECATED
+
+#ifndef BENCHMARK_DEPRECATED_EXPORT
+#define BENCHMARK_DEPRECATED_EXPORT BENCHMARK_EXPORT BENCHMARK_DEPRECATED
+#endif  // BENCHMARK_DEPRECATED_EXPORT
+
+#ifndef BENCHMARK_DEPRECATED_NO_EXPORT
+#define BENCHMARK_DEPRECATED_NO_EXPORT BENCHMARK_NO_EXPORT BENCHMARK_DEPRECATED
+#endif  // BENCHMARK_DEPRECATED_EXPORT
+
+#endif /* BENCHMARK_EXPORT_H */
--- a/third_party/benchmark/mingw.py
+++ b/third_party/benchmark/mingw.py
@@ -1,320 +0,0 @@
-#! /usr/bin/env python
-# encoding: utf-8
-
-import argparse
-import errno
-import logging
-import os
-import platform
-import re
-import sys
-import subprocess
-import tempfile
-
-try:
-    import winreg
-except ImportError:
-    import _winreg as winreg
-try:
-    import urllib.request as request
-except ImportError:
-    import urllib as request
-try:
-    import urllib.parse as parse
-except ImportError:
-    import urlparse as parse
-
-class EmptyLogger(object):
-    '''
-    Provides an implementation that performs no logging
-    '''
-    def debug(self, *k, **kw):
-        pass
-    def info(self, *k, **kw):
-        pass
-    def warn(self, *k, **kw):
-        pass
-    def error(self, *k, **kw):
-        pass
-    def critical(self, *k, **kw):
-        pass
-    def setLevel(self, *k, **kw):
-        pass
-
-urls = (
-    'http://downloads.sourceforge.net/project/mingw-w64/Toolchains%20'
-        'targetting%20Win32/Personal%20Builds/mingw-builds/installer/'
-        'repository.txt',
-    'http://downloads.sourceforge.net/project/mingwbuilds/host-windows/'
-        'repository.txt'
-)
-'''
-A list of mingw-build repositories
-'''
-
-def repository(urls = urls, log = EmptyLogger()):
-    '''
-    Downloads and parse mingw-build repository files and parses them
-    '''
-    log.info('getting mingw-builds repository')
-    versions = {}
-    re_sourceforge = re.compile(r'http://sourceforge.net/projects/([^/]+)/files')
-    re_sub = r'http://downloads.sourceforge.net/project/\1'
-    for url in urls:
-        log.debug(' - requesting: %s', url)
-        socket = request.urlopen(url)
-        repo = socket.read()
-        if not isinstance(repo, str):
-            repo = repo.decode();
-        socket.close()
-        for entry in repo.split('\n')[:-1]:
-            value = entry.split('|')
-            version = tuple([int(n) for n in value[0].strip().split('.')])
-            version = versions.setdefault(version, {})
-            arch = value[1].strip()
-            if arch == 'x32':
-                arch = 'i686'
-            elif arch == 'x64':
-                arch = 'x86_64'
-            arch = version.setdefault(arch, {})
-            threading = arch.setdefault(value[2].strip(), {})
-            exceptions = threading.setdefault(value[3].strip(), {})
-            revision = exceptions.setdefault(int(value[4].strip()[3:]),
-                re_sourceforge.sub(re_sub, value[5].strip()))
-    return versions
-
-def find_in_path(file, path=None):
-    '''
-    Attempts to find an executable in the path
-    '''
-    if platform.system() == 'Windows':
-        file += '.exe'
-    if path is None:
-        path = os.environ.get('PATH', '')
-    if type(path) is type(''):
-        path = path.split(os.pathsep)
-    return list(filter(os.path.exists,
-        map(lambda dir, file=file: os.path.join(dir, file), path)))
-
-def find_7zip(log = EmptyLogger()):
-    '''
-    Attempts to find 7zip for unpacking the mingw-build archives
-    '''
-    log.info('finding 7zip')
-    path = find_in_path('7z')
-    if not path:
-        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\7-Zip')
-        path, _ = winreg.QueryValueEx(key, 'Path')
-        path = [os.path.join(path, '7z.exe')]
-    log.debug('found \'%s\'', path[0])
-    return path[0]
-
-find_7zip()
-
-def unpack(archive, location, log = EmptyLogger()):
-    '''
-    Unpacks a mingw-builds archive
-    '''
-    sevenzip = find_7zip(log)
-    log.info('unpacking %s', os.path.basename(archive))
-    cmd = [sevenzip, 'x', archive, '-o' + location, '-y']
-    log.debug(' - %r', cmd)
-    with open(os.devnull, 'w') as devnull:
-        subprocess.check_call(cmd, stdout = devnull)
-
-def download(url, location, log = EmptyLogger()):
-    '''
-    Downloads and unpacks a mingw-builds archive
-    '''
-    log.info('downloading MinGW')
-    log.debug(' - url: %s', url)
-    log.debug(' - location: %s', location)
-
-    re_content = re.compile(r'attachment;[ \t]*filename=(")?([^"]*)(")?[\r\n]*')
-
-    stream = request.urlopen(url)
-    try:
-        content = stream.getheader('Content-Disposition') or ''
-    except AttributeError:
-        content = stream.headers.getheader('Content-Disposition') or ''
-    matches = re_content.match(content)
-    if matches:
-        filename = matches.group(2)
-    else:
-        parsed = parse.urlparse(stream.geturl())
-        filename = os.path.basename(parsed.path)
-
-    try:
-        os.makedirs(location)
-    except OSError as e:
-        if e.errno == errno.EEXIST and os.path.isdir(location):
-            pass
-        else:
-            raise
-
-    archive = os.path.join(location, filename)
-    with open(archive, 'wb') as out:
-        while True:
-            buf = stream.read(1024)
-            if not buf:
-                break
-            out.write(buf)
-    unpack(archive, location, log = log)
-    os.remove(archive)
-
-    possible = os.path.join(location, 'mingw64')
-    if not os.path.exists(possible):
-        possible = os.path.join(location, 'mingw32')
-        if not os.path.exists(possible):
-            raise ValueError('Failed to find unpacked MinGW: ' + possible)
-    return possible
-
-def root(location = None, arch = None, version = None, threading = None,
-        exceptions = None, revision = None, log = EmptyLogger()):
-    '''
-    Returns the root folder of a specific version of the mingw-builds variant
-    of gcc. Will download the compiler if needed
-    '''
-
-    # Get the repository if we don't have all the information
-    if not (arch and version and threading and exceptions and revision):
-        versions = repository(log = log)
-
-    # Determine some defaults
-    version = version or max(versions.keys())
-    if not arch:
-        arch = platform.machine().lower()
-        if arch == 'x86':
-            arch = 'i686'
-        elif arch == 'amd64':
-            arch = 'x86_64'
-    if not threading:
-        keys = versions[version][arch].keys()
-        if 'posix' in keys:
-            threading = 'posix'
-        elif 'win32' in keys:
-            threading = 'win32'
-        else:
-            threading = keys[0]
-    if not exceptions:
-        keys = versions[version][arch][threading].keys()
-        if 'seh' in keys:
-            exceptions = 'seh'
-        elif 'sjlj' in keys:
-            exceptions = 'sjlj'
-        else:
-            exceptions = keys[0]
-    if revision == None:
-        revision = max(versions[version][arch][threading][exceptions].keys())
-    if not location:
-        location = os.path.join(tempfile.gettempdir(), 'mingw-builds')
-
-    # Get the download url
-    url = versions[version][arch][threading][exceptions][revision]
-
-    # Tell the user whatzzup
-    log.info('finding MinGW %s', '.'.join(str(v) for v in version))
-    log.debug(' - arch: %s', arch)
-    log.debug(' - threading: %s', threading)
-    log.debug(' - exceptions: %s', exceptions)
-    log.debug(' - revision: %s', revision)
-    log.debug(' - url: %s', url)
-
-    # Store each specific revision differently
-    slug = '{version}-{arch}-{threading}-{exceptions}-rev{revision}'
-    slug = slug.format(
-        version = '.'.join(str(v) for v in version),
-        arch = arch,
-        threading = threading,
-        exceptions = exceptions,
-        revision = revision
-    )
-    if arch == 'x86_64':
-        root_dir = os.path.join(location, slug, 'mingw64')
-    elif arch == 'i686':
-        root_dir = os.path.join(location, slug, 'mingw32')
-    else:
-        raise ValueError('Unknown MinGW arch: ' + arch)
-
-    # Download if needed
-    if not os.path.exists(root_dir):
-        downloaded = download(url, os.path.join(location, slug), log = log)
-        if downloaded != root_dir:
-            raise ValueError('The location of mingw did not match\n%s\n%s'
-                % (downloaded, root_dir))
-
-    return root_dir
-
-def str2ver(string):
-    '''
-    Converts a version string into a tuple
-    '''
-    try:
-        version = tuple(int(v) for v in string.split('.'))
-        if len(version) is not 3:
-            raise ValueError()
-    except ValueError:
-        raise argparse.ArgumentTypeError(
-            'please provide a three digit version string')
-    return version
-
-def main():
-    '''
-    Invoked when the script is run directly by the python interpreter
-    '''
-    parser = argparse.ArgumentParser(
-        description = 'Downloads a specific version of MinGW',
-        formatter_class = argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument('--location',
-        help = 'the location to download the compiler to',
-        default = os.path.join(tempfile.gettempdir(), 'mingw-builds'))
-    parser.add_argument('--arch', required = True, choices = ['i686', 'x86_64'],
-        help = 'the target MinGW architecture string')
-    parser.add_argument('--version', type = str2ver,
-        help = 'the version of GCC to download')
-    parser.add_argument('--threading', choices = ['posix', 'win32'],
-        help = 'the threading type of the compiler')
-    parser.add_argument('--exceptions', choices = ['sjlj', 'seh', 'dwarf'],
-        help = 'the method to throw exceptions')
-    parser.add_argument('--revision', type=int,
-        help = 'the revision of the MinGW release')
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument('-v', '--verbose', action='store_true',
-        help='increase the script output verbosity')
-    group.add_argument('-q', '--quiet', action='store_true',
-        help='only print errors and warning')
-    args = parser.parse_args()
-
-    # Create the logger
-    logger = logging.getLogger('mingw')
-    handler = logging.StreamHandler()
-    formatter = logging.Formatter('%(message)s')
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    logger.setLevel(logging.INFO)
-    if args.quiet:
-        logger.setLevel(logging.WARN)
-    if args.verbose:
-        logger.setLevel(logging.DEBUG)
-
-    # Get MinGW
-    root_dir = root(location = args.location, arch = args.arch,
-        version = args.version, threading = args.threading,
-        exceptions = args.exceptions, revision = args.revision,
-        log = logger)
-
-    sys.stdout.write('%s\n' % os.path.join(root_dir, 'bin'))
-
-if __name__ == '__main__':
-    try:
-        main()
-    except IOError as e:
-        sys.stderr.write('IO error: %s\n' % e)
-        sys.exit(1)
-    except OSError as e:
-        sys.stderr.write('OS error: %s\n' % e)
-        sys.exit(1)
-    except KeyboardInterrupt as e:
-        sys.stderr.write('Killed\n')
-        sys.exit(1)
--- a/third_party/benchmark/pyproject.toml
+++ b/third_party/benchmark/pyproject.toml
@@ -0,0 +1,78 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "google_benchmark"
+description = "A library to benchmark code snippets."
+requires-python = ">=3.10"
+license = { file = "LICENSE" }
+keywords = ["benchmark"]
+
+authors = [{ name = "Google", email = "benchmark-discuss@googlegroups.com" }]
+
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Software Development :: Testing",
+    "Topic :: System :: Benchmark",
+]
+
+dynamic = ["readme", "version"]
+
+dependencies = ["absl-py>=0.7.1"]
+
+[project.optional-dependencies]
+dev = ["pre-commit>=3.3.3"]
+
+[project.urls]
+Homepage = "https://github.com/google/benchmark"
+Documentation = "https://github.com/google/benchmark/tree/main/docs"
+Repository = "https://github.com/google/benchmark.git"
+Discord = "https://discord.gg/cz7UX7wKC2"
+
+[tool.setuptools]
+package-dir = { "" = "bindings/python" }
+zip-safe = false
+
+[tool.setuptools.packages.find]
+where = ["bindings/python"]
+
+[tool.setuptools.dynamic]
+readme = { file = "README.md", content-type = "text/markdown" }
+version = { attr = "google_benchmark.__version__" }
+
+[tool.mypy]
+check_untyped_defs = true
+disallow_incomplete_defs = true
+pretty = true
+python_version = "3.11"
+strict_optional = false
+warn_unreachable = true
+
+[[tool.mypy.overrides]]
+module = ["yaml"]
+ignore_missing_imports = true
+
+[tool.ruff]
+# explicitly tell ruff the source directory to correctly identify first-party package.
+src = ["bindings/python"]
+
+line-length = 80
+target-version = "py311"
+
+[tool.ruff.lint]
+# Enable pycodestyle (`E`, `W`), Pyflakes (`F`), and isort (`I`) codes by default.
+select = ["ASYNC", "B", "C4", "C90", "E", "F", "I", "PERF", "PIE", "PT018", "RUF", "SIM", "UP", "W"]
+ignore = [
+    "PLW2901",  # redefined-loop-name
+    "UP031",    # printf-string-formatting
+]
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
--- a/third_party/benchmark/releasing.md
+++ b/third_party/benchmark/releasing.md
@@ -1,16 +0,0 @@
-# How to release
-
-* Make sure you're on master and synced to HEAD
-* Ensure the project builds and tests run (sanity check only, obviously)
-    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
-      passes
-* Prepare release notes
-    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
-      commits between the last annotated tag and HEAD
-    * Pick the most interesting.
-* Create a release through github's interface
-    * Note this will create a lightweight tag.
-    * Update this to an annotated tag:
-      * `git pull --tags`
-      * `git tag -a -f <tag> <tag>`
-      * `git push --force origin`
--- a/third_party/benchmark/setup.py
+++ b/third_party/benchmark/setup.py
@@ -0,0 +1,166 @@
+import contextlib
+import os
+import platform
+import re
+import shutil
+import sys
+from collections.abc import Generator
+from pathlib import Path
+from typing import Any
+
+import setuptools
+from setuptools.command import build_ext
+
+IS_WINDOWS = platform.system() == "Windows"
+IS_MAC = platform.system() == "Darwin"
+IS_LINUX = platform.system() == "Linux"
+
+# hardcoded SABI-related options. Requires that each Python interpreter
+# (hermetic or not) participating is of the same major-minor version.
+py_limited_api = sys.version_info >= (3, 12)
+options = {"bdist_wheel": {"py_limited_api": "cp312"}} if py_limited_api else {}
+
+
+def is_cibuildwheel() -> bool:
+    return os.getenv("CIBUILDWHEEL") is not None
+
+
+@contextlib.contextmanager
+def _maybe_patch_toolchains() -> Generator[None, None, None]:
+    """
+    Patch rules_python toolchains to ignore root user error
+    when run in a Docker container on Linux in cibuildwheel.
+    """
+
+    def fmt_toolchain_args(matchobj):
+        suffix = "ignore_root_user_error = True"
+        callargs = matchobj.group(1)
+        # toolchain def is broken over multiple lines
+        if callargs.endswith("\n"):
+            callargs = callargs + "    " + suffix + ",\n"
+        # toolchain def is on one line.
+        else:
+            callargs = callargs + ", " + suffix
+        return "python.toolchain(" + callargs + ")"
+
+    CIBW_LINUX = is_cibuildwheel() and IS_LINUX
+    module_bazel = Path("MODULE.bazel")
+    content: str = module_bazel.read_text()
+    try:
+        if CIBW_LINUX:
+            module_bazel.write_text(
+                re.sub(
+                    r"python.toolchain\(([\w\"\s,.=]*)\)",
+                    fmt_toolchain_args,
+                    content,
+                )
+            )
+        yield
+    finally:
+        if CIBW_LINUX:
+            module_bazel.write_text(content)
+
+
+class BazelExtension(setuptools.Extension):
+    """A C/C++ extension that is defined as a Bazel BUILD target."""
+
+    def __init__(self, name: str, bazel_target: str, **kwargs: Any):
+        super().__init__(name=name, sources=[], **kwargs)
+
+        self.bazel_target = bazel_target
+        stripped_target = bazel_target.split("//")[-1]
+        self.relpath, self.target_name = stripped_target.split(":")
+
+
+class BuildBazelExtension(build_ext.build_ext):
+    """A command that runs Bazel to build a C/C++ extension."""
+
+    def run(self):
+        for ext in self.extensions:
+            self.bazel_build(ext)
+        # explicitly call `bazel shutdown` for graceful exit
+        self.spawn(["bazel", "shutdown"])
+
+    def copy_extensions_to_source(self):
+        """
+        Copy generated extensions into the source tree.
+        This is done in the ``bazel_build`` method, so it's not necessary to
+        do again in the `build_ext` base class.
+        """
+
+    def bazel_build(self, ext: BazelExtension) -> None:  # noqa: C901
+        """Runs the bazel build to create the package."""
+        temp_path = Path(self.build_temp)
+
+        # We round to the minor version, which makes rules_python
+        # look up the latest available patch version internally.
+        python_version = "{}.{}".format(*sys.version_info[:2])
+
+        bazel_argv = [
+            "bazel",
+            "run",
+            ext.bazel_target,
+            f"--symlink_prefix={temp_path / 'bazel-'}",
+            f"--compilation_mode={'dbg' if self.debug else 'opt'}",
+            # C++17 is required by nanobind
+            f"--cxxopt={'/std:c++17' if IS_WINDOWS else '-std=c++17'}",
+            f"--@rules_python//python/config_settings:python_version={python_version}",
+        ]
+
+        if ext.py_limited_api:
+            bazel_argv += ["--@nanobind_bazel//:py-limited-api=cp312"]
+
+        if IS_WINDOWS:
+            # Link with python*.lib.
+            for library_dir in self.library_dirs:
+                bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
+        elif IS_MAC:
+            # C++17 needs macOS 10.14 at minimum
+            bazel_argv.append("--macos_minimum_os=10.14")
+
+        with _maybe_patch_toolchains():
+            self.spawn(bazel_argv)
+
+        if IS_WINDOWS:
+            suffix = ".pyd"
+        else:
+            suffix = ".abi3.so" if ext.py_limited_api else ".so"
+
+        # copy the Bazel build artifacts into setuptools' libdir,
+        # from where the wheel is built.
+        pkgname = "google_benchmark"
+        pythonroot = Path("bindings") / "python" / "google_benchmark"
+        srcdir = temp_path / "bazel-bin" / pythonroot
+        libdir = Path(self.build_lib) / pkgname
+        for root, dirs, files in os.walk(srcdir, topdown=True):
+            # exclude runfiles directories and children.
+            dirs[:] = [d for d in dirs if "runfiles" not in d]
+
+            for f in files:
+                fp = Path(f)
+                should_copy = False
+                # we do not want the bare .so file included
+                # when building for ABI3, so we require a
+                # full and exact match on the file extension.
+                if "".join(fp.suffixes) == suffix or fp.suffix == ".pyi":
+                    should_copy = True
+                elif Path(root) == srcdir and f == "py.typed":
+                    # copy py.typed, but only at the package root.
+                    should_copy = True
+
+                if should_copy:
+                    shutil.copyfile(root / fp, libdir / fp)
+
+
+setuptools.setup(
+    cmdclass={"build_ext": BuildBazelExtension},
+    package_data={"google_benchmark": ["py.typed", "*.pyi"]},
+    ext_modules=[
+        BazelExtension(
+            name="google_benchmark._benchmark",
+            bazel_target="//bindings/python/google_benchmark:benchmark_stubgen",
+            py_limited_api=py_limited_api,
+        )
+    ],
+    options=options,
+)
--- a/third_party/benchmark/src/CMakeLists.txt
+++ b/third_party/benchmark/src/CMakeLists.txt
@@ -1,4 +1,5 @@
-# Allow the source files to find headers in src/
+#Allow the source files to find headers in src /
+include(GNUInstallDirs)
 include_directories(${PROJECT_SOURCE_DIR}/src)

 if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
@@ -17,92 +18,166 @@ foreach(item ${BENCHMARK_MAIN})
 endforeach()

 add_library(benchmark ${SOURCE_FILES})
+add_library(benchmark::benchmark ALIAS benchmark)
 set_target_properties(benchmark PROPERTIES
  OUTPUT_NAME "benchmark"
  VERSION ${GENERIC_LIB_VERSION}
  SOVERSION ${GENERIC_LIB_SOVERSION}
 )
 target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+)
+
+set_property(
+  SOURCE benchmark.cc
+  APPEND
+  PROPERTY COMPILE_DEFINITIONS
+  BENCHMARK_VERSION="${VERSION}"
+)
+
+# libpfm, if available
+if (PFM_FOUND)
+  target_link_libraries(benchmark PRIVATE PFM::libpfm)
+  target_compile_definitions(benchmark PRIVATE -DHAVE_LIBPFM)
+  install(
+      FILES "${PROJECT_SOURCE_DIR}/cmake/Modules/FindPFM.cmake"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+endif()
+
+# pthread affinity, if available
+if(HAVE_PTHREAD_AFFINITY)
+  target_compile_definitions(benchmark PRIVATE -DBENCHMARK_HAS_PTHREAD_AFFINITY)
+endif()

 # Link threads.
-target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-find_library(LIBRT rt)
-if(LIBRT)
-  target_link_libraries(benchmark ${LIBRT})
-endif()
+target_link_libraries(benchmark PRIVATE Threads::Threads)
+
+target_link_libraries(benchmark PRIVATE ${BENCHMARK_CXX_LIBRARIES})
+
+if(HAVE_LIB_RT)
+  target_link_libraries(benchmark PRIVATE rt)
+endif(HAVE_LIB_RT)
+

 # We need extra libraries on Windows
 if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-  target_link_libraries(benchmark Shlwapi)
+  target_link_libraries(benchmark PRIVATE shlwapi)
 endif()

 # We need extra libraries on Solaris
 if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
-  target_link_libraries(benchmark kstat)
+  target_link_libraries(benchmark PRIVATE kstat)
+  set(BENCHMARK_PRIVATE_LINK_LIBRARIES -lkstat)
+endif()
+
+if (NOT BUILD_SHARED_LIBS)
+  target_compile_definitions(benchmark PUBLIC -DBENCHMARK_STATIC_DEFINE)
 endif()

 # Benchmark main library
 add_library(benchmark_main "benchmark_main.cc")
+add_library(benchmark::benchmark_main ALIAS benchmark_main)
 set_target_properties(benchmark_main PROPERTIES
  OUTPUT_NAME "benchmark_main"
  VERSION ${GENERIC_LIB_VERSION}
  SOVERSION ${GENERIC_LIB_SOVERSION}
+  DEFINE_SYMBOL benchmark_EXPORTS
 )
-target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
-target_link_libraries(benchmark_main benchmark)
+target_link_libraries(benchmark_main PUBLIC benchmark::benchmark)

-set(include_install_dir "include")
-set(lib_install_dir "lib/")
-set(bin_install_dir "bin/")
-set(config_install_dir "lib/cmake/${PROJECT_NAME}")
-set(pkgconfig_install_dir "lib/pkgconfig")
-
-set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
+set(generated_dir "${PROJECT_BINARY_DIR}")

 set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
 set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
 set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
+set(pkg_config_main "${generated_dir}/${PROJECT_NAME}_main.pc")
+set(targets_to_export benchmark benchmark_main)
 set(targets_export_name "${PROJECT_NAME}Targets")

 set(namespace "${PROJECT_NAME}::")

 include(CMakePackageConfigHelpers)
+
+configure_package_config_file (
+  ${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in
+  ${project_config}
+  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
+  NO_SET_AND_CHECK_MACRO
+  NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
 write_basic_package_version_file(
  "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY SameMajorVersion
 )

-configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
 configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)
+configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark_main.pc.in" "${pkg_config_main}" @ONLY)
+
+export (
+  TARGETS ${targets_to_export}
+  NAMESPACE "${namespace}"
+  FILE ${generated_dir}/${targets_export_name}.cmake
+)

 if (BENCHMARK_ENABLE_INSTALL)
  # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
  install(
-    TARGETS benchmark benchmark_main
+    TARGETS ${targets_to_export}
    EXPORT ${targets_export_name}
-    ARCHIVE DESTINATION ${lib_install_dir}
-    LIBRARY DESTINATION ${lib_install_dir}
-    RUNTIME DESTINATION ${bin_install_dir}
-    INCLUDES DESTINATION ${include_install_dir})
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

  install(
    DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
-    DESTINATION ${include_install_dir}
+              "${PROJECT_BINARY_DIR}/include/benchmark"
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
    FILES_MATCHING PATTERN "*.*h")

  install(
      FILES "${project_config}" "${version_config}"
-      DESTINATION "${config_install_dir}")
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")

  install(
-      FILES "${pkg_config}"
-      DESTINATION "${pkgconfig_install_dir}")
+      FILES "${pkg_config}" "${pkg_config_main}"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")

  install(
      EXPORT "${targets_export_name}"
      NAMESPACE "${namespace}"
-      DESTINATION "${config_install_dir}")
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+endif()
+
+if (BENCHMARK_ENABLE_DOXYGEN)
+  find_package(Doxygen REQUIRED)
+  set(DOXYGEN_QUIET YES)
+  set(DOXYGEN_RECURSIVE YES)
+  set(DOXYGEN_GENERATE_HTML YES)
+  set(DOXYGEN_GENERATE_MAN NO)
+  set(DOXYGEN_MARKDOWN_SUPPORT YES)
+  set(DOXYGEN_BUILTIN_STL_SUPPORT YES)
+  set(DOXYGEN_EXTRACT_PACKAGE YES)
+  set(DOXYGEN_EXTRACT_STATIC YES)
+  set(DOXYGEN_SHOW_INCLUDE_FILES YES)
+  set(DOXYGEN_BINARY_TOC YES)
+  set(DOXYGEN_TOC_EXPAND YES)
+  set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "index.md")
+  doxygen_add_docs(benchmark_doxygen
+    docs
+    include
+    src
+    ALL
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+    COMMENT "Building documentation with Doxygen.")
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/html/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
+else()
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${PROJECT_SOURCE_DIR}/docs/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
 endif()
--- a/third_party/benchmark/src/benchmark.cc
+++ b/third_party/benchmark/src/benchmark.cc
--- a/third_party/benchmark/src/benchmark_api_internal.cc
+++ b/third_party/benchmark/src/benchmark_api_internal.cc
@@ -1,15 +1,118 @@
 #include "benchmark_api_internal.h"

+#include <cinttypes>
+
+#include "string_util.h"
+
 namespace benchmark {
 namespace internal {

+BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
+                                     int per_family_instance_idx,
+                                     const std::vector<int64_t>& args,
+                                     int thread_count)
+    : benchmark_(*benchmark),
+      family_index_(family_idx),
+      per_family_instance_index_(per_family_instance_idx),
+      aggregation_report_mode_(benchmark_.aggregation_report_mode_),
+      args_(args),
+      time_unit_(benchmark_.GetTimeUnit()),
+      measure_process_cpu_time_(benchmark_.measure_process_cpu_time_),
+      use_real_time_(benchmark_.use_real_time_),
+      use_manual_time_(benchmark_.use_manual_time_),
+      complexity_(benchmark_.complexity_),
+      complexity_lambda_(benchmark_.complexity_lambda_),
+      statistics_(benchmark_.statistics_),
+      repetitions_(benchmark_.repetitions_),
+      min_time_(benchmark_.min_time_),
+      min_warmup_time_(benchmark_.min_warmup_time_),
+      iterations_(benchmark_.iterations_),
+      threads_(thread_count),
+      setup_(benchmark_.setup_),
+      teardown_(benchmark_.teardown_) {
+  name_.function_name = benchmark_.name_;
+
+  size_t arg_i = 0;
+  for (const auto& arg : args) {
+    if (!name_.args.empty()) {
+      name_.args += '/';
+    }
+
+    if (arg_i < benchmark->arg_names_.size()) {
+      const auto& arg_name = benchmark_.arg_names_[arg_i];
+      if (!arg_name.empty()) {
+        name_.args += StrFormat("%s:", arg_name.c_str());
+      }
+    }
+
+    name_.args += StrFormat("%" PRId64, arg);
+    ++arg_i;
+  }
+
+  if (!IsZero(benchmark->min_time_)) {
+    name_.min_time = StrFormat("min_time:%0.3f", benchmark_.min_time_);
+  }
+
+  if (!IsZero(benchmark->min_warmup_time_)) {
+    name_.min_warmup_time =
+        StrFormat("min_warmup_time:%0.3f", benchmark_.min_warmup_time_);
+  }
+
+  if (benchmark_.iterations_ != 0) {
+    name_.iterations = StrFormat(
+        "iterations:%lu", static_cast<unsigned long>(benchmark_.iterations_));
+  }
+
+  if (benchmark_.repetitions_ != 0) {
+    name_.repetitions = StrFormat("repeats:%d", benchmark_.repetitions_);
+  }
+
+  if (benchmark_.measure_process_cpu_time_) {
+    name_.time_type = "process_time";
+  }
+
+  if (benchmark_.use_manual_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "manual_time";
+  } else if (benchmark_.use_real_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "real_time";
+  }
+
+  if (!benchmark_.thread_counts_.empty()) {
+    name_.threads = StrFormat("threads:%d", threads_);
+  }
+}
+
 State BenchmarkInstance::Run(
-    size_t iters, int thread_id, internal::ThreadTimer* timer,
-    internal::ThreadManager* manager) const {
-  State st(iters, arg, thread_id, threads, timer, manager);
-  benchmark->Run(st);
+    IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+    internal::ThreadManager* manager,
+    internal::PerfCountersMeasurement* perf_counters_measurement,
+    ProfilerManager* profiler_manager) const {
+  State st(name_.function_name, iters, args_, thread_id, threads_, timer,
+           manager, perf_counters_measurement, profiler_manager);
+  benchmark_.Run(st);
  return st;
 }

-}  // internal
-}  // benchmark
+void BenchmarkInstance::Setup() const {
+  if (setup_ != nullptr) {
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr, nullptr);
+    setup_(st);
+  }
+}
+
+void BenchmarkInstance::Teardown() const {
+  if (teardown_ != nullptr) {
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr, nullptr);
+    teardown_(st);
+  }
+}
+}  // namespace internal
+}  // namespace benchmark
--- a/third_party/benchmark/src/benchmark_api_internal.h
+++ b/third_party/benchmark/src/benchmark_api_internal.h
@@ -1,9 +1,6 @@
 #ifndef BENCHMARK_API_INTERNAL_H
 #define BENCHMARK_API_INTERNAL_H

-#include "benchmark/benchmark.h"
-#include "commandlineflags.h"
-
 #include <cmath>
 #include <iosfwd>
 #include <limits>
@@ -11,31 +8,71 @@
 #include <string>
 #include <vector>

+#include "benchmark/benchmark.h"
+#include "commandlineflags.h"
+
 namespace benchmark {
 namespace internal {

 // Information kept per benchmark we may want to run
-struct BenchmarkInstance {
-  std::string name;
-  Benchmark* benchmark;
-  AggregationReportMode aggregation_report_mode;
-  std::vector<int64_t> arg;
-  TimeUnit time_unit;
-  int range_multiplier;
-  bool use_real_time;
-  bool use_manual_time;
-  BigO complexity;
-  BigOFunc* complexity_lambda;
-  UserCounters counters;
-  const std::vector<Statistics>* statistics;
-  bool last_benchmark_instance;
-  int repetitions;
-  double min_time;
-  size_t iterations;
-  int threads;  // Number of concurrent threads to us
+class BenchmarkInstance {
+ public:
+  BenchmarkInstance(Benchmark* benchmark, int family_idx,
+                    int per_family_instance_idx,
+                    const std::vector<int64_t>& args, int thread_count);

-  State Run(size_t iters, int thread_id, internal::ThreadTimer* timer,
-            internal::ThreadManager* manager) const;
+  const BenchmarkName& name() const { return name_; }
+  int family_index() const { return family_index_; }
+  int per_family_instance_index() const { return per_family_instance_index_; }
+  AggregationReportMode aggregation_report_mode() const {
+    return aggregation_report_mode_;
+  }
+  TimeUnit time_unit() const { return time_unit_; }
+  bool measure_process_cpu_time() const { return measure_process_cpu_time_; }
+  bool use_real_time() const { return use_real_time_; }
+  bool use_manual_time() const { return use_manual_time_; }
+  BigO complexity() const { return complexity_; }
+  BigOFunc* complexity_lambda() const { return complexity_lambda_; }
+  const std::vector<Statistics>& statistics() const { return statistics_; }
+  int repetitions() const { return repetitions_; }
+  double min_time() const { return min_time_; }
+  double min_warmup_time() const { return min_warmup_time_; }
+  IterationCount iterations() const { return iterations_; }
+  int threads() const { return threads_; }
+  void Setup() const;
+  void Teardown() const;
+  const auto& GetUserThreadRunnerFactory() const {
+    return benchmark_.threadrunner_;
+  }
+
+  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+            internal::ThreadManager* manager,
+            internal::PerfCountersMeasurement* perf_counters_measurement,
+            ProfilerManager* profiler_manager) const;
+
+ private:
+  BenchmarkName name_;
+  Benchmark& benchmark_;
+  const int family_index_;
+  const int per_family_instance_index_;
+  AggregationReportMode aggregation_report_mode_;
+  const std::vector<int64_t>& args_;
+  TimeUnit time_unit_;
+  bool measure_process_cpu_time_;
+  bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
+  UserCounters counters_;
+  const std::vector<Statistics>& statistics_;
+  int repetitions_;
+  double min_time_;
+  double min_warmup_time_;
+  IterationCount iterations_;
+  int threads_;  // Number of concurrent threads to us
+
+  callback_function setup_;
+  callback_function teardown_;
 };

 bool FindBenchmarksInternal(const std::string& re,
@@ -44,6 +81,7 @@ bool FindBenchmarksInternal(const std::string& re,

 bool IsZero(double n);

+BENCHMARK_EXPORT
 ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);

 }  // end namespace internal
--- a/third_party/benchmark/src/benchmark_main.cc
+++ b/third_party/benchmark/src/benchmark_main.cc
@@ -14,4 +14,5 @@

 #include "benchmark/benchmark.h"

+BENCHMARK_EXPORT int main(int /*argc*/, char** /*argv*/);
 BENCHMARK_MAIN();
--- a/third_party/benchmark/src/benchmark_name.cc
+++ b/third_party/benchmark/src/benchmark_name.cc
@@ -0,0 +1,59 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <benchmark/benchmark.h>
+
+namespace benchmark {
+
+namespace {
+
+// Compute the total size of a pack of std::strings
+size_t size_impl() { return 0; }
+
+template <typename Head, typename... Tail>
+size_t size_impl(const Head& head, const Tail&... tail) {
+  return head.size() + size_impl(tail...);
+}
+
+// Join a pack of std::strings using a delimiter
+// TODO(dominic): use absl::StrJoin
+void join_impl(std::string& /*unused*/, char /*unused*/) {}
+
+template <typename Head, typename... Tail>
+void join_impl(std::string& s, const char delimiter, const Head& head,
+               const Tail&... tail) {
+  if (!s.empty() && !head.empty()) {
+    s += delimiter;
+  }
+
+  s += head;
+
+  join_impl(s, delimiter, tail...);
+}
+
+template <typename... Ts>
+std::string join(char delimiter, const Ts&... ts) {
+  std::string s;
+  s.reserve(sizeof...(Ts) + size_impl(ts...));
+  join_impl(s, delimiter, ts...);
+  return s;
+}
+}  // namespace
+
+BENCHMARK_EXPORT
+std::string BenchmarkName::str() const {
+  return join('/', function_name, args, min_time, min_warmup_time, iterations,
+              repetitions, time_type, threads);
+}
+}  // namespace benchmark
--- a/third_party/benchmark/src/benchmark_register.cc
+++ b/third_party/benchmark/src/benchmark_register.cc
@@ -15,7 +15,7 @@
 #include "benchmark_register.h"

 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -24,6 +24,7 @@

 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
@@ -31,6 +32,7 @@
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <numeric>
 #include <sstream>
 #include <thread>

@@ -51,10 +53,13 @@ namespace benchmark {

 namespace {
 // For non-dense Range, intermediate values are powers of kRangeMultiplier.
-static const int kRangeMultiplier = 8;
+constexpr int kRangeMultiplier = 8;
+
 // The size of a benchmark family determines is the number of inputs to repeat
 // the benchmark on. If this is "large" then warn the user during configuration.
-static const size_t kMaxFamilySize = 100;
+constexpr size_t kMaxFamilySize = 100;
+
+constexpr char kDisabledPrefix[] = "DISABLED_";
 }  // end namespace

 namespace internal {
@@ -77,7 +82,7 @@ class BenchmarkFamilies {

  // Extract the list of benchmark instances that match the specified
  // regular expression.
-  bool FindBenchmarks(std::string re,
+  bool FindBenchmarks(std::string spec,
                      std::vector<BenchmarkInstance>* benchmarks,
                      std::ostream* Err);

@@ -109,28 +114,35 @@ void BenchmarkFamilies::ClearBenchmarks() {
 bool BenchmarkFamilies::FindBenchmarks(
    std::string spec, std::vector<BenchmarkInstance>* benchmarks,
    std::ostream* ErrStream) {
-  CHECK(ErrStream);
+  BM_CHECK(ErrStream);
  auto& Err = *ErrStream;
  // Make regular expression out of command-line flag
  std::string error_msg;
  Regex re;
-  bool isNegativeFilter = false;
+  bool is_negative_filter = false;
  if (spec[0] == '-') {
    spec.replace(0, 1, "");
-    isNegativeFilter = true;
+    is_negative_filter = true;
  }
  if (!re.Init(spec, &error_msg)) {
-    Err << "Could not compile benchmark re: " << error_msg << std::endl;
+    Err << "Could not compile benchmark re: " << error_msg << '\n';
    return false;
  }

  // Special list of thread counts to use when none are specified
  const std::vector<int> one_thread = {1};

+  int next_family_index = 0;
+
  MutexLock l(mutex_);
  for (std::unique_ptr<Benchmark>& family : families_) {
+    int family_index = next_family_index;
+    int per_family_instance_index = 0;
+
    // Family was deleted or benchmark doesn't match
-    if (!family) continue;
+    if (!family) {
+      continue;
+    }

    if (family->ArgsCnt() == -1) {
      family->Args({});
@@ -147,67 +159,31 @@ bool BenchmarkFamilies::FindBenchmarks(
          << " will be repeated at least " << family_size << " times.\n";
    }
    // reserve in the special case the regex ".", since we know the final
-    // family size.
-    if (spec == ".") benchmarks->reserve(family_size);
+    // family size.  this doesn't take into account any disabled benchmarks
+    // so worst case we reserve more than we need.
+    if (spec == ".") {
+      benchmarks->reserve(benchmarks->size() + family_size);
+    }

    for (auto const& args : family->args_) {
      for (int num_threads : *thread_counts) {
-        BenchmarkInstance instance;
-        instance.name = family->name_;
-        instance.benchmark = family.get();
-        instance.aggregation_report_mode = family->aggregation_report_mode_;
-        instance.arg = args;
-        instance.time_unit = family->time_unit_;
-        instance.range_multiplier = family->range_multiplier_;
-        instance.min_time = family->min_time_;
-        instance.iterations = family->iterations_;
-        instance.repetitions = family->repetitions_;
-        instance.use_real_time = family->use_real_time_;
-        instance.use_manual_time = family->use_manual_time_;
-        instance.complexity = family->complexity_;
-        instance.complexity_lambda = family->complexity_lambda_;
-        instance.statistics = &family->statistics_;
-        instance.threads = num_threads;
+        BenchmarkInstance instance(family.get(), family_index,
+                                   per_family_instance_index, args,
+                                   num_threads);

-        // Add arguments to instance name
-        size_t arg_i = 0;
-        for (auto const& arg : args) {
-          instance.name += "/";
-
-          if (arg_i < family->arg_names_.size()) {
-            const auto& arg_name = family->arg_names_[arg_i];
-            if (!arg_name.empty()) {
-              instance.name +=
-                  StrFormat("%s:", family->arg_names_[arg_i].c_str());
-            }
-          }
-
-          instance.name += StrFormat("%d", arg);
-          ++arg_i;
-        }
-
-        if (!IsZero(family->min_time_))
-          instance.name += StrFormat("/min_time:%0.3f", family->min_time_);
-        if (family->iterations_ != 0)
-          instance.name += StrFormat("/iterations:%d", family->iterations_);
-        if (family->repetitions_ != 0)
-          instance.name += StrFormat("/repeats:%d", family->repetitions_);
-
-        if (family->use_manual_time_) {
-          instance.name += "/manual_time";
-        } else if (family->use_real_time_) {
-          instance.name += "/real_time";
-        }
-
-        // Add the number of threads used to the name
-        if (!family->thread_counts_.empty()) {
-          instance.name += StrFormat("/threads:%d", instance.threads);
-        }
-
-        if ((re.Match(instance.name) && !isNegativeFilter) ||
-            (!re.Match(instance.name) && isNegativeFilter)) {
-          instance.last_benchmark_instance = (&args == &family->args_.back());
+        const auto full_name = instance.name().str();
+        if (full_name.rfind(kDisabledPrefix, 0) != 0 &&
+            ((re.Match(full_name) && !is_negative_filter) ||
+             (!re.Match(full_name) && is_negative_filter))) {
          benchmarks->push_back(std::move(instance));
+
+          ++per_family_instance_index;
+
+          // Only bump the next family index once we've estabilished that
+          // at least one instance of this family will be run.
+          if (next_family_index == family_index) {
+            ++next_family_index;
+          }
        }
      }
    }
@@ -215,11 +191,11 @@ bool BenchmarkFamilies::FindBenchmarks(
  return true;
 }

-Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
-  std::unique_ptr<Benchmark> bench_ptr(bench);
+Benchmark* RegisterBenchmarkInternal(std::unique_ptr<Benchmark> bench) {
+  Benchmark* bench_ptr = bench.get();
  BenchmarkFamilies* families = BenchmarkFamilies::GetInstance();
-  families->AddBenchmark(std::move(bench_ptr));
-  return bench;
+  families->AddBenchmark(std::move(bench));
+  return bench_ptr;
 }

 // FIXME: This function is a hack so that benchmark.cc can access
@@ -234,14 +210,17 @@ bool FindBenchmarksInternal(const std::string& re,
 //                               Benchmark
 //=============================================================================//

-Benchmark::Benchmark(const char* name)
+Benchmark::Benchmark(const std::string& name)
    : name_(name),
      aggregation_report_mode_(ARM_Unspecified),
-      time_unit_(kNanosecond),
+      time_unit_(GetDefaultTimeUnit()),
+      use_default_time_unit_(true),
      range_multiplier_(kRangeMultiplier),
      min_time_(0),
+      min_warmup_time_(0),
      iterations_(0),
      repetitions_(0),
+      measure_process_cpu_time_(false),
      use_real_time_(false),
      use_manual_time_(false),
      complexity_(oNone),
@@ -249,23 +228,30 @@ Benchmark::Benchmark(const char* name)
  ComputeStatistics("mean", StatisticsMean);
  ComputeStatistics("median", StatisticsMedian);
  ComputeStatistics("stddev", StatisticsStdDev);
+  ComputeStatistics("cv", StatisticsCV, kPercentage);
 }

 Benchmark::~Benchmark() {}

+Benchmark* Benchmark::Name(const std::string& name) {
+  SetName(name);
+  return this;
+}
+
 Benchmark* Benchmark::Arg(int64_t x) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
  args_.push_back({x});
  return this;
 }

 Benchmark* Benchmark::Unit(TimeUnit unit) {
  time_unit_ = unit;
+  use_default_time_unit_ = false;
  return this;
 }

 Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
  std::vector<int64_t> arglist;
  AddRange(&arglist, start, limit, range_multiplier_);

@@ -277,54 +263,61 @@ Benchmark* Benchmark::Range(int64_t start, int64_t limit) {

 Benchmark* Benchmark::Ranges(
    const std::vector<std::pair<int64_t, int64_t>>& ranges) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
  std::vector<std::vector<int64_t>> arglists(ranges.size());
-  std::size_t total = 1;
  for (std::size_t i = 0; i < ranges.size(); i++) {
    AddRange(&arglists[i], ranges[i].first, ranges[i].second,
             range_multiplier_);
-    total *= arglists[i].size();
  }

-  std::vector<std::size_t> ctr(arglists.size(), 0);
+  ArgsProduct(arglists);

+  return this;
+}
+
+Benchmark* Benchmark::ArgsProduct(
+    const std::vector<std::vector<int64_t>>& arglists) {
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(arglists.size()));
+
+  std::vector<std::size_t> indices(arglists.size());
+  const std::size_t total = std::accumulate(
+      std::begin(arglists), std::end(arglists), std::size_t{1},
+      [](const std::size_t res, const std::vector<int64_t>& arglist) {
+        return res * arglist.size();
+      });
+  std::vector<int64_t> args;
+  args.reserve(arglists.size());
  for (std::size_t i = 0; i < total; i++) {
-    std::vector<int64_t> tmp;
-    tmp.reserve(arglists.size());
-
-    for (std::size_t j = 0; j < arglists.size(); j++) {
-      tmp.push_back(arglists[j].at(ctr[j]));
+    for (std::size_t arg = 0; arg < arglists.size(); arg++) {
+      args.push_back(arglists[arg][indices[arg]]);
    }
+    args_.push_back(args);
+    args.clear();

-    args_.push_back(std::move(tmp));
-
-    for (std::size_t j = 0; j < arglists.size(); j++) {
-      if (ctr[j] + 1 < arglists[j].size()) {
-        ++ctr[j];
-        break;
-      }
-      ctr[j] = 0;
-    }
+    std::size_t arg = 0;
+    do {
+      indices[arg] = (indices[arg] + 1) % arglists[arg].size();
+    } while (indices[arg++] == 0 && arg < arglists.size());
  }
+
  return this;
 }

 Benchmark* Benchmark::ArgName(const std::string& name) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
  arg_names_ = {name};
  return this;
 }

 Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
  arg_names_ = names;
  return this;
 }

 Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-  CHECK_GE(start, 0);
-  CHECK_LE(start, limit);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK_LE(start, limit);
  for (int64_t arg = start; arg <= limit; arg += step) {
    args_.push_back({arg});
  }
@@ -332,7 +325,7 @@ Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
 }

 Benchmark* Benchmark::Args(const std::vector<int64_t>& args) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
  args_.push_back(args);
  return this;
 }
@@ -342,28 +335,60 @@ Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
  return this;
 }

+Benchmark* Benchmark::Setup(callback_function&& setup) {
+  BM_CHECK(setup != nullptr);
+  setup_ = std::forward<callback_function>(setup);
+  return this;
+}
+
+Benchmark* Benchmark::Setup(const callback_function& setup) {
+  BM_CHECK(setup != nullptr);
+  setup_ = setup;
+  return this;
+}
+
+Benchmark* Benchmark::Teardown(callback_function&& teardown) {
+  BM_CHECK(teardown != nullptr);
+  teardown_ = std::forward<callback_function>(teardown);
+  return this;
+}
+
+Benchmark* Benchmark::Teardown(const callback_function& teardown) {
+  BM_CHECK(teardown != nullptr);
+  teardown_ = teardown;
+  return this;
+}
+
 Benchmark* Benchmark::RangeMultiplier(int multiplier) {
-  CHECK(multiplier > 1);
+  BM_CHECK(multiplier > 1);
  range_multiplier_ = multiplier;
  return this;
 }

 Benchmark* Benchmark::MinTime(double t) {
-  CHECK(t > 0.0);
-  CHECK(iterations_ == 0);
+  BM_CHECK(t > 0.0);
+  BM_CHECK(iterations_ == 0);
  min_time_ = t;
  return this;
 }

-Benchmark* Benchmark::Iterations(size_t n) {
-  CHECK(n > 0);
-  CHECK(IsZero(min_time_));
+Benchmark* Benchmark::MinWarmUpTime(double t) {
+  BM_CHECK(t >= 0.0);
+  BM_CHECK(iterations_ == 0);
+  min_warmup_time_ = t;
+  return this;
+}
+
+Benchmark* Benchmark::Iterations(IterationCount n) {
+  BM_CHECK(n > 0);
+  BM_CHECK(IsZero(min_time_));
+  BM_CHECK(IsZero(min_warmup_time_));
  iterations_ = n;
  return this;
 }

 Benchmark* Benchmark::Repetitions(int n) {
-  CHECK(n > 0);
+  BM_CHECK(n > 0);
  repetitions_ = n;
  return this;
 }
@@ -389,15 +414,21 @@ Benchmark* Benchmark::DisplayAggregatesOnly(bool value) {
  return this;
 }

+Benchmark* Benchmark::MeasureProcessCPUTime() {
+  // Can be used together with UseRealTime() / UseManualTime().
+  measure_process_cpu_time_ = true;
+  return this;
+}
+
 Benchmark* Benchmark::UseRealTime() {
-  CHECK(!use_manual_time_)
+  BM_CHECK(!use_manual_time_)
      << "Cannot set UseRealTime and UseManualTime simultaneously.";
  use_real_time_ = true;
  return this;
 }

 Benchmark* Benchmark::UseManualTime() {
-  CHECK(!use_real_time_)
+  BM_CHECK(!use_real_time_)
      << "Cannot set UseRealTime and UseManualTime simultaneously.";
  use_manual_time_ = true;
  return this;
@@ -414,21 +445,22 @@ Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
  return this;
 }

-Benchmark* Benchmark::ComputeStatistics(std::string name,
-                                        StatisticsFunc* statistics) {
-  statistics_.emplace_back(name, statistics);
+Benchmark* Benchmark::ComputeStatistics(const std::string& name,
+                                        StatisticsFunc* statistics,
+                                        StatisticUnit unit) {
+  statistics_.emplace_back(name, statistics, unit);
  return this;
 }

 Benchmark* Benchmark::Threads(int t) {
-  CHECK_GT(t, 0);
+  BM_CHECK_GT(t, 0);
  thread_counts_.push_back(t);
  return this;
 }

 Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);

  AddRange(&thread_counts_, min_threads, max_threads, 2);
  return this;
@@ -436,9 +468,9 @@ Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {

 Benchmark* Benchmark::DenseThreadRange(int min_threads, int max_threads,
                                       int stride) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
-  CHECK_GE(stride, 1);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GE(stride, 1);

  for (auto i = min_threads; i < max_threads; i += stride) {
    thread_counts_.push_back(i);
@@ -452,16 +484,36 @@ Benchmark* Benchmark::ThreadPerCpu() {
  return this;
 }

-void Benchmark::SetName(const char* name) { name_ = name; }
+Benchmark* Benchmark::ThreadRunner(threadrunner_factory&& factory) {
+  threadrunner_ = std::move(factory);
+  return this;
+}
+
+void Benchmark::SetName(const std::string& name) { name_ = name; }
+
+const char* Benchmark::GetName() const { return name_.c_str(); }

 int Benchmark::ArgsCnt() const {
  if (args_.empty()) {
-    if (arg_names_.empty()) return -1;
+    if (arg_names_.empty()) {
+      return -1;
+    }
    return static_cast<int>(arg_names_.size());
  }
  return static_cast<int>(args_.front().size());
 }

+const char* Benchmark::GetArgName(int arg) const {
+  BM_CHECK_GE(arg, 0);
+  size_t uarg = static_cast<size_t>(arg);
+  BM_CHECK_LT(uarg, arg_names_.size());
+  return arg_names_[uarg].c_str();
+}
+
+TimeUnit Benchmark::GetTimeUnit() const {
+  return use_default_time_unit_ ? GetDefaultTimeUnit() : time_unit_;
+}
+
 //=============================================================================//
 //                            FunctionBenchmark
 //=============================================================================//
@@ -474,4 +526,19 @@ void ClearRegisteredBenchmarks() {
  internal::BenchmarkFamilies::GetInstance()->ClearBenchmarks();
 }

+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi) {
+  std::vector<int64_t> args;
+  internal::AddRange(&args, lo, hi, multi);
+  return args;
+}
+
+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit, int step) {
+  BM_CHECK_LE(start, limit);
+  std::vector<int64_t> args;
+  for (int64_t arg = start; arg <= limit; arg += step) {
+    args.push_back(arg);
+  }
+  return args;
+}
+
 }  // end namespace benchmark
--- a/third_party/benchmark/src/benchmark_register.h
+++ b/third_party/benchmark/src/benchmark_register.h
@@ -1,33 +1,109 @@
 #ifndef BENCHMARK_REGISTER_H
 #define BENCHMARK_REGISTER_H

+#include <algorithm>
+#include <limits>
 #include <vector>

 #include "check.h"

+namespace benchmark {
+namespace internal {
+
+// Append the powers of 'mult' in the closed interval [lo, hi].
+// Returns iterator to the start of the inserted range.
+template <typename T>
+typename std::vector<T>::iterator AddPowers(std::vector<T>* dst, T lo, T hi,
+                                            int mult) {
+  BM_CHECK_GE(lo, 0);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);
+
+  const size_t start_offset = dst->size();
+
+  static const T kmax = std::numeric_limits<T>::max();
+
+  // Space out the values in multiples of "mult"
+  for (T i = static_cast<T>(1); i <= hi; i = static_cast<T>(i * mult)) {
+    if (i >= lo) {
+      dst->push_back(i);
+    }
+    // Break the loop here since multiplying by
+    // 'mult' would move outside of the range of T
+    if (i > kmax / mult) break;
+  }
+
+  return dst->begin() + static_cast<int>(start_offset);
+}
+
+template <typename T>
+void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
+  // We negate lo and hi so we require that they cannot be equal to 'min'.
+  BM_CHECK_GT(lo, std::numeric_limits<T>::min());
+  BM_CHECK_GT(hi, std::numeric_limits<T>::min());
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_LE(hi, 0);
+
+  // Add positive powers, then negate and reverse.
+  // Casts necessary since small integers get promoted
+  // to 'int' when negating.
+  const auto lo_complement = static_cast<T>(-lo);
+  const auto hi_complement = static_cast<T>(-hi);
+
+  const auto it = AddPowers(dst, hi_complement, lo_complement, mult);
+
+  std::for_each(it, dst->end(), [](T& t) { t = static_cast<T>(t * -1); });
+  std::reverse(it, dst->end());
+}
+
 template <typename T>
 void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
-  CHECK_GE(lo, 0);
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+  static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
+                "Args type must be a signed integer");
+
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);

  // Add "lo"
  dst->push_back(lo);

-  static const T kmax = std::numeric_limits<T>::max();
+  // Handle lo == hi as a special case, so we then know
+  // lo < hi and so it is safe to add 1 to lo and subtract 1
+  // from hi without falling outside of the range of T.
+  if (lo == hi) return;

-  // Now space out the benchmarks in multiples of "mult"
-  for (T i = 1; i < kmax / mult; i *= mult) {
-    if (i >= hi) break;
-    if (i > lo) {
-      dst->push_back(i);
-    }
+  // Ensure that lo_inner <= hi_inner below.
+  if (lo + 1 == hi) {
+    dst->push_back(hi);
+    return;
  }

-  // Add "hi" (if different from "lo")
-  if (hi != lo) {
+  // Add all powers of 'mult' in the range [lo+1, hi-1] (inclusive).
+  const auto lo_inner = static_cast<T>(lo + 1);
+  const auto hi_inner = static_cast<T>(hi - 1);
+
+  // Insert negative values
+  if (lo_inner < 0) {
+    AddNegatedPowers(dst, lo_inner, std::min(hi_inner, T{-1}), mult);
+  }
+
+  // Treat 0 as a special case (see discussion on #762).
+  if (lo < 0 && hi >= 0) {
+    dst->push_back(0);
+  }
+
+  // Insert positive values
+  if (hi_inner > 0) {
+    AddPowers(dst, std::max(lo_inner, T{1}), hi_inner, mult);
+  }
+
+  // Add "hi" (if different from last value).
+  if (hi != dst->back()) {
    dst->push_back(hi);
  }
 }

+}  // namespace internal
+}  // namespace benchmark
+
 #endif  // BENCHMARK_REGISTER_H
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Run Yu	44ef0af4a4	immediately GetMappedRange after MapAsync still would return null pointer	2025-11-21 16:53:03 -05:00
Run Yu	c9b5bfc4a4	Use a stage pool for WebGPU, but somehow GetMappedRange would sometimes return null, which breaks the code.	2025-11-21 10:57:02 -05:00
Doris Wu	f07176c0a2	Try to fix test_UboBatching (#9448 )	2025-11-20 15:14:19 +08:00
Doris Wu	15db141c7a	Add some unit tests for UboManager (#9446 )	2025-11-20 14:28:13 +08:00
Doris Wu	d4bbb7c591	buffer update opt: Some optimizations (#9438 )	2025-11-20 00:52:04 +00:00
Sungun Park	92e620d2ad	Simplify buffer object creation (#9436 ) Simplify the buffer object creation logic to streamline and help make the future integration of asynchronous features easier.	2025-11-19 22:03:05 +00:00
Mathias Agopian	311104da97	update google benchmark library to 1.9.4 (#9441 ) * benchmark: update README and add update script * update google benchmark library to 1.9.4 * update tnt CMakeLists to match the library new version	2025-11-19 11:50:34 -08:00
Filament Bot	3127632f96	[automated] Updating /docs due to commit `59f611b` Full commit hash is `59f611bfde` DOCS_ALLOW_DIRECT_EDITS	2025-11-19 19:32:56 +00:00