metal: implement memory mapping (#9454 )

fix a typo/bad merge that broke setPresentationTime (#9452 )
FIXES=[462533574]
2025-11-27 11:05:07 +08:00 · 2025-11-21 11:40:17 -08:00 · 2025-11-21 19:33:36 +00:00 · 2025-11-21 00:37:28 +00:00 · 2025-11-20 15:32:02 -08:00 · 2025-11-20 13:23:58 -08:00
211 changed files with 20178 additions and 8020 deletions
--- a/android/filament-utils-android/src/main/java/com/google/android/filament/utils/ModelViewer.kt
+++ b/android/filament-utils-android/src/main/java/com/google/android/filament/utils/ModelViewer.kt
@@ -16,6 +16,9 @@

 package com.google.android.filament.utils

+import android.graphics.Bitmap
+import android.os.Handler
+import android.os.Looper
 import android.view.MotionEvent
 import android.view.Surface
 import android.view.SurfaceView
@@ -26,6 +29,7 @@ import com.google.android.filament.android.UiHelper
 import com.google.android.filament.gltfio.*
 import kotlinx.coroutines.*
 import java.nio.Buffer
+import java.nio.ByteBuffer

 private const val kNearPlane = 0.05f     // 5 cm
 private const val kFarPlane = 1000.0f    // 1 km
@@ -119,6 +123,8 @@ class ModelViewer(
    private val target = DoubleArray(3)
    private val upward = DoubleArray(3)

+    private var debugFrameCallback: ((Bitmap) -> Unit)? = null
+
    init {
        renderer = engine.createRenderer()
        scene = engine.createScene()
@@ -305,10 +311,39 @@ class ModelViewer(
        // Render the scene, unless the renderer wants to skip the frame.
        if (renderer.beginFrame(swapChain!!, frameTimeNanos)) {
            renderer.render(view)
+
+            debugFrameCallback?.let {
+                val viewport = view.viewport
+                val bitmap = Bitmap.createBitmap(viewport.width, viewport.height,
+                        Bitmap.Config.ARGB_8888)
+                val buffer = ByteBuffer.allocateDirect(viewport.width * viewport.height * 4)
+
+                val handler = Handler(Looper.getMainLooper())
+                val pixelBufferDescriptor = Texture.PixelBufferDescriptor(buffer,
+                        Texture.Format.RGBA, Texture.Type.UBYTE, 1, 0, 0, 0, handler) {
+                    buffer.rewind()
+                    bitmap.copyPixelsFromBuffer(buffer)
+                    it(bitmap)
+                }
+                renderer.readPixels(viewport.left, viewport.bottom, viewport.width,
+                                    viewport.height, pixelBufferDescriptor)
+                debugFrameCallback = null
+            }
+
            renderer.endFrame()
        }
    }

+    /*
+     * Sets a callback that will be invoked with the next rendered frame as a Bitmap. Note that this
+     * is a one-time callback.
+     *
+     *  @param callback   callback to be invoked with a rendered frame as [Bitmap]
+     */
+    fun debugGetNextFrameCallback(callback: (Bitmap) -> Unit) {
+        debugFrameCallback = callback
+    }
+
    private fun populateScene(asset: FilamentAsset) {
        val rcm = engine.renderableManager
        var count = 0
--- a/docs/dup/intro.html
+++ b/docs/dup/intro.html
@@ -181,7 +181,7 @@ important for <code>matc</code> (material compiler).</p>
 }

 dependencies {
-    implementation 'com.google.android.filament:filament-android:1.66.2'
+    implementation 'com.google.android.filament:filament-android:1.67.1'
 }
 </code></pre>
 <p>Here are all the libraries available in the group <code>com.google.android.filament</code>:</p>
@@ -196,7 +196,7 @@ dependencies {
 </div>
 <h3 id="ios"><a class="header" href="#ios">iOS</a></h3>
 <p>iOS projects can use CocoaPods to install the latest release:</p>
-<pre><code class="language-shell">pod 'Filament', '~&gt; 1.66.2'
+<pre><code class="language-shell">pod 'Filament', '~&gt; 1.67.1'
 </code></pre>
 <h2 id="documentation"><a class="header" href="#documentation">Documentation</a></h2>
 <ul>
--- a/docs/main/filament.html
+++ b/docs/main/filament.html
--- a/docs/main/materials.html
+++ b/docs/main/materials.html
--- a/docs/searchindex.js
+++ b/docs/searchindex.js
--- a/docs/searchindex.json
+++ b/docs/searchindex.json
--- a/filament/backend/src/AndroidNativeWindow.cpp
+++ b/filament/backend/src/AndroidNativeWindow.cpp
@@ -61,6 +61,16 @@ int NativeWindow::enableFrameTimestamps(ANativeWindow* anw, bool enable) {
    return pWindow->perform(anw, ENABLE_FRAME_TIMESTAMPS, enable);
 }

+int NativeWindow::frameTimestampsSupportsPresent(ANativeWindow* anw, bool* outSupportsPresent) {
+    NativeWindow const* pWindow = reinterpret_cast<NativeWindow const*>(anw);
+    int value = 0;
+    bool const success = pWindow->perform(anw, FRAME_TIMESTAMPS_SUPPORTS_PRESENT, &value);
+    if (success) {
+        *outSupportsPresent = bool(value);
+    }
+    return success;
+}
+
 int NativeWindow::getCompositorTiming(ANativeWindow* anw,
        int64_t* compositeDeadline, int64_t* compositeInterval,
        int64_t* compositeToPresentLatency) {
--- a/filament/backend/src/AndroidNativeWindow.h
+++ b/filament/backend/src/AndroidNativeWindow.h
@@ -32,6 +32,7 @@ struct NativeWindow {
    // is valid query enum value
    enum {
        IS_VALID                = 17,
+        FRAME_TIMESTAMPS_SUPPORTS_PRESENT = 18,
        GET_NEXT_FRAME_ID       = 24,
        ENABLE_FRAME_TIMESTAMPS = 25,
        GET_COMPOSITOR_TIMING   = 26,
@@ -51,6 +52,7 @@ struct NativeWindow {

    static int getNextFrameId(ANativeWindow* anw, uint64_t* frameId);
    static int enableFrameTimestamps(ANativeWindow* anw, bool enable);
+    static int frameTimestampsSupportsPresent(ANativeWindow* anw, bool* outSupportsPresent);
    static int getCompositorTiming(ANativeWindow* anw,
            int64_t* compositeDeadline, int64_t* compositeInterval,
            int64_t* compositeToPresentLatency);
--- a/filament/backend/src/AndroidSwapChainHelper.cpp
+++ b/filament/backend/src/AndroidSwapChainHelper.cpp
@@ -19,6 +19,9 @@

 #include <android/native_window.h>

+#include <utils/compiler.h>
+#include <utils/Logger.h>
+
 #include <cstddef>
 #include <cstdint>
 #include <limits>
@@ -36,11 +39,23 @@ bool AndroidSwapChainHelper::setPresentFrameId(
    int const status = NativeWindow::getNextFrameId(anw, &sysFrameId);
    if (status == 0) {
        std::lock_guard const lock(mLock);
-        auto const pos = mFrameIdToSystemFrameId.find(frameId);
-        if (pos && *pos != sysFrameId) {
-            // we're trying to associate the same frame id to a different frame!
-            return false;
+        // frameIds must be strictly monotonic, if that's not the case (i.e. the new frameId is
+        // less or equal to the last one in the map), we have to clear the map, because the
+        // map's find() assume sorted keys.
+        // This case can happen if two different filament::Renderer are used with the same
+        // ANativeWindow (the Renderer would have different frameIds). This is expected to
+        // be a rare case.
+        if (UTILS_UNLIKELY(!mFrameIdToSystemFrameId.empty() &&
+                frameId <= mFrameIdToSystemFrameId.back().first)) {
+            // this log is expected to happen very rarely
+            DLOG(INFO) << "clearing frame history anw=" << anw
+                    << ", frameId=" << frameId
+                    << ", previous=" << mFrameIdToSystemFrameId.back().first
+                    << ", sysFrameId=" << sysFrameId;
+            // clear the frame history
+            mFrameIdToSystemFrameId.clear();
        }
+
        // oldest entry is removed
        mFrameIdToSystemFrameId.insert(frameId, sysFrameId);
        return true;
--- a/filament/backend/src/Platform.cpp
+++ b/filament/backend/src/Platform.cpp
@@ -146,12 +146,12 @@ void Platform::setBlobFunc(InsertBlobFunc&& insertBlob, RetrieveBlobFunc&& retri

 bool Platform::hasInsertBlobFunc() const noexcept {
    std::lock_guard<decltype(mMutex)> lock(mMutex);
-    return bool(mInsertBlob);
+    return mInsertBlob && bool(*mInsertBlob);
 }

 bool Platform::hasRetrieveBlobFunc() const noexcept {
    std::lock_guard<decltype(mMutex)> lock(mMutex);
-    return bool(mRetrieveBlob);
+    return mRetrieveBlob && bool(*mRetrieveBlob);
 }

 void Platform::insertBlob(void const* key, size_t keySize, void const* value, size_t valueSize) {
@@ -184,7 +184,7 @@ void Platform::setDebugUpdateStatFunc(DebugUpdateStatFunc&& debugUpdateStat) noe

 bool Platform::hasDebugUpdateStatFunc() const noexcept {
    std::lock_guard<decltype(mMutex)> lock(mMutex);
-    return mDebugUpdateStat != nullptr;
+    return mDebugUpdateStat && bool(*mDebugUpdateStat);
 }

 void Platform::debugUpdateStat(const char* key, uint64_t intValue) {
--- a/filament/backend/src/metal/MetalBuffer.h
+++ b/filament/backend/src/metal/MetalBuffer.h
@@ -165,7 +165,7 @@ public:
         size_t size, bool forceGpuBuffer = false);
    ~MetalBuffer();

-    [[nodiscard]] bool wasAllocationSuccessful() const noexcept { return mBuffer || mCpuBuffer; }
+    [[nodiscard]] bool wasAllocationSuccessful() const noexcept { return mBuffer; }

    MetalBuffer(const MetalBuffer& rhs) = delete;
    MetalBuffer& operator=(const MetalBuffer& rhs) = delete;
@@ -185,14 +185,12 @@ public:
     * Denotes that this buffer is used for a draw call ensuring that its allocation remains valid
     * until the end of the current frame.
     *
-     * @return The MTLBuffer representing the current state of the buffer to bind, or nil if there
-     * is no device allocation.
+     * @return The MTLBuffer representing the current state of the buffer to bind, it never returns
+     * nil.
     *
     */
    id<MTLBuffer> getGpuBufferForDraw() noexcept;

-    void* getCpuBuffer() const noexcept { return mCpuBuffer; }
-
    void setLabel(const utils::ImmutableCString& label) {
 #if FILAMENT_METAL_DEBUG_LABELS
        if (label.empty()) {
@@ -235,7 +233,6 @@ private:
    UploadStrategy mUploadStrategy;
    TrackedMetalBuffer mBuffer;
    size_t mBufferSize = 0;
-    void* mCpuBuffer = nullptr;
    MetalContext& mContext;
 };

--- a/filament/backend/src/metal/MetalBuffer.mm
+++ b/filament/backend/src/metal/MetalBuffer.mm
@@ -39,34 +39,30 @@ MetalBuffer::MetalBuffer(MetalContext& context, BufferObjectBinding bindingType,
        mUploadStrategy = UploadStrategy::POOL;
    }

-    // If the buffer is less than 4K in size and is updated frequently, we don't use an explicit
-    // buffer. Instead, we use immediate command encoder methods like setVertexBytes:length:atIndex:.
-    // This won't work for SSBOs, since they are read/write.
+    MTLResourceOptions options = MTLResourceStorageModePrivate;

-    /*
-    if (size <= 4 * 1024 && bindingType != BufferObjectBinding::SHADER_STORAGE &&
-            usage == BufferUsage::DYNAMIC && !forceGpuBuffer) {
-        mBuffer = nil;
-        mCpuBuffer = malloc(size);
-        return;
+    // The buffer will be memory mapped for write operations.
+    if (any(usage & BufferUsage::SHARED_WRITE_BIT)) {
+#if defined(FILAMENT_IOS) || defined(__arm64__) || defined(__aarch64__)
+        // iOS and Apple Silicon devices use UMA (Unified Memory Architecture), so we use Shared memory.
+        options = MTLResourceStorageModeShared;
+#else
+        // Intel Macs require Managed memory for CPU/GPU synchronization.
+        options = MTLResourceStorageModeManaged;
+#endif
    }
-    */

-    // Otherwise, we allocate a private GPU buffer.
    {
        ScopedAllocationTimer timer("generic");
-        mBuffer = { [context.device newBufferWithLength:size options:MTLResourceStorageModePrivate],
+        mBuffer = { [context.device newBufferWithLength:size options:options],
            TrackedMetalBuffer::Type::GENERIC };
    }
+
    // mBuffer might fail to be allocated. Clients can check for this by calling
    // wasAllocationSuccessful().
 }

-MetalBuffer::~MetalBuffer() {
-    if (mCpuBuffer) {
-        free(mCpuBuffer);
-    }
-}
+MetalBuffer::~MetalBuffer() = default;

 void MetalBuffer::copyIntoBuffer(
        void* src, size_t size, size_t byteOffset, TagResolver&& getHandleTag) {
@@ -83,12 +79,6 @@ void MetalBuffer::copyIntoBuffer(
    FILAMENT_CHECK_PRECONDITION(!(byteOffset & 0x3))
            << "byteOffset must be a multiple of 4, tag=" << getHandleTag();

-    // If we have a cpu buffer, we can directly copy into it.
-    if (mCpuBuffer) {
-        memcpy(static_cast<uint8_t*>(mCpuBuffer) + byteOffset, src, size);
-        return;
-    }
-
    switch (mUploadStrategy) {
        case UploadStrategy::BUMP_ALLOCATOR:
            uploadWithBumpAllocator(src, size, byteOffset, std::move(getHandleTag));
@@ -106,11 +96,6 @@ void MetalBuffer::copyIntoBufferUnsynchronized(
 }

 id<MTLBuffer> MetalBuffer::getGpuBufferForDraw() noexcept {
-    // If there's a CPU buffer, then we return nil here, as the CPU-side buffer will be bound
-    // separately.
-    if (mCpuBuffer) {
-        return nil;
-    }
    assert_invariant(mBuffer);
    return mBuffer.get();
 }
@@ -171,41 +156,6 @@ void MetalBuffer::bindBuffers(id<MTLCommandBuffer> cmdBuffer, id<MTLCommandEncod
                                                   offsets:metalOffsets.data()
                                                 withRange:bufferRange];
    }
-
-    for (size_t b = 0; b < count; b++) {
-        MetalBuffer* const buffer = buffers[b];
-        if (!buffer) {
-            continue;
-        }
-
-        const void* cpuBuffer = buffer->getCpuBuffer();
-        if (!cpuBuffer) {
-            continue;
-        }
-
-        const size_t bufferIndex = bufferStart + b;
-        const size_t offset = offsets[b];
-        auto* bytes = static_cast<const uint8_t*>(cpuBuffer);
-
-        if (stages & Stage::VERTEX) {
-            [(id<MTLRenderCommandEncoder>) encoder setVertexBytes:(bytes + offset)
-                                                           length:(buffer->getSize() - offset)
-                                                          atIndex:bufferIndex];
-        }
-        if (stages & Stage::FRAGMENT) {
-            [(id<MTLRenderCommandEncoder>) encoder setFragmentBytes:(bytes + offset)
-                                                             length:(buffer->getSize() - offset)
-                                                            atIndex:bufferIndex];
-        }
-        if (stages & Stage::COMPUTE) {
-            // TODO: using setBytes means the data is read-only, which currently isn't enforced.
-            // In practice this won't be an issue since MetalBuffer ensures all SSBOs are realized
-            // through actual id<MTLBuffer> allocations.
-            [(id<MTLComputeCommandEncoder>) encoder setBytes:(bytes + offset)
-                                                      length:(buffer->getSize() - offset)
-                                                     atIndex:bufferIndex];
-        }
-    }
 }

 void MetalBuffer::uploadWithPoolBuffer(
--- a/filament/backend/src/metal/MetalDriver.h
+++ b/filament/backend/src/metal/MetalDriver.h
@@ -61,6 +61,8 @@ public:

    MetalContext* getContext() { return mContext; }

+    using DriverBase::scheduleDestroy;
+
 private:

    friend class MetalSwapChain;
--- a/filament/backend/src/metal/MetalDriver.mm
+++ b/filament/backend/src/metal/MetalDriver.mm
@@ -2275,7 +2275,10 @@ MemoryMappedBufferHandle MetalDriver::mapBufferS() noexcept {
 void MetalDriver::mapBufferR(MemoryMappedBufferHandle mmbh,
        BufferObjectHandle boh, size_t offset,
        size_t size, MapBufferAccessFlags access, utils::ImmutableCString&& tag) {
-    construct_handle<MetalMemoryMappedBuffer>(mmbh, boh, offset, size, access);
+    assert_invariant(boh);
+    MetalBufferObject* bo = mHandleAllocator.handle_cast<MetalBufferObject*>(boh);
+    assert_invariant(bo);
+    construct_handle<MetalMemoryMappedBuffer>(mmbh, bo, offset, size, access);
    mHandleAllocator.associateTagToHandle(mmbh.getId(), std::move(tag));
 }

@@ -2283,21 +2286,16 @@ void MetalDriver::unmapBuffer(MemoryMappedBufferHandle mmbh) {
    if (UTILS_UNLIKELY(!mmbh)) {
        return;
    }
+
+    auto* mmb = handle_cast<MetalMemoryMappedBuffer>(mmbh);
+    mmb->unmap();
    destruct_handle<MetalMemoryMappedBuffer>(mmbh);
 }

 void MetalDriver::copyToMemoryMappedBuffer(MemoryMappedBufferHandle mmbh, size_t offset,
        BufferDescriptor&& data) {
-    auto mmb = handle_cast<MetalMemoryMappedBuffer>(mmbh);
-
-    assert_invariant(any(mmb->access & MapBufferAccessFlags::WRITE_BIT));
-    assert_invariant(offset + data.size <= mmb->size);
-
-    // TODO: this isa zero-effort implementation of copyToMemoryMappedBuffer(), where we just
-    //       call updateBufferObject(). This could be a fallback implementation for when
-    //       shared memory is not available.
-    //       On UMA systems, this should just be a memcpy into the memory-mapped buffer.
-    updateBufferObject(mmb->boh, std::move(data), mmb->offset + offset);
+    auto* mmb = handle_cast<MetalMemoryMappedBuffer>(mmbh);
+    mmb->copy(*this, offset, std::move(data));
 }

 // explicit instantiation of the Dispatcher
--- a/filament/backend/src/metal/MetalHandles.h
+++ b/filament/backend/src/metal/MetalHandles.h
@@ -569,14 +569,22 @@ struct MetalDescriptorSet : public HwDescriptorSet {


 struct MetalMemoryMappedBuffer : public HwMemoryMappedBuffer {
-    MetalMemoryMappedBuffer(BufferObjectHandle boh, size_t const offset,
-            size_t const size, MapBufferAccessFlags const access)
-        : boh(boh), access(access), size(size), offset(offset) {
-    }
-    BufferObjectHandle boh{};
    MapBufferAccessFlags access{};
-    uint32_t size = 0;
-    uint32_t offset = 0;
+    struct {
+        MetalBufferObject* bo;
+        void* vaddr = nullptr;
+        uint32_t size = 0;
+        uint32_t offset = 0;
+    } mtl;
+
+    MetalMemoryMappedBuffer(MetalBufferObject* bo, size_t offset, size_t size,
+            MapBufferAccessFlags access) noexcept;
+
+    ~MetalMemoryMappedBuffer();
+
+    void unmap();
+
+    void copy(MetalDriver& mtld, size_t offset, BufferDescriptor&& data) const;
 };

 } // namespace backend
--- a/filament/backend/src/metal/MetalHandles.mm
+++ b/filament/backend/src/metal/MetalHandles.mm
@@ -1664,5 +1664,44 @@ id<MTLBuffer> MetalDescriptorSet::finalizeAndGetBuffer(MetalDriver* driver, Shad
    return buffer.get();
 }

+MetalMemoryMappedBuffer::MetalMemoryMappedBuffer(MetalBufferObject* bo, size_t offset, size_t size,
+    MapBufferAccessFlags access) noexcept : access(access) {
+    MetalBuffer* buffer = bo->getBuffer();
+    assert_invariant(buffer);
+    id<MTLBuffer> mtlBuffer = buffer->getGpuBufferForDraw();
+
+    assert_invariant(offset + size <= bo->byteCount);
+    assert_invariant(mtlBuffer.storageMode != MTLStorageModePrivate);
+
+    mtl.bo = bo;
+    mtl.vaddr = static_cast<char*>(mtlBuffer.contents) + offset;
+    mtl.size = size;
+    mtl.offset = offset;
+}
+
+MetalMemoryMappedBuffer::~MetalMemoryMappedBuffer() = default;
+
+void MetalMemoryMappedBuffer::unmap() {
+#if !defined(FILAMENT_IOS) && defined(__x86_64__)
+    // Managed memory requires didModifyRange to synchronize changes to the GPU. This is specific to Intel Macs.
+    MetalBuffer* buffer = bo->getBuffer();
+    id<MTLBuffer> mtlBuffer = buffer->getGpuBufferForDraw();
+    if (mtlBuffer && mtlBuffer.storageMode == MTLStorageModeManaged) {
+        [mtlBuffer didModifyRange:NSMakeRange(mtl.offset, mtl.size)];
+    }
+#endif
+    // Shared memory on UMA systems is coherent; no explicit synchronization is required.
+}
+
+void MetalMemoryMappedBuffer::copy(MetalDriver& mtld, size_t offset, BufferDescriptor&& data) const {
+    assert_invariant(any(access & MapBufferAccessFlags::WRITE_BIT));
+    assert_invariant(offset + data.size <= mtl.size);
+    assert_invariant(mtl.vaddr);
+
+    memcpy(static_cast<char*>(mtl.vaddr) + offset, data.buffer, data.size);
+
+    mtld.scheduleDestroy(std::move(data));
+}
+
 } // namespace backend
 } // namespace filament
--- a/filament/backend/src/opengl/platforms/PlatformEGLAndroid.cpp
+++ b/filament/backend/src/opengl/platforms/PlatformEGLAndroid.cpp
@@ -106,6 +106,8 @@ struct PlatformEGLAndroid::SwapChainEGLAndroid : public SwapChainEGL {
    void terminate(PlatformEGLAndroid& platform);
    bool setPresentFrameId(uint64_t frameId) const noexcept;
    uint64_t getFrameId(uint64_t frameId) const noexcept;
+    bool compositorTimingSupported = false;
+    bool frameTimestampsSupported = false;
 private:
    AndroidSwapChainHelper mImpl{};
 };
@@ -228,9 +230,9 @@ Driver* PlatformEGLAndroid::createDriver(void* sharedContext,
                    "eglGetNativeClientBufferANDROID"));

    if (ext.egl.ANDROID_presentation_time) {
-        eglGetNativeClientBufferANDROID =
-                PFNEGLGETNATIVECLIENTBUFFERANDROIDPROC(eglGetProcAddress(
-                        "eglGetNativeClientBufferANDROID"));
+        eglPresentationTimeANDROID =
+                PFNEGLPRESENTATIONTIMEANDROIDPROC(eglGetProcAddress(
+                        "eglPresentationTimeANDROID"));
    }

    if (ext.egl.ANDROID_get_frame_timestamps) {
@@ -289,11 +291,21 @@ bool PlatformEGLAndroid::queryCompositorTiming(SwapChain const* swapchain,
    outCompositorTiming->frameTime = preferredTimeline.frameTime;
    outCompositorTiming->expectedPresentTime = preferredTimeline.expectedPresentTime;
    outCompositorTiming->frameTimelineDeadline = preferredTimeline.frameTimelineDeadline;
+    outCompositorTiming->compositeDeadline = CompositorTiming::INVALID;
+    outCompositorTiming->compositeInterval = CompositorTiming::INVALID;
+    outCompositorTiming->compositeToPresentLatency = CompositorTiming::INVALID;
+
+    // From this point on, we always return "success" because some timings were returned.
+
+    if (!static_cast<SwapChainEGLAndroid const *>(swapchain)->compositorTimingSupported) {
+        // if this surface doesn't support it, don't attempt to query the values.
+        return true;
+    }

    if (UTILS_LIKELY(ext.egl.ANDROID_get_frame_timestamps)) {
        EGLSurface const sur = static_cast<SwapChainEGL const *>(swapchain)->sur;
        if (sur == EGL_NO_SURFACE) {
-            return false;
+            return true;
        }

        std::array<EGLnsecsANDROID, 3> values;
@@ -304,26 +316,16 @@ bool PlatformEGLAndroid::queryCompositorTiming(SwapChain const* swapchain,
        };
        EGLBoolean const success = eglGetCompositorTimingANDROID(getEglDisplay(), sur,
                names.size(), names.data(), values.data());
-        if (!success) {
-            return false;
+        if (UTILS_UNLIKELY(!success)) {
+            // reset current error to EGL_SUCCESS
+            eglGetError();
+        } else {
+            outCompositorTiming->compositeDeadline = values[0];
+            outCompositorTiming->compositeInterval = values[1];
+            outCompositorTiming->compositeToPresentLatency = values[2];
        }
-        outCompositorTiming->compositeDeadline = values[0];
-        outCompositorTiming->compositeInterval = values[1];
-        outCompositorTiming->compositeToPresentLatency = values[2];
-        return true;
    }
-
-    // fallback to private APIs
-    auto const anw = static_cast<SwapChainEGL const *>(swapchain)->nativeWindow;
-    int const status = NativeWindow::getCompositorTiming(anw,
-            &outCompositorTiming->compositeDeadline,
-            &outCompositorTiming->compositeInterval,
-            &outCompositorTiming->compositeToPresentLatency);
-    if (status == 0) {
-        return true;
-    }
-
-    return PlatformEGL::queryCompositorTiming(swapchain, outCompositorTiming);
+    return true;
 }

 bool PlatformEGLAndroid::setPresentFrameId(SwapChain const* swapchain,
@@ -348,6 +350,10 @@ bool PlatformEGLAndroid::queryFrameTimestamps(SwapChain const* swapchain, uint64
        return false;
    }

+    if (!static_cast<SwapChainEGLAndroid const *>(swapchain)->frameTimestampsSupported) {
+        return false;
+    }
+
    if (UTILS_LIKELY(ext.egl.ANDROID_get_frame_timestamps)) {
        EGLSurface const sur = sc->sur;
        if (sur == EGL_NO_SURFACE) {
@@ -368,7 +374,9 @@ bool PlatformEGLAndroid::queryFrameTimestamps(SwapChain const* swapchain, uint64
        };
        EGLBoolean const success = eglGetFrameTimestampsANDROID(getEglDisplay(), sur, hwFrameId,
                names.size(), names.data(), values.data());
-        if (!success) {
+        if (UTILS_UNLIKELY(!success)) {
+            // reset current error to EGL_SUCCESS
+            eglGetError();
            return false;
        }
        outFrameTimestamps->requestedPresentTime = values[0];
@@ -382,28 +390,44 @@ bool PlatformEGLAndroid::queryFrameTimestamps(SwapChain const* swapchain, uint64
        outFrameTimestamps->releaseTime = values[8];
        return true;
    }
-
-    // fallback to private APIs
-    auto const anw = sc->nativeWindow;
-    int const status = NativeWindow::getFrameTimestamps(anw, hwFrameId,
-            &outFrameTimestamps->requestedPresentTime,
-            &outFrameTimestamps->acquireTime,
-            &outFrameTimestamps->latchTime,
-            &outFrameTimestamps->firstCompositionStartTime,
-            &outFrameTimestamps->lastCompositionStartTime,
-            &outFrameTimestamps->gpuCompositionDoneTime,
-            &outFrameTimestamps->displayPresentTime,
-            &outFrameTimestamps->dequeueReadyTime,
-            &outFrameTimestamps->releaseTime);
-    if (status == 0) {
-        return true;
-    }
-
    return PlatformEGL::queryFrameTimestamps(swapchain, frameId, outFrameTimestamps);
 }

 Platform::SwapChain* PlatformEGLAndroid::createSwapChain(void* nativeWindow, uint64_t const flags) {
    auto* const sc = new(std::nothrow) SwapChainEGLAndroid(*this, nativeWindow, flags);
+    if (UTILS_LIKELY(ext.egl.ANDROID_get_frame_timestamps)) {
+        EGLDisplay const dpy = getEglDisplay();
+        sc->compositorTimingSupported =
+                eglGetCompositorTimingSupportedANDROID(dpy, sc->sur,
+                        EGL_COMPOSITE_DEADLINE_ANDROID) &&
+                eglGetCompositorTimingSupportedANDROID(dpy, sc->sur,
+                        EGL_COMPOSITE_INTERVAL_ANDROID) &&
+                eglGetCompositorTimingSupportedANDROID(dpy, sc->sur,
+                        EGL_COMPOSITE_TO_PRESENT_LATENCY_ANDROID);
+        sc->frameTimestampsSupported =
+                eglGetFrameTimestampSupportedANDROID(dpy, sc->sur,
+                        EGL_REQUESTED_PRESENT_TIME_ANDROID) &&
+                eglGetFrameTimestampSupportedANDROID(dpy, sc->sur,
+                        EGL_RENDERING_COMPLETE_TIME_ANDROID) &&
+                eglGetFrameTimestampSupportedANDROID(dpy, sc->sur,
+                        EGL_COMPOSITION_LATCH_TIME_ANDROID) &&
+                eglGetFrameTimestampSupportedANDROID(dpy, sc->sur,
+                        EGL_FIRST_COMPOSITION_START_TIME_ANDROID) &&
+                eglGetFrameTimestampSupportedANDROID(dpy, sc->sur,
+                        EGL_LAST_COMPOSITION_START_TIME_ANDROID) &&
+                eglGetFrameTimestampSupportedANDROID(dpy, sc->sur,
+                        EGL_FIRST_COMPOSITION_GPU_FINISHED_TIME_ANDROID) &&
+                eglGetFrameTimestampSupportedANDROID(dpy, sc->sur,
+                        EGL_DISPLAY_PRESENT_TIME_ANDROID) &&
+                eglGetFrameTimestampSupportedANDROID(dpy, sc->sur,
+                        EGL_DEQUEUE_READY_TIME_ANDROID) &&
+                eglGetFrameTimestampSupportedANDROID(dpy, sc->sur,
+                        EGL_READS_DONE_TIME_ANDROID);
+    }
+    // This is expected to be a low frequency log, only turned on in debug builds
+    DLOG(INFO) << "anw: " << nativeWindow
+            << ", compositorTimingSupported=" << sc->compositorTimingSupported
+            << ", frameTimestampsSupported=" << sc->frameTimestampsSupported;
    return sc;
 }

@@ -725,8 +749,6 @@ PlatformEGLAndroid::SwapChainEGLAndroid::SwapChainEGLAndroid(PlatformEGLAndroid
            // we ignore the result, it doesn't matter much if it fails
            eglSurfaceAttrib(platform.getEglDisplay(), sur, EGL_TIMESTAMPS_ANDROID, EGL_TRUE);
        }
-    } else {
-        NativeWindow::enableFrameTimestamps(EGLNativeWindowType(nativeWindow), true);
    }
 }

--- a/filament/backend/src/vulkan/platform/VulkanPlatformAndroid.cpp
+++ b/filament/backend/src/vulkan/platform/VulkanPlatformAndroid.cpp
@@ -541,9 +541,15 @@ bool VulkanPlatformAndroid::queryCompositorTiming(SwapChain const* swapchain,
    outCompositorTiming->frameTime = preferredTimeline.frameTime;
    outCompositorTiming->expectedPresentTime = preferredTimeline.expectedPresentTime;
    outCompositorTiming->frameTimelineDeadline = preferredTimeline.frameTimelineDeadline;
+    outCompositorTiming->compositeDeadline = CompositorTiming::INVALID;
+    outCompositorTiming->compositeInterval = CompositorTiming::INVALID;
+    outCompositorTiming->compositeToPresentLatency = CompositorTiming::INVALID;
+
+    // From this point on, we always return "success" because some timings were returned.

    auto vulkanSwapchain = static_cast<VulkanPlatformSwapChainBase const *>(swapchain);
-    return vulkanSwapchain->queryCompositorTiming(outCompositorTiming);
+    vulkanSwapchain->queryCompositorTiming(outCompositorTiming);
+    return true;
 }

 bool VulkanPlatformAndroid::setPresentFrameId(SwapChain const* swapchain, uint64_t frameId) noexcept {
--- a/filament/backend/src/vulkan/platform/VulkanPlatformSwapChainImpl.cpp
+++ b/filament/backend/src/vulkan/platform/VulkanPlatformSwapChainImpl.cpp
@@ -361,13 +361,15 @@ bool VulkanPlatformSurfaceSwapChain::queryCompositorTiming(
        CompositorTiming* outCompositorTiming) const {
 #ifdef __ANDROID__
    // fallback to private APIs
-    int const status = NativeWindow::getCompositorTiming(
-            static_cast<ANativeWindow*>(mNativeWindow),
-            &outCompositorTiming->compositeDeadline,
-            &outCompositorTiming->compositeInterval,
-            &outCompositorTiming->compositeToPresentLatency);
-    if (status == 0) {
-        return true;
+    if (UTILS_VERY_LIKELY(mNativeWindow)) {
+        int const status = NativeWindow::getCompositorTiming(
+                static_cast<ANativeWindow*>(mNativeWindow),
+                &outCompositorTiming->compositeDeadline,
+                &outCompositorTiming->compositeInterval,
+                &outCompositorTiming->compositeToPresentLatency);
+        if (status == 0) {
+            return true;
+        }
    }
 #endif
    return VulkanPlatformSwapChainBase::queryCompositorTiming(outCompositorTiming);
--- a/filament/include/filament/Renderer.h
+++ b/filament/include/filament/Renderer.h
@@ -113,8 +113,10 @@ public:
    /**
     * Retrieve a history of frame timing information. The maximum frame history size is
     * given by getMaxFrameHistorySize().
+     * All or part of the history can be lost when using a different SwapChain in beginFrame().
     * @param historySize requested history size. The returned vector could be smaller.
     * @return A vector of FrameInfo.
+     * @see beginFrame()
     */
    utils::FixedCapacityVector<FrameInfo> getFrameInfoHistory(
            size_t historySize = 1) const noexcept;
@@ -326,6 +328,8 @@ public:
     *                                 or 0 if unknown. This value should be the timestamp of
     *                                 the last h/w vsync. It is expressed in the
     *                                 std::chrono::steady_clock time base.
+     *                                 On Android this should be the frame time received from
+     *                                 a Choreographer.
     * @param swapChain A pointer to the SwapChain instance to use.
     *
     * @return
@@ -337,6 +341,8 @@ public:
     *
     * @note
     * All calls to render() must happen *after* beginFrame().
+     * It is recommended to use the same swapChain for every call to beginFrame, failing to do
+     * so can result is losing all or part of the FrameInfo history.
     *
     * @see
     * endFrame()
--- a/filament/src/details/BufferAllocator.cpp
+++ b/filament/src/details/BufferAllocator.cpp
@@ -16,6 +16,7 @@

 #include "details/BufferAllocator.h"

+#include <private/utils/Tracing.h>
 #include <utils/Panic.h>
 #include <utils/debug.h>

@@ -169,6 +170,7 @@ void BufferAllocator::releaseGpu(AllocationId id) {
 }

 void BufferAllocator::releaseFreeSlots() {
+    FILAMENT_TRACING_CALL(FILAMENT_TRACING_CATEGORY_FILAMENT);
    if (!mHasPendingFrees) {
        return;
    }
--- a/filament/src/details/Engine.cpp
+++ b/filament/src/details/Engine.cpp
@@ -718,8 +718,7 @@ void FEngine::prepare() {

    if (useUboBatching) {
        assert_invariant(mUboManager != nullptr);
-
-        mUboManager->beginFrame(driver, mMaterialInstances);
+        mUboManager->beginFrame(driver);
    }

    UboManager* uboManager = mUboManager;
@@ -758,7 +757,7 @@ void FEngine::gc() {
 void FEngine::submitFrame() {
    if (isUboBatchingEnabled()) {
        DriverApi& driver = getDriverApi();
-        getUboManager()->endFrame(driver, getMaterialInstanceResourceList());
+        getUboManager()->endFrame(driver);
    }
 }

@@ -1285,11 +1284,6 @@ UTILS_NOINLINE
 bool FEngine::destroy(const FMaterialInstance* p) {
    if (p == nullptr) return true;

-    if (p->isUsingUboBatching()) {
-        assert_invariant(isUboBatchingEnabled());
-        mUboManager->retireSlot(p->getAllocationId());
-    }
-
    // Check that the material instance we're destroying is not in use in the RenderableManager
    // To do this, we currently need to inspect all render primitives in the RenderableManager
    EntityManager const& em = mEntityManager;
--- a/filament/src/details/MaterialInstance.cpp
+++ b/filament/src/details/MaterialInstance.cpp
@@ -95,6 +95,7 @@ FMaterialInstance::FMaterialInstance(FEngine& engine, FMaterial const* material,

    if (mUseUboBatching) {
        mUboData = BufferAllocator::UNALLOCATED;
+        engine.getUboManager()->manageMaterialInstance(this);
    } else {
        mUboData = driver.createBufferObject(mUniforms.getSize(), BufferObjectBinding::UNIFORM,
                BufferUsage::STATIC, ImmutableCString{ material->getName().c_str_safe() });
@@ -167,6 +168,7 @@ FMaterialInstance::FMaterialInstance(FEngine& engine,

    if (mUseUboBatching) {
        mUboData = BufferAllocator::UNALLOCATED;
+        engine.getUboManager()->manageMaterialInstance(this);
    } else {
        mUboData = driver.createBufferObject(mUniforms.getSize(), BufferObjectBinding::UNIFORM,
                BufferUsage::DYNAMIC, ImmutableCString{ material->getName().c_str_safe() });
@@ -211,6 +213,10 @@ FMaterialInstance::~FMaterialInstance() noexcept = default;
 void FMaterialInstance::terminate(FEngine& engine) {
    FEngine::DriverApi& driver = engine.getDriverApi();
    mDescriptorSet.terminate(driver);
+    if (mUseUboBatching) {
+        engine.getUboManager()->unmanageMaterialInstance(this);
+    }
+
    auto* ubHandle = std::get_if<Handle<HwBufferObject>>(&mUboData);
    if (ubHandle){
        driver.destroyBufferObject(*ubHandle);
--- a/filament/src/details/UboManager.cpp
+++ b/filament/src/details/UboManager.cpp
@@ -48,6 +48,7 @@ void UboManager::FenceManager::track(DriverApi& driver, std::unordered_set<Alloc

 void UboManager::FenceManager::reclaimCompletedResources(DriverApi& driver,
        std::function<void(AllocationId)> const& onReclaimed) {
+    FILAMENT_TRACING_CALL(FILAMENT_TRACING_CATEGORY_FILAMENT);
    uint32_t signaledCount = 0;
    bool seenSignaledFence = false;

@@ -115,8 +116,7 @@ UboManager::UboManager(DriverApi& driver, allocation_size_t defaultSlotSizeInByt
    reallocate(driver, defaultTotalSizeInBytes);
 }

-void UboManager::beginFrame(DriverApi& driver,
-        const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>& materialInstances) {
+void UboManager::beginFrame(DriverApi& driver) {
    FILAMENT_TRACING_CALL(FILAMENT_TRACING_CATEGORY_FILAMENT);
    // Check finished frames and decrement GPU count accordingly.
    mFenceManager.reclaimCompletedResources(driver,
@@ -126,7 +126,7 @@ void UboManager::beginFrame(DriverApi& driver,
    mAllocator.releaseFreeSlots();

    // Traverse all MIs and see which of them need slot allocation.
-    if (allocateOnDemand(materialInstances) == SUCCESS) {
+    if (allocateOnDemand() == SUCCESS) {
        // No need to grow the buffer, so we can just map the buffer for writing and return.
        mMemoryMappedBufferHandle = driver.mapBuffer(mUbHandle, 0, mUboSize, MapBufferAccessFlags::WRITE_BIT,
                "UboManager");
@@ -135,25 +135,19 @@ void UboManager::beginFrame(DriverApi& driver,
    }

    // Calculate the required size and grow the Ubo.
-    const allocation_size_t requiredSize = calculateRequiredSize(materialInstances);
+    const allocation_size_t requiredSize = calculateRequiredSize();
    reallocate(driver, requiredSize);

    // Allocate slots for each MI on the new Ubo.
-    allocateAllInstances(materialInstances);
+    allocateAllInstances();

    // Map the buffer so that we can write to it
    mMemoryMappedBufferHandle =
            driver.mapBuffer(mUbHandle, 0, mUboSize, MapBufferAccessFlags::WRITE_BIT, "UboManager");

    // Invalidate the migrated MIs, so that next commit() call must be triggered.
-    for (const auto& materialInstance : materialInstances) {
-        materialInstance.second.forEach([](const FMaterialInstance* mi) {
-            if (!mi->isUsingUboBatching()) {
-                return;
-            }
-
-            mi->getUniformBuffer().invalidate();
-        });
+    for (const auto* mi : mManagedInstances) {
+        mi->getUniformBuffer().invalidate();
    }
 }

@@ -164,24 +158,16 @@ void UboManager::finishBeginFrame(DriverApi& driver) {
    }
 }

-void UboManager::endFrame(DriverApi& driver,
-        const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>& materialInstances) {
-    BufferAllocator& allocator = mAllocator;
+void UboManager::endFrame(DriverApi& driver) {
    std::unordered_set<AllocationId> allocationIds;
-    for (const auto& materialInstance : materialInstances) {
-        materialInstance.second.forEach([&allocator, &allocationIds](const FMaterialInstance* mi) {
-            if (!mi->isUsingUboBatching()) {
-                return;
-            }
+    for (const auto* mi : mManagedInstances) {
+        const AllocationId id = mi->getAllocationId();
+        if (UTILS_UNLIKELY(!BufferAllocator::isValid(id))) {
+            continue;
+        }

-            const AllocationId id = mi->getAllocationId();
-            if (!BufferAllocator::isValid(id)) {
-                return;
-            }
-
-            allocator.acquireGpu(id);
-            allocationIds.insert(id);
-        });
+        mAllocator.acquireGpu(id);
+        allocationIds.insert(id);
    }

    mFenceManager.track(driver, std::move(allocationIds));
@@ -194,76 +180,90 @@ void UboManager::terminate(DriverApi& driver) {

 void UboManager::updateSlot(DriverApi& driver, AllocationId id,
        BufferDescriptor bufferDescriptor) const {
-    if (!mMemoryMappedBufferHandle)
+    if (!mMemoryMappedBufferHandle) {
        return;
+    }

    const allocation_size_t offset = mAllocator.getAllocationOffset(id);
    driver.copyToMemoryMappedBuffer(mMemoryMappedBufferHandle, offset, std::move(bufferDescriptor));
 }

-void UboManager::retireSlot(BufferAllocator::AllocationId id) {
-    if (!BufferAllocator::isValid(id))
-        return;
-    mAllocator.retire(id);
+void UboManager::manageMaterialInstance(FMaterialInstance* instance) {
+    mPendingInstances.insert(instance);
 }

-UboManager::AllocationResult UboManager::allocateOnDemand(
-        const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                materialInstances) {
-    // Collect all MIs that need allocation into two groups.
-    std::vector<FMaterialInstance*> newInstances;
-    std::vector<FMaterialInstance*> existingInstances;
-    for (const auto& [_, miList] : materialInstances) {
-        miList.forEach([&](FMaterialInstance* mi) {
-            if (!mi->isUsingUboBatching()) {
-                return;
-            }
-            if (BufferAllocator::isValid(mi->getAllocationId())) {
-                existingInstances.push_back(mi);
-            } else {
-                newInstances.push_back(mi);
-            }
-        });
+void UboManager::unmanageMaterialInstance(FMaterialInstance* materialInstance) {
+    AllocationId id = materialInstance->getAllocationId();
+    mPendingInstances.erase(materialInstance);
+    mManagedInstances.erase(materialInstance);
+
+    if (!BufferAllocator::isValid(id)) {
+        return;
    }

+    mAllocator.retire(id);
+    materialInstance->assignUboAllocation(mUbHandle, BufferAllocator::UNALLOCATED, 0);
+}
+
+UboManager::AllocationResult UboManager::allocateOnDemand() {
+    FILAMENT_TRACING_CALL(FILAMENT_TRACING_CATEGORY_FILAMENT);
    bool reallocationNeeded = false;

    // Pass 1: Allocate slots for new material instances (that don't have a slot yet).
-    for (FMaterialInstance* mi : newInstances) {
+    for (auto* mi : mPendingInstances) {
+        mManagedInstances.insert(mi);
        auto [newId, newOffset] = mAllocator.allocate(mi->getUniformBuffer().getSize());
+
+        // Even if the newId is not valid, we assign it to the MI so that the following process knows
+        // this material instance was not allocated successfully. Then we can calculate the new
+        // required UBO size properly.
        mi->assignUboAllocation(mUbHandle, newId, newOffset);
+
        if (!BufferAllocator::isValid(newId)) {
            reallocationNeeded = true;
        }
    }
+    mPendingInstances.clear();

    // Pass 2: Allocate slots for existing material instances that need to be orphaned.
-    for (FMaterialInstance* mi : existingInstances) {
-        if (mi->getUniformBuffer().isDirty() && mAllocator.isLockedByGpu(mi->getAllocationId())) {
-            mAllocator.retire(mi->getAllocationId());
-            auto [newId, newOffset] = mAllocator.allocate(mi->getUniformBuffer().getSize());
-            mi->assignUboAllocation(mUbHandle, newId, newOffset);
-            if (!BufferAllocator::isValid(newId)) {
-                reallocationNeeded = true;
-            }
+    for (auto* mi: mManagedInstances) {
+        if (!BufferAllocator::isValid(mi->getAllocationId())) {
+            continue;
+        }
+
+        // This instance doesn't need orphaning.
+        if (!mi->getUniformBuffer().isDirty() || !mAllocator.isLockedByGpu(mi->getAllocationId())) {
+            continue;
+        }
+
+        mAllocator.retire(mi->getAllocationId());
+
+        // If the space is already not sufficient, we don't need to give another try on allocation.
+        if (reallocationNeeded) {
+            mi->assignUboAllocation(mUbHandle, REALLOCATION_REQUIRED, 0);
+            continue;
+        }
+
+        auto [newId, newOffset] = mAllocator.allocate(mi->getUniformBuffer().getSize());
+
+        // Even if the newId is not valid, we assign it to the MI so that the following process knows
+        // this material instance was not allocated successfully. Then we can calculate the new
+        // required UBO size properly.
+        mi->assignUboAllocation(mUbHandle, newId, newOffset);
+
+        if (!BufferAllocator::isValid(newId)) {
+            reallocationNeeded = true;
        }
    }

    return reallocationNeeded ? REALLOCATION_REQUIRED : SUCCESS;
 }

-void UboManager::allocateAllInstances(
-        const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                materialInstances) {
-    for (const auto& [_, miList] : materialInstances) {
-        miList.forEach([this](FMaterialInstance* mi) {
-            if (!mi->isUsingUboBatching()) {
-                return;
-            }
-            auto [newId, newOffset] = mAllocator.allocate(mi->getUniformBuffer().getSize());
-            assert_invariant(BufferAllocator::isValid(newId));
-            mi->assignUboAllocation(mUbHandle, newId, newOffset);
-        });
+void UboManager::allocateAllInstances() {
+    for (auto* mi: mManagedInstances) {
+        auto [newId, newOffset] = mAllocator.allocate(mi->getUniformBuffer().getSize());
+        assert_invariant(BufferAllocator::isValid(newId));
+        mi->assignUboAllocation(mUbHandle, newId, newOffset);
    }
 }

@@ -288,28 +288,19 @@ void UboManager::reallocate(DriverApi& driver, allocation_size_t requiredSize) {
            BufferUsage::DYNAMIC | BufferUsage::SHARED_WRITE_BIT);
 }

-allocation_size_t UboManager::calculateRequiredSize(
-        const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                materialInstances) {
-    BufferAllocator& allocator = mAllocator;
+allocation_size_t UboManager::calculateRequiredSize() {
    allocation_size_t newBufferSize = 0;
-    for (const auto& materialInstance: materialInstances) {
-        materialInstance.second.forEach([&newBufferSize, &allocator](const FMaterialInstance* mi) {
-            if (!mi->isUsingUboBatching()) {
-                return;
-            }
-
-            const AllocationId allocationId = mi->getAllocationId();
-            if (allocationId == BufferAllocator::REALLOCATION_REQUIRED) {
-                // For MIs whose parameters have been updated, aside from the slot it is being
-                // occupied by the GPU, we need to preserve an additional slot for it.
-                newBufferSize += 2 * allocator.alignUp(mi->getUniformBuffer().getSize());
-            } else {
-                newBufferSize += allocator.alignUp(mi->getUniformBuffer().getSize());
-            }
-        });
+    for (const auto* mi: mManagedInstances) {
+        const AllocationId allocationId = mi->getAllocationId();
+        if (allocationId == BufferAllocator::REALLOCATION_REQUIRED) {
+            // For MIs whose parameters have been updated, aside from the slot it is being
+            // occupied by the GPU, we need to preserve an additional slot for it.
+            newBufferSize += 2 * mAllocator.alignUp(mi->getUniformBuffer().getSize());
+        } else {
+            newBufferSize += mAllocator.alignUp(mi->getUniformBuffer().getSize());
+        }
    }
-    return allocator.alignUp(newBufferSize * BUFFER_SIZE_GROWTH_MULTIPLIER);
+    return mAllocator.alignUp(newBufferSize * BUFFER_SIZE_GROWTH_MULTIPLIER);
 }

 } // namespace filament
--- a/filament/src/details/UboManager.h
+++ b/filament/src/details/UboManager.h
@@ -17,7 +17,6 @@
 #ifndef TNT_FILAMENT_DETAILS_UBOMANAGER_H
 #define TNT_FILAMENT_DETAILS_UBOMANAGER_H

-#include "ResourceList.h"
 #include "backend/DriverApiForward.h"

 #include "details/BufferAllocator.h"
@@ -29,6 +28,8 @@
 #include <unordered_set>
 #include <vector>

+class UboManagerTest;
+
 namespace filament {

 class FMaterial;
@@ -96,9 +97,8 @@ public:
    //    instances with modified uniforms).
    // 3. Reallocating a larger shared UBO if the current one is insufficient.
    // 4. Mapping the shared UBO into CPU-accessible memory to prepare for uniform data writes.
-    void beginFrame(backend::DriverApi& driver,
-            const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                    materialInstances);
+    // Note that it must happen before committing all MIs.
+    void beginFrame(backend::DriverApi& driver);

    // Unmap the buffer here
    void finishBeginFrame(backend::DriverApi& driver);
@@ -106,23 +106,31 @@ public:
    // Create a fence and associate it with a set of allocation ids.
    // The gpuUseCount of these allocations will be incremented, and they will be decremented
    // After the corresponding frame has been done.
-    void endFrame(backend::DriverApi& driver,
-            const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                    materialInstances);
+    void endFrame(backend::DriverApi& driver);

    void terminate(backend::DriverApi& driver);

    void updateSlot(backend::DriverApi& driver, BufferAllocator::AllocationId id,
            backend::BufferDescriptor bufferDescriptor) const;

-    // Call this when a material instance is no longer holding a slot. e.g. it is destroyed.
-    void retireSlot(BufferAllocator::AllocationId id);
+    // Call this to register a new material instance to UboManager.
+    void manageMaterialInstance(FMaterialInstance* instance);
+
+    // Call this when a material instance is destroyed.
+    void unmanageMaterialInstance(FMaterialInstance* materialInstance);

    // Returns the size of the actual UBO. Note that when there's allocation failed, it will be
    // reallocated to a bigger size at the next frame.
    [[nodiscard]] BufferAllocator::allocation_size_t getTotalSize() const noexcept;

+    // For testing
+    [[nodiscard]] backend::MemoryMappedBufferHandle getMemoryMappedBufferHandle() const noexcept {
+        return mMemoryMappedBufferHandle;
+    }
+
 private:
+    friend class ::UboManagerTest;
+
    constexpr static float BUFFER_SIZE_GROWTH_MULTIPLIER = 1.5f;

    enum AllocationResult {
@@ -134,23 +142,19 @@ private:
    [[nodiscard]] BufferAllocator::allocation_size_t getAllocationOffset(
            BufferAllocator::AllocationId id) const;

-    AllocationResult allocateOnDemand(
-            const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                    materialInstances);
+    AllocationResult allocateOnDemand();

-    void allocateAllInstances(
-            const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                    materialInstances);
+    void allocateAllInstances();

    void reallocate(backend::DriverApi& driver, BufferAllocator::allocation_size_t requiredSize);

-    BufferAllocator::allocation_size_t calculateRequiredSize(
-            const std::unordered_map<const FMaterial*, ResourceList<FMaterialInstance>>&
-                    materialInstances);
+    BufferAllocator::allocation_size_t calculateRequiredSize();

    backend::Handle<backend::HwBufferObject> mUbHandle;
    backend::MemoryMappedBufferHandle mMemoryMappedBufferHandle;
    BufferAllocator::allocation_size_t mUboSize{};
+    std::unordered_set<FMaterialInstance*> mPendingInstances;
+    std::unordered_set<FMaterialInstance*> mManagedInstances;

    FenceManager mFenceManager;
    BufferAllocator mAllocator;
--- a/filament/src/details/VertexBuffer.cpp
+++ b/filament/src/details/VertexBuffer.cpp
@@ -285,53 +285,45 @@ FVertexBuffer::FVertexBuffer(FEngine& engine, const Builder& builder)

    // calculate buffer sizes
    size_t bufferSizes[MAX_VERTEX_BUFFER_COUNT] = {};
-    #pragma nounroll
-    for (size_t i = 0, n = mAttributes.size(); i < n; ++i) {
-        if (mDeclaredAttributes[i]) {
-            const uint32_t offset = mAttributes[i].offset;
-            const uint8_t stride = mAttributes[i].stride;
-            const uint8_t slot = mAttributes[i].buffer;
-            const size_t end = offset + mVertexCount * stride;
-            if (slot != Attribute::BUFFER_UNUSED) {
-                assert_invariant(slot < MAX_VERTEX_BUFFER_COUNT);
-                bufferSizes[slot] = std::max(bufferSizes[slot], end);
-            }
-        }
-    }
+
+    auto shouldCreateBuffer = [this](size_t attributeIndex) {
+        const uint8_t slot = mAttributes[attributeIndex].buffer;
+        return mDeclaredAttributes[attributeIndex] && slot != Attribute::BUFFER_UNUSED &&
+                !mBufferObjects[slot];
+    };
+    auto updateBufferSize = [&bufferSizes, this](size_t attributeIndex) {
+        const uint32_t offset = mAttributes[attributeIndex].offset;
+        const uint8_t stride = mAttributes[attributeIndex].stride;
+        const uint8_t slot = mAttributes[attributeIndex].buffer;
+        const size_t end = offset + mVertexCount * stride;
+        assert_invariant(slot < MAX_VERTEX_BUFFER_COUNT);
+        bufferSizes[slot] = std::max(bufferSizes[slot], end);
+    };

    if (!mBufferObjectsEnabled) {
-        // If buffer objects are not enabled at the API level, then we create them internally.
        #pragma nounroll
-        for (size_t index = 0; index < MAX_VERTEX_ATTRIBUTE_COUNT; ++index) {
-            size_t const i = mAttributes[index].buffer;
-            if (i != Attribute::BUFFER_UNUSED) {
-                assert_invariant(bufferSizes[i] > 0);
-                if (!mBufferObjects[i]) {
-                    BufferObjectHandle const bo = driver.createBufferObject(bufferSizes[i],
-                            BufferObjectBinding::VERTEX, BufferUsage::STATIC,
-                            utils::ImmutableCString{ builder.getName() });
-                    driver.setVertexBufferObject(mHandle, i, bo);
-                    mBufferObjects[i] = bo;
-                }
+        for (size_t i = 0, n = mAttributes.size(); i < n; ++i) {
+            if (shouldCreateBuffer(i)) {
+                updateBufferSize(i);
            }
        }
-    } else {
-        // in advanced skinning mode, we manage the BONE_INDICES and BONE_WEIGHTS arrays ourselves,
-        // so we have to set the corresponding buffer objects.
-        if (mAdvancedSkinningEnabled) {
-            for (auto const index : { BONE_INDICES, BONE_WEIGHTS }) {
-                size_t const i = mAttributes[index].buffer;
-                assert_invariant(i != Attribute::BUFFER_UNUSED);
-                assert_invariant(bufferSizes[i] > 0);
-                if (!mBufferObjects[i]) {
-                    BufferObjectHandle const bo = driver.createBufferObject(bufferSizes[i],
-                            BufferObjectBinding::VERTEX, BufferUsage::STATIC,
-                            utils::ImmutableCString{ builder.getName() });
-                    driver.setVertexBufferObject(mHandle, i, bo);
-                    mBufferObjects[i] = bo;
-                }
-            }
+    } else if (mAdvancedSkinningEnabled) {
+        // For advanced skinning mode, only relevant buffers (BONE_INDICES & BONE_WEIGHTS) are
+        // created. We manually populated the relevant attributes for those buffers above.
+        updateBufferSize(BONE_INDICES);
+        updateBufferSize(BONE_WEIGHTS);
+    }
+
+    // create buffers
+    for (size_t i = 0; i < MAX_VERTEX_BUFFER_COUNT; ++i) {
+        if (bufferSizes[i] == 0 || mBufferObjects[i]) {
+            continue;
        }
+        BufferObjectHandle const bo = driver.createBufferObject(bufferSizes[i],
+                BufferObjectBinding::VERTEX, BufferUsage::STATIC,
+                utils::ImmutableCString{ builder.getName() });
+        driver.setVertexBufferObject(mHandle, i, bo);
+        mBufferObjects[i] = bo;
    }
 }

--- a/filament/test/CMakeLists.txt
+++ b/filament/test/CMakeLists.txt
@@ -50,14 +50,17 @@ if (TNT_DEV)
            test_BufferAllocatorStress.cpp
            test_CircularQueue.cpp
            test_FenceManager.cpp
+            test_UboManager.cpp
            filament_test_exposure.cpp
            filament_rendering_test.cpp
            filament_bimap_test.cpp
            filament_framegraph_test.cpp
-            filament_test.cpp)
+            filament_test.cpp
+            ${RESGEN_SOURCE})

    target_link_libraries(test_${TARGET} PRIVATE filament gtest)
    target_compile_options(test_${TARGET} PRIVATE ${COMPILER_FLAGS})
+    target_include_directories(test_${TARGET} PRIVATE ${RESOURCE_DIR})
    set_target_properties(test_${TARGET} PROPERTIES FOLDER Tests)

    add_executable(test_depth depth_test.cpp)
--- a/filament/test/test_UboManager.cpp
+++ b/filament/test/test_UboManager.cpp
@@ -0,0 +1,366 @@
+/*
+ * Copyright (C) 2025 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "MockDriver.h"
+#include "details/MaterialInstance.h"
+#include "details/UboManager.h"
+
+#include <private/backend/CommandBufferQueue.h>
+#include <private/backend/CommandStream.h>
+#include <private/backend/Driver.h>
+
+#include "filament_test_resources.h"
+
+namespace {
+using namespace filament;
+using namespace backend;
+
+using ::testing::NiceMock;
+
+using AllocationId = BufferAllocator::AllocationId;
+using allocation_size_t = BufferAllocator::allocation_size_t;
+} // anonymous namespace
+
+class UboManagerTest : public ::testing::Test {
+protected:
+    static constexpr size_t MIN_COMMAND_BUFFERS_SIZE = 1 * 1024 * 1024;
+    static constexpr size_t COMMAND_BUFFERS_SIZE = 3 * MIN_COMMAND_BUFFERS_SIZE;
+    static constexpr BufferAllocator::allocation_size_t DEFAULT_SLOT_SIZE = 64;
+    static constexpr BufferAllocator::allocation_size_t DEFAULT_TOTAL_SIZE = 1024;
+
+    UboManagerTest()
+            : mCommandBufferQueue(MIN_COMMAND_BUFFERS_SIZE, COMMAND_BUFFERS_SIZE, false),
+              mCommandStream(mMockDriver, mCommandBufferQueue.getCircularBuffer()),
+
+              mDriverApi(mCommandStream),
+              // The constructor will call reallocate, which calls createBufferObject.
+              // MockDriver's default ...S() implementation returns an incrementing handle.
+              // So, the first handle will be 1.
+              mUboManager(mDriverApi, DEFAULT_SLOT_SIZE, DEFAULT_TOTAL_SIZE),
+              mPendingInstances(mUboManager.mPendingInstances),
+              mManagedInstances(mUboManager.mManagedInstances),
+              mUbHandle(mUboManager.mUbHandle),
+              mAllocator(mUboManager.mAllocator) {
+        mEngine = Engine::Builder()
+                          .feature("material.enable_material_instance_uniform_batching", true)
+                          .backend(Backend::NOOP)
+                          .build();
+
+        mMaterial = Material::Builder()
+                            .package(FILAMENT_TEST_RESOURCES_TEST_MATERIAL_DATA,
+                                    FILAMENT_TEST_RESOURCES_TEST_MATERIAL_SIZE)
+                            .build(*mEngine);
+    }
+
+    void TearDown() override {
+        mEngine->destroy(mMaterial);
+        Engine::destroy(&mEngine);
+    }
+
+    // The engine is only for creating materials/material instances, we're not using the UboManager
+    // inside for testing.
+    Engine* mEngine = nullptr;
+    NiceMock<MockDriver> mMockDriver;
+    CommandBufferQueue mCommandBufferQueue;
+    CommandStream mCommandStream;
+    DriverApi& mDriverApi;
+    UboManager mUboManager;
+    Material const* mMaterial;
+    std::unordered_set<FMaterialInstance*>& mPendingInstances;
+    std::unordered_set<FMaterialInstance*>& mManagedInstances;
+    Handle<HwBufferObject>& mUbHandle;
+    BufferAllocator& mAllocator;
+};
+
+TEST_F(UboManagerTest, InitialState) {
+    EXPECT_EQ(mUboManager.getTotalSize(), DEFAULT_TOTAL_SIZE);
+    EXPECT_EQ(mMockDriver.nextFakeHandle, 2);
+    EXPECT_NE(mUbHandle.getId(), HandleBase::nullid);
+}
+
+TEST_F(UboManagerTest, BeginFrameWithoutReallocate) {
+    BufferAllocator::allocation_size_t originalBufferSize = mUboManager.getTotalSize();
+    auto mi1 = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+    EXPECT_EQ(mi1->getAllocationId(), BufferAllocator::UNALLOCATED);
+    ASSERT_TRUE(mi1->isUsingUboBatching());
+
+    // The mi1 should be put in the pending list.
+    mUboManager.manageMaterialInstance(mi1);
+    EXPECT_TRUE(mPendingInstances.contains(mi1));
+    EXPECT_FALSE(mManagedInstances.contains(mi1));
+
+    mUboManager.beginFrame(mDriverApi);
+
+    // The mi1 should be moved to managed list after beginFrame.
+    EXPECT_FALSE(mPendingInstances.contains(mi1));
+    EXPECT_TRUE(mManagedInstances.contains(mi1));
+    // Should have allocation after beginFrame.
+    EXPECT_TRUE(BufferAllocator::isValid(mi1->getAllocationId()));
+
+    // Reallocation is not triggered under this case.
+    EXPECT_EQ(mUboManager.getTotalSize(), originalBufferSize);
+    EXPECT_NE(mUboManager.getMemoryMappedBufferHandle().getId(), HandleBase::nullid);
+
+    mUboManager.finishBeginFrame(mDriverApi);
+    EXPECT_EQ(mUboManager.getMemoryMappedBufferHandle().getId(), HandleBase::nullid);
+
+    mUboManager.endFrame(mDriverApi);
+    EXPECT_TRUE(mAllocator.isLockedByGpu(mi1->getAllocationId()));
+
+    // We're not using the UboManager inside mEngine, so we need to unmanage the material instance
+    // by ourselves.
+    mUboManager.unmanageMaterialInstance(mi1);
+    EXPECT_FALSE(mPendingInstances.contains(mi1));
+    EXPECT_FALSE(mManagedInstances.contains(mi1));
+
+    mUboManager.terminate(mDriverApi);
+    mEngine->destroy(mi1);
+}
+
+TEST_F(UboManagerTest, BeginFrameWithReallocate) {
+    const allocation_size_t originalBufferSize = mUboManager.getTotalSize();
+    const Handle<HwBufferObject> originalBufferHandle = mUbHandle;
+
+    // Create enough material instances to trigger a reallocation.
+    constexpr size_t numInstances = (DEFAULT_TOTAL_SIZE / DEFAULT_SLOT_SIZE) + 1;
+    std::vector<FMaterialInstance*> instances;
+    instances.reserve(numInstances);
+
+    for (size_t i = 0; i < numInstances; ++i) {
+        auto mi = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+        instances.push_back(mi);
+        mUboManager.manageMaterialInstance(mi);
+    }
+
+    // Before beginFrame, all instances should be pending.
+    EXPECT_EQ(mPendingInstances.size(), numInstances);
+    EXPECT_TRUE(mManagedInstances.empty());
+
+    mUboManager.beginFrame(mDriverApi);
+
+    // After beginFrame, reallocation should have occurred.
+    EXPECT_NE(mUbHandle.getId(), originalBufferHandle.getId());
+    EXPECT_GT(mUboManager.getTotalSize(), originalBufferSize);
+
+    // All instances should now be managed and have valid allocations.
+    EXPECT_TRUE(mPendingInstances.empty());
+    EXPECT_EQ(mManagedInstances.size(), numInstances);
+    for (const auto* mi: instances) {
+        EXPECT_TRUE(mManagedInstances.contains(const_cast<FMaterialInstance*>(mi)));
+        EXPECT_TRUE(BufferAllocator::isValid(mi->getAllocationId()));
+    }
+
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.terminate(mDriverApi);
+
+    for (auto* mi: instances) {
+        // We're not using the UboManager inside mEngine, so we need to unmanage the material instance
+        // by ourselves.
+        mUboManager.unmanageMaterialInstance(mi);
+        mEngine->destroy(mi);
+    }
+}
+
+TEST_F(UboManagerTest, RecycleSlot) {
+    auto mi1 = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+    mUboManager.manageMaterialInstance(mi1);
+
+    // Frame 1: mi1 gets an allocation.
+    mUboManager.beginFrame(mDriverApi);
+    const AllocationId mi1AllocationId = mi1->getAllocationId();
+    const allocation_size_t mi1AllocationOffset =
+            mAllocator.getAllocationOffset(mi1AllocationId);
+    EXPECT_TRUE(BufferAllocator::isValid(mi1AllocationId));
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.endFrame(mDriverApi); // Locks mi1's allocation.
+
+    // Now, unmanage mi1. The slot should be retired but not yet released.
+    mUboManager.unmanageMaterialInstance(mi1);
+    EXPECT_TRUE(mAllocator.isLockedByGpu(mi1AllocationId));
+
+    // Frame 2: The slot for mi1 is still locked by the GPU.
+    // We expect getFenceStatus to be called for the fence from frame 1.
+    // We'll mock it to return TIMEOUT_EXPIRED, so the resource is not reclaimed.
+    EXPECT_CALL(mMockDriver, getFenceStatus(_)).WillOnce(Return(FenceStatus::TIMEOUT_EXPIRED));
+    mUboManager.beginFrame(mDriverApi);
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.endFrame(mDriverApi);
+
+    // Frame 3: Now, we'll simulate that the fence from frame 1 has signaled.
+    // The resource for mi1 should be reclaimed.
+    EXPECT_CALL(mMockDriver, getFenceStatus(_)).WillOnce(Return(FenceStatus::CONDITION_SATISFIED));
+
+    auto mi2 = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+    mUboManager.manageMaterialInstance(mi2);
+
+    mUboManager.beginFrame(mDriverApi);
+
+    // mi2 should now have a valid allocation, and it should reuse the slot from mi1.
+    EXPECT_TRUE(BufferAllocator::isValid(mi2->getAllocationId()));
+    EXPECT_EQ(mAllocator.getAllocationOffset(mi2->getAllocationId()), mi1AllocationOffset);
+
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.unmanageMaterialInstance(mi2);
+    mUboManager.terminate(mDriverApi);
+
+    mEngine->destroy(mi1);
+    mEngine->destroy(mi2);
+}
+
+TEST_F(UboManagerTest, OrphanSlot) {
+    auto mi1 = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+    mUboManager.manageMaterialInstance(mi1);
+
+    // Frame 1: mi1 gets an allocation.
+    mUboManager.beginFrame(mDriverApi);
+    const AllocationId alloc1 = mi1->getAllocationId();
+    EXPECT_TRUE(BufferAllocator::isValid(alloc1));
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.endFrame(mDriverApi); // Locks alloc1.
+
+    // Frame 2: Mark the instance as dirty and begin a new frame.
+    // This should trigger orphaning.
+    mi1->getUniformBuffer().invalidate();
+    EXPECT_CALL(mMockDriver, getFenceStatus(_)).WillOnce(Return(FenceStatus::TIMEOUT_EXPIRED));
+    mUboManager.beginFrame(mDriverApi);
+
+    const AllocationId alloc2 = mi1->getAllocationId();
+    EXPECT_TRUE(BufferAllocator::isValid(alloc2));
+    EXPECT_NE(alloc1, alloc2); // Should have a new allocation.
+
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.endFrame(mDriverApi); // Locks alloc2.
+
+    // Frame 3: The fence for alloc1 should now be signaled.
+    EXPECT_CALL(mMockDriver, getFenceStatus(_))
+            .WillOnce(Return(FenceStatus::TIMEOUT_EXPIRED))      // For alloc2's fence
+            .WillOnce(Return(FenceStatus::CONDITION_SATISFIED)); // For alloc1's fence
+    mUboManager.beginFrame(mDriverApi);
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.unmanageMaterialInstance(mi1);
+    mUboManager.terminate(mDriverApi);
+
+    mEngine->destroy(mi1);
+}
+
+TEST_F(UboManagerTest, DoubleManage) {
+    auto mi1 = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+    mUboManager.manageMaterialInstance(mi1);
+    EXPECT_EQ(mPendingInstances.size(), 1);
+
+    // Managing the same instance again should be a no-op.
+    mUboManager.manageMaterialInstance(mi1);
+    EXPECT_EQ(mPendingInstances.size(), 1);
+
+    mUboManager.terminate(mDriverApi);
+    mEngine->destroy(mi1);
+}
+
+TEST_F(UboManagerTest, ManageAndUnmanageBeforeBeginFrame) {
+    auto mi1 = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+    mUboManager.manageMaterialInstance(mi1);
+    EXPECT_TRUE(mPendingInstances.contains(mi1));
+
+    mUboManager.unmanageMaterialInstance(mi1);
+    EXPECT_FALSE(mPendingInstances.contains(mi1));
+
+    // After beginFrame, the instance should not be in any list.
+    mUboManager.beginFrame(mDriverApi);
+    EXPECT_FALSE(mPendingInstances.contains(mi1));
+    EXPECT_FALSE(mManagedInstances.contains(mi1));
+    EXPECT_EQ(mi1->getAllocationId(), BufferAllocator::UNALLOCATED);
+
+    mUboManager.terminate(mDriverApi);
+    mEngine->destroy(mi1);
+}
+
+TEST_F(UboManagerTest, UnmanageUnmanaged) {
+    auto mi1 = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+
+    // Unmanaging an instance that was never managed should not cause any issues.
+    mUboManager.unmanageMaterialInstance(mi1);
+    EXPECT_FALSE(mPendingInstances.contains(mi1));
+    EXPECT_FALSE(mManagedInstances.contains(mi1));
+
+    mUboManager.terminate(mDriverApi);
+    mEngine->destroy(mi1);
+}
+
+TEST_F(UboManagerTest, AllAllocationsLockedAfterEndFrame) {
+    constexpr size_t numInstances = 5;
+    std::vector<FMaterialInstance*> instances;
+    instances.reserve(numInstances);
+
+    for (size_t i = 0; i < numInstances; ++i) {
+        auto mi = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+        instances.push_back(mi);
+        mUboManager.manageMaterialInstance(mi);
+    }
+
+    mUboManager.beginFrame(mDriverApi);
+    mUboManager.finishBeginFrame(mDriverApi);
+    mUboManager.endFrame(mDriverApi);
+
+    for (const auto* mi: instances) {
+        EXPECT_TRUE(mAllocator.isLockedByGpu(mi->getAllocationId()));
+    }
+
+    mUboManager.terminate(mDriverApi);
+    for (auto* mi: instances) {
+        // We're not using the UboManager inside mEngine, so we need to unmanage the material instance
+        // by ourselves.
+        mUboManager.unmanageMaterialInstance(mi);
+        mEngine->destroy(mi);
+    }
+}
+
+TEST_F(UboManagerTest, AllAllocationsLockedAfterEndFrameWithInvalidIdInBetween) {
+    constexpr size_t numInstances = 5;
+    std::vector<FMaterialInstance*> instances;
+    instances.reserve(numInstances);
+
+    for (size_t i = 0; i < numInstances; ++i) {
+        auto mi = static_cast<FMaterialInstance*>(mMaterial->createInstance());
+        instances.push_back(mi);
+        mUboManager.manageMaterialInstance(mi);
+    }
+
+    mUboManager.beginFrame(mDriverApi);
+    mUboManager.finishBeginFrame(mDriverApi);
+
+    // It should rarely happen, but we want to make sure all other instances are locked properly.
+    instances[2]->assignUboAllocation(mUbHandle, BufferAllocator::REALLOCATION_REQUIRED, 0);
+    mUboManager.endFrame(mDriverApi);
+
+    for (const auto* mi: instances) {
+        if (BufferAllocator::isValid(mi->getAllocationId())) {
+            EXPECT_TRUE(mAllocator.isLockedByGpu(mi->getAllocationId()));
+        }
+    }
+
+    mUboManager.terminate(mDriverApi);
+    for (auto* mi: instances) {
+        // We're not using the UboManager inside mEngine, so we need to unmanage the material instance
+        // by ourselves.
+        mUboManager.unmanageMaterialInstance(mi);
+        mEngine->destroy(mi);
+    }
+}
+
+// TODO: Add more tests for the beginFrame flow
--- a/filament/test/test_material.filamat
+++ b/filament/test/test_material.filamat
--- a/filament/test/test_material_transformname.filamat
+++ b/filament/test/test_material_transformname.filamat
--- a/libs/utils/include/utils/MonotonicRingMap.h
+++ b/libs/utils/include/utils/MonotonicRingMap.h
@@ -56,6 +56,9 @@ public:
    //! Returns true if the map is full.
    bool full() const noexcept { return mSize == N; }

+    //! Clears the map entirely.
+    void clear() noexcept { mSize = 0; mHead = 0; }
+
    /**
     * Inserts a new key-value pair.
     * The key must be greater than the key of the last inserted element.
@@ -65,7 +68,7 @@ public:
     */
    UTILS_NOINLINE void insert(key_type key, mapped_type value) {
        assert(empty() || key > back().first); // assert monotonic
-        if (full()) {
+        if (UTILS_LIKELY(full())) {
            // container is full, replace the oldest element
            mStorage[mHead] = { key, value };
            mHead = (mHead + 1) % N;
--- a/third_party/benchmark/.clang-format
+++ b/third_party/benchmark/.clang-format
@@ -1,5 +1,5 @@
 ---
 Language:        Cpp
 BasedOnStyle:  Google
+PointerAlignment: Left
 ...
-
--- a/third_party/benchmark/.clang-tidy
+++ b/third_party/benchmark/.clang-tidy
@@ -0,0 +1,37 @@
+---
+Checks: >
+  abseil-*,
+  bugprone-*,
+  clang-analyzer-*,
+  cppcoreguidelines-*,
+  google-*,
+  misc-*,
+  performance-*,
+  readability-*,
+  -clang-analyzer-deadcode*,
+  -clang-analyzer-optin*,
+  -readability-identifier-length
+WarningsAsErrors: ''
+HeaderFilterRegex: ''
+FormatStyle:     none
+CheckOptions:
+  llvm-else-after-return.WarnOnConditionVariables: 'false'
+  modernize-loop-convert.MinConfidence: reasonable
+  modernize-replace-auto-ptr.IncludeStyle: llvm
+  cert-str34-c.DiagnoseSignedUnsignedCharComparisons: 'false'
+  google-readability-namespace-comments.ShortNamespaceLines: '10'
+  cert-err33-c.CheckedFunctions: '::aligned_alloc;::asctime_s;::at_quick_exit;::atexit;::bsearch;::bsearch_s;::btowc;::c16rtomb;::c32rtomb;::calloc;::clock;::cnd_broadcast;::cnd_init;::cnd_signal;::cnd_timedwait;::cnd_wait;::ctime_s;::fclose;::fflush;::fgetc;::fgetpos;::fgets;::fgetwc;::fopen;::fopen_s;::fprintf;::fprintf_s;::fputc;::fputs;::fputwc;::fputws;::fread;::freopen;::freopen_s;::fscanf;::fscanf_s;::fseek;::fsetpos;::ftell;::fwprintf;::fwprintf_s;::fwrite;::fwscanf;::fwscanf_s;::getc;::getchar;::getenv;::getenv_s;::gets_s;::getwc;::getwchar;::gmtime;::gmtime_s;::localtime;::localtime_s;::malloc;::mbrtoc16;::mbrtoc32;::mbsrtowcs;::mbsrtowcs_s;::mbstowcs;::mbstowcs_s;::memchr;::mktime;::mtx_init;::mtx_lock;::mtx_timedlock;::mtx_trylock;::mtx_unlock;::printf_s;::putc;::putwc;::raise;::realloc;::remove;::rename;::scanf;::scanf_s;::setlocale;::setvbuf;::signal;::snprintf;::snprintf_s;::sprintf;::sprintf_s;::sscanf;::sscanf_s;::strchr;::strerror_s;::strftime;::strpbrk;::strrchr;::strstr;::strtod;::strtof;::strtoimax;::strtok;::strtok_s;::strtol;::strtold;::strtoll;::strtoul;::strtoull;::strtoumax;::strxfrm;::swprintf;::swprintf_s;::swscanf;::swscanf_s;::thrd_create;::thrd_detach;::thrd_join;::thrd_sleep;::time;::timespec_get;::tmpfile;::tmpfile_s;::tmpnam;::tmpnam_s;::tss_create;::tss_get;::tss_set;::ungetc;::ungetwc;::vfprintf;::vfprintf_s;::vfscanf;::vfscanf_s;::vfwprintf;::vfwprintf_s;::vfwscanf;::vfwscanf_s;::vprintf_s;::vscanf;::vscanf_s;::vsnprintf;::vsnprintf_s;::vsprintf;::vsprintf_s;::vsscanf;::vsscanf_s;::vswprintf;::vswprintf_s;::vswscanf;::vswscanf_s;::vwprintf_s;::vwscanf;::vwscanf_s;::wcrtomb;::wcschr;::wcsftime;::wcspbrk;::wcsrchr;::wcsrtombs;::wcsrtombs_s;::wcsstr;::wcstod;::wcstof;::wcstoimax;::wcstok;::wcstok_s;::wcstol;::wcstold;::wcstoll;::wcstombs;::wcstombs_s;::wcstoul;::wcstoull;::wcstoumax;::wcsxfrm;::wctob;::wctrans;::wctype;::wmemchr;::wprintf_s;::wscanf;::wscanf_s;'
+  cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField: 'false'
+  cert-dcl16-c.NewSuffixes: 'L;LL;LU;LLU'
+  google-readability-braces-around-statements.ShortStatementLines: '1'
+  cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic: 'true'
+  google-readability-namespace-comments.SpacesBeforeComments: '2'
+  modernize-loop-convert.MaxCopySize: '16'
+  modernize-pass-by-value.IncludeStyle: llvm
+  modernize-use-nullptr.NullMacros: 'NULL'
+  llvm-qualified-auto.AddConstToQualified: 'false'
+  modernize-loop-convert.NamingStyle: CamelCase
+  llvm-else-after-return.WarnOnUnfixable: 'false'
+  google-readability-function-size.StatementThreshold: '800'
+...
+
--- a/third_party/benchmark/.clang-tidy.ignore
+++ b/third_party/benchmark/.clang-tidy.ignore
@@ -0,0 +1 @@
+.*third_party/.*
--- a/third_party/benchmark/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/third_party/benchmark/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,32 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: "[BUG]"
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**System**
+Which OS, compiler, and compiler version are you using:
+  - OS: 
+  - Compiler and version: 
+
+**To reproduce**
+Steps to reproduce the behavior:
+1. sync to commit ...
+2. cmake/bazel...
+3. make ...
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Additional context**
+Add any other context about the problem here.
--- a/third_party/benchmark/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/third_party/benchmark/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: "[FR]"
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
--- a/third_party/benchmark/.github/install_bazel.sh
+++ b/third_party/benchmark/.github/install_bazel.sh
@@ -0,0 +1,12 @@
+if ! bazel version; then
+  arch=$(uname -m)
+  if [ "$arch" == "aarch64" ]; then
+    arch="arm64"
+  fi
+  echo "Downloading $arch Bazel binary from GitHub releases."
+  curl -L -o $HOME/bin/bazel --create-dirs "https://github.com/bazelbuild/bazel/releases/download/8.2.0/bazel-8.2.0-linux-$arch"
+  chmod +x $HOME/bin/bazel
+else
+  # Bazel is installed for the correct architecture
+  exit 0
+fi
--- a/third_party/benchmark/.github/libcxx-setup.sh
+++ b/third_party/benchmark/.github/libcxx-setup.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Checkout LLVM sources
+git clone --filter=blob:none --depth=1 --branch llvmorg-19.1.6 --no-checkout https://github.com/llvm/llvm-project.git llvm-project
+cd llvm-project
+git sparse-checkout set --cone
+git checkout llvmorg-19.1.6
+git sparse-checkout set cmake llvm/cmake runtimes libcxx libcxxabi
+cd ..
+
+## Setup libc++ options
+if [ -z "$BUILD_32_BITS" ]; then
+  export BUILD_32_BITS=OFF && echo disabling 32 bit build
+fi
+
+## Build and install libc++ (Use unstable ABI for better sanitizer coverage)
+mkdir llvm-build && cd llvm-build
+cmake -GNinja                                   \
+      -DCMAKE_C_COMPILER=${CC}                  \
+      -DCMAKE_CXX_COMPILER=${CXX}               \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
+      -DCMAKE_INSTALL_PREFIX=/usr               \
+      -DLIBCXX_ABI_UNSTABLE=OFF                 \
+      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER}  \
+      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS}     \
+      -DLIBCXXABI_USE_LLVM_UNWINDER=OFF         \
+      -DLLVM_INCLUDE_TESTS=OFF                  \
+      -DLIBCXX_INCLUDE_TESTS=OFF                \
+      -DLIBCXX_INCLUDE_BENCHMARKS=OFF           \
+      -DLLVM_ENABLE_RUNTIMES='libcxx;libcxxabi' \
+      ../llvm-project/runtimes/
+cmake --build . -- cxx cxxabi
+cd ..
--- a/third_party/benchmark/.github/workflows/bazel.yml
+++ b/third_party/benchmark/.github/workflows/bazel.yml
@@ -0,0 +1,37 @@
+name: bazel
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  build_and_test_default:
+    name: bazel.${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: mount bazel cache
+      uses: actions/cache@v4
+      env:
+        cache-name: bazel-cache
+      with:
+        path: "~/.cache/bazel"
+        key: ${{ env.cache-name }}-${{ matrix.os }}-${{ github.ref }}
+        restore-keys: |
+          ${{ env.cache-name }}-${{ matrix.os }}-main
+
+    - name: build
+      run: |
+        bazel build //:benchmark //:benchmark_main //test/...
+
+    - name: test
+      run: |
+        bazel test --test_output=all //test/...
--- a/third_party/benchmark/.github/workflows/build-and-test-min-cmake.yml
+++ b/third_party/benchmark/.github/workflows/build-and-test-min-cmake.yml
@@ -0,0 +1,49 @@
+name: build-and-test-min-cmake
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  job:
+    name: ${{ matrix.os }}.min-cmake
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: lukka/get-cmake@latest
+        with:
+          cmakeVersion: 3.13.0
+
+      - name: create build environment
+        run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+      - name: setup cmake initial cache
+        run: touch compiler-cache.cmake
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: >
+          cmake -C ${{ github.workspace }}/compiler-cache.cmake
+          $GITHUB_WORKSPACE
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DCMAKE_CXX_VISIBILITY_PRESET=hidden
+          -DCMAKE_VISIBILITY_INLINES_HIDDEN=ON
+
+      - name: build
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: cmake --build .
--- a/third_party/benchmark/.github/workflows/build-and-test-perfcounters.yml
+++ b/third_party/benchmark/.github/workflows/build-and-test-perfcounters.yml
@@ -0,0 +1,54 @@
+name: build-and-test-perfcounters
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  job:
+    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
+    name: ${{ matrix.os }}.${{ matrix.build_type }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        build_type: ['Release', 'Debug']
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: install libpfm
+      run: |
+        sudo apt update
+        sudo apt -y install libpfm4-dev
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_LIBPFM=1
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    # Skip testing, for now. It seems perf_event_open does not succeed on the
+    # hosting machine, very likely a permissions issue.
+    # TODO(mtrofin): Enable test.
+    # - name: test
+    #   shell: bash
+    #   working-directory: ${{ runner.workspace }}/_build
+    #   run: ctest -C ${{ matrix.build_type }} --rerun-failed --output-on-failure
+
--- a/third_party/benchmark/.github/workflows/build-and-test.yml
+++ b/third_party/benchmark/.github/workflows/build-and-test.yml
@@ -0,0 +1,151 @@
+name: build-and-test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  # TODO: add 32-bit builds (g++ and clang++) for ubuntu
+  #   (requires g++-multilib and libc6:i386)
+  # TODO: add coverage build (requires lcov)
+  # TODO: add clang + libc++ builds for ubuntu
+  job:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.compiler }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-24.04, ubuntu-22.04, ubuntu-24.04-arm, macos-latest]
+        build_type: ['Release', 'Debug']
+        compiler: ['g++', 'clang++']
+        lib: ['shared', 'static']
+
+    steps:
+      - name: Install dependencies (macos)
+        if: runner.os == 'macOS'
+        run: brew install ninja
+
+      - uses: actions/checkout@v4
+
+      - name: build
+        uses: threeal/cmake-action@v2.1.0
+        with:
+          build-dir: ${{ runner.workspace }}/_build
+          cxx-compiler: ${{ matrix.compiler }}
+          options: |
+            BENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+            BUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+            CMAKE_BUILD_TYPE=${{ matrix.build_type }}
+            CMAKE_CXX_COMPILER=${{ matrix.compiler }}
+            CMAKE_CXX_VISIBILITY_PRESET=hidden
+            CMAKE_VISIBILITY_INLINES_HIDDEN=ON
+
+      - name: test
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: ctest -C ${{ matrix.build_type }} -VV
+
+  msvc:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.msvc }}
+    runs-on: ${{ matrix.os }}
+    defaults:
+        run:
+            shell: powershell
+    strategy:
+      fail-fast: false
+      matrix:
+        msvc:
+          - VS-16-2019
+          - VS-17-2022
+        build_type:
+          - Debug
+          - Release
+        lib:
+          - shared
+          - static
+        include:
+          - msvc: VS-16-2019
+            os: windows-2019
+            generator: 'Visual Studio 16 2019'
+          - msvc: VS-17-2022
+            os: windows-2022
+            generator: 'Visual Studio 17 2022'
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: lukka/get-cmake@latest
+
+      - name: configure cmake
+        run: >
+          cmake -S . -B ${{ runner.workspace }}/_build/
+          -G "${{ matrix.generator }}"
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+
+      - name: build
+        run: cmake --build ${{ runner.workspace }}/_build/ --config ${{ matrix.build_type }}
+
+      - name: test
+        run: ctest --test-dir ${{ runner.workspace }}/_build/ -C ${{ matrix.build_type }} -VV
+
+  msys2:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.msys2.msystem }}
+    runs-on: ${{ matrix.os }}
+    defaults:
+        run:
+            shell: msys2 {0}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ windows-latest ]
+        msys2:
+          - { msystem: MINGW64,    arch: x86_64,  family: GNU,  compiler: g++ }
+          - { msystem: CLANG64,    arch: x86_64,  family: LLVM, compiler: clang++ }
+          - { msystem: UCRT64,     arch: x86_64,  family: GNU,  compiler: g++ }
+        build_type:
+          - Debug
+          - Release
+        lib:
+          - shared
+          - static
+
+    steps:
+      - name: setup msys2
+        uses: msys2/setup-msys2@v2
+        with:
+          cache: false
+          msystem: ${{ matrix.msys2.msystem }}
+          update: true
+          install: >-
+            git
+            base-devel
+          pacboy: >-
+            gcc:p
+            clang:p
+            cmake:p
+            ninja:p
+
+      - uses: actions/checkout@v4
+
+      # NOTE: we can't use cmake actions here as we need to do everything in msys2 shell.
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.msys2.compiler }}
+        run: >
+          cmake -S . -B _build/
+          -GNinja
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+
+      - name: build
+        run: cmake --build _build/ --config ${{ matrix.build_type }}
+
+      - name: test
+        working-directory: _build
+        run: ctest -C ${{ matrix.build_type }} -VV
--- a/third_party/benchmark/.github/workflows/clang-format-lint.yml
+++ b/third_party/benchmark/.github/workflows/clang-format-lint.yml
@@ -0,0 +1,19 @@
+name: clang-format-lint
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  job:
+    name: check-clang-format
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - uses: DoozyX/clang-format-lint-action@v0.18.2
+      with:
+        source: './include/benchmark ./src ./test'
+        clangFormatVersion: 18
--- a/third_party/benchmark/.github/workflows/clang-tidy-lint.yml
+++ b/third_party/benchmark/.github/workflows/clang-tidy-lint.yml
@@ -0,0 +1,41 @@
+name: clang-tidy
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  job:
+    name: run-clang-tidy
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: install clang-tidy
+      run: sudo apt update && sudo apt -y install clang-tidy
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ github.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ github.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=clang
+        -DCMAKE_CXX_COMPILER=clang++
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DGTEST_COMPILE_COMMANDS=OFF
+
+    - name: run
+      shell: bash
+      working-directory: ${{ github.workspace }}/_build
+      run: run-clang-tidy -config-file=$GITHUB_WORKSPACE/.clang-tidy
--- a/third_party/benchmark/.github/workflows/doxygen.yml
+++ b/third_party/benchmark/.github/workflows/doxygen.yml
@@ -0,0 +1,31 @@
+name: doxygen
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  build-and-deploy:
+    name: Build HTML documentation
+    runs-on: ubuntu-latest
+    steps:
+    - name: Fetching sources
+      uses: actions/checkout@v4
+
+    - name: Installing build dependencies
+      run: |
+        sudo apt update
+        sudo apt install doxygen gcc git
+
+    - name: Creating build directory
+      run: mkdir build
+
+    - name: Building HTML documentation with Doxygen
+      run: |
+        cmake -S . -B build -DBENCHMARK_ENABLE_TESTING:BOOL=OFF -DBENCHMARK_ENABLE_DOXYGEN:BOOL=ON -DBENCHMARK_INSTALL_DOCS:BOOL=ON
+        cmake --build build --target benchmark_doxygen
--- a/third_party/benchmark/.github/workflows/pre-commit.yml
+++ b/third_party/benchmark/.github/workflows/pre-commit.yml
@@ -0,0 +1,41 @@
+name: python + Bazel pre-commit checks
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    env:
+      MYPY_CACHE_DIR: "${{ github.workspace }}/.cache/mypy"
+      RUFF_CACHE_DIR: "${{ github.workspace }}/.cache/ruff"
+      PRE_COMMIT_HOME: "${{ github.workspace }}/.cache/pre-commit"
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.11
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Install dependencies
+      run: python -m pip install ".[dev]"
+    - name: Cache pre-commit tools
+      uses: actions/cache@v4
+      with:
+        path: |
+          ${{ env.MYPY_CACHE_DIR }}
+          ${{ env.RUFF_CACHE_DIR }}
+          ${{ env.PRE_COMMIT_HOME }}
+        key: ${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }}-linter-cache
+    - name: Run pre-commit checks
+      run: pre-commit run --all-files --verbose --show-diff-on-failure
--- a/third_party/benchmark/.github/workflows/sanitizer.yml
+++ b/third_party/benchmark/.github/workflows/sanitizer.yml
@@ -0,0 +1,97 @@
+name: sanitizer
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  CMAKE_GENERATOR: Ninja
+  UBSAN_OPTIONS: "print_stacktrace=1"
+
+jobs:
+  job:
+    name: ${{ matrix.sanitizer }}.${{ matrix.build_type }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: ['Debug', 'RelWithDebInfo']
+        sanitizer: ['asan', 'ubsan', 'tsan', 'msan']
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: configure msan env
+      if: matrix.sanitizer == 'msan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=MemoryWithOrigins" >> $GITHUB_ENV
+
+    - name: configure ubsan env
+      if: matrix.sanitizer == 'ubsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Undefined" >> $GITHUB_ENV
+
+    - name: configure asan env
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=address -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Address" >> $GITHUB_ENV
+
+    - name: configure tsan env
+      if: matrix.sanitizer == 'tsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Thread" >> $GITHUB_ENV
+
+    - name: fine-tune asan options
+      # in asan we get an error from std::regex. ignore it.
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "ASAN_OPTIONS=alloc_dealloc_mismatch=0" >> $GITHUB_ENV
+
+    - name: setup clang
+      uses: egor-tensin/setup-clang@v1
+      with:
+        version: latest
+        platform: x64
+
+    - name: configure clang
+      run: |
+        echo "CC=cc" >> $GITHUB_ENV
+        echo "CXX=c++" >> $GITHUB_ENV
+
+    - name: build libc++ (non-asan)
+      if: matrix.sanitizer != 'asan'
+      run: |
+        "${GITHUB_WORKSPACE}/.github/libcxx-setup.sh"
+        echo "EXTRA_CXX_FLAGS=-stdlib=libc++ -L${GITHUB_WORKSPACE}/llvm-build/lib -lc++abi -I${GITHUB_WORKSPACE}/llvm-build/include/c++/v1 -Isystem${GITHUB_WORKSPACE}/llvm-build/include/c++/v1 -Wl,-rpath,${GITHUB_WORKSPACE}/llvm-build/lib" >> $GITHUB_ENV
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        VERBOSE=1
+        cmake -GNinja $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=${{ env.CC }}
+        -DCMAKE_CXX_COMPILER=${{ env.CXX }}
+        -DCMAKE_C_FLAGS="${{ env.EXTRA_FLAGS }}"
+        -DCMAKE_CXX_FLAGS="${{ env.EXTRA_FLAGS }} ${{ env.EXTRA_CXX_FLAGS }}"
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    - name: test
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: ctest -C ${{ matrix.build_type }} -VV
--- a/third_party/benchmark/.github/workflows/test_bindings.yml
+++ b/third_party/benchmark/.github/workflows/test_bindings.yml
@@ -0,0 +1,33 @@
+name: test-bindings
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  python_bindings:
+    name: Test GBM Python ${{ matrix.python-version }} bindings on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
+        python-version: [ "3.10", "3.11", "3.12", "3.13" ]
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install GBM Python bindings on ${{ matrix.os }}
+        run: python -m pip install .
+      - name: Run example on ${{ matrix.os }} under Python ${{ matrix.python-version }}
+        run: python bindings/python/google_benchmark/example.py
--- a/third_party/benchmark/.github/workflows/wheels.yml
+++ b/third_party/benchmark/.github/workflows/wheels.yml
@@ -0,0 +1,83 @@
+name: Build and upload Python wheels
+
+on:
+  workflow_dispatch:
+  release:
+    types:
+      - published
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Install Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - run: python -m pip install build
+      - name: Build sdist
+        run: python -m build --sdist
+      - uses: actions/upload-artifact@v4
+        with:
+          name: dist-sdist
+          path: dist/*.tar.gz
+
+  build_wheels:
+    name: Build Google Benchmark wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, ubuntu-24.04-arm, macos-13, macos-14, windows-latest]
+    steps:
+      - name: Check out Google Benchmark
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - uses: actions/setup-python@v5
+        name: Install Python 3.12
+        with:
+          python-version: "3.12"
+      - run: pip install --upgrade pip uv
+
+      - name: Build wheels on ${{ matrix.os }} using cibuildwheel
+        uses: pypa/cibuildwheel@v2.23.2
+        env:
+          CIBW_BUILD: "cp310-* cp311-* cp312-*"
+          CIBW_BUILD_FRONTEND: "build[uv]"
+          CIBW_SKIP: "*-musllinux_*"
+          CIBW_ARCHS: auto64
+          CIBW_BEFORE_ALL_LINUX: bash .github/install_bazel.sh
+          # Grab the rootless Bazel installation inside the container.
+          CIBW_ENVIRONMENT_LINUX: PATH=$PATH:$HOME/bin
+          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+          # unused by Bazel, but needed explicitly by delocate on MacOS.
+          MACOSX_DEPLOYMENT_TARGET: "10.14"
+
+      - name: Upload Google Benchmark ${{ matrix.os }} wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: dist-${{ matrix.os }}
+          path: wheelhouse/*.whl
+
+  pypi_upload:
+    name: Publish google-benchmark wheels to PyPI
+    needs: [build_sdist, build_wheels]
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          path: dist
+          pattern: dist-*
+          merge-multiple: true
+      - uses: pypa/gh-action-pypi-publish@release/v1
--- a/third_party/benchmark/.gitignore
+++ b/third_party/benchmark/.gitignore
@@ -8,8 +8,10 @@
 !/cmake/*.cmake
 !/test/AssemblyTests.cmake
 *~
+*.swp
 *.pyc
 __pycache__
+.DS_Store

 # lcov
 *.lcov
@@ -44,6 +46,7 @@ rules.ninja

 # bazel output symlinks.
 bazel-*
+MODULE.bazel.lock

 # out-of-source build top-level folders.
 build/
@@ -56,3 +59,10 @@ build*/
 # Visual Studio 2015/2017 cache/options directory
 .vs/
 CMakeSettings.json
+
+# Visual Studio Code cache/options directory
+.vscode/
+
+# Python build stuff
+dist/
+*.egg-info*
--- a/third_party/benchmark/.pre-commit-config.yaml
+++ b/third_party/benchmark/.pre-commit-config.yaml
@@ -0,0 +1,18 @@
+repos:
+  -   repo: https://github.com/keith/pre-commit-buildifier
+      rev: 8.0.3
+      hooks:
+      -   id: buildifier
+      -   id: buildifier-lint
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.15.0
+    hooks:
+      - id: mypy
+        types_or: [ python, pyi ]
+        args: [ "--ignore-missing-imports", "--scripts-are-modules" ]
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.8
+    hooks:
+      - id: ruff
+        args: [ --fix, --exit-non-zero-on-fix ]
+      - id: ruff-format
--- a/third_party/benchmark/.travis-libcxx-setup.sh
+++ b/third_party/benchmark/.travis-libcxx-setup.sh
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-# Install a newer CMake version
-curl -sSL https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh -o install-cmake.sh
-chmod +x install-cmake.sh
-sudo ./install-cmake.sh --prefix=/usr/local --skip-license
-
-# Checkout LLVM sources
-git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
-git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
-git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
-
-# Setup libc++ options
-if [ -z "$BUILD_32_BITS" ]; then
-  export BUILD_32_BITS=OFF && echo disabling 32 bit build
-fi
-
-# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
-mkdir llvm-build && cd llvm-build
-cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
-      -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
-      -DLIBCXX_ABI_UNSTABLE=ON \
-      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
-      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \
-      ../llvm-source
-make cxx -j2
-sudo make install-cxxabi install-cxx
-cd ../
--- a/third_party/benchmark/.travis.yml
+++ b/third_party/benchmark/.travis.yml
@@ -1,199 +0,0 @@
-sudo: required
-dist: trusty
-language: cpp
-
-env:
-  global:
-    - /usr/local/bin:$PATH
-
-matrix:
-  include:
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - lcov
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - g++-multilib
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug BUILD_32_BITS=ON
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - g++-multilib
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release BUILD_32_BITS=ON
-    - compiler: gcc
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
-    # Clang w/ libc++
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1
-        - EXTRA_FLAGS="-stdlib=libc++"
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
-        - LIBCXX_BUILD=1
-        - EXTRA_FLAGS="-stdlib=libc++"
-    # Clang w/ 32bit libc++
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            - clang-3.8
-            - g++-multilib
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-stdlib=libc++ -m32"
-    # Clang w/ 32bit libc++
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            - clang-3.8
-            - g++-multilib
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
-        - LIBCXX_BUILD=1
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-stdlib=libc++ -m32"
-    # Clang w/ libc++, ASAN, UBSAN
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
-        - UBSAN_OPTIONS=print_stacktrace=1
-    # Clang w/ libc++ and MSAN
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
-    # Clang w/ libc++ and MSAN
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Debug
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Release
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Release BUILD_32_BITS=ON
-    - os: osx
-      osx_image: xcode8.3
-      compiler: gcc
-      env:
-        - COMPILER=g++-7 C_COMPILER=gcc-7  BUILD_TYPE=Debug
-
-before_script:
-  - if [ -n "${LIBCXX_BUILD}" ]; then
-      source .travis-libcxx-setup.sh;
-    fi
-  - if [ -n "${ENABLE_SANITIZER}" ]; then
-      export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
-    else
-      export EXTRA_OPTIONS="";
-    fi
-  - mkdir -p build && cd build
-
-before_install:
-  - if [ -z "$BUILD_32_BITS" ]; then
-      export BUILD_32_BITS=OFF && echo disabling 32 bit build;
-    fi
-  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
-      sudo add-apt-repository -y "ppa:ubuntu-toolchain-r/test";
-      sudo apt-get update --option Acquire::Retries=100 --option Acquire::http::Timeout="60";
-    fi
-
-install:
-  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
-      travis_wait sudo -E apt-get -yq --no-install-suggests --no-install-recommends install g++-6;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "linux" -a "${BUILD_32_BITS}" == "OFF" ]; then
-      travis_wait sudo -E apt-get -y --no-install-suggests --no-install-recommends install llvm-3.9-tools;
-      sudo cp /usr/lib/llvm-3.9/bin/FileCheck /usr/local/bin/;
-    fi
-  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
-      PATH=~/.local/bin:${PATH};
-      pip install --user --upgrade pip;
-      travis_wait pip install --user cpp-coveralls;
-    fi
-  - if [ "${C_COMPILER}" == "gcc-7" -a "${TRAVIS_OS_NAME}" == "osx" ]; then
-      rm -f /usr/local/include/c++;
-      brew update;
-      travis_wait brew install gcc@7;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "linux" ]; then
-      sudo apt-get update -qq;
-      sudo apt-get install -qq unzip;
-      wget https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-linux-x86_64.sh --output-document bazel-installer.sh;
-      travis_wait sudo bash bazel-installer.sh;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
-      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-darwin-x86_64.sh;
-      travis_wait sudo bash bazel-installer.sh;
-    fi
-
-script:
-  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS}" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ${EXTRA_OPTIONS} ..
-  - make
-  - ctest -C ${BUILD_TYPE} --output-on-failure
-  - bazel test -c dbg --define google_benchmark.have_regex=posix --announce_rc --verbose_failures --test_output=errors --keep_going //test/...
-
-after_success:
-  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
-      coveralls --include src --include include --gcov-options '\-lp' --root .. --build-root .;
-    fi
--- a/third_party/benchmark/.ycm_extra_conf.py
+++ b/third_party/benchmark/.ycm_extra_conf.py
@@ -1,25 +1,30 @@
 import os
+
 import ycm_core

 # These are the compilation flags that will be used in case there's no
 # compilation database set (by default, one is not set).
 # CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR.
 flags = [
-'-Wall',
-'-Werror',
-'-pedantic-errors',
-'-std=c++0x',
-'-fno-strict-aliasing',
-'-O3',
-'-DNDEBUG',
-# ...and the same thing goes for the magic -x option which specifies the
-# language that the files to be compiled are written in. This is mostly
-# relevant for c++ headers.
-# For a C project, you would set this to 'c' instead of 'c++'.
-'-x', 'c++',
-'-I', 'include',
-'-isystem', '/usr/include',
-'-isystem', '/usr/local/include',
+    "-Wall",
+    "-Werror",
+    "-pedantic-errors",
+    "-std=c++0x",
+    "-fno-strict-aliasing",
+    "-O3",
+    "-DNDEBUG",
+    # ...and the same thing goes for the magic -x option which specifies the
+    # language that the files to be compiled are written in. This is mostly
+    # relevant for c++ headers.
+    # For a C project, you would set this to 'c' instead of 'c++'.
+    "-x",
+    "c++",
+    "-I",
+    "include",
+    "-isystem",
+    "/usr/include",
+    "-isystem",
+    "/usr/local/include",
 ]


@@ -29,87 +34,87 @@ flags = [
 #
 # Most projects will NOT need to set this to anything; you can just change the
 # 'flags' list of compilation flags. Notice that YCM itself uses that approach.
-compilation_database_folder = ''
+compilation_database_folder = ""

-if os.path.exists( compilation_database_folder ):
-  database = ycm_core.CompilationDatabase( compilation_database_folder )
+if os.path.exists(compilation_database_folder):
+    database = ycm_core.CompilationDatabase(compilation_database_folder)
 else:
-  database = None
+    database = None
+
+SOURCE_EXTENSIONS = [".cc"]

-SOURCE_EXTENSIONS = [ '.cc' ]

 def DirectoryOfThisScript():
-  return os.path.dirname( os.path.abspath( __file__ ) )
+    return os.path.dirname(os.path.abspath(__file__))


-def MakeRelativePathsInFlagsAbsolute( flags, working_directory ):
-  if not working_directory:
-    return list( flags )
-  new_flags = []
-  make_next_absolute = False
-  path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ]
-  for flag in flags:
-    new_flag = flag
+def MakeRelativePathsInFlagsAbsolute(flags, working_directory):
+    if not working_directory:
+        return list(flags)
+    new_flags = []
+    make_next_absolute = False
+    path_flags = ["-isystem", "-I", "-iquote", "--sysroot="]
+    for flag in flags:
+        new_flag = flag

-    if make_next_absolute:
-      make_next_absolute = False
-      if not flag.startswith( '/' ):
-        new_flag = os.path.join( working_directory, flag )
+        if make_next_absolute:
+            make_next_absolute = False
+            if not flag.startswith("/"):
+                new_flag = os.path.join(working_directory, flag)

-    for path_flag in path_flags:
-      if flag == path_flag:
-        make_next_absolute = True
-        break
+        for path_flag in path_flags:
+            if flag == path_flag:
+                make_next_absolute = True
+                break

-      if flag.startswith( path_flag ):
-        path = flag[ len( path_flag ): ]
-        new_flag = path_flag + os.path.join( working_directory, path )
-        break
+            if flag.startswith(path_flag):
+                path = flag[len(path_flag) :]
+                new_flag = path_flag + os.path.join(working_directory, path)
+                break

-    if new_flag:
-      new_flags.append( new_flag )
-  return new_flags
+        if new_flag:
+            new_flags.append(new_flag)
+    return new_flags


-def IsHeaderFile( filename ):
-  extension = os.path.splitext( filename )[ 1 ]
-  return extension in [ '.h', '.hxx', '.hpp', '.hh' ]
+def IsHeaderFile(filename):
+    extension = os.path.splitext(filename)[1]
+    return extension in [".h", ".hxx", ".hpp", ".hh"]


-def GetCompilationInfoForFile( filename ):
-  # The compilation_commands.json file generated by CMake does not have entries
-  # for header files. So we do our best by asking the db for flags for a
-  # corresponding source file, if any. If one exists, the flags for that file
-  # should be good enough.
-  if IsHeaderFile( filename ):
-    basename = os.path.splitext( filename )[ 0 ]
-    for extension in SOURCE_EXTENSIONS:
-      replacement_file = basename + extension
-      if os.path.exists( replacement_file ):
-        compilation_info = database.GetCompilationInfoForFile(
-          replacement_file )
-        if compilation_info.compiler_flags_:
-          return compilation_info
-    return None
-  return database.GetCompilationInfoForFile( filename )
+def GetCompilationInfoForFile(filename):
+    # The compilation_commands.json file generated by CMake does not have
+    # entries for header files. So we do our best by asking the db for flags for
+    # a corresponding source file, if any. If one exists, the flags for that
+    # file should be good enough.
+    if IsHeaderFile(filename):
+        basename = os.path.splitext(filename)[0]
+        for extension in SOURCE_EXTENSIONS:
+            replacement_file = basename + extension
+            if os.path.exists(replacement_file):
+                compilation_info = database.GetCompilationInfoForFile(
+                    replacement_file
+                )
+                if compilation_info.compiler_flags_:
+                    return compilation_info
+        return None
+    return database.GetCompilationInfoForFile(filename)


-def FlagsForFile( filename, **kwargs ):
-  if database:
-    # Bear in mind that compilation_info.compiler_flags_ does NOT return a
-    # python list, but a "list-like" StringVec object
-    compilation_info = GetCompilationInfoForFile( filename )
-    if not compilation_info:
-      return None
+def FlagsForFile(filename, **kwargs):
+    if database:
+        # Bear in mind that compilation_info.compiler_flags_ does NOT return a
+        # python list, but a "list-like" StringVec object
+        compilation_info = GetCompilationInfoForFile(filename)
+        if not compilation_info:
+            return None

-    final_flags = MakeRelativePathsInFlagsAbsolute(
-      compilation_info.compiler_flags_,
-      compilation_info.compiler_working_dir_ )
-  else:
-    relative_to = DirectoryOfThisScript()
-    final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to )
+        final_flags = MakeRelativePathsInFlagsAbsolute(
+            compilation_info.compiler_flags_,
+            compilation_info.compiler_working_dir_,
+        )
+    else:
+        relative_to = DirectoryOfThisScript()
+        final_flags = MakeRelativePathsInFlagsAbsolute(flags, relative_to)

-  return {
-    'flags': final_flags,
-    'do_cache': True
-  }
+    return {"flags": final_flags, "do_cache": True}
--- a/third_party/benchmark/AUTHORS
+++ b/third_party/benchmark/AUTHORS
@@ -9,40 +9,64 @@
 # Please keep the list sorted.

 Albert Pretorius <pretoalb@gmail.com>
+Alex Steele <steeleal123@gmail.com>
+Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
 Carto
+Cezary Skrzyński <czars1988@gmail.com>
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
+Colin Braley <braley.colin@gmail.com>
+Daniel Harvey <danielharvey458@gmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Deniz Evrenci <denizevrenci@gmail.com>
 Dirac Research 
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
+Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fabien Pichot <pichot.fabien@gmail.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Gergely Meszaros <maetveis@gmail.com>
+Gergő Szitár <szitar.gergo@gmail.com>
 Google Inc.
+Henrique Bucher <hbucher@gmail.com>
 International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
+Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
 MongoDB Inc.
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raghu Raja <raghu@enfabrica.net>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Roman Lebedev <lebedev.ri@gmail.com>
+Sayan Bhattacharjee <aero.sayan@gmail.com>
+Shapr3D <google-contributors@shapr3d.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Staffan Tjernstrom <staffantj@gmail.com>
 Steinar H. Gunderson <sgunderson@bigfoot.com>
 Stripe, Inc.
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
--- a/third_party/benchmark/BUILD.bazel
+++ b/third_party/benchmark/BUILD.bazel
@@ -1,9 +1,38 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
 licenses(["notice"])

+COPTS = [
+    "-pedantic",
+    "-pedantic-errors",
+    "-std=c++17",
+    "-Wall",
+    "-Wconversion",
+    "-Wextra",
+    "-Wshadow",
+    #    "-Wshorten-64-to-32",
+    "-Wfloat-equal",
+    "-fstrict-aliasing",
+    ## assert() are used a lot in tests upstream, which may be optimised out leading to
+    ## unused-variable warning.
+    "-Wno-unused-variable",
+    "-Werror=old-style-cast",
+]
+
+MSVC_COPTS = [
+    "/std:c++17",
+]
+
 config_setting(
    name = "windows",
-    values = {
-        "cpu": "x64_windows",
+    constraint_values = ["@platforms//os:windows"],
+    visibility = [":__subpackages__"],
+)
+
+config_setting(
+    name = "perfcounters",
+    define_values = {
+        "pfm": "1",
    },
    visibility = [":__subpackages__"],
 )
@@ -17,20 +46,51 @@ cc_library(
        ],
        exclude = ["src/benchmark_main.cc"],
    ),
-    hdrs = ["include/benchmark/benchmark.h"],
+    hdrs = [
+        "include/benchmark/benchmark.h",
+        "include/benchmark/export.h",
+    ],
+    copts = select({
+        ":windows": MSVC_COPTS,
+        "//conditions:default": COPTS,
+    }),
+    defines = [
+        "BENCHMARK_STATIC_DEFINE",
+        "BENCHMARK_VERSION=\\\"" + (module_version() if module_version() != None else "") + "\\\"",
+    ] + select({
+        ":perfcounters": ["HAVE_LIBPFM"],
+        "//conditions:default": [],
+    }),
+    includes = ["include"],
    linkopts = select({
        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
        "//conditions:default": ["-pthread"],
    }),
-    strip_include_prefix = "include",
+    # Only static linking is allowed; no .so will be produced.
+    # Using `defines` (i.e. not `local_defines`) means that no
+    # dependent rules need to bother about defining the macro.
+    linkstatic = True,
+    local_defines = [
+        # Turn on Large-file Support
+        "_FILE_OFFSET_BITS=64",
+        "_LARGEFILE64_SOURCE",
+        "_LARGEFILE_SOURCE",
+    ],
    visibility = ["//visibility:public"],
+    deps = select({
+        ":perfcounters": ["@libpfm"],
+        "//conditions:default": [],
+    }),
 )

 cc_library(
    name = "benchmark_main",
    srcs = ["src/benchmark_main.cc"],
-    hdrs = ["include/benchmark/benchmark.h"],
-    strip_include_prefix = "include",
+    hdrs = [
+        "include/benchmark/benchmark.h",
+        "include/benchmark/export.h",
+    ],
+    includes = ["include"],
    visibility = ["//visibility:public"],
    deps = [":benchmark"],
 )
--- a/third_party/benchmark/CMakeLists.txt
+++ b/third_party/benchmark/CMakeLists.txt
@@ -1,27 +1,34 @@
-cmake_minimum_required (VERSION 2.8.12)
+# Require CMake 3.10. If available, use the policies up to CMake 3.22.
+cmake_minimum_required (VERSION 3.13...3.22)

-project (benchmark)
-
-foreach(p
-    CMP0054 # CMake 3.1
-    CMP0056 # export EXE_LINKER_FLAGS to try_run
-    CMP0057 # Support no if() IN_LIST operator
-    )
-  if(POLICY ${p})
-    cmake_policy(SET ${p} NEW)
-  endif()
-endforeach()
+project (benchmark VERSION 1.9.4 LANGUAGES CXX)

 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
-if(NOT MSVC)
+option(BENCHMARK_ENABLE_WERROR "Build Release candidates with -Werror." ON)
+option(BENCHMARK_FORCE_WERROR "Build Release candidates with -Werror regardless of compiler issues." OFF)
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
+  # PGC++ maybe reporting false positives.
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "NVHPC")
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if(BENCHMARK_FORCE_WERROR)
+  set(BENCHMARK_ENABLE_WERROR ON)
+endif(BENCHMARK_FORCE_WERROR)
+
+if(NOT (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC"))
  option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
 else()
  set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
 endif()
 option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
+option(BENCHMARK_ENABLE_DOXYGEN "Build documentation with Doxygen." OFF)
+option(BENCHMARK_INSTALL_DOCS "Enable installation of documentation." ON)

 # Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
 # may require downloading the source code.
@@ -30,6 +37,24 @@ option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree buildi
 # This option can be used to disable building and running unit tests which depend on gtest
 # in cases where it is not possible to build or find a valid version of gtest.
 option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
+option(BENCHMARK_USE_BUNDLED_GTEST "Use bundled GoogleTest. If disabled, the find_package(GTest) will be used." ON)
+
+option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm" OFF)
+
+# Export only public symbols
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    # As of CMake 3.18, CMAKE_SYSTEM_PROCESSOR is not set properly for MSVC and
+    # cross-compilation (e.g. Host=x86_64, target=aarch64) requires using the
+    # undocumented, but working variable.
+    # See https://gitlab.kitware.com/cmake/cmake/-/issues/15170
+    set(CMAKE_SYSTEM_PROCESSOR ${MSVC_CXX_ARCHITECTURE_ID})
+    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ARM")
+      set(CMAKE_CROSSCOMPILING TRUE)
+    endif()
+endif()

 set(ENABLE_ASSEMBLY_TESTS_DEFAULT OFF)
 function(should_enable_assembly_tests)
@@ -41,7 +66,7 @@ function(should_enable_assembly_tests)
      return()
    endif()
  endif()
-  if (MSVC)
+  if (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC")
    return()
  elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
    return()
@@ -77,29 +102,63 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 include(GetGitVersion)
 get_git_version(GIT_VERSION)

+# If no git version can be determined, use the version
+# from the project() command
+if ("${GIT_VERSION}" STREQUAL "v0.0.0")
+  set(VERSION "v${benchmark_VERSION}")
+else()
+  set(VERSION "${GIT_VERSION}")
+endif()
+
+# Normalize version: drop "v" prefix, replace first "-" with ".",
+# drop everything after second "-" (including said "-").
+string(STRIP ${VERSION} VERSION)
+if(VERSION MATCHES v[^-]*-)
+   string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  NORMALIZED_VERSION ${VERSION})
+else()
+   string(REGEX REPLACE "v(.*)" "\\1" NORMALIZED_VERSION ${VERSION})
+endif()
+
 # Tell the user what versions we are using
-string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
-message(STATUS "Version: ${VERSION}")
+message(STATUS "Google Benchmark version: ${VERSION}, normalized to ${NORMALIZED_VERSION}")

 # The version of the libraries
-set(GENERIC_LIB_VERSION ${VERSION})
-string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
+set(GENERIC_LIB_VERSION ${NORMALIZED_VERSION})
+string(SUBSTRING ${NORMALIZED_VERSION} 0 1 GENERIC_LIB_SOVERSION)

 # Import our CMake modules
-include(CheckCXXCompilerFlag)
 include(AddCXXCompilerFlag)
+include(CheckCXXCompilerFlag)
+include(CheckLibraryExists)
 include(CXXFeatureCheck)

+check_library_exists(rt shm_open "" HAVE_LIB_RT)
+
 if (BENCHMARK_BUILD_32_BITS)
  add_required_cxx_compiler_flag(-m32)
 endif()

+set(BENCHMARK_CXX_STANDARD 17)
+
+set(CMAKE_CXX_STANDARD ${BENCHMARK_CXX_STANDARD})
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
 if (MSVC)
  # Turn compiler warnings up to 11
  string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
+
+  # MP flag only applies to cl, not cl frontends to other compilers (e.g. clang-cl, icx-cl etc)
+  if(CMAKE_CXX_COMPILER_ID MATCHES MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
+  endif()
  add_definitions(-D_CRT_SECURE_NO_WARNINGS)

+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-WX)
+  endif()
+
  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
    add_cxx_compiler_flag(-EHs-)
    add_cxx_compiler_flag(-EHa-)
@@ -126,45 +185,48 @@ if (MSVC)
    set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
  endif()
 else()
-  # Try and enable C++11. Don't use C++14 because it doesn't work in some
-  # configurations.
-  add_cxx_compiler_flag(-std=c++11)
-  if (NOT HAVE_CXX_FLAG_STD_CXX11)
-    add_cxx_compiler_flag(-std=c++0x)
-  endif()
-
+  # Turn on Large-file Support
+  add_definitions(-D_FILE_OFFSET_BITS=64)
+  add_definitions(-D_LARGEFILE64_SOURCE)
+  add_definitions(-D_LARGEFILE_SOURCE)
  # Turn compiler warnings up to 11
-  if (NOT MSVC)
-    add_cxx_compiler_flag(-Wall)
-    add_cxx_compiler_flag(-Wextra)
-    add_cxx_compiler_flag(-Wshadow)
-    add_cxx_compiler_flag(-Werror RELEASE)
-    add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
-    add_cxx_compiler_flag(-Werror MINSIZEREL)
-    add_cxx_compiler_flag(-pedantic)
-    add_cxx_compiler_flag(-pedantic-errors)
-    add_cxx_compiler_flag(-Wshorten-64-to-32)
-    add_cxx_compiler_flag(-fstrict-aliasing)
-    # Disable warnings regarding deprecated parts of the library while building
-    # and testing those parts of the library.
-    add_cxx_compiler_flag(-Wno-deprecated-declarations)
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-      # Intel silently ignores '-Wno-deprecated-declarations',
-      # warning no. 1786 must be explicitly disabled.
-      # See #631 for rationale.
-      add_cxx_compiler_flag(-wd1786)
-    endif()
-    # Disable deprecation warnings for release builds (when -Werror is enabled).
-    add_cxx_compiler_flag(-Wno-deprecated RELEASE)
-    add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
-    add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
-    if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
-      add_cxx_compiler_flag(-fno-exceptions)
-    endif()
+  add_cxx_compiler_flag(-Wall)
+  add_cxx_compiler_flag(-Wextra)
+  add_cxx_compiler_flag(-Wshadow)
+  add_cxx_compiler_flag(-Wfloat-equal)
+  add_cxx_compiler_flag(-Wold-style-cast)
+  add_cxx_compiler_flag(-Wconversion)
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Werror)
+  endif()
+  if (NOT BENCHMARK_ENABLE_TESTING)
+    # Disable warning when compiling tests as gtest does not use 'override'.
+    add_cxx_compiler_flag(-Wsuggest-override)
+  endif()
+  add_cxx_compiler_flag(-pedantic)
+  add_cxx_compiler_flag(-pedantic-errors)
+  add_cxx_compiler_flag(-Wshorten-64-to-32)
+  add_cxx_compiler_flag(-fstrict-aliasing)
+  # Disable warnings regarding deprecated parts of the library while building
+  # and testing those parts of the library.
+  add_cxx_compiler_flag(-Wno-deprecated-declarations)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
+    # Intel silently ignores '-Wno-deprecated-declarations',
+    # warning no. 1786 must be explicitly disabled.
+    # See #631 for rationale.
+    add_cxx_compiler_flag(-wd1786)
+    add_cxx_compiler_flag(-fno-finite-math-only)
+  endif()
+  # Disable deprecation warnings for release builds (when -Werror is enabled).
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Wno-deprecated)
+  endif()
+  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
+    add_cxx_compiler_flag(-fno-exceptions)
  endif()

  if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
-    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
+    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM") #ICC17u2: Many false positives for Wstrict-aliasing
      add_cxx_compiler_flag(-Wstrict-aliasing)
    endif()
  endif()
@@ -173,21 +235,26 @@ else()
  add_cxx_compiler_flag(-wd654)
  add_cxx_compiler_flag(-Wthread-safety)
  if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
-    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
+    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES "-DINCLUDE_DIRECTORIES=${PROJECT_SOURCE_DIR}/include")
  endif()

  # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a
  # predefined macro, which turns on all of the wonderful libc extensions.
-  # However g++ doesn't do this in Cygwin so we have to define it ourselfs
+  # However g++ doesn't do this in Cygwin so we have to define it ourselves
  # since we depend on GNU/POSIX/BSD extensions.
  if (CYGWIN)
    add_definitions(-D_GNU_SOURCE=1)
  endif()

+  if (QNXNTO)
+    add_definitions(-D_QNX_SOURCE)
+  endif()
+
  # Link time optimisation
  if (BENCHMARK_ENABLE_LTO)
    add_cxx_compiler_flag(-flto)
-    if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+    add_cxx_compiler_flag(-Wno-lto-type-mismatch)
+    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
      find_program(GCC_AR gcc-ar)
      if (GCC_AR)
        set(CMAKE_AR ${GCC_AR})
@@ -196,7 +263,7 @@ else()
      if (GCC_RANLIB)
        set(CMAKE_RANLIB ${GCC_RANLIB})
      endif()
-    elseif("${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
+    elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
      include(llvm-toolchain)
    endif()
  endif()
@@ -224,7 +291,8 @@ if (BENCHMARK_USE_LIBCXX)
  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
    add_cxx_compiler_flag(-stdlib=libc++)
  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
-          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel" OR
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM")
    add_cxx_compiler_flag(-nostdinc++)
    message(WARNING "libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
    # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
@@ -250,9 +318,16 @@ if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
        AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
  message(WARNING "Using std::regex with exceptions disabled is not fully supported")
 endif()
+
 cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
+cxx_feature_check(PTHREAD_AFFINITY)
+
+if (BENCHMARK_ENABLE_LIBPFM)
+  find_package(PFM REQUIRED)
+endif()

 # Set up directories
 include_directories(${PROJECT_SOURCE_DIR}/include)
@@ -262,8 +337,18 @@ add_subdirectory(src)

 if (BENCHMARK_ENABLE_TESTING)
  enable_testing()
-  if (BENCHMARK_ENABLE_GTEST_TESTS)
-    include(HandleGTest)
+  if (BENCHMARK_ENABLE_GTEST_TESTS AND
+      NOT (TARGET gtest AND TARGET gtest_main AND
+           TARGET gmock AND TARGET gmock_main))
+    if (BENCHMARK_USE_BUNDLED_GTEST)
+      include(GoogleTest)
+    else()
+      find_package(GTest CONFIG REQUIRED)
+      add_library(gtest ALIAS GTest::gtest)
+      add_library(gtest_main ALIAS GTest::gtest_main)
+      add_library(gmock ALIAS GTest::gmock)
+      add_library(gmock_main ALIAS GTest::gmock_main)
+    endif()
  endif()
  add_subdirectory(test)
 endif()
--- a/third_party/benchmark/CONTRIBUTORS
+++ b/third_party/benchmark/CONTRIBUTORS
@@ -22,44 +22,75 @@
 #
 # Please keep the list sorted.

+Abhina Sreeskantharajan <abhina.sreeskantharajan@ibm.com>
 Albert Pretorius <pretoalb@gmail.com>
+Alex Steele <steelal123@gmail.com>
+Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
+Bátor Tallér <bator.taller@shapr3d.com>
 Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
+Cezary Skrzyński <czars1988@gmail.com>
 Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
+Colin Braley <braley.colin@gmail.com>
+Cyrille Faucheux <cyrille.faucheux@gmail.com>
+Daniel Harvey <danielharvey458@gmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Deniz Evrenci <denizevrenci@gmail.com>
 Dominic Hamon <dma@stripysock.com> <dominic@google.com>
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
+Doug Evans <xdje42@gmail.com>
+Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fabien Pichot <pichot.fabien@gmail.com>
+Fanbo Meng <fanbo.meng@ibm.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Geoffrey Martin-Noble <gcmn@google.com> <gmngeoffrey@gmail.com>
+Gergely Meszaros <maetveis@gmail.com>
+Gergő Szitár <szitar.gergo@gmail.com>
+Hannes Hauswedell <h2@fsfe.org>
+Henrique Bucher <hbucher@gmail.com>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+Iakov Sergeev <yahontu@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 John Millikin <jmillikin@stripe.com>
+Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
+Kaito Udagawa <umireon@gmail.com>
 Kai Wolf <kai.wolf@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
-Kaito Udagawa <umireon@gmail.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raghu Raja <raghu@enfabrica.net>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Raul Marin <rmrodriguez@cartodb.com>
 Ray Glover <ray.glover@uk.ibm.com>
 Robert Guo <robert.guo@mongodb.com>
 Roman Lebedev <lebedev.ri@gmail.com>
+Sayan Bhattacharjee <aero.sayan@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Steven Wan <wan.yu@ibm.com>
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Tobias Ulvgård <tobias.ulvgard@dirac.se>
 Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
 Yixuan Qiu <yixuanq@gmail.com>
--- a/third_party/benchmark/MODULE.bazel
+++ b/third_party/benchmark/MODULE.bazel
@@ -0,0 +1,41 @@
+module(
+    name = "google_benchmark",
+    version = "1.9.4",
+)
+
+bazel_dep(name = "bazel_skylib", version = "1.7.1")
+bazel_dep(name = "platforms", version = "0.0.10")
+bazel_dep(name = "rules_cc", version = "0.0.9")
+
+bazel_dep(name = "rules_python", version = "1.0.0", dev_dependency = True)
+bazel_dep(name = "googletest", version = "1.14.0", dev_dependency = True, repo_name = "com_google_googletest")
+
+bazel_dep(name = "libpfm", version = "4.11.0.bcr.1")
+
+# Register a toolchain for Python 3.9 to be able to build numpy. Python
+# versions >=3.10 are problematic.
+# A second reason for this is to be able to build Python hermetically instead
+# of relying on the changing default version from rules_python.
+
+python = use_extension("@rules_python//python/extensions:python.bzl", "python", dev_dependency = True)
+python.toolchain(python_version = "3.8")
+python.toolchain(python_version = "3.9")
+python.toolchain(python_version = "3.10")
+python.toolchain(python_version = "3.11")
+python.toolchain(
+    is_default = True,
+    python_version = "3.12",
+)
+python.toolchain(python_version = "3.13")
+
+pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip", dev_dependency = True)
+pip.parse(
+    hub_name = "tools_pip_deps",
+    python_version = "3.9",
+    requirements_lock = "//tools:requirements.txt",
+)
+use_repo(pip, "tools_pip_deps")
+
+# -- bazel_dep definitions -- #
+
+bazel_dep(name = "nanobind_bazel", version = "2.7.0", dev_dependency = True)
--- a/third_party/benchmark/README.md
+++ b/third_party/benchmark/README.md
--- a/third_party/benchmark/WORKSPACE
+++ b/third_party/benchmark/WORKSPACE
@@ -1,7 +1,20 @@
 workspace(name = "com_github_google_benchmark")

-http_archive(
-     name = "com_google_googletest",
-     urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
-     strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
+load("//:bazel/benchmark_deps.bzl", "benchmark_deps")
+
+benchmark_deps()
+
+load("@rules_python//python:repositories.bzl", "py_repositories")
+
+py_repositories()
+
+load("@rules_python//python:pip.bzl", "pip_parse")
+
+pip_parse(
+    name = "tools_pip_deps",
+    requirements_lock = "//tools:requirements.txt",
 )
+
+load("@tools_pip_deps//:requirements.bzl", "install_deps")
+
+install_deps()
--- a/third_party/benchmark/WORKSPACE.bzlmod
+++ b/third_party/benchmark/WORKSPACE.bzlmod
@@ -0,0 +1,2 @@
+# This file marks the root of the Bazel workspace.
+# See MODULE.bazel for dependencies and setup.
--- a/third_party/benchmark/_config.yml
+++ b/third_party/benchmark/_config.yml
@@ -0,0 +1,2 @@
+theme: jekyll-theme-midnight
+markdown: GFM
--- a/third_party/benchmark/appveyor.yml
+++ b/third_party/benchmark/appveyor.yml
@@ -41,7 +41,7 @@ build_script:
  - cmake --build . --config %configuration%

 test_script:
-  - ctest -c %configuration% --timeout 300 --output-on-failure
+  - ctest --build-config %configuration% --timeout 300 --output-on-failure

 artifacts:
  - path: '_build/CMakeFiles/*.log'
--- a/third_party/benchmark/bazel/benchmark_deps.bzl
+++ b/third_party/benchmark/bazel/benchmark_deps.bzl
@@ -0,0 +1,54 @@
+"""
+This file contains the Bazel build dependencies for Google Benchmark (both C++ source and Python bindings).
+"""
+
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "new_git_repository")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+def benchmark_deps():
+    """Loads dependencies required to build Google Benchmark."""
+
+    if "bazel_skylib" not in native.existing_rules():
+        http_archive(
+            name = "bazel_skylib",
+            sha256 = "cd55a062e763b9349921f0f5db8c3933288dc8ba4f76dd9416aac68acee3cb94",
+            urls = [
+                "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
+                "https://github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
+            ],
+        )
+
+    if "rules_python" not in native.existing_rules():
+        http_archive(
+            name = "rules_python",
+            sha256 = "e85ae30de33625a63eca7fc40a94fea845e641888e52f32b6beea91e8b1b2793",
+            strip_prefix = "rules_python-0.27.1",
+            url = "https://github.com/bazelbuild/rules_python/releases/download/0.27.1/rules_python-0.27.1.tar.gz",
+        )
+
+    if "com_google_googletest" not in native.existing_rules():
+        new_git_repository(
+            name = "com_google_googletest",
+            remote = "https://github.com/google/googletest.git",
+            tag = "release-1.12.1",
+        )
+
+    if "nanobind" not in native.existing_rules():
+        new_git_repository(
+            name = "nanobind",
+            remote = "https://github.com/wjakob/nanobind.git",
+            tag = "v1.9.2",
+            build_file = "@//bindings/python:nanobind.BUILD",
+            recursive_init_submodules = True,
+        )
+
+    if "libpfm" not in native.existing_rules():
+        # Downloaded from v4.9.0 tag at https://sourceforge.net/p/perfmon2/libpfm4/ref/master/tags/
+        http_archive(
+            name = "libpfm",
+            build_file = str(Label("//tools:libpfm.BUILD.bazel")),
+            sha256 = "5da5f8872bde14b3634c9688d980f68bda28b510268723cc12973eedbab9fecc",
+            type = "tar.gz",
+            strip_prefix = "libpfm-4.11.0",
+            urls = ["https://sourceforge.net/projects/perfmon2/files/libpfm4/libpfm-4.11.0.tar.gz/download"],
+        )
--- a/third_party/benchmark/bindings/python/google_benchmark/BUILD
+++ b/third_party/benchmark/bindings/python/google_benchmark/BUILD
@@ -0,0 +1,34 @@
+load("@nanobind_bazel//:build_defs.bzl", "nanobind_extension", "nanobind_stubgen")
+load("@rules_python//python:defs.bzl", "py_library", "py_test")
+
+py_library(
+    name = "google_benchmark",
+    srcs = ["__init__.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":_benchmark",
+    ],
+)
+
+nanobind_extension(
+    name = "_benchmark",
+    srcs = ["benchmark.cc"],
+    deps = ["//:benchmark"],
+)
+
+nanobind_stubgen(
+    name = "benchmark_stubgen",
+    marker_file = "bindings/python/google_benchmark/py.typed",
+    module = ":_benchmark",
+)
+
+py_test(
+    name = "example",
+    srcs = ["example.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":google_benchmark",
+    ],
+)
--- a/third_party/benchmark/bindings/python/google_benchmark/init.py
+++ b/third_party/benchmark/bindings/python/google_benchmark/init.py
@@ -0,0 +1,145 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Python benchmarking utilities.
+
+Example usage:
+  import google_benchmark as benchmark
+
+  @benchmark.register
+  def my_benchmark(state):
+      ...  # Code executed outside `while` loop is not timed.
+
+      while state:
+        ...  # Code executed within `while` loop is timed.
+
+  if __name__ == '__main__':
+    benchmark.main()
+"""
+
+import atexit
+
+from absl import app
+
+from google_benchmark import _benchmark
+from google_benchmark._benchmark import (
+    Counter as Counter,
+    State as State,
+    kMicrosecond as kMicrosecond,
+    kMillisecond as kMillisecond,
+    kNanosecond as kNanosecond,
+    kSecond as kSecond,
+    o1 as o1,
+    oAuto as oAuto,
+    oLambda as oLambda,
+    oLogN as oLogN,
+    oN as oN,
+    oNCubed as oNCubed,
+    oNLogN as oNLogN,
+    oNone as oNone,
+    oNSquared as oNSquared,
+)
+
+__version__ = "1.9.4"
+
+
+class __OptionMaker:
+    """A stateless class to collect benchmark options.
+
+    Collect all decorator calls like @option.range(start=0, limit=1<<5).
+    """
+
+    class Options:
+        """Pure data class to store options calls, along with the benchmarked
+        function."""
+
+        def __init__(self, func):
+            self.func = func
+            self.builder_calls = []
+
+    @classmethod
+    def make(cls, func_or_options):
+        """Make Options from Options or the benchmarked function."""
+        if isinstance(func_or_options, cls.Options):
+            return func_or_options
+        return cls.Options(func_or_options)
+
+    def __getattr__(self, builder_name):
+        """Append option call in the Options."""
+
+        # The function that get returned on @option.range(start=0, limit=1<<5).
+        def __builder_method(*args, **kwargs):
+            # The decorator that get called, either with the benchmared function
+            # or the previous Options
+            def __decorator(func_or_options):
+                options = self.make(func_or_options)
+                options.builder_calls.append((builder_name, args, kwargs))
+                # The decorator returns Options so it is not technically a
+                # decorator and needs a final call to @register
+                return options
+
+            return __decorator
+
+        return __builder_method
+
+
+# Alias for nicer API.
+# We have to instantiate an object, even if stateless, to be able to use
+# __getattr__ on option.range
+option = __OptionMaker()
+
+
+def register(undefined=None, *, name=None):
+    """Register function for benchmarking."""
+    if undefined is None:
+        # Decorator is called without parenthesis so we return a decorator
+        return lambda f: register(f, name=name)
+
+    # We have either the function to benchmark (simple case) or an instance of
+    # Options (@option._ case).
+    options = __OptionMaker.make(undefined)
+
+    if name is None:
+        name = options.func.__name__
+
+    # We register the benchmark and reproduce all the @option._ calls onto the
+    # benchmark builder pattern
+    benchmark = _benchmark.RegisterBenchmark(name, options.func)
+    for name, args, kwargs in options.builder_calls[::-1]:
+        getattr(benchmark, name)(*args, **kwargs)
+
+    # return the benchmarked function because the decorator does not modify it
+    return options.func
+
+
+def _flags_parser(argv):
+    argv = _benchmark.Initialize(argv)
+    return app.parse_flags_with_usage(argv)
+
+
+def _run_benchmarks(argv):
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+    return _benchmark.RunSpecifiedBenchmarks()
+
+
+def main(argv=None):
+    return app.run(_run_benchmarks, argv=argv, flags_parser=_flags_parser)
+
+
+# FIXME: can we rerun with disabled ASLR?
+
+# Methods for use with custom main function.
+initialize = _benchmark.Initialize
+run_benchmarks = _benchmark.RunSpecifiedBenchmarks
+atexit.register(_benchmark.ClearRegisteredBenchmarks)
--- a/third_party/benchmark/bindings/python/google_benchmark/benchmark.cc
+++ b/third_party/benchmark/bindings/python/google_benchmark/benchmark.cc
@@ -0,0 +1,184 @@
+// Benchmark for Python.
+
+#include "benchmark/benchmark.h"
+
+#include "nanobind/nanobind.h"
+#include "nanobind/operators.h"
+#include "nanobind/stl/bind_map.h"
+#include "nanobind/stl/string.h"
+#include "nanobind/stl/vector.h"
+
+NB_MAKE_OPAQUE(benchmark::UserCounters);
+
+namespace {
+namespace nb = nanobind;
+
+std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
+  // The `argv` pointers here become invalid when this function returns, but
+  // benchmark holds the pointer to `argv[0]`. We create a static copy of it
+  // so it persists, and replace the pointer below.
+  static std::string executable_name(argv[0]);
+  std::vector<char*> ptrs;
+  ptrs.reserve(argv.size());
+  for (auto& arg : argv) {
+    ptrs.push_back(const_cast<char*>(arg.c_str()));
+  }
+  ptrs[0] = const_cast<char*>(executable_name.c_str());
+  int argc = static_cast<int>(argv.size());
+  benchmark::Initialize(&argc, ptrs.data());
+  std::vector<std::string> remaining_argv;
+  remaining_argv.reserve(argc);
+  for (int i = 0; i < argc; ++i) {
+    remaining_argv.emplace_back(ptrs[i]);
+  }
+  return remaining_argv;
+}
+
+benchmark::internal::Benchmark* RegisterBenchmark(const std::string& name,
+                                                  nb::callable f) {
+  return benchmark::RegisterBenchmark(
+      name, [f](benchmark::State& state) { f(&state); });
+}
+
+NB_MODULE(_benchmark, m) {
+
+  using benchmark::TimeUnit;
+  nb::enum_<TimeUnit>(m, "TimeUnit")
+      .value("kNanosecond", TimeUnit::kNanosecond)
+      .value("kMicrosecond", TimeUnit::kMicrosecond)
+      .value("kMillisecond", TimeUnit::kMillisecond)
+      .value("kSecond", TimeUnit::kSecond)
+      .export_values();
+
+  using benchmark::BigO;
+  nb::enum_<BigO>(m, "BigO")
+      .value("oNone", BigO::oNone)
+      .value("o1", BigO::o1)
+      .value("oN", BigO::oN)
+      .value("oNSquared", BigO::oNSquared)
+      .value("oNCubed", BigO::oNCubed)
+      .value("oLogN", BigO::oLogN)
+      .value("oNLogN", BigO::oNLogN)
+      .value("oAuto", BigO::oAuto)
+      .value("oLambda", BigO::oLambda)
+      .export_values();
+
+  using benchmark::internal::Benchmark;
+  nb::class_<Benchmark>(m, "Benchmark")
+      // For methods returning a pointer to the current object, reference
+      // return policy is used to ask nanobind not to take ownership of the
+      // returned object and avoid calling delete on it.
+      // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies
+      //
+      // For methods taking a const std::vector<...>&, a copy is created
+      // because a it is bound to a Python list.
+      // https://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html
+      .def("unit", &Benchmark::Unit, nb::rv_policy::reference)
+      .def("arg", &Benchmark::Arg, nb::rv_policy::reference)
+      .def("args", &Benchmark::Args, nb::rv_policy::reference)
+      .def("range", &Benchmark::Range, nb::rv_policy::reference,
+           nb::arg("start"), nb::arg("limit"))
+      .def("dense_range", &Benchmark::DenseRange,
+           nb::rv_policy::reference, nb::arg("start"),
+           nb::arg("limit"), nb::arg("step") = 1)
+      .def("ranges", &Benchmark::Ranges, nb::rv_policy::reference)
+      .def("args_product", &Benchmark::ArgsProduct,
+           nb::rv_policy::reference)
+      .def("arg_name", &Benchmark::ArgName, nb::rv_policy::reference)
+      .def("arg_names", &Benchmark::ArgNames,
+           nb::rv_policy::reference)
+      .def("range_pair", &Benchmark::RangePair,
+           nb::rv_policy::reference, nb::arg("lo1"), nb::arg("hi1"),
+           nb::arg("lo2"), nb::arg("hi2"))
+      .def("range_multiplier", &Benchmark::RangeMultiplier,
+           nb::rv_policy::reference)
+      .def("min_time", &Benchmark::MinTime, nb::rv_policy::reference)
+      .def("min_warmup_time", &Benchmark::MinWarmUpTime,
+           nb::rv_policy::reference)
+      .def("iterations", &Benchmark::Iterations,
+           nb::rv_policy::reference)
+      .def("repetitions", &Benchmark::Repetitions,
+           nb::rv_policy::reference)
+      .def("report_aggregates_only", &Benchmark::ReportAggregatesOnly,
+           nb::rv_policy::reference, nb::arg("value") = true)
+      .def("display_aggregates_only", &Benchmark::DisplayAggregatesOnly,
+           nb::rv_policy::reference, nb::arg("value") = true)
+      .def("measure_process_cpu_time", &Benchmark::MeasureProcessCPUTime,
+           nb::rv_policy::reference)
+      .def("use_real_time", &Benchmark::UseRealTime,
+           nb::rv_policy::reference)
+      .def("use_manual_time", &Benchmark::UseManualTime,
+           nb::rv_policy::reference)
+      .def(
+          "complexity",
+          (Benchmark * (Benchmark::*)(benchmark::BigO)) & Benchmark::Complexity,
+          nb::rv_policy::reference,
+          nb::arg("complexity") = benchmark::oAuto);
+
+  using benchmark::Counter;
+  nb::class_<Counter> py_counter(m, "Counter");
+
+  nb::enum_<Counter::Flags>(py_counter, "Flags", nb::is_arithmetic(), nb::is_flag())
+      .value("kDefaults", Counter::Flags::kDefaults)
+      .value("kIsRate", Counter::Flags::kIsRate)
+      .value("kAvgThreads", Counter::Flags::kAvgThreads)
+      .value("kAvgThreadsRate", Counter::Flags::kAvgThreadsRate)
+      .value("kIsIterationInvariant", Counter::Flags::kIsIterationInvariant)
+      .value("kIsIterationInvariantRate",
+             Counter::Flags::kIsIterationInvariantRate)
+      .value("kAvgIterations", Counter::Flags::kAvgIterations)
+      .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate)
+      .value("kInvert", Counter::Flags::kInvert)
+      .export_values();
+
+  nb::enum_<Counter::OneK>(py_counter, "OneK")
+      .value("kIs1000", Counter::OneK::kIs1000)
+      .value("kIs1024", Counter::OneK::kIs1024)
+      .export_values();
+
+  py_counter
+      .def(nb::init<double, Counter::Flags, Counter::OneK>(),
+           nb::arg("value") = 0., nb::arg("flags") = Counter::kDefaults,
+           nb::arg("k") = Counter::kIs1000)
+      .def("__init__",
+           ([](Counter* c, double value) { new (c) Counter(value); }))
+      .def_rw("value", &Counter::value)
+      .def_rw("flags", &Counter::flags)
+      .def_rw("oneK", &Counter::oneK)
+      .def(nb::init_implicit<double>());
+
+  nb::implicitly_convertible<nb::int_, Counter>();
+
+  nb::bind_map<benchmark::UserCounters>(m, "UserCounters");
+
+  using benchmark::State;
+  nb::class_<State>(m, "State")
+      .def("__bool__", &State::KeepRunning)
+      .def_prop_ro("keep_running", &State::KeepRunning)
+      .def("pause_timing", &State::PauseTiming)
+      .def("resume_timing", &State::ResumeTiming)
+      .def("skip_with_error", &State::SkipWithError)
+      .def_prop_ro("error_occurred", &State::error_occurred)
+      .def("set_iteration_time", &State::SetIterationTime)
+      .def_prop_rw("bytes_processed", &State::bytes_processed,
+                    &State::SetBytesProcessed)
+      .def_prop_rw("complexity_n", &State::complexity_length_n,
+                    &State::SetComplexityN)
+      .def_prop_rw("items_processed", &State::items_processed,
+                   &State::SetItemsProcessed)
+      .def("set_label", &State::SetLabel)
+      .def("range", &State::range, nb::arg("pos") = 0)
+      .def_prop_ro("iterations", &State::iterations)
+      .def_prop_ro("name", &State::name)
+      .def_rw("counters", &State::counters)
+      .def_prop_ro("thread_index", &State::thread_index)
+      .def_prop_ro("threads", &State::threads);
+
+  m.def("Initialize", Initialize);
+  m.def("RegisterBenchmark", RegisterBenchmark,
+        nb::rv_policy::reference);
+  m.def("RunSpecifiedBenchmarks",
+        []() { benchmark::RunSpecifiedBenchmarks(); });
+  m.def("ClearRegisteredBenchmarks", benchmark::ClearRegisteredBenchmarks);
+};
+}  // namespace
--- a/third_party/benchmark/bindings/python/google_benchmark/example.py
+++ b/third_party/benchmark/bindings/python/google_benchmark/example.py
@@ -0,0 +1,140 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Example of Python using C++ benchmark framework.
+
+To run this example, you must first install the `google_benchmark` Python
+package.
+
+To install using `setup.py`, download and extract the `google_benchmark` source.
+In the extracted directory, execute:
+  python setup.py install
+"""
+
+import random
+import time
+
+import google_benchmark as benchmark
+from google_benchmark import Counter
+
+
+@benchmark.register
+def empty(state):
+    while state:
+        pass
+
+
+@benchmark.register
+def sum_million(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register
+def pause_timing(state):
+    """Pause timing every iteration."""
+    while state:
+        # Construct a list of random ints every iteration without timing it
+        state.pause_timing()
+        random_list = [random.randint(0, 100) for _ in range(100)]
+        state.resume_timing()
+        # Time the in place sorting algorithm
+        random_list.sort()
+
+
+@benchmark.register
+def skipped(state):
+    if True:  # Test some predicate here.
+        state.skip_with_error("some error")
+        return  # NOTE: You must explicitly return, or benchmark will continue.
+
+    # Benchmark code would be here.
+
+
+@benchmark.register
+@benchmark.option.use_manual_time()
+def manual_timing(state):
+    while state:
+        # Manually count Python CPU time
+        start = time.perf_counter()  # perf_counter_ns() in Python 3.7+
+        # Something to benchmark
+        time.sleep(0.01)
+        end = time.perf_counter()
+        state.set_iteration_time(end - start)
+
+
+@benchmark.register
+def custom_counters(state):
+    """Collect custom metric using benchmark.Counter."""
+    num_foo = 0.0
+    while state:
+        # Benchmark some code here
+        # Collect some custom metric named foo
+        num_foo += 0.13
+
+    # Automatic Counter from numbers.
+    state.counters["foo"] = num_foo
+    # Set a counter as a rate.
+    state.counters["foo_rate"] = Counter(num_foo, Counter.kIsRate)
+    #  Set a counter as an inverse of rate.
+    state.counters["foo_inv_rate"] = Counter(
+        num_foo, Counter.kIsRate | Counter.kInvert
+    )
+    # Set a counter as a thread-average quantity.
+    state.counters["foo_avg"] = Counter(num_foo, Counter.kAvgThreads)
+    # There's also a combined flag:
+    state.counters["foo_avg_rate"] = Counter(num_foo, Counter.kAvgThreadsRate)
+
+
+@benchmark.register
+@benchmark.option.measure_process_cpu_time()
+@benchmark.option.use_real_time()
+def with_options(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register(name="sum_million_microseconds")
+@benchmark.option.unit(benchmark.kMicrosecond)
+def with_options2(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register
+@benchmark.option.arg(100)
+@benchmark.option.arg(1000)
+def passing_argument(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+@benchmark.register
+@benchmark.option.range(8, limit=8 << 10)
+def using_range(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+@benchmark.register
+@benchmark.option.range_multiplier(2)
+@benchmark.option.range(1 << 10, 1 << 18)
+@benchmark.option.complexity(benchmark.oN)
+def computing_complexity(state):
+    while state:
+        sum(range(state.range(0)))
+    state.complexity_n = state.range(0)
+
+
+if __name__ == "__main__":
+    benchmark.main()
--- a/third_party/benchmark/cmake/AddCXXCompilerFlag.cmake
+++ b/third_party/benchmark/cmake/AddCXXCompilerFlag.cmake
@@ -34,9 +34,11 @@ function(add_cxx_compiler_flag FLAG)
  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
  if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
      string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
    endif()
    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${BENCHMARK_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
  endif()
@@ -49,9 +51,11 @@ function(add_required_cxx_compiler_flag FLAG)
  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
  if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
      string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
    endif()
    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
--- a/third_party/benchmark/cmake/CXXFeatureCheck.cmake
+++ b/third_party/benchmark/cmake/CXXFeatureCheck.cmake
@@ -17,6 +17,8 @@ if(__cxx_feature_check)
 endif()
 set(__cxx_feature_check INCLUDED)

+option(CXXFEATURECHECK_DEBUG OFF)
+
 function(cxx_feature_check FILE)
  string(TOLOWER ${FILE} FILE)
  string(TOUPPER ${FILE} VAR)
@@ -27,26 +29,38 @@ function(cxx_feature_check FILE)
    return()
  endif()

+  set(FEATURE_CHECK_CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+  if (ARGC GREATER 1)
+    message(STATUS "Enabling additional flags: ${ARGV1}")
+    list(APPEND FEATURE_CHECK_CMAKE_FLAGS ${ARGV1})
+  endif()
+
  if (NOT DEFINED COMPILE_${FEATURE})
-    message(STATUS "Performing Test ${FEATURE}")
    if(CMAKE_CROSSCOMPILING)
+      message(STATUS "Cross-compiling to test ${FEATURE}")
      try_compile(COMPILE_${FEATURE}
              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 17
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
      if(COMPILE_${FEATURE})
        message(WARNING
              "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
-        set(RUN_${FEATURE} 0)
+        set(RUN_${FEATURE} 0 CACHE INTERNAL "")
      else()
-        set(RUN_${FEATURE} 1)
+        set(RUN_${FEATURE} 1 CACHE INTERNAL "")
      endif()
    else()
-      message(STATUS "Performing Test ${FEATURE}")
+      message(STATUS "Compiling and running to test ${FEATURE}")
      try_run(RUN_${FEATURE} COMPILE_${FEATURE}
              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 17
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
    endif()
  endif()

@@ -56,7 +70,11 @@ function(cxx_feature_check FILE)
    add_definitions(-DHAVE_${VAR})
  else()
    if(NOT COMPILE_${FEATURE})
-      message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      if(CXXFEATURECHECK_DEBUG)
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile: ${COMPILE_OUTPUT_VAR}")
+      else()
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      endif()
    else()
      message(STATUS "Performing Test ${FEATURE} -- compiled but failed to run")
    endif()
--- a/third_party/benchmark/cmake/Config.cmake.in
+++ b/third_party/benchmark/cmake/Config.cmake.in
@@ -1 +1,12 @@
+@PACKAGE_INIT@
+
+include (CMakeFindDependencyMacro)
+
+find_dependency (Threads)
+
+if (@BENCHMARK_ENABLE_LIBPFM@)
+    list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+    find_dependency (PFM)
+endif()
+
 include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
--- a/third_party/benchmark/cmake/GetGitVersion.cmake
+++ b/third_party/benchmark/cmake/GetGitVersion.cmake
@@ -20,35 +20,17 @@ set(__get_git_version INCLUDED)

 function(get_git_version var)
  if(GIT_EXECUTABLE)
-      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8 --dirty
          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
          RESULT_VARIABLE status
          OUTPUT_VARIABLE GIT_VERSION
          ERROR_QUIET)
-      if(${status})
+      if(status)
          set(GIT_VERSION "v0.0.0")
-      else()
-          string(STRIP ${GIT_VERSION} GIT_VERSION)
-          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
-      endif()
-
-      # Work out if the repository is dirty
-      execute_process(COMMAND ${GIT_EXECUTABLE} update-index -q --refresh
-          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-          OUTPUT_QUIET
-          ERROR_QUIET)
-      execute_process(COMMAND ${GIT_EXECUTABLE} diff-index --name-only HEAD --
-          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-          OUTPUT_VARIABLE GIT_DIFF_INDEX
-          ERROR_QUIET)
-      string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
-      if (${GIT_DIRTY})
-          set(GIT_VERSION "${GIT_VERSION}-dirty")
      endif()
  else()
      set(GIT_VERSION "v0.0.0")
  endif()

-  message(STATUS "git Version: ${GIT_VERSION}")
  set(${var} ${GIT_VERSION} PARENT_SCOPE)
 endfunction()
--- a/third_party/benchmark/cmake/GoogleTest.cmake
+++ b/third_party/benchmark/cmake/GoogleTest.cmake
@@ -0,0 +1,58 @@
+# Download and unpack googletest at configure time
+set(GOOGLETEST_PREFIX "${benchmark_BINARY_DIR}/third_party/googletest")
+configure_file(${benchmark_SOURCE_DIR}/cmake/GoogleTest.cmake.in ${GOOGLETEST_PREFIX}/CMakeLists.txt @ONLY)
+
+set(GOOGLETEST_PATH "${CMAKE_CURRENT_SOURCE_DIR}/googletest" CACHE PATH "") # Mind the quotes
+execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
+  -DALLOW_DOWNLOADING_GOOGLETEST=${BENCHMARK_DOWNLOAD_DEPENDENCIES} -DGOOGLETEST_PATH:PATH=${GOOGLETEST_PATH} .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+endif()
+
+execute_process(
+  COMMAND ${CMAKE_COMMAND} --build .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "Build step for googletest failed: ${result}")
+endif()
+
+# Prevent overriding the parent project's compiler/linker
+# settings on Windows
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+include(${GOOGLETEST_PREFIX}/googletest-paths.cmake)
+
+# Add googletest directly to our build. This defines
+# the gtest and gtest_main targets.
+add_subdirectory(${GOOGLETEST_SOURCE_DIR}
+                 ${GOOGLETEST_BINARY_DIR}
+                 EXCLUDE_FROM_ALL)
+
+# googletest doesn't seem to want to stay build warning clean so let's not hurt ourselves.
+if (MSVC)
+  target_compile_options(gtest PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gtest_main PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock_main PRIVATE "/wd4244" "/wd4722")
+else()
+  target_compile_options(gtest PRIVATE "-w")
+  target_compile_options(gtest_main PRIVATE "-w")
+  target_compile_options(gmock PRIVATE "-w")
+  target_compile_options(gmock_main PRIVATE "-w")
+endif()
+
+if(NOT DEFINED GTEST_COMPILE_COMMANDS)
+    set(GTEST_COMPILE_COMMANDS ON)
+endif()
+
+set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
--- a/third_party/benchmark/cmake/GoogleTest.cmake.in
+++ b/third_party/benchmark/cmake/GoogleTest.cmake.in
@@ -0,0 +1,60 @@
+cmake_minimum_required (VERSION 3.13...3.22)
+
+project(googletest-download NONE)
+
+# Enable ExternalProject CMake module
+include(ExternalProject)
+
+option(ALLOW_DOWNLOADING_GOOGLETEST "If googletest src tree is not found in location specified by GOOGLETEST_PATH, do fetch the archive from internet" OFF)
+set(GOOGLETEST_PATH "/usr/src/googletest" CACHE PATH
+                    "Path to the googletest root tree. Should contain googletest and googlemock subdirs. And CMakeLists.txt in root, and in both of these subdirs")
+
+# Download and install GoogleTest
+
+message(STATUS "Looking for Google Test sources")
+message(STATUS "Looking for Google Test sources in ${GOOGLETEST_PATH}")
+if(EXISTS "${GOOGLETEST_PATH}"            AND IS_DIRECTORY "${GOOGLETEST_PATH}"            AND EXISTS "${GOOGLETEST_PATH}/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googletest" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googletest" AND EXISTS "${GOOGLETEST_PATH}/googletest/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googlemock" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googlemock" AND EXISTS "${GOOGLETEST_PATH}/googlemock/CMakeLists.txt")
+  message(STATUS "Found Google Test in ${GOOGLETEST_PATH}")
+
+  ExternalProject_Add(
+    googletest
+    PREFIX            "${CMAKE_BINARY_DIR}"
+    DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+    SOURCE_DIR        "${GOOGLETEST_PATH}" # use existing src dir.
+    BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+  )
+else()
+  if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_USE_BUNDLED_GTEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    return()
+  else()
+    message(STATUS "Did not find Google Test sources! Fetching from web...")
+    ExternalProject_Add(
+      googletest
+      GIT_REPOSITORY    https://github.com/google/googletest.git
+      GIT_TAG           "v1.15.2"
+      GIT_SHALLOW       "ON"
+      PREFIX            "${CMAKE_BINARY_DIR}"
+      STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
+      DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+      SOURCE_DIR        "${CMAKE_BINARY_DIR}/src"
+      BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND     ""
+      INSTALL_COMMAND   ""
+      TEST_COMMAND      ""
+    )
+  endif()
+endif()
+
+ExternalProject_Get_Property(googletest SOURCE_DIR BINARY_DIR)
+file(WRITE googletest-paths.cmake
+"set(GOOGLETEST_SOURCE_DIR \"${SOURCE_DIR}\")
+set(GOOGLETEST_BINARY_DIR \"${BINARY_DIR}\")
+")
--- a/third_party/benchmark/cmake/HandleGTest.cmake
+++ b/third_party/benchmark/cmake/HandleGTest.cmake
@@ -1,113 +0,0 @@
-
-include(split_list)
-
-macro(build_external_gtest)
-  include(ExternalProject)
-  set(GTEST_FLAGS "")
-  if (BENCHMARK_USE_LIBCXX)
-    if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-      list(APPEND GTEST_FLAGS -stdlib=libc++)
-    else()
-      message(WARNING "Unsupported compiler (${CMAKE_CXX_COMPILER}) when using libc++")
-    endif()
-  endif()
-  if (BENCHMARK_BUILD_32_BITS)
-    list(APPEND GTEST_FLAGS -m32)
-  endif()
-  if (NOT "${CMAKE_CXX_FLAGS}" STREQUAL "")
-    list(APPEND GTEST_FLAGS ${CMAKE_CXX_FLAGS})
-  endif()
-  string(TOUPPER "${CMAKE_BUILD_TYPE}" GTEST_BUILD_TYPE)
-  if ("${GTEST_BUILD_TYPE}" STREQUAL "COVERAGE")
-    set(GTEST_BUILD_TYPE "DEBUG")
-  endif()
-  # FIXME: Since 10/Feb/2017 the googletest trunk has had a bug where
-  # -Werror=unused-function fires during the build on OS X. This is a temporary
-  # workaround to keep our travis bots from failing. It should be removed
-  # once gtest is fixed.
-  if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-    list(APPEND GTEST_FLAGS "-Wno-unused-function")
-  endif()
-  split_list(GTEST_FLAGS)
-  set(EXCLUDE_FROM_ALL_OPT "")
-  set(EXCLUDE_FROM_ALL_VALUE "")
-  if (${CMAKE_VERSION} VERSION_GREATER "3.0.99")
-      set(EXCLUDE_FROM_ALL_OPT "EXCLUDE_FROM_ALL")
-      set(EXCLUDE_FROM_ALL_VALUE "ON")
-  endif()
-  ExternalProject_Add(googletest
-      ${EXCLUDE_FROM_ALL_OPT} ${EXCLUDE_FROM_ALL_VALUE}
-      GIT_REPOSITORY https://github.com/google/googletest.git
-      GIT_TAG master
-      PREFIX "${CMAKE_BINARY_DIR}/googletest"
-      INSTALL_DIR "${CMAKE_BINARY_DIR}/googletest"
-      CMAKE_CACHE_ARGS
-        -DCMAKE_BUILD_TYPE:STRING=${GTEST_BUILD_TYPE}
-        -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-        -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-        -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
-        -DCMAKE_INSTALL_LIBDIR:PATH=<INSTALL_DIR>/lib
-        -DCMAKE_CXX_FLAGS:STRING=${GTEST_FLAGS}
-        -Dgtest_force_shared_crt:BOOL=ON
-      )
-
-  ExternalProject_Get_Property(googletest install_dir)
-  set(GTEST_INCLUDE_DIRS ${install_dir}/include)
-  file(MAKE_DIRECTORY ${GTEST_INCLUDE_DIRS})
-
-  set(LIB_SUFFIX "${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  set(LIB_PREFIX "${CMAKE_STATIC_LIBRARY_PREFIX}")
-  if("${GTEST_BUILD_TYPE}" STREQUAL "DEBUG")
-    set(LIB_SUFFIX "d${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  endif()
-
-  # Use gmock_main instead of gtest_main because it initializes gtest as well.
-  # Note: The libraries are listed in reverse order of their dependancies.
-  foreach(LIB gtest gmock gmock_main)
-    add_library(${LIB} UNKNOWN IMPORTED)
-    set_target_properties(${LIB} PROPERTIES
-      IMPORTED_LOCATION ${install_dir}/lib/${LIB_PREFIX}${LIB}${LIB_SUFFIX}
-      INTERFACE_INCLUDE_DIRECTORIES ${GTEST_INCLUDE_DIRS}
-      INTERFACE_LINK_LIBRARIES "${GTEST_BOTH_LIBRARIES}"
-    )
-    add_dependencies(${LIB} googletest)
-    list(APPEND GTEST_BOTH_LIBRARIES ${LIB})
-  endforeach()
-endmacro(build_external_gtest)
-
-if (BENCHMARK_ENABLE_GTEST_TESTS)
-  if (IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/googletest)
-    set(GTEST_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/googletest")
-    set(INSTALL_GTEST OFF CACHE INTERNAL "")
-    set(INSTALL_GMOCK OFF CACHE INTERNAL "")
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/googletest)
-    set(GTEST_BOTH_LIBRARIES gtest gmock gmock_main)
-    foreach(HEADER test mock)
-      # CMake 2.8 and older don't respect INTERFACE_INCLUDE_DIRECTORIES, so we
-      # have to add the paths ourselves.
-      set(HFILE g${HEADER}/g${HEADER}.h)
-      set(HPATH ${GTEST_ROOT}/google${HEADER}/include)
-      find_path(HEADER_PATH_${HEADER} ${HFILE}
-          NO_DEFAULT_PATHS
-          HINTS ${HPATH}
-      )
-      if (NOT HEADER_PATH_${HEADER})
-        message(FATAL_ERROR "Failed to find header ${HFILE} in ${HPATH}")
-      endif()
-      list(APPEND GTEST_INCLUDE_DIRS ${HEADER_PATH_${HEADER}})
-    endforeach()
-  elseif(BENCHMARK_DOWNLOAD_DEPENDENCIES)
-    build_external_gtest()
-  else()
-    find_package(GTest REQUIRED)
-    find_path(GMOCK_INCLUDE_DIRS gmock/gmock.h
-        HINTS ${GTEST_INCLUDE_DIRS})
-    if (NOT GMOCK_INCLUDE_DIRS)
-      message(FATAL_ERROR "Failed to find header gmock/gmock.h with hint ${GTEST_INCLUDE_DIRS}")
-    endif()
-    set(GTEST_INCLUDE_DIRS ${GTEST_INCLUDE_DIRS} ${GMOCK_INCLUDE_DIRS})
-    # FIXME: We don't currently require the gmock library to build the tests,
-    # and it's likely we won't find it, so we don't try. As long as we've
-    # found the gmock/gmock.h header and gtest_main that should be good enough.
-  endif()
-endif()
--- a/third_party/benchmark/cmake/benchmark.pc.in
+++ b/third_party/benchmark/cmake/benchmark.pc.in
@@ -1,11 +1,12 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-libdir=${prefix}/lib
-includedir=${prefix}/include
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@

 Name: @PROJECT_NAME@
 Description: Google microbenchmark framework
-Version: @VERSION@
+Version: @NORMALIZED_VERSION@

 Libs: -L${libdir} -lbenchmark
+Libs.private: -lpthread @BENCHMARK_PRIVATE_LINK_LIBRARIES@
 Cflags: -I${includedir}
--- a/third_party/benchmark/cmake/benchmark_main.pc.in
+++ b/third_party/benchmark/cmake/benchmark_main.pc.in
@@ -0,0 +1,7 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+
+Name: @PROJECT_NAME@
+Description: Google microbenchmark framework (with main() function)
+Version: @NORMALIZED_VERSION@
+Requires: benchmark
+Libs: -L${libdir} -lbenchmark_main
--- a/third_party/benchmark/cmake/pthread_affinity.cpp
+++ b/third_party/benchmark/cmake/pthread_affinity.cpp
@@ -0,0 +1,16 @@
+#include <pthread.h>
+int main() {
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  for (int i = 0; i < CPU_SETSIZE; ++i) {
+    CPU_SET(i, &set);
+    CPU_CLR(i, &set);
+  }
+  pthread_t self = pthread_self();
+  int ret;
+  ret = pthread_getaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  ret = pthread_setaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  return 0;
+}
--- a/third_party/benchmark/docs/AssemblyTests.md
+++ b/third_party/benchmark/docs/AssemblyTests.md
@@ -111,6 +111,7 @@ between compilers or compiler versions. A common example of this
 is matching stack frame addresses. In this case regular expressions
 can be used to match the differing bits of output. For example:

+<!-- {% raw %} -->
 ```c++
 int ExternInt;
 struct Point { int x, y, z; };
@@ -127,6 +128,7 @@ extern "C" void test_store_point() {
    // CHECK: ret
 }
 ```
+<!-- {% endraw %} -->

 ## Current Requirements and Limitations

--- a/third_party/benchmark/docs/_config.yml
+++ b/third_party/benchmark/docs/_config.yml
@@ -0,0 +1,3 @@
+theme: jekyll-theme-minimal
+logo: /assets/images/icon_black.png
+show_downloads: true
--- a/third_party/benchmark/docs/assets/images/icon.png
+++ b/third_party/benchmark/docs/assets/images/icon.png
--- a/third_party/benchmark/docs/assets/images/icon.xcf
+++ b/third_party/benchmark/docs/assets/images/icon.xcf
--- a/third_party/benchmark/docs/assets/images/icon_black.png
+++ b/third_party/benchmark/docs/assets/images/icon_black.png
--- a/third_party/benchmark/docs/assets/images/icon_black.xcf
+++ b/third_party/benchmark/docs/assets/images/icon_black.xcf
--- a/third_party/benchmark/docs/dependencies.md
+++ b/third_party/benchmark/docs/dependencies.md
@@ -0,0 +1,19 @@
+# Build tool dependency policy
+
+We follow the [Foundational C++ support policy](https://opensource.google/documentation/policies/cplusplus-support) for our build tools. In
+particular the ["Build Systems" section](https://opensource.google/documentation/policies/cplusplus-support#build-systems).
+
+## CMake
+
+The current supported version is CMake 3.10 as of 2023-08-10. Most modern
+distributions include newer versions, for example:
+
+* Ubuntu 20.04 provides CMake 3.16.3
+* Debian 11.4 provides CMake 3.18.4
+* Ubuntu 22.04 provides CMake 3.22.1
+
+## Python
+
+The Python bindings require Python 3.10+ as of v1.9.0 (2024-08-16) for installation from PyPI.
+Building from source for older versions probably still works, though. See the [user guide](python_bindings.md) for details on how to build from source.
+The minimum theoretically supported version is Python 3.8, since the used bindings generator (nanobind) only supports Python 3.8+.
--- a/third_party/benchmark/docs/index.md
+++ b/third_party/benchmark/docs/index.md
@@ -0,0 +1,12 @@
+# Benchmark
+
+* [Assembly Tests](AssemblyTests.md)
+* [Dependencies](dependencies.md)
+* [Perf Counters](perf_counters.md)
+* [Platform Specific Build Instructions](platform_specific_build_instructions.md)
+* [Python Bindings](python_bindings.md)
+* [Random Interleaving](random_interleaving.md)
+* [Reducing Variance](reducing_variance.md)
+* [Releasing](releasing.md)
+* [Tools](tools.md)
+* [User Guide](user_guide.md)
--- a/third_party/benchmark/docs/perf_counters.md
+++ b/third_party/benchmark/docs/perf_counters.md
@@ -0,0 +1,35 @@
+<a name="perf-counters" />
+
+# User-Requested Performance Counters
+
+When running benchmarks, the user may choose to request collection of
+performance counters. This may be useful in investigation scenarios - narrowing
+down the cause of a regression; or verifying that the underlying cause of a
+performance improvement matches expectations.
+
+This feature is available if:
+
+* The benchmark is run on an architecture featuring a Performance Monitoring
+  Unit (PMU),
+* The benchmark is compiled with support for collecting counters. Currently,
+  this requires [libpfm](http://perfmon2.sourceforge.net/), which is built as a
+  dependency via Bazel.
+
+The feature does not require modifying benchmark code. Counter collection is
+handled at the boundaries where timer collection is also handled. 
+
+To opt-in:
+* If using a Bazel build, add `--define pfm=1` to your build flags
+* If using CMake:
+  * Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`.
+  * Enable the CMake flag `BENCHMARK_ENABLE_LIBPFM` in `CMakeLists.txt`.
+
+To use, pass a comma-separated list of counter names through the
+`--benchmark_perf_counters` flag. The names are decoded through libpfm - meaning,
+they are platform specific, but some (e.g. `CYCLES` or `INSTRUCTIONS`) are
+mapped by libpfm to platform-specifics - see libpfm
+[documentation](http://perfmon2.sourceforge.net/docs.html) for more details.
+
+The counter values are reported back through the [User Counters](../README.md#custom-counters)
+mechanism, meaning, they are available in all the formats (e.g. JSON) supported
+by User Counters.
--- a/third_party/benchmark/docs/platform_specific_build_instructions.md
+++ b/third_party/benchmark/docs/platform_specific_build_instructions.md
@@ -0,0 +1,52 @@
+# Platform Specific Build Instructions
+
+## Building with GCC
+
+When the library is built using GCC it is necessary to link with the pthread
+library due to how GCC implements `std::thread`. Failing to link to pthread will
+lead to runtime exceptions (unless you're using libc++), not linker errors. See
+[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
+can link to pthread by adding `-pthread` to your linker command. Note, you can
+also use `-lpthread`, but there are potential issues with ordering of command
+line parameters if you use that.
+
+On QNX, the pthread library is part of libc and usually included automatically
+(see
+[`pthread_create()`](https://www.qnx.com/developers/docs/7.1/index.html#com.qnx.doc.neutrino.lib_ref/topic/p/pthread_create.html)).
+There's no separate pthread library to link.
+
+## Building with Visual Studio 2015, 2017 or 2022
+
+The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
+
+```
+// Alternatively, can add libraries using linker options.
+
+// First, Add the path to the generated library files (directory containing the `benchmark.lib`) in `[Configuration Properties > Linker > General > Additional Library Directories]`. Then do the following:
+#ifdef _WIN32
+#pragma comment ( lib, "Shlwapi.lib" )
+#ifdef _DEBUG
+#pragma comment ( lib, "benchmark.lib" )
+#else
+#pragma comment ( lib, "benchmark.lib" )
+#endif
+#endif
+```
+
+When using the static library, make sure to add `BENCHMARK_STATIC_DEFINE` under `[Configuration Properties > C/C++ > Preprocessor > Preprocessor Definitions]`
+
+Can also use the graphical version of CMake:
+* Open `CMake GUI`.
+* Under `Where to build the binaries`, same path as source plus `build`.
+* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
+* Click `Configure`, `Generate`, `Open Project`.
+* If build fails, try deleting entire directory and starting again, or unticking options to build less.
+
+## Building with Intel 2015 Update 1 or Intel System Studio Update 4
+
+See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
+
+## Building on Solaris
+
+If you're running benchmarks on solaris, you'll want the kstat library linked in
+too (`-lkstat`).
--- a/third_party/benchmark/docs/python_bindings.md
+++ b/third_party/benchmark/docs/python_bindings.md
@@ -0,0 +1,34 @@
+# Building and installing Python bindings
+
+Python bindings are available as wheels on [PyPI](https://pypi.org/project/google-benchmark/) for importing and 
+using Google Benchmark directly in Python. 
+Currently, pre-built wheels exist for macOS (both ARM64 and Intel x86), Linux x86-64 and 64-bit Windows.
+Supported Python versions are Python 3.8 - 3.12.
+
+To install Google Benchmark's Python bindings, run:
+
+```bash
+python -m pip install --upgrade pip  # for manylinux2014 support
+python -m pip install google-benchmark
+```
+
+In order to keep your system Python interpreter clean, it is advisable to run these commands in a virtual
+environment. See the [official Python documentation](https://docs.python.org/3/library/venv.html) 
+on how to create virtual environments.
+
+To build a wheel directly from source, you can follow these steps:
+```bash
+git clone https://github.com/google/benchmark.git
+cd benchmark
+# create a virtual environment and activate it
+python3 -m venv venv --system-site-packages
+source venv/bin/activate  # .\venv\Scripts\Activate.ps1 on Windows
+
+# upgrade Python's system-wide packages
+python -m pip install --upgrade pip build
+# builds the wheel and stores it in the directory "dist".
+python -m build
+```
+
+NB: Building wheels from source requires Bazel. For platform-specific instructions on how to install Bazel,
+refer to the [Bazel installation docs](https://bazel.build/install).
--- a/third_party/benchmark/docs/random_interleaving.md
+++ b/third_party/benchmark/docs/random_interleaving.md
@@ -0,0 +1,13 @@
+<a name="interleaving" />
+
+# Random Interleaving
+
+[Random Interleaving](https://github.com/google/benchmark/issues/1051) is a
+technique to lower run-to-run variance. It randomly interleaves repetitions of a
+microbenchmark with repetitions from other microbenchmarks in the same benchmark
+test. Data shows it is able to lower run-to-run variance by
+[40%](https://github.com/google/benchmark/issues/1051) on average.
+
+To use, you mainly need to set `--benchmark_enable_random_interleaving=true`,
+and optionally specify non-zero repetition count `--benchmark_repetitions=9`
+and optionally decrease the per-repetition time `--benchmark_min_time=0.1`.
--- a/third_party/benchmark/docs/reducing_variance.md
+++ b/third_party/benchmark/docs/reducing_variance.md
@@ -0,0 +1,133 @@
+# Reducing Variance
+
+<a name="disabling-cpu-frequency-scaling" />
+
+## Disabling CPU Frequency Scaling
+
+If you see this error:
+
+```
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+```
+
+you might want to disable the CPU frequency scaling while running the
+benchmark, as well as consider other ways to stabilize the performance of
+your system while benchmarking.
+
+Exactly how to do this depends on the Linux distribution,
+desktop environment, and installed programs.  Specific details are a moving
+target, so we will not attempt to exhaustively document them here.
+
+One simple option is to use the `cpupower` program to change the
+performance governor to "performance".  This tool is maintained along with
+the Linux kernel and provided by your distribution.
+
+It must be run as root, like this:
+
+```bash
+sudo cpupower frequency-set --governor performance
+```
+
+After this you can verify that all CPUs are using the performance governor
+by running this command:
+
+```bash
+cpupower frequency-info -o proc
+```
+
+The benchmarks you subsequently run will have less variance.
+
+<a name="reducing-variance" />
+
+## Disabling ASLR
+
+If you see this error:
+
+```
+***WARNING*** ASLR is enabled, the results may have unreproducible noise in them.
+```
+
+you might want to disable the ASLR security hardening feature while running the
+benchmark.
+
+The simplest way is to add
+```
+benchmark::MaybeReenterWithoutASLR(argc, argv);
+```
+as the first line of your `main()` function. It will try to disable ASLR
+for the current processor, and, if successful, re-execute the binary.
+Note that `personality(2)` may be forbidden by e.g. seccomp (which happens
+by default if you are running in a Docker container).
+
+Note that if you link to `benchmark_main` already does that for you.
+
+To globally disable ASLR on Linux, run
+```
+echo 0 > /proc/sys/kernel/randomize_va_space
+```
+
+To run a single benchmark with ASLR disabled on Linux, do:
+```
+setarch `uname -m` -R ./a_benchmark
+```
+
+Note that for the information on how to disable ASLR on other operating systems,
+please refer to their documentation.
+
+## Reducing Variance in Benchmarks
+
+The Linux CPU frequency governor [discussed
+above](user_guide#disabling-cpu-frequency-scaling) is not the only source
+of noise in benchmarks.  Some, but not all, of the sources of variance
+include:
+
+1. On multi-core machines not all CPUs/CPU cores/CPU threads run the same
+   speed, so running a benchmark one time and then again may give a
+   different result depending on which CPU it ran on.
+2. CPU scaling features that run on the CPU, like Intel's Turbo Boost and
+   AMD Turbo Core and Precision Boost, can temporarily change the CPU
+   frequency even when the using the "performance" governor on Linux.
+3. Context switching between CPUs, or scheduling competition on the CPU the
+   benchmark is running on.
+4. Intel Hyperthreading or AMD SMT causing the same issue as above.
+5. Cache effects caused by code running on other CPUs.
+6. Non-uniform memory architectures (NUMA).
+
+These can cause variance in benchmarks results within a single run
+(`--benchmark_repetitions=N`) or across multiple runs of the benchmark
+program.
+
+Reducing sources of variance is OS and architecture dependent, which is one
+reason some companies maintain machines dedicated to performance testing.
+
+Some of the easier and effective ways of reducing variance on a typical
+Linux workstation are:
+
+1. Use the performance governor as [discussed
+above](user_guide#disabling-cpu-frequency-scaling).
+1. Disable processor boosting by:
+   ```sh
+   echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost
+   ```
+   See the Linux kernel's
+   [boost.txt](https://www.kernel.org/doc/Documentation/cpu-freq/boost.txt)
+   for more information.
+2. Set the benchmark program's task affinity to a fixed cpu.  For example:
+   ```sh
+   taskset -c 0 ./mybenchmark
+   ```
+3. Disabling Hyperthreading/SMT.  This can be done in the Bios or using the
+   `/sys` file system (see the LLVM project's [Benchmarking
+   tips](https://llvm.org/docs/Benchmarking.html)).
+4. Close other programs that do non-trivial things based on timers, such as
+   your web browser, desktop environment, etc.
+5. Reduce the working set of your benchmark to fit within the L1 cache, but
+   do be aware that this may lead you to optimize for an unrealistic
+   situation.
+
+Further resources on this topic:
+
+1. The LLVM project's [Benchmarking
+   tips](https://llvm.org/docs/Benchmarking.html).
+1. The Arch Wiki [Cpu frequency
+scaling](https://wiki.archlinux.org/title/CPU_frequency_scaling) page.
--- a/third_party/benchmark/docs/releasing.md
+++ b/third_party/benchmark/docs/releasing.md
@@ -0,0 +1,38 @@
+# How to release
+
+* Make sure you're on main and synced to HEAD
+* Ensure the project builds and tests run
+    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
+      passes
+* Prepare release notes
+    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
+      commits between the last annotated tag and HEAD
+    * Pick the most interesting.
+* Create one last commit that updates the version saved in `CMakeLists.txt`, `MODULE.bazel`,
+  and `bindings/python/google_benchmark/__init__.py` to the release version you're creating.
+  (This version will be used if benchmark is installed from the archive you'll be creating
+  in the next step.)
+
+```
+# CMakeLists.txt
+project (benchmark VERSION 1.9.0 LANGUAGES CXX)
+```
+
+```
+# MODULE.bazel
+module(name = "com_github_google_benchmark", version="1.9.0")
+```
+
+```
+# google_benchmark/__init__.py
+__version__ = "1.9.0"
+```
+
+* Create a release through github's interface
+    * Note this will create a lightweight tag.
+    * Update this to an annotated tag:
+      * `git pull --tags`
+      * `git tag -a -f <tag> <tag>`
+      * `git push --force --tags origin`
+* Confirm that the "Build and upload Python wheels" action runs to completion
+    * Run it manually if it hasn't run.
--- a/third_party/benchmark/docs/tools.md
+++ b/third_party/benchmark/docs/tools.md
@@ -4,7 +4,11 @@

 The `compare.py` can be used to compare the result of benchmarks.

-**NOTE**: the utility relies on the scipy package which can be installed using [these instructions](https://www.scipy.org/install.html).
+### Dependencies
+The utility relies on the [scipy](https://www.scipy.org) package which can be installed using pip:
+```bash
+pip3 install -r requirements.txt
+```

 ### Displaying aggregates only

@@ -182,6 +186,146 @@ Benchmark                               Time             CPU      Time Old
 This is a mix of the previous two modes, two (potentially different) benchmark binaries are run, and a different filter is applied to each one.
 As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.

+### Note: Interpreting the output
+
+Performance measurements are an art, and performance comparisons are doubly so.
+Results are often noisy and don't necessarily have large absolute differences to
+them, so just by visual inspection, it is not at all apparent if two
+measurements are actually showing a performance change or not. It is even more
+confusing with multiple benchmark repetitions.
+
+Thankfully, what we can do, is use statistical tests on the results to determine
+whether the performance has statistically-significantly changed. `compare.py`
+uses [Mann–Whitney U
+test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), with a null
+hypothesis being that there's no difference in performance.
+ 
+**The below output is a summary of a benchmark comparison with statistics
+provided for a multi-threaded process.**
+```
+Benchmark                                               Time        CPU    Time Old      Time New       CPU Old       CPU New
+-----------------------------------------------------------------------------------------------------------------------------
+benchmark/threads:1/process_time/real_time_pvalue     0.0000     0.0000    U Test, Repetitions: 27 vs 27
+benchmark/threads:1/process_time/real_time_mean      -0.1442    -0.1442          90            77            90            77
+benchmark/threads:1/process_time/real_time_median    -0.1444    -0.1444          90            77            90            77
+benchmark/threads:1/process_time/real_time_stddev    +0.3974    +0.3933           0             0             0             0
+benchmark/threads:1/process_time/real_time_cv        +0.6329    +0.6280           0             0             0             0
+OVERALL_GEOMEAN                                      -0.1442    -0.1442           0             0             0             0
+```
+--------------------------------------------
+Here's a breakdown of each row:
+
+**benchmark/threads:1/process_time/real_time_pvalue**: This shows the _p-value_ for
+the statistical test comparing the performance of the process running with one
+thread. A value of 0.0000 suggests a statistically significant difference in
+performance. The comparison was conducted using the U Test (Mann-Whitney
+U Test) with 27 repetitions for each case.
+
+**benchmark/threads:1/process_time/real_time_mean**: This shows the relative
+difference in mean execution time between two different cases. The negative
+value (-0.1442) implies that the new process is faster by about 14.42%. The old
+time was 90 units, while the new time is 77 units.
+
+**benchmark/threads:1/process_time/real_time_median**: Similarly, this shows the
+relative difference in the median execution time. Again, the new process is
+faster by 14.44%.
+
+**benchmark/threads:1/process_time/real_time_stddev**: This is the relative
+difference in the standard deviation of the execution time, which is a measure
+of how much variation or dispersion there is from the mean. A positive value
+(+0.3974) implies there is more variance in the execution time in the new
+process.
+
+**benchmark/threads:1/process_time/real_time_cv**: CV stands for Coefficient of
+Variation. It is the ratio of the standard deviation to the mean. It provides a
+standardized measure of dispersion. An increase (+0.6329) indicates more
+relative variability in the new process.
+
+**OVERALL_GEOMEAN**: Geomean stands for geometric mean, a type of average that is
+less influenced by outliers. The negative value indicates a general improvement
+in the new process. However, given the values are all zero for the old and new
+times, this seems to be a mistake or placeholder in the output.
+
+-----------------------------------------
+
+
+
+Let's first try to see what the different columns represent in the above
+`compare.py` benchmarking output:
+
+  1. **Benchmark:** The name of the function being benchmarked, along with the
+     size of the input (after the slash).
+
+  2. **Time:** The average time per operation, across all iterations.
+
+  3. **CPU:** The average CPU time per operation, across all iterations.
+
+  4. **Iterations:** The number of iterations the benchmark was run to get a
+     stable estimate.
+
+  5. **Time Old and Time New:** These represent the average time it takes for a
+     function to run in two different scenarios or versions. For example, you
+     might be comparing how fast a function runs before and after you make some
+     changes to it.
+
+  6. **CPU Old and CPU New:** These show the average amount of CPU time that the
+     function uses in two different scenarios or versions. This is similar to
+     Time Old and Time New, but focuses on CPU usage instead of overall time.
+
+In the comparison section, the relative differences in both time and CPU time
+are displayed for each input size.
+
+
+A statistically-significant difference is determined by a **p-value**, which is
+a measure of the probability that the observed difference could have occurred
+just by random chance. A smaller p-value indicates stronger evidence against the
+null hypothesis. 
+
+**Therefore:**
+  1. If the p-value is less than the chosen significance level (alpha), we
+     reject the null hypothesis and conclude the benchmarks are significantly
+     different.
+  2. If the p-value is greater than or equal to alpha, we fail to reject the
+     null hypothesis and treat the two benchmarks as similar.
+
+
+
+The result of said the statistical test is additionally communicated through color coding:
+```diff
+ Green:
+```
+  The benchmarks are _**statistically different**_. This could mean the
+  performance has either **significantly improved** or **significantly
+  deteriorated**. You should look at the actual performance numbers to see which
+  is the case.
+```diff
+- Red:
+```
+  The benchmarks are _**statistically similar**_. This means the performance
+  **hasn't significantly changed**.
+
+In statistical terms, **'green'** means we reject the null hypothesis that
+there's no difference in performance, and **'red'** means we fail to reject the
+null hypothesis. This might seem counter-intuitive if you're expecting 'green'
+to mean 'improved performance' and 'red' to mean 'worsened performance'. 
+```bash
+  But remember, in this context:
+
+    'Success' means 'successfully finding a difference'.
+    'Failure' means 'failing to find a difference'.
+```
+
+
+Also, please note that **even if** we determine that there **is** a
+statistically-significant difference between the two measurements, it does not
+_necessarily_ mean that the actual benchmarks that were measured **are**
+different, or vice versa, even if we determine that there is **no**
+statistically-significant difference between the two measurements, it does not
+necessarily mean that the actual benchmarks that were measured **are not**
+different.
+
+
+
 ### U test

 If there is a sufficient repetition count of the benchmarks, the tool can do
--- a/third_party/benchmark/docs/user_guide.md
+++ b/third_party/benchmark/docs/user_guide.md
--- a/third_party/benchmark/include/benchmark/benchmark.h
+++ b/third_party/benchmark/include/benchmark/benchmark.h
--- a/third_party/benchmark/include/benchmark/export.h
+++ b/third_party/benchmark/include/benchmark/export.h
@@ -0,0 +1,47 @@
+#ifndef BENCHMARK_EXPORT_H
+#define BENCHMARK_EXPORT_H
+
+#if defined(_WIN32)
+#define EXPORT_ATTR __declspec(dllexport)
+#define IMPORT_ATTR __declspec(dllimport)
+#define NO_EXPORT_ATTR
+#define DEPRECATED_ATTR __declspec(deprecated)
+#else  // _WIN32
+#define EXPORT_ATTR __attribute__((visibility("default")))
+#define IMPORT_ATTR __attribute__((visibility("default")))
+#define NO_EXPORT_ATTR __attribute__((visibility("hidden")))
+#define DEPRECATE_ATTR __attribute__((__deprecated__))
+#endif  // _WIN32
+
+#ifdef BENCHMARK_STATIC_DEFINE
+#define BENCHMARK_EXPORT
+#define BENCHMARK_NO_EXPORT
+#else  // BENCHMARK_STATIC_DEFINE
+#ifndef BENCHMARK_EXPORT
+#ifdef benchmark_EXPORTS
+/* We are building this library */
+#define BENCHMARK_EXPORT EXPORT_ATTR
+#else  // benchmark_EXPORTS
+/* We are using this library */
+#define BENCHMARK_EXPORT IMPORT_ATTR
+#endif  // benchmark_EXPORTS
+#endif  // !BENCHMARK_EXPORT
+
+#ifndef BENCHMARK_NO_EXPORT
+#define BENCHMARK_NO_EXPORT NO_EXPORT_ATTR
+#endif  // !BENCHMARK_NO_EXPORT
+#endif  // BENCHMARK_STATIC_DEFINE
+
+#ifndef BENCHMARK_DEPRECATED
+#define BENCHMARK_DEPRECATED DEPRECATE_ATTR
+#endif  // BENCHMARK_DEPRECATED
+
+#ifndef BENCHMARK_DEPRECATED_EXPORT
+#define BENCHMARK_DEPRECATED_EXPORT BENCHMARK_EXPORT BENCHMARK_DEPRECATED
+#endif  // BENCHMARK_DEPRECATED_EXPORT
+
+#ifndef BENCHMARK_DEPRECATED_NO_EXPORT
+#define BENCHMARK_DEPRECATED_NO_EXPORT BENCHMARK_NO_EXPORT BENCHMARK_DEPRECATED
+#endif  // BENCHMARK_DEPRECATED_EXPORT
+
+#endif /* BENCHMARK_EXPORT_H */
--- a/third_party/benchmark/mingw.py
+++ b/third_party/benchmark/mingw.py
@@ -1,320 +0,0 @@
-#! /usr/bin/env python
-# encoding: utf-8
-
-import argparse
-import errno
-import logging
-import os
-import platform
-import re
-import sys
-import subprocess
-import tempfile
-
-try:
-    import winreg
-except ImportError:
-    import _winreg as winreg
-try:
-    import urllib.request as request
-except ImportError:
-    import urllib as request
-try:
-    import urllib.parse as parse
-except ImportError:
-    import urlparse as parse
-
-class EmptyLogger(object):
-    '''
-    Provides an implementation that performs no logging
-    '''
-    def debug(self, *k, **kw):
-        pass
-    def info(self, *k, **kw):
-        pass
-    def warn(self, *k, **kw):
-        pass
-    def error(self, *k, **kw):
-        pass
-    def critical(self, *k, **kw):
-        pass
-    def setLevel(self, *k, **kw):
-        pass
-
-urls = (
-    'http://downloads.sourceforge.net/project/mingw-w64/Toolchains%20'
-        'targetting%20Win32/Personal%20Builds/mingw-builds/installer/'
-        'repository.txt',
-    'http://downloads.sourceforge.net/project/mingwbuilds/host-windows/'
-        'repository.txt'
-)
-'''
-A list of mingw-build repositories
-'''
-
-def repository(urls = urls, log = EmptyLogger()):
-    '''
-    Downloads and parse mingw-build repository files and parses them
-    '''
-    log.info('getting mingw-builds repository')
-    versions = {}
-    re_sourceforge = re.compile(r'http://sourceforge.net/projects/([^/]+)/files')
-    re_sub = r'http://downloads.sourceforge.net/project/\1'
-    for url in urls:
-        log.debug(' - requesting: %s', url)
-        socket = request.urlopen(url)
-        repo = socket.read()
-        if not isinstance(repo, str):
-            repo = repo.decode();
-        socket.close()
-        for entry in repo.split('\n')[:-1]:
-            value = entry.split('|')
-            version = tuple([int(n) for n in value[0].strip().split('.')])
-            version = versions.setdefault(version, {})
-            arch = value[1].strip()
-            if arch == 'x32':
-                arch = 'i686'
-            elif arch == 'x64':
-                arch = 'x86_64'
-            arch = version.setdefault(arch, {})
-            threading = arch.setdefault(value[2].strip(), {})
-            exceptions = threading.setdefault(value[3].strip(), {})
-            revision = exceptions.setdefault(int(value[4].strip()[3:]),
-                re_sourceforge.sub(re_sub, value[5].strip()))
-    return versions
-
-def find_in_path(file, path=None):
-    '''
-    Attempts to find an executable in the path
-    '''
-    if platform.system() == 'Windows':
-        file += '.exe'
-    if path is None:
-        path = os.environ.get('PATH', '')
-    if type(path) is type(''):
-        path = path.split(os.pathsep)
-    return list(filter(os.path.exists,
-        map(lambda dir, file=file: os.path.join(dir, file), path)))
-
-def find_7zip(log = EmptyLogger()):
-    '''
-    Attempts to find 7zip for unpacking the mingw-build archives
-    '''
-    log.info('finding 7zip')
-    path = find_in_path('7z')
-    if not path:
-        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\7-Zip')
-        path, _ = winreg.QueryValueEx(key, 'Path')
-        path = [os.path.join(path, '7z.exe')]
-    log.debug('found \'%s\'', path[0])
-    return path[0]
-
-find_7zip()
-
-def unpack(archive, location, log = EmptyLogger()):
-    '''
-    Unpacks a mingw-builds archive
-    '''
-    sevenzip = find_7zip(log)
-    log.info('unpacking %s', os.path.basename(archive))
-    cmd = [sevenzip, 'x', archive, '-o' + location, '-y']
-    log.debug(' - %r', cmd)
-    with open(os.devnull, 'w') as devnull:
-        subprocess.check_call(cmd, stdout = devnull)
-
-def download(url, location, log = EmptyLogger()):
-    '''
-    Downloads and unpacks a mingw-builds archive
-    '''
-    log.info('downloading MinGW')
-    log.debug(' - url: %s', url)
-    log.debug(' - location: %s', location)
-
-    re_content = re.compile(r'attachment;[ \t]*filename=(")?([^"]*)(")?[\r\n]*')
-
-    stream = request.urlopen(url)
-    try:
-        content = stream.getheader('Content-Disposition') or ''
-    except AttributeError:
-        content = stream.headers.getheader('Content-Disposition') or ''
-    matches = re_content.match(content)
-    if matches:
-        filename = matches.group(2)
-    else:
-        parsed = parse.urlparse(stream.geturl())
-        filename = os.path.basename(parsed.path)
-
-    try:
-        os.makedirs(location)
-    except OSError as e:
-        if e.errno == errno.EEXIST and os.path.isdir(location):
-            pass
-        else:
-            raise
-
-    archive = os.path.join(location, filename)
-    with open(archive, 'wb') as out:
-        while True:
-            buf = stream.read(1024)
-            if not buf:
-                break
-            out.write(buf)
-    unpack(archive, location, log = log)
-    os.remove(archive)
-
-    possible = os.path.join(location, 'mingw64')
-    if not os.path.exists(possible):
-        possible = os.path.join(location, 'mingw32')
-        if not os.path.exists(possible):
-            raise ValueError('Failed to find unpacked MinGW: ' + possible)
-    return possible
-
-def root(location = None, arch = None, version = None, threading = None,
-        exceptions = None, revision = None, log = EmptyLogger()):
-    '''
-    Returns the root folder of a specific version of the mingw-builds variant
-    of gcc. Will download the compiler if needed
-    '''
-
-    # Get the repository if we don't have all the information
-    if not (arch and version and threading and exceptions and revision):
-        versions = repository(log = log)
-
-    # Determine some defaults
-    version = version or max(versions.keys())
-    if not arch:
-        arch = platform.machine().lower()
-        if arch == 'x86':
-            arch = 'i686'
-        elif arch == 'amd64':
-            arch = 'x86_64'
-    if not threading:
-        keys = versions[version][arch].keys()
-        if 'posix' in keys:
-            threading = 'posix'
-        elif 'win32' in keys:
-            threading = 'win32'
-        else:
-            threading = keys[0]
-    if not exceptions:
-        keys = versions[version][arch][threading].keys()
-        if 'seh' in keys:
-            exceptions = 'seh'
-        elif 'sjlj' in keys:
-            exceptions = 'sjlj'
-        else:
-            exceptions = keys[0]
-    if revision == None:
-        revision = max(versions[version][arch][threading][exceptions].keys())
-    if not location:
-        location = os.path.join(tempfile.gettempdir(), 'mingw-builds')
-
-    # Get the download url
-    url = versions[version][arch][threading][exceptions][revision]
-
-    # Tell the user whatzzup
-    log.info('finding MinGW %s', '.'.join(str(v) for v in version))
-    log.debug(' - arch: %s', arch)
-    log.debug(' - threading: %s', threading)
-    log.debug(' - exceptions: %s', exceptions)
-    log.debug(' - revision: %s', revision)
-    log.debug(' - url: %s', url)
-
-    # Store each specific revision differently
-    slug = '{version}-{arch}-{threading}-{exceptions}-rev{revision}'
-    slug = slug.format(
-        version = '.'.join(str(v) for v in version),
-        arch = arch,
-        threading = threading,
-        exceptions = exceptions,
-        revision = revision
-    )
-    if arch == 'x86_64':
-        root_dir = os.path.join(location, slug, 'mingw64')
-    elif arch == 'i686':
-        root_dir = os.path.join(location, slug, 'mingw32')
-    else:
-        raise ValueError('Unknown MinGW arch: ' + arch)
-
-    # Download if needed
-    if not os.path.exists(root_dir):
-        downloaded = download(url, os.path.join(location, slug), log = log)
-        if downloaded != root_dir:
-            raise ValueError('The location of mingw did not match\n%s\n%s'
-                % (downloaded, root_dir))
-
-    return root_dir
-
-def str2ver(string):
-    '''
-    Converts a version string into a tuple
-    '''
-    try:
-        version = tuple(int(v) for v in string.split('.'))
-        if len(version) is not 3:
-            raise ValueError()
-    except ValueError:
-        raise argparse.ArgumentTypeError(
-            'please provide a three digit version string')
-    return version
-
-def main():
-    '''
-    Invoked when the script is run directly by the python interpreter
-    '''
-    parser = argparse.ArgumentParser(
-        description = 'Downloads a specific version of MinGW',
-        formatter_class = argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument('--location',
-        help = 'the location to download the compiler to',
-        default = os.path.join(tempfile.gettempdir(), 'mingw-builds'))
-    parser.add_argument('--arch', required = True, choices = ['i686', 'x86_64'],
-        help = 'the target MinGW architecture string')
-    parser.add_argument('--version', type = str2ver,
-        help = 'the version of GCC to download')
-    parser.add_argument('--threading', choices = ['posix', 'win32'],
-        help = 'the threading type of the compiler')
-    parser.add_argument('--exceptions', choices = ['sjlj', 'seh', 'dwarf'],
-        help = 'the method to throw exceptions')
-    parser.add_argument('--revision', type=int,
-        help = 'the revision of the MinGW release')
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument('-v', '--verbose', action='store_true',
-        help='increase the script output verbosity')
-    group.add_argument('-q', '--quiet', action='store_true',
-        help='only print errors and warning')
-    args = parser.parse_args()
-
-    # Create the logger
-    logger = logging.getLogger('mingw')
-    handler = logging.StreamHandler()
-    formatter = logging.Formatter('%(message)s')
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    logger.setLevel(logging.INFO)
-    if args.quiet:
-        logger.setLevel(logging.WARN)
-    if args.verbose:
-        logger.setLevel(logging.DEBUG)
-
-    # Get MinGW
-    root_dir = root(location = args.location, arch = args.arch,
-        version = args.version, threading = args.threading,
-        exceptions = args.exceptions, revision = args.revision,
-        log = logger)
-
-    sys.stdout.write('%s\n' % os.path.join(root_dir, 'bin'))
-
-if __name__ == '__main__':
-    try:
-        main()
-    except IOError as e:
-        sys.stderr.write('IO error: %s\n' % e)
-        sys.exit(1)
-    except OSError as e:
-        sys.stderr.write('OS error: %s\n' % e)
-        sys.exit(1)
-    except KeyboardInterrupt as e:
-        sys.stderr.write('Killed\n')
-        sys.exit(1)
--- a/third_party/benchmark/pyproject.toml
+++ b/third_party/benchmark/pyproject.toml
@@ -0,0 +1,78 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "google_benchmark"
+description = "A library to benchmark code snippets."
+requires-python = ">=3.10"
+license = { file = "LICENSE" }
+keywords = ["benchmark"]
+
+authors = [{ name = "Google", email = "benchmark-discuss@googlegroups.com" }]
+
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Software Development :: Testing",
+    "Topic :: System :: Benchmark",
+]
+
+dynamic = ["readme", "version"]
+
+dependencies = ["absl-py>=0.7.1"]
+
+[project.optional-dependencies]
+dev = ["pre-commit>=3.3.3"]
+
+[project.urls]
+Homepage = "https://github.com/google/benchmark"
+Documentation = "https://github.com/google/benchmark/tree/main/docs"
+Repository = "https://github.com/google/benchmark.git"
+Discord = "https://discord.gg/cz7UX7wKC2"
+
+[tool.setuptools]
+package-dir = { "" = "bindings/python" }
+zip-safe = false
+
+[tool.setuptools.packages.find]
+where = ["bindings/python"]
+
+[tool.setuptools.dynamic]
+readme = { file = "README.md", content-type = "text/markdown" }
+version = { attr = "google_benchmark.__version__" }
+
+[tool.mypy]
+check_untyped_defs = true
+disallow_incomplete_defs = true
+pretty = true
+python_version = "3.11"
+strict_optional = false
+warn_unreachable = true
+
+[[tool.mypy.overrides]]
+module = ["yaml"]
+ignore_missing_imports = true
+
+[tool.ruff]
+# explicitly tell ruff the source directory to correctly identify first-party package.
+src = ["bindings/python"]
+
+line-length = 80
+target-version = "py311"
+
+[tool.ruff.lint]
+# Enable pycodestyle (`E`, `W`), Pyflakes (`F`), and isort (`I`) codes by default.
+select = ["ASYNC", "B", "C4", "C90", "E", "F", "I", "PERF", "PIE", "PT018", "RUF", "SIM", "UP", "W"]
+ignore = [
+    "PLW2901",  # redefined-loop-name
+    "UP031",    # printf-string-formatting
+]
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
--- a/third_party/benchmark/releasing.md
+++ b/third_party/benchmark/releasing.md
@@ -1,16 +0,0 @@
-# How to release
-
-* Make sure you're on master and synced to HEAD
-* Ensure the project builds and tests run (sanity check only, obviously)
-    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
-      passes
-* Prepare release notes
-    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
-      commits between the last annotated tag and HEAD
-    * Pick the most interesting.
-* Create a release through github's interface
-    * Note this will create a lightweight tag.
-    * Update this to an annotated tag:
-      * `git pull --tags`
-      * `git tag -a -f <tag> <tag>`
-      * `git push --force origin`
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Doris Wu	56050de5cd	metal: implement memory mapping (#9454 )	2025-11-27 11:05:07 +08:00
Mathias Agopian	772136decd	fix a typo/bad merge that broke setPresentationTime (#9452 ) FIXES=[462533574]	2025-11-21 11:40:17 -08:00
Powei Feng	f664601c51	android: add readPixels callback for ModelViewer (#9440 )	2025-11-21 19:33:36 +00:00
Ben Doherty	f06b27b7fb	Fix getter methods (#9450 )	2025-11-21 00:37:28 +00:00
Mathias Agopian	ef53ce88d4	don't assume compositor and frame timing are always available (#9449 ) - this is not true fro headless swapchains - and potentially some timings might not be available on a given nativewindow Previously the code would handle these errors gracefully, but the errors were still generated. Now, we query the availability and only make the calls if supported. Also on EGL, we don't attempt to use private APIs -- this code path should never be used anyways.	2025-11-20 15:32:02 -08:00
Mathias Agopian	ef18030e1a	frameId must be monotonic in the SwapChain (#9447 ) The frameId coming from a Renderer must be monotonic when seen from a SwapChain (Specifically a ANativeWindow on Android), if it's not the case, we must clear that part of the history. This can happen if a SwapChain is used with two different Renderer; at this point that SwapChain's history is no longer connected to that Renderer.	2025-11-20 13:23:58 -08:00
Doris Wu	f07176c0a2	Try to fix test_UboBatching (#9448 )	2025-11-20 15:14:19 +08:00
Doris Wu	15db141c7a	Add some unit tests for UboManager (#9446 )	2025-11-20 14:28:13 +08:00
Doris Wu	d4bbb7c591	buffer update opt: Some optimizations (#9438 )	2025-11-20 00:52:04 +00:00
Sungun Park	92e620d2ad	Simplify buffer object creation (#9436 ) Simplify the buffer object creation logic to streamline and help make the future integration of asynchronous features easier.	2025-11-19 22:03:05 +00:00
Mathias Agopian	311104da97	update google benchmark library to 1.9.4 (#9441 ) * benchmark: update README and add update script * update google benchmark library to 1.9.4 * update tnt CMakeLists to match the library new version	2025-11-19 11:50:34 -08:00
Filament Bot	3127632f96	[automated] Updating /docs due to commit `59f611b` Full commit hash is `59f611bfde` DOCS_ALLOW_DIRECT_EDITS	2025-11-19 19:32:56 +00:00