Log GPU errors through Platform debugStat API (#9434)

This commit is contained in:
Ben Doherty
2025-11-19 13:42:42 -05:00
committed by GitHub
parent 6550a63056
commit 65e7dba8b8
7 changed files with 161 additions and 19 deletions

View File

@@ -8,5 +8,5 @@ appropriate header in [RELEASE_NOTES.md](./RELEASE_NOTES.md).
## Release notes for next branch cut
- engine: add `View::getLastDynamicResolutionScale()` (b/457753622)
- Metal: report GPU errors to the platform via `debugUpdateStat` (b/431665753).
- materials: Make Material Instances' UBO descriptor use dynamic offsets. [⚠️ **Recompile Materials**]

View File

@@ -19,13 +19,17 @@
#ifndef TNT_FILAMENT_BACKEND_PLATFORM_H
#define TNT_FILAMENT_BACKEND_PLATFORM_H
#include <utils/CString.h>
#include <utils/compiler.h>
#include <utils/Invocable.h>
#include <utils/Mutex.h>
#include <stddef.h>
#include <stdint.h>
#include <atomic>
#include <memory>
#include <mutex>
namespace filament::backend {
@@ -508,13 +512,22 @@ public:
// --------------------------------------------------------------------------------------------
// Debugging APIs
using DebugUpdateStatFunc = utils::Invocable<void(const char* UTILS_NONNULL key, uint64_t value)>;
using DebugUpdateStatFunc = utils::Invocable<void(const char* UTILS_NONNULL key,
uint64_t intValue, utils::CString stringValue)>;
/**
* Sets the callback function that the backend can use to update backend-specific statistics
* to aid with debugging. This callback is guaranteed to be called on the Filament driver
* thread.
*
* The callback signature is (key, intValue, stringValue). Note that for any given call,
* only one of the value parameters (intValue or stringValue) will be meaningful, depending on
* the specific key.
*
* IMPORTANT_NOTE: because the callback is called on the driver thread, only quick, non-blocking
* work should be done inside it. Furthermore, no graphics API calls (such as GL calls) should
* be made, which could interfere with Filament's driver state.
*
* @param debugUpdateStat an Invocable that updates debug statistics
*/
void setDebugUpdateStatFunc(DebugUpdateStatFunc&& debugUpdateStat) noexcept;
@@ -533,15 +546,32 @@ public:
* This function is guaranteed to be called only on a single thread, the Filament driver
* thread.
*
* @param key a null-terminated C-string with the key of the debug statistic
* @param value the updated value of key
* @param key a null-terminated C-string with the key of the debug statistic
* @param intValue the updated integer value of key (the string value passed to the
* callback will be empty)
*/
void debugUpdateStat(const char* UTILS_NONNULL key, uint64_t value);
void debugUpdateStat(const char* UTILS_NONNULL key, uint64_t intValue);
/**
* To track backend-specific statistics, the backend implementation can call the
* application-provided callback function debugUpdateStatFunc to associate or update a value
* with a given key. It is possible for this function to be called multiple times with the
* same key, in which case newer values should overwrite older values.
*
* This function is guaranteed to be called only on a single thread, the Filament driver
* thread.
*
* @param key a null-terminated C-string with the key of the debug statistic
* @param stringValue the updated string value of key (the integer value passed to the
* callback will be 0)
*/
void debugUpdateStat(const char* UTILS_NONNULL key, utils::CString stringValue);
private:
InsertBlobFunc mInsertBlob;
RetrieveBlobFunc mRetrieveBlob;
DebugUpdateStatFunc mDebugUpdateStat;
std::shared_ptr<InsertBlobFunc> mInsertBlob;
std::shared_ptr<RetrieveBlobFunc> mRetrieveBlob;
std::shared_ptr<DebugUpdateStatFunc> mDebugUpdateStat;
mutable utils::Mutex mMutex;
};
} // namespace filament

View File

@@ -139,42 +139,73 @@ bool Platform::queryFrameTimestamps(SwapChain const*, uint64_t, FrameTimestamps*
}
void Platform::setBlobFunc(InsertBlobFunc&& insertBlob, RetrieveBlobFunc&& retrieveBlob) noexcept {
mInsertBlob = std::move(insertBlob);
mRetrieveBlob = std::move(retrieveBlob);
std::lock_guard<decltype(mMutex)> lock(mMutex);
mInsertBlob = std::make_shared<InsertBlobFunc>(std::move(insertBlob));
mRetrieveBlob = std::make_shared<RetrieveBlobFunc>(std::move(retrieveBlob));
}
bool Platform::hasInsertBlobFunc() const noexcept {
std::lock_guard<decltype(mMutex)> lock(mMutex);
return bool(mInsertBlob);
}
bool Platform::hasRetrieveBlobFunc() const noexcept {
std::lock_guard<decltype(mMutex)> lock(mMutex);
return bool(mRetrieveBlob);
}
void Platform::insertBlob(void const* key, size_t keySize, void const* value, size_t valueSize) {
if (mInsertBlob) {
mInsertBlob(key, keySize, value, valueSize);
std::shared_ptr<InsertBlobFunc> callback;
{
std::unique_lock<decltype(mMutex)> lock(mMutex);
callback = mInsertBlob;
}
if (callback) {
(*callback)(key, keySize, value, valueSize);
}
}
size_t Platform::retrieveBlob(void const* key, size_t keySize, void* value, size_t valueSize) {
if (mRetrieveBlob) {
return mRetrieveBlob(key, keySize, value, valueSize);
std::shared_ptr<RetrieveBlobFunc> callback;
{
std::unique_lock<decltype(mMutex)> lock(mMutex);
callback = mRetrieveBlob;
}
if (callback) {
return (*callback)(key, keySize, value, valueSize);
}
return 0;
}
void Platform::setDebugUpdateStatFunc(DebugUpdateStatFunc&& debugUpdateStat) noexcept {
mDebugUpdateStat = std::move(debugUpdateStat);
std::lock_guard<decltype(mMutex)> lock(mMutex);
mDebugUpdateStat = std::make_shared<DebugUpdateStatFunc>(std::move(debugUpdateStat));
}
bool Platform::hasDebugUpdateStatFunc() const noexcept {
return bool(mDebugUpdateStat);
std::lock_guard<decltype(mMutex)> lock(mMutex);
return mDebugUpdateStat != nullptr;
}
void Platform::debugUpdateStat(const char* key, uint64_t value) {
if (mDebugUpdateStat) {
mDebugUpdateStat(key, value);
void Platform::debugUpdateStat(const char* key, uint64_t intValue) {
std::shared_ptr<DebugUpdateStatFunc> callback;
{
std::unique_lock<decltype(mMutex)> lock(mMutex);
callback = mDebugUpdateStat;
}
if (callback) {
(*callback)(key, intValue, "");
}
}
void Platform::debugUpdateStat(const char* key, utils::CString stringValue) {
std::shared_ptr<DebugUpdateStatFunc> callback;
{
std::unique_lock<decltype(mMutex)> lock(mMutex);
callback = mDebugUpdateStat;
}
if (callback) {
(*callback)(key, 0, stringValue);
}
}

View File

@@ -17,6 +17,7 @@
#ifndef TNT_METALCONTEXT_H
#define TNT_METALCONTEXT_H
#include "MetalErrorQueue.h"
#include "MetalResourceTracker.h"
#include "MetalShaderCompiler.h"
#include "MetalState.h"
@@ -129,6 +130,7 @@ struct MetalContext {
id<MTLCommandBuffer> pendingCommandBuffer = nil;
id<MTLRenderCommandEncoder> currentRenderPassEncoder = nil;
uint32_t currentFrame = 0;
MetalErrorQueue commandBufferErrors;
std::atomic<bool> memorylessLimitsReached = false;

View File

@@ -153,6 +153,7 @@ id<MTLCommandBuffer> getPendingCommandBuffer(MetalContext* context) {
if (UTILS_UNLIKELY(errorCode != MTLCommandBufferErrorNone)) {
logMTLCommandBufferError(errorCode);
context->commandBufferErrors.push(buffer.error);
}
}];
FILAMENT_CHECK_POSTCONDITION(context->pendingCommandBuffer)

View File

@@ -254,6 +254,18 @@ MetalDriver::~MetalDriver() noexcept {
void MetalDriver::tick(int) {
executeTickOps();
executeDeferredOps();
// Notify platform of GPU errors.
auto& platform = mPlatform;
if (UTILS_UNLIKELY(!mContext->commandBufferErrors.isEmpty())) {
mContext->commandBufferErrors.flush([&platform](NSError* error) {
if (UTILS_VERY_UNLIKELY(!error)) {
return;
}
const utils::CString errorString(error.localizedDescription.UTF8String);
platform.debugUpdateStat("filament.metal.command_buffer_error", errorString);
});
}
}
void MetalDriver::beginFrame(int64_t monotonic_clock_ns,

View File

@@ -0,0 +1,66 @@
/*
* Copyright (C) 2025 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef TNT_FILAMENT_DRIVER_METALERRORQUEUE_H
#define TNT_FILAMENT_DRIVER_METALERRORQUEUE_H
#import <Foundation/Foundation.h>
#include <utils/compiler.h>
#include <atomic>
#include <functional>
#include <mutex>
#include <vector>
class MetalErrorQueue {
public:
bool isEmpty() const {
return !mHasErrors.load(std::memory_order_relaxed);
}
void push(NSError* error) {
std::lock_guard<std::mutex> lock(mMutex);
mErrors.push_back(error);
mHasErrors.store(true, std::memory_order_relaxed);
}
void flush(const std::function<void(NSError*)>& callback) {
if (UTILS_LIKELY(isEmpty())) {
return;
}
std::vector<NSError*> errors;
{
std::lock_guard<std::mutex> lock(mMutex);
std::swap(mErrors, errors);
mHasErrors.store(false, std::memory_order_relaxed);
}
for (const auto& error: errors) {
callback(error);
}
}
private:
std::vector<NSError*> mErrors;
std::mutex mMutex;
// Optimization to avoid locking the mutex at each call to flush.
std::atomic<bool> mHasErrors;
};
#endif // TNT_FILAMENT_DRIVER_METALERRORQUEUE_H