mirror of
https://github.com/wolfpld/tracy.git
synced 2026-06-13 18:59:00 +00:00
Compare commits
1 Commits
slomp/webg
...
slomp/gl-c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5240b96675 |
2
.github/workflows/linux.yml
vendored
2
.github/workflows/linux.yml
vendored
@@ -32,7 +32,7 @@ jobs:
|
||||
if [ "${ACT:-}" != "true" ] && [ "${FORGEJO_ACTIONS:-}" != "true" ]; then
|
||||
cmake --build profiler/build
|
||||
else
|
||||
cmake --build profiler/build --parallel 2
|
||||
cmake --build profiler/build --parallel
|
||||
fi
|
||||
- name: Update utility
|
||||
run: |
|
||||
|
||||
2
.github/workflows/macos.yml
vendored
2
.github/workflows/macos.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
- name: Build profiler
|
||||
run: |
|
||||
cmake -B profiler/build -S profiler -DCMAKE_BUILD_TYPE=Release -DGIT_REV=${{ github.sha }}
|
||||
cmake --build profiler/build --parallel 2 --config Release
|
||||
cmake --build profiler/build --parallel --config Release
|
||||
- name: Build update
|
||||
run: |
|
||||
cmake -B update/build -S update -DCMAKE_BUILD_TYPE=Release -DGIT_REV=${{ github.sha }}
|
||||
|
||||
2
.github/workflows/windows.yml
vendored
2
.github/workflows/windows.yml
vendored
@@ -32,7 +32,7 @@ jobs:
|
||||
- name: Build profiler
|
||||
run: |
|
||||
cmake -B profiler/build -S profiler -DCMAKE_BUILD_TYPE=Release -DGIT_REV=${{ github.sha }}
|
||||
cmake --build profiler/build --parallel 2 --config Release
|
||||
cmake --build profiler/build --parallel --config Release
|
||||
- name: Build update
|
||||
run: |
|
||||
cmake -B update/build -S update -DCMAKE_BUILD_TYPE=Release -DGIT_REV=${{ github.sha }}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
### A real time, nanosecond resolution, remote telemetry, hybrid frame and sampling profiler for games and other applications.
|
||||
|
||||
Tracy supports profiling CPU (Direct support is provided for C, C++, Lua, Python and Fortran integration. At the same time, third-party bindings to many other languages exist on the internet, such as [Rust](https://github.com/nagisa/rust_tracy_client), [Zig](https://github.com/tealsnow/zig-tracy), [C#](https://github.com/clibequilibrium/Tracy-CSharp), [OCaml](https://github.com/imandra-ai/ocaml-tracy), [Odin](https://github.com/oskarnp/odin-tracy), etc.), GPU (All major graphics/compute APIs: OpenGL, Vulkan, Direct3D 11/12, Metal, OpenCL, CUDA, WebGPU.), memory allocations, locks, context switches, automatically attribute screenshots to captured frames, and much more.
|
||||
Tracy supports profiling CPU (Direct support is provided for C, C++, Lua, Python and Fortran integration. At the same time, third-party bindings to many other languages exist on the internet, such as [Rust](https://github.com/nagisa/rust_tracy_client), [Zig](https://github.com/tealsnow/zig-tracy), [C#](https://github.com/clibequilibrium/Tracy-CSharp), [OCaml](https://github.com/imandra-ai/ocaml-tracy), [Odin](https://github.com/oskarnp/odin-tracy), etc.), GPU (All major graphic APIs: OpenGL, Vulkan, Direct3D 11/12, Metal, OpenCL, CUDA.), memory allocations, locks, context switches, automatically attribute screenshots to captured frames, and much more.
|
||||
|
||||
- [Documentation](https://github.com/wolfpld/tracy/releases/latest/download/tracy.pdf) for usage and build process instructions
|
||||
- [Releases](https://github.com/wolfpld/tracy/releases) containing the documentation (`tracy.pdf`) and compiled Windows x64 binaries (`Tracy-<version>.7z`) as assets
|
||||
|
||||
@@ -1,164 +0,0 @@
|
||||
# CMakeLists.txt — WebGPU spinning triangle demo
|
||||
#
|
||||
# macOS:
|
||||
# clang++ -std=c++17 -ObjC++ spinning_triangle.cpp platform/platform_macos.mm \
|
||||
# -I/path/to/wgpu/include -L/path/to/wgpu/lib -lwgpu_native \
|
||||
# -Wl,-rpath,@executable_path \
|
||||
# -framework Cocoa -framework Metal -framework QuartzCore \
|
||||
# -framework Foundation -framework IOKit -framework IOSurface \
|
||||
# -o spinning_triangle
|
||||
#
|
||||
# Windows (MSVC):
|
||||
# cl /std:c++17 spinning_triangle.cpp platform/platform_windows.cpp \
|
||||
# /I\path\to\wgpu\include \path\to\wgpu\lib\wgpu_native.lib \
|
||||
# user32.lib gdi32.lib /Fe:spinning_triangle.exe
|
||||
#
|
||||
# Linux / Wayland:
|
||||
# g++ -std=c++17 spinning_triangle.cpp platform/platform_wayland.cpp \
|
||||
# xdg-shell-protocol.c \
|
||||
# -I/path/to/wgpu/include -L/path/to/wgpu/lib -lwgpu_native \
|
||||
# -lwayland-client -o spinning_triangle
|
||||
|
||||
cmake_minimum_required(VERSION 3.16)
|
||||
project(spinning_triangle LANGUAGES C CXX)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# WebGPU backend — set WGPU_PATH to your wgpu-native or Dawn installation.
|
||||
# The library name differs between backends:
|
||||
# wgpu-native → wgpu_native
|
||||
# Dawn → webgpu_dawn
|
||||
# ---------------------------------------------------------------------------
|
||||
set(WGPU_PATH "" CACHE PATH "Root of the WebGPU native installation (contains include/ and lib/)")
|
||||
set(WGPU_LIB "" CACHE STRING "WebGPU library name (wgpu_native or webgpu_dawn); auto-detected if empty")
|
||||
|
||||
if(NOT WGPU_PATH)
|
||||
message(FATAL_ERROR "Set WGPU_PATH to the root of your WebGPU native installation.")
|
||||
endif()
|
||||
|
||||
# When WGPU_PATH changes, discard any previously auto-detected WGPU_LIB so
|
||||
# detection re-runs against the new path.
|
||||
if(NOT "${WGPU_PATH}" STREQUAL "${_WGPU_PATH_LAST}")
|
||||
unset(WGPU_LIB CACHE)
|
||||
set(WGPU_LIB "" CACHE STRING "WebGPU library name (wgpu_native or webgpu_dawn); auto-detected if empty")
|
||||
endif()
|
||||
set(_WGPU_PATH_LAST "${WGPU_PATH}" CACHE INTERNAL "")
|
||||
|
||||
if(NOT WGPU_LIB)
|
||||
unset(_WGPU_NATIVE_LIB CACHE)
|
||||
unset(_WEBGPU_DAWN_LIB CACHE)
|
||||
find_library(_WGPU_NATIVE_LIB NAMES wgpu_native wgpu_native.dll PATHS "${WGPU_PATH}/lib" NO_DEFAULT_PATH)
|
||||
find_library(_WEBGPU_DAWN_LIB NAMES webgpu_dawn PATHS "${WGPU_PATH}/lib" NO_DEFAULT_PATH)
|
||||
if(_WGPU_NATIVE_LIB)
|
||||
set(WGPU_LIB "wgpu_native" CACHE STRING "WebGPU library name (wgpu_native or webgpu_dawn); auto-detected if empty" FORCE)
|
||||
elseif(_WEBGPU_DAWN_LIB)
|
||||
set(WGPU_LIB "webgpu_dawn" CACHE STRING "WebGPU library name (wgpu_native or webgpu_dawn); auto-detected if empty" FORCE)
|
||||
else()
|
||||
message(FATAL_ERROR "Could not detect a WebGPU library in ${WGPU_PATH}/lib. Set WGPU_LIB explicitly (wgpu_native or webgpu_dawn).")
|
||||
endif()
|
||||
message(STATUS "WebGPU library auto-detected: ${WGPU_LIB}")
|
||||
endif()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tracy root — defaults to two directories above this CMakeLists.txt.
|
||||
# ---------------------------------------------------------------------------
|
||||
set(TRACY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
|
||||
option(TRACY_ENABLE "Enable Tracy profiling" ON)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# macOS quarantine — pre-built WebGPU binaries downloaded from the internet
|
||||
# carry a com.apple.quarantine extended attribute that prevents dyld from
|
||||
# loading them ("damaged or incomplete" / Gatekeeper block). Strip it once
|
||||
# at configure time so the linker and the runtime loader can both access the
|
||||
# library directory without further user intervention.
|
||||
# ---------------------------------------------------------------------------
|
||||
if(APPLE)
|
||||
execute_process(
|
||||
COMMAND xattr -dr com.apple.quarantine "${WGPU_PATH}/lib"
|
||||
)
|
||||
endif()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Platform — RGFW (cross-platform windowing, fetched automatically)
|
||||
# ---------------------------------------------------------------------------
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(rgfw
|
||||
GIT_REPOSITORY https://github.com/ColleagueRiley/RGFW.git
|
||||
GIT_TAG main # pin to a specific commit for reproducible builds
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
FetchContent_MakeAvailable(rgfw)
|
||||
|
||||
set(PLATFORM_SOURCES platform/platform_rgfw.cpp)
|
||||
set(PLATFORM_INCLUDES ${rgfw_SOURCE_DIR})
|
||||
|
||||
if(APPLE)
|
||||
set(PLATFORM_LIBS
|
||||
"-framework Cocoa"
|
||||
"-framework Metal"
|
||||
"-framework QuartzCore"
|
||||
"-framework Foundation"
|
||||
"-framework IOKit"
|
||||
"-framework IOSurface"
|
||||
)
|
||||
elseif(WIN32)
|
||||
set(PLATFORM_LIBS user32 gdi32)
|
||||
else()
|
||||
find_package(X11 REQUIRED)
|
||||
if(NOT X11_Xrandr_FOUND)
|
||||
message(FATAL_ERROR "Xrandr not found — install libxrandr-dev")
|
||||
endif()
|
||||
set(PLATFORM_LIBS X11::X11 X11::Xrandr)
|
||||
endif()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Target
|
||||
# ---------------------------------------------------------------------------
|
||||
add_executable(spinning_triangle
|
||||
spinning_triangle.cpp
|
||||
"${TRACY_DIR}/public/TracyClient.cpp"
|
||||
${PLATFORM_SOURCES}
|
||||
)
|
||||
|
||||
# Treat TracyClient.cpp as third-party code — suppress all warnings so that
|
||||
# upstream changes don't pollute our build output.
|
||||
if(MSVC)
|
||||
set_source_files_properties("${TRACY_DIR}/public/TracyClient.cpp"
|
||||
PROPERTIES COMPILE_FLAGS "/w"
|
||||
)
|
||||
else()
|
||||
set_source_files_properties("${TRACY_DIR}/public/TracyClient.cpp"
|
||||
PROPERTIES COMPILE_FLAGS "-w"
|
||||
)
|
||||
endif()
|
||||
|
||||
target_compile_features(spinning_triangle PRIVATE cxx_std_17)
|
||||
|
||||
if(TRACY_ENABLE)
|
||||
target_compile_definitions(spinning_triangle PRIVATE TRACY_ENABLE)
|
||||
endif()
|
||||
|
||||
target_include_directories(spinning_triangle PRIVATE
|
||||
"${WGPU_PATH}/include"
|
||||
"${TRACY_DIR}/public"
|
||||
${PLATFORM_INCLUDES}
|
||||
)
|
||||
|
||||
target_link_directories(spinning_triangle PRIVATE "${WGPU_PATH}/lib")
|
||||
|
||||
target_link_libraries(spinning_triangle PRIVATE
|
||||
${WGPU_LIB}
|
||||
${PLATFORM_LIBS}
|
||||
)
|
||||
|
||||
# Embed the rpath so the binary finds the WebGPU dylib/so next to itself.
|
||||
if(APPLE)
|
||||
set_target_properties(spinning_triangle PROPERTIES
|
||||
BUILD_RPATH "${WGPU_PATH}/lib"
|
||||
INSTALL_RPATH "@executable_path"
|
||||
)
|
||||
elseif(UNIX)
|
||||
set_target_properties(spinning_triangle PROPERTIES
|
||||
BUILD_RPATH "${WGPU_PATH}/lib"
|
||||
INSTALL_RPATH "$ORIGIN"
|
||||
)
|
||||
endif()
|
||||
@@ -1,23 +0,0 @@
|
||||
// platform.h — interface between platform-agnostic code and platform backends
|
||||
//
|
||||
// Each platform_*.mm / platform_*.cpp file implements these five functions.
|
||||
// Exactly one backend must be linked into the final binary.
|
||||
|
||||
#pragma once
|
||||
#include <webgpu/webgpu.h>
|
||||
|
||||
// Initialize the windowing system and create a window of the given dimensions.
|
||||
// Returns true on success.
|
||||
bool platformInit(int width, int height, const char* title);
|
||||
|
||||
// Create a WebGPU surface backed by the platform window.
|
||||
// Must be called after wgpuCreateInstance() and platformInit().
|
||||
WGPUSurface platformCreateSurface(WGPUInstance instance);
|
||||
|
||||
// Elapsed wall-clock time in seconds since platformInit().
|
||||
double platformGetTime();
|
||||
|
||||
// Enter the platform event/render loop.
|
||||
// Calls render() each frame at ~60 fps.
|
||||
// Calls shutdown() exactly once before returning.
|
||||
void platformRunLoop(void (*render)(), void (*shutdown)());
|
||||
@@ -1,72 +0,0 @@
|
||||
// platform_rgfw.cpp — RGFW windowing backend for the WebGPU example
|
||||
// https://github.com/ColleagueRiley/RGFW
|
||||
|
||||
#include "platform.h" // webgpu/webgpu.h first so RGFW sees WGPUSurface
|
||||
|
||||
#define RGFW_WEBGPU
|
||||
#define RGFW_IMPLEMENTATION
|
||||
#include <RGFW.h>
|
||||
|
||||
#include <chrono>
|
||||
#include <cstdio>
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <X11/Xlib.h>
|
||||
static bool platformHasDisplay() {
|
||||
// RGFW workaround: RGFW indiscriminately passes XOpenDisplay(0) unchecked
|
||||
// to X11 functions like XCreateWindow(), which will lead to SIGSEGV.
|
||||
Display* display = XOpenDisplay(0);
|
||||
if (display == nullptr) {
|
||||
fprintf(stderr, "ERROR: failed to open X11 display (is $DISPLAY set?)\n");
|
||||
return false;
|
||||
}
|
||||
XCloseDisplay(display);
|
||||
return true;
|
||||
}
|
||||
#else
|
||||
static bool platformHasDisplay() {
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
static RGFW_window* sWin = nullptr;
|
||||
static std::chrono::steady_clock::time_point sStartTime;
|
||||
|
||||
bool platformInit(int width, int height, const char* title) {
|
||||
if (!platformHasDisplay()) {
|
||||
fprintf(stderr, "ERROR: no display found\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
sWin = RGFW_createWindow(title, 0, 0, width, height, RGFW_windowCenter);
|
||||
if (!sWin) {
|
||||
fprintf(stderr, "ERROR: failed to create window\n");
|
||||
return false;
|
||||
}
|
||||
RGFW_window_setExitKey(sWin, RGFW_keyEscape);
|
||||
sStartTime = std::chrono::steady_clock::now();
|
||||
return true;
|
||||
}
|
||||
|
||||
WGPUSurface platformCreateSurface(WGPUInstance instance) {
|
||||
return RGFW_window_createSurface_WebGPU(sWin, instance);
|
||||
}
|
||||
|
||||
double platformGetTime() {
|
||||
return std::chrono::duration<double>(
|
||||
std::chrono::steady_clock::now() - sStartTime).count();
|
||||
}
|
||||
|
||||
void platformRunLoop(void (*render)(), void (*shutdown)()) {
|
||||
while (RGFW_window_shouldClose(sWin) == RGFW_FALSE) {
|
||||
RGFW_event event;
|
||||
while (RGFW_window_checkEvent(sWin, &event)) {
|
||||
if (event.type == RGFW_windowClose) goto done;
|
||||
}
|
||||
render();
|
||||
}
|
||||
done:
|
||||
shutdown();
|
||||
RGFW_window_close(sWin);
|
||||
sWin = nullptr;
|
||||
}
|
||||
@@ -1,352 +0,0 @@
|
||||
// spinning_triangle.cpp — platform-agnostic WebGPU spinning triangle demo.
|
||||
|
||||
#include "platform/platform.h"
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <webgpu/webgpu.h>
|
||||
|
||||
#include <tracy/Tracy.hpp>
|
||||
#include <tracy/TracyWebGPU.hpp>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Globals
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static const int kWidth = 800;
|
||||
static const int kHeight = 600;
|
||||
|
||||
static WGPUInstance gInstance = nullptr;
|
||||
static WGPUSurface gSurface = nullptr;
|
||||
static WGPUAdapter gAdapter = nullptr;
|
||||
static WGPUDevice gDevice = nullptr;
|
||||
static WGPUQueue gQueue = nullptr;
|
||||
static WGPURenderPipeline gPipeline = nullptr;
|
||||
static WGPUBuffer gUniformBuf = nullptr;
|
||||
static WGPUBindGroup gBindGroup = nullptr;
|
||||
|
||||
static TracyWebGPUCtx gTracyCtx = nullptr;
|
||||
|
||||
static WGPUTextureFormat gSurfaceFormat = WGPUTextureFormat_BGRA8Unorm;
|
||||
|
||||
// TODO: this can become platformError() instead
|
||||
int error(int code, const char* message) {
|
||||
fprintf(stderr, "ERROR: %s (code: %d)\n", message, code);
|
||||
return code;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// WGSL shader — vertex colours baked in, rotation via a uniform float.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static const char* kShaderSource = R"(
|
||||
struct Uniforms {
|
||||
angle: f32,
|
||||
};
|
||||
@group(0) @binding(0) var<uniform> u: Uniforms;
|
||||
|
||||
struct VSOut {
|
||||
@builtin(position) pos: vec4f,
|
||||
@location(0) color: vec3f,
|
||||
};
|
||||
|
||||
@vertex
|
||||
fn vs_main(@builtin(vertex_index) vi: u32) -> VSOut {
|
||||
var positions = array<vec2f, 3>(
|
||||
vec2f( 0.0, 0.5),
|
||||
vec2f(-0.433, -0.25),
|
||||
vec2f( 0.433, -0.25),
|
||||
);
|
||||
var colors = array<vec3f, 3>(
|
||||
vec3f(1.0, 0.0, 0.0),
|
||||
vec3f(0.0, 1.0, 0.0),
|
||||
vec3f(0.0, 0.0, 1.0),
|
||||
);
|
||||
|
||||
let c = cos(u.angle);
|
||||
let s = sin(u.angle);
|
||||
let p = positions[vi];
|
||||
let rotated = vec2f(p.x * c - p.y * s, p.x * s + p.y * c);
|
||||
|
||||
var out: VSOut;
|
||||
out.pos = vec4f(rotated, 0.0, 1.0);
|
||||
out.color = colors[vi];
|
||||
return out;
|
||||
}
|
||||
|
||||
@fragment
|
||||
fn fs_main(@location(0) color: vec3f) -> @location(0) vec4f {
|
||||
return vec4f(color, 1.0);
|
||||
}
|
||||
)";
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Adapter / Device request callbacks (current wgpu-native API)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void onAdapterReady(WGPURequestAdapterStatus status,
|
||||
WGPUAdapter adapter,
|
||||
WGPUStringView message,
|
||||
void* userdata1, void* /*userdata2*/) {
|
||||
if (status == WGPURequestAdapterStatus_Success) {
|
||||
*(WGPUAdapter*)userdata1 = adapter;
|
||||
} else {
|
||||
fprintf(stderr, "Adapter request failed: %.*s\n",
|
||||
(int)message.length, message.data);
|
||||
}
|
||||
}
|
||||
|
||||
static void onDeviceReady(WGPURequestDeviceStatus status,
|
||||
WGPUDevice device,
|
||||
WGPUStringView message,
|
||||
void* userdata1, void* /*userdata2*/) {
|
||||
if (status == WGPURequestDeviceStatus_Success) {
|
||||
*(WGPUDevice*)userdata1 = device;
|
||||
} else {
|
||||
fprintf(stderr, "Device request failed: %.*s\n",
|
||||
(int)message.length, message.data);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// WebGPU init
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static int initWebGPU() {
|
||||
// Adapter
|
||||
WGPURequestAdapterOptions adapterOpts = {};
|
||||
adapterOpts.compatibleSurface = gSurface;
|
||||
|
||||
WGPURequestAdapterCallbackInfo adapterCB = {};
|
||||
adapterCB.mode = WGPUCallbackMode_AllowProcessEvents;
|
||||
adapterCB.callback = onAdapterReady;
|
||||
adapterCB.userdata1 = &gAdapter;
|
||||
wgpuInstanceRequestAdapter(gInstance, &adapterOpts, adapterCB);
|
||||
while (!gAdapter) { wgpuInstanceProcessEvents(gInstance); }
|
||||
if (!gAdapter) return error(11, "No adapter");
|
||||
|
||||
WGPUUncapturedErrorCallbackInfo errorCB = {};
|
||||
errorCB.callback = [](WGPUDevice const*, WGPUErrorType type,
|
||||
WGPUStringView message, void*, void*) {
|
||||
fprintf(stderr, "[WGPU ERROR] type=%d %.*s\n",
|
||||
(int)type, (int)message.length, message.data);
|
||||
};
|
||||
|
||||
WGPUDeviceDescriptor deviceDesc = {};
|
||||
deviceDesc.uncapturedErrorCallbackInfo = errorCB;
|
||||
|
||||
TracyWebGPUSetupDeviceDescriptor(deviceDesc);
|
||||
|
||||
WGPURequestDeviceCallbackInfo deviceCB = {};
|
||||
deviceCB.mode = WGPUCallbackMode_AllowProcessEvents;
|
||||
deviceCB.callback = onDeviceReady;
|
||||
deviceCB.userdata1 = &gDevice;
|
||||
wgpuAdapterRequestDevice(gAdapter, &deviceDesc, deviceCB);
|
||||
while (!gDevice) { wgpuInstanceProcessEvents(gInstance); }
|
||||
if (!gDevice) return error(12, "No device");
|
||||
|
||||
gQueue = wgpuDeviceGetQueue(gDevice);
|
||||
gTracyCtx = TracyWebGPUContext(gInstance, gDevice, gQueue);
|
||||
TracyWebGPUContextName(gTracyCtx, "WebGPU", 6);
|
||||
|
||||
// Configure surface
|
||||
WGPUSurfaceConfiguration config = {};
|
||||
config.device = gDevice;
|
||||
config.format = gSurfaceFormat;
|
||||
config.usage = WGPUTextureUsage_RenderAttachment;
|
||||
config.alphaMode = WGPUCompositeAlphaMode_Opaque;
|
||||
config.width = kWidth;
|
||||
config.height = kHeight;
|
||||
config.presentMode = WGPUPresentMode_Fifo;
|
||||
wgpuSurfaceConfigure(gSurface, &config);
|
||||
|
||||
// Shader module
|
||||
WGPUShaderSourceWGSL wgslSrc = {};
|
||||
wgslSrc.chain.sType = WGPUSType_ShaderSourceWGSL;
|
||||
wgslSrc.code = { kShaderSource, WGPU_STRLEN };
|
||||
|
||||
WGPUShaderModuleDescriptor smDesc = {};
|
||||
smDesc.nextInChain = (WGPUChainedStruct*)&wgslSrc;
|
||||
WGPUShaderModule shaderMod = wgpuDeviceCreateShaderModule(gDevice, &smDesc);
|
||||
|
||||
// Uniform buffer (one f32 for rotation angle)
|
||||
WGPUBufferDescriptor bufDesc = {};
|
||||
bufDesc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
|
||||
bufDesc.size = sizeof(float);
|
||||
gUniformBuf = wgpuDeviceCreateBuffer(gDevice, &bufDesc);
|
||||
|
||||
// Bind group layout + bind group
|
||||
WGPUBindGroupLayoutEntry bglEntry = {};
|
||||
bglEntry.binding = 0;
|
||||
bglEntry.visibility = WGPUShaderStage_Vertex;
|
||||
bglEntry.buffer.type = WGPUBufferBindingType_Uniform;
|
||||
bglEntry.buffer.minBindingSize = sizeof(float);
|
||||
|
||||
WGPUBindGroupLayoutDescriptor bglDesc = {};
|
||||
bglDesc.entryCount = 1;
|
||||
bglDesc.entries = &bglEntry;
|
||||
WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(gDevice, &bglDesc);
|
||||
|
||||
WGPUBindGroupEntry bgEntry = {};
|
||||
bgEntry.binding = 0;
|
||||
bgEntry.buffer = gUniformBuf;
|
||||
bgEntry.size = sizeof(float);
|
||||
|
||||
WGPUBindGroupDescriptor bgDesc = {};
|
||||
bgDesc.layout = bgl;
|
||||
bgDesc.entryCount = 1;
|
||||
bgDesc.entries = &bgEntry;
|
||||
gBindGroup = wgpuDeviceCreateBindGroup(gDevice, &bgDesc);
|
||||
|
||||
// Pipeline layout
|
||||
WGPUPipelineLayoutDescriptor plDesc = {};
|
||||
plDesc.bindGroupLayoutCount = 1;
|
||||
plDesc.bindGroupLayouts = &bgl;
|
||||
WGPUPipelineLayout pipelineLayout = wgpuDeviceCreatePipelineLayout(gDevice, &plDesc);
|
||||
|
||||
// Render pipeline
|
||||
WGPUColorTargetState colorTarget = {};
|
||||
colorTarget.format = gSurfaceFormat;
|
||||
colorTarget.writeMask = WGPUColorWriteMask_All;
|
||||
|
||||
WGPUFragmentState fragState = {};
|
||||
fragState.module = shaderMod;
|
||||
fragState.entryPoint = { "fs_main", WGPU_STRLEN };
|
||||
fragState.targetCount = 1;
|
||||
fragState.targets = &colorTarget;
|
||||
|
||||
WGPURenderPipelineDescriptor rpDesc = {};
|
||||
rpDesc.layout = pipelineLayout;
|
||||
rpDesc.vertex.module = shaderMod;
|
||||
rpDesc.vertex.entryPoint = { "vs_main", WGPU_STRLEN };
|
||||
rpDesc.primitive.topology = WGPUPrimitiveTopology_TriangleList;
|
||||
rpDesc.multisample.count = 1;
|
||||
rpDesc.multisample.mask = 0xFFFFFFFF;
|
||||
rpDesc.fragment = &fragState;
|
||||
|
||||
gPipeline = wgpuDeviceCreateRenderPipeline(gDevice, &rpDesc);
|
||||
|
||||
// Cleanup intermediates
|
||||
wgpuShaderModuleRelease(shaderMod);
|
||||
wgpuPipelineLayoutRelease(pipelineLayout);
|
||||
wgpuBindGroupLayoutRelease(bgl);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Frame rendering
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Returns the surface texture for the current frame, or {.texture=nullptr} on
|
||||
// a skippable condition (timeout, occlusion) or an error.
|
||||
static WGPUSurfaceTexture getWindowSurface() {
|
||||
WGPUSurfaceTexture surfTex = {};
|
||||
wgpuSurfaceGetCurrentTexture(gSurface, &surfTex);
|
||||
if (surfTex.status == WGPUSurfaceGetCurrentTextureStatus_SuccessOptimal ||
|
||||
surfTex.status == WGPUSurfaceGetCurrentTextureStatus_SuccessSuboptimal)
|
||||
return surfTex;
|
||||
|
||||
// Timeout and Occluded are normal OS events (window covered / on a different Space).
|
||||
bool silent = surfTex.status == WGPUSurfaceGetCurrentTextureStatus_Timeout;
|
||||
#ifdef WGPU_H_
|
||||
silent = silent || surfTex.status == (WGPUSurfaceGetCurrentTextureStatus)WGPUSurfaceGetCurrentTextureStatus_Occluded;
|
||||
#endif
|
||||
if (!silent)
|
||||
fprintf(stderr, "Failed to get surface texture (status %d)\n", surfTex.status);
|
||||
if (surfTex.texture) wgpuTextureRelease(surfTex.texture);
|
||||
surfTex.texture = nullptr;
|
||||
return surfTex;
|
||||
}
|
||||
|
||||
static void renderFrame() {
|
||||
ZoneScoped;
|
||||
|
||||
// Update rotation angle
|
||||
float angle = (float)platformGetTime();
|
||||
wgpuQueueWriteBuffer(gQueue, gUniformBuf, 0, &angle, sizeof(float));
|
||||
|
||||
WGPUSurfaceTexture surfTex = getWindowSurface();
|
||||
if (!surfTex.texture) return;
|
||||
|
||||
WGPUTextureView view = wgpuTextureCreateView(surfTex.texture, nullptr);
|
||||
|
||||
// Command encoder
|
||||
WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(gDevice, nullptr);
|
||||
|
||||
// Render pass
|
||||
WGPURenderPassColorAttachment colorAtt = {};
|
||||
colorAtt.view = view;
|
||||
colorAtt.loadOp = WGPULoadOp_Clear;
|
||||
colorAtt.storeOp = WGPUStoreOp_Store;
|
||||
colorAtt.clearValue = { 0.05, 0.05, 0.08, 1.0 };
|
||||
colorAtt.depthSlice = WGPU_DEPTH_SLICE_UNDEFINED;
|
||||
|
||||
WGPURenderPassDescriptor passDesc = {};
|
||||
passDesc.colorAttachmentCount = 1;
|
||||
passDesc.colorAttachments = &colorAtt;
|
||||
|
||||
{
|
||||
ZoneScopedN("render-pass");
|
||||
TracyWebGPUNamedZone(gTracyCtx, tracyZone, encoder, passDesc, "triangle draw", true);
|
||||
WGPURenderPassEncoder pass = wgpuCommandEncoderBeginRenderPass(encoder, &passDesc);
|
||||
wgpuRenderPassEncoderSetPipeline(pass, gPipeline);
|
||||
wgpuRenderPassEncoderSetBindGroup(pass, 0, gBindGroup, 0, nullptr);
|
||||
wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
|
||||
wgpuRenderPassEncoderEnd(pass);
|
||||
wgpuRenderPassEncoderRelease(pass);
|
||||
}
|
||||
|
||||
// Submit
|
||||
WGPUCommandBuffer cmdBuf = wgpuCommandEncoderFinish(encoder, nullptr);
|
||||
wgpuQueueSubmit(gQueue, 1, &cmdBuf);
|
||||
|
||||
// Present
|
||||
wgpuSurfacePresent(gSurface);
|
||||
|
||||
// Process Events
|
||||
wgpuInstanceProcessEvents(gInstance);
|
||||
TracyWebGPUCollect(gTracyCtx);
|
||||
|
||||
// Cleanup
|
||||
wgpuCommandBufferRelease(cmdBuf);
|
||||
wgpuCommandEncoderRelease(encoder);
|
||||
wgpuTextureViewRelease(view);
|
||||
wgpuTextureRelease(surfTex.texture);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Shutdown
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void shutdown() {
|
||||
fprintf(stderr, "application is shutting down...\n");
|
||||
TracyWebGPUDestroy(gTracyCtx);
|
||||
if (gBindGroup) wgpuBindGroupRelease(gBindGroup);
|
||||
if (gUniformBuf) wgpuBufferRelease(gUniformBuf);
|
||||
if (gPipeline) wgpuRenderPipelineRelease(gPipeline);
|
||||
if (gQueue) wgpuQueueRelease(gQueue);
|
||||
if (gDevice) wgpuDeviceRelease(gDevice);
|
||||
if (gAdapter) wgpuAdapterRelease(gAdapter);
|
||||
if (gSurface) wgpuSurfaceRelease(gSurface);
|
||||
if (gInstance) wgpuInstanceRelease(gInstance);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// main
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
if (!platformInit(kWidth, kHeight, "WebGPU Spinning Triangle"))
|
||||
return 1;
|
||||
|
||||
gInstance = wgpuCreateInstance(nullptr);
|
||||
if (!gInstance) return error(2, "Failed to create WebGPU instance.");
|
||||
|
||||
gSurface = platformCreateSurface(gInstance);
|
||||
if (!gSurface) return error(3, "Failed to create surface.");
|
||||
|
||||
if (initWebGPU() != 0) return 4;
|
||||
|
||||
platformRunLoop(renderFrame, shutdown);
|
||||
return 0;
|
||||
}
|
||||
@@ -11,7 +11,7 @@ The user manual
|
||||
|
||||
**Bartosz Taudul** [\<wolf@nereid.pl\>](mailto:wolf@nereid.pl)
|
||||
|
||||
2026-06-09 <https://github.com/wolfpld/tracy>
|
||||
2026-06-06 <https://github.com/wolfpld/tracy>
|
||||
|
||||
# Quick overview {#quick-overview .unnumbered}
|
||||
|
||||
@@ -1495,12 +1495,6 @@ You also need to periodically collect the GPU events using the `TracyGpuCollect`
|
||||
|
||||
[^49]: Because Apple is unable to implement standards properly.
|
||||
|
||||
##### Calibrated context
|
||||
|
||||
By default, the OpenGL context is uncalibrated: the CPU and GPU clocks are aligned only once, when the context is created, so over long captures the two time domains may drift apart (section [5.4](#options) describes correcting this drift manually). Defining `TRACY_OPENGL_AUTO_CALIBRATION` before including `TracyOpenGL.hpp` enables periodic recalibration instead: roughly once per second Tracy samples the GPU and CPU clocks together and emits a calibration event, allowing the profiler to track and remove the drift automatically.
|
||||
|
||||
This is opt-in because OpenGL exposes no atomic CPU+GPU timestamp query (unlike Vulkan's `VK_EXT_calibrated_timestamps` or Direct3D 12, whose contexts are always calibrated). Recalibration therefore reads the GPU clock with `glGetInteger64v(GL_TIMESTAMP)`, which forces a CPU/GPU synchronization (a pipeline stall) each time it runs. Enable it only when the improved long-capture alignment is worth the periodic stall.
|
||||
|
||||
### Vulkan
|
||||
|
||||
Similarly, for Vulkan support you should include the `public/tracy/TracyVulkan.hpp` header file. Tracing Vulkan devices and queues is a bit more involved, and the Vulkan initialization macro `TracyVkContext(physdev, device, queue, cmdbuf)` returns an instance of `TracyVkCtx` object, which tracks an associated Vulkan queue. Cleanup is performed using the `TracyVkDestroy(ctx)` macro. You may create multiple Vulkan contexts. To set a custom name for the context, use the `TracyVkContextName(ctx, name, size)` macro.
|
||||
@@ -1800,10 +1794,6 @@ By default, tracy client resolves callstack symbols in a background thread at ru
|
||||
|
||||
The generated tracy capture will have callstack frames symbols showing `[unresolved]`. The `update` tool can be used to load that capture, perform symbol resolution offline (by passing `-r`) and writing out a new capture with symbols resolved. By default `update` will use the original shared libraries paths that were recorded in the capture (which assumes running in the same machine or a machine with identical filesystem setup as the one used to run the tracy instrumented application). You can do path substitution with the `-p` option to perform any number of path substitions in order to use symbols located elsewhere.
|
||||
|
||||
By default symbol resolution is performed with the platform's native facility: the DbgHelp library on Windows, and the `addr2line` tool found in `PATH` elsewhere. You can override this with the `-a` option, passing the path to a custom `addr2line`-compatible tool (for instance an `addr2line` from a cross-compilation toolchain, or `llvm-addr2line`). The `-a` option works on all platforms, including Windows, and takes precedence over the platform default.
|
||||
|
||||
Extra arguments can be passed verbatim to the resolution tool with the `-A` option. Tracy records callstack frame offsets relative to the image base, but `addr2line`-compatible tools expect a full virtual address for images that have a non-zero preferred image base (such as PE on Windows or Mach-O on Apple). For these, pass `-A "--relative-address"` so that `llvm-addr2line` or `llvm-symbolizer` adds the image base back. ELF images need no such adjustment.
|
||||
|
||||
> [!IMPORTANT]
|
||||
> **Important**
|
||||
>
|
||||
@@ -1998,39 +1988,6 @@ After you release the lock use the `TracyCLockAfterUnlock` macro:
|
||||
|
||||
You can optionally mark the location of where the lock is held by using the `TracyCLockMark` macro, this should be done after acquiring the lock.
|
||||
|
||||
Similarly, you can use the following macros to mark a shared lock using the C API:
|
||||
|
||||
- `TracyCSharedLockAnnounce(lock_ctx)`
|
||||
|
||||
- `TracyCSharedLockTerminate(lock_ctx)`
|
||||
|
||||
- `TracyCSharedLockBeforeLock(lock_ctx)`
|
||||
|
||||
- `TracyCSharedLockAfterLock(lock_ctx)`
|
||||
|
||||
- `TracyCSharedLockAfterUnlock(lock_ctx)`
|
||||
|
||||
- `TracyCSharedLockAfterTryLock(lock_ctx, acquired)`
|
||||
|
||||
- `TracyCSharedLockBeforeSharedLock(lock_ctx)`
|
||||
|
||||
- `TracyCSharedLockAfterSharedLock(lock_ctx)`
|
||||
|
||||
- `TracyCSharedLockAfterSharedUnlock(lock_ctx)`
|
||||
|
||||
- `TracyCSharedLockAfterTrySharedLock(lock_ctx, acquired)`
|
||||
|
||||
- `TracyCSharedLockMark(lock_ctx)`
|
||||
|
||||
- `TracyCSharedLockCustomName(lock_ctx, name, size)`
|
||||
|
||||
A shared lock context has to be defined next to the shared lock that it will be marking:
|
||||
|
||||
TracyCSharedLockCtx tracy_shared_lock_ctx;
|
||||
HANDLE shared_lock;
|
||||
|
||||
The same rules apply to shared locks as to regular locks, but you need to use the shared lock macros instead. Lock implementations in classes `Lockable` and `SharedLockable` show how to properly perform context handling.
|
||||
|
||||
### Memory profiling {#cmemoryprofiling}
|
||||
|
||||
Use the following macros in your implementations of `malloc` and `free`:
|
||||
@@ -3625,7 +3582,7 @@ You can freely adjust each time range on the timeline by clicking the left mouse
|
||||
|
||||
Tracy allows adding custom notes to the trace. For example, you may want to mark a region to ignore because the application was out-of-focus or a region where a new user was connecting to the game, which resulted in a frame drop that needs to be investigated.
|
||||
|
||||
Methods of specifying the annotation region are described in section [5.3](#timeranges). When a new annotation is added, it is assigned a semi-unique random name to make it distinguishable. The settings window is also opened (section [5.21](#annotationsettings)), allowing you to enter your own description of the annotation.
|
||||
Methods of specifying the annotation region are described in section [5.3](#timeranges). When a new annotation is added, a settings window is displayed (section [5.21](#annotationsettings)), allowing you to enter a description.
|
||||
|
||||
Annotations are displayed on the timeline, as presented in figure [21](#annotation). Clicking on the circle next to the text description will open the annotation settings window, in which you can modify or remove the region. List of all annotations in the trace is available in the annotations list window described in section [5.22](#annotationlist), which is accessible through the * Tools* button on the control menu.
|
||||
|
||||
@@ -4168,9 +4125,7 @@ The information about the selected memory allocation is displayed in this window
|
||||
|
||||
## Trace information window {#traceinfo}
|
||||
|
||||
This window contains information about the current trace: captured program name, time of the capture, profiler version which performed the capture.
|
||||
|
||||
There's an text entry field for an optional custom description of the trace for you to fill in. This description will appear on the profiler window title bar, or when comparing two traces (section [5.8](#compare)), enabling you to quickly recognize what the trace contains. For some people it's fine to just have *any* semi-unique description to be able to identify a specific trace. For such purposes there's an * Generate name* button, which will set the trace description to an abstract meaningless identifier.
|
||||
This window contains information about the current trace: captured program name, time of the capture, profiler version which performed the capture, and a custom trace description, which you can fill in.
|
||||
|
||||
If the * Public sidecar* option is selected, the file containing trace-specific user settings (see section [9.2](#tracespecific)) will be saved on disk next to the trace file.
|
||||
|
||||
@@ -4204,7 +4159,6 @@ If an application should crash during profiling (section [2.5](#crashhandling))
|
||||
|
||||
-----
|
||||
|
||||
- Dice icon
|
||||
- User Gear icon
|
||||
|
||||
## Zone information window {#zoneinfo}
|
||||
@@ -4608,12 +4562,7 @@ The profiled program is highlighted using green color. Furthermore, the yellow h
|
||||
|
||||
## Annotation settings window {#annotationsettings}
|
||||
|
||||
In this window, you may modify how a timeline annotation (section [5.3.1](#annotatingtrace)) is presented by setting its text description or selecting region highlight color. A random annotation description can be set with the * Generate name* button. If the note is no longer needed, you may also remove it here.
|
||||
|
||||
|
||||
-----
|
||||
|
||||
- Dice icon
|
||||
In this window, you may modify how a timeline annotation (section [5.3.1](#annotatingtrace)) is presented by setting its text description or selecting region highlight color. If the note is no longer needed, you may also remove it here.
|
||||
|
||||
## Annotation list window {#annotationlist}
|
||||
|
||||
|
||||
@@ -141,7 +141,7 @@ There's much more Tracy can do, which can be explored by carefully reading this
|
||||
\section{A quick look at Tracy Profiler}
|
||||
\label{quicklook}
|
||||
|
||||
Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that you can use for remote or embedded telemetry of games and other applications. It can profile CPU\footnote{Direct support is provided for C, C++, Lua, Python and Fortran integration. At the same time, third-party bindings to many other languages exist on the internet, such as Rust, Zig, C\#, OCaml, Odin, etc.}, GPU\footnote{All major graphics/compute APIs: OpenGL, Vulkan, Direct3D 11/12, Metal, OpenCL, CUDA, WebGPU.}, memory allocations, locks, context switches, automatically attribute screenshots to captured frames, and much more.
|
||||
Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that you can use for remote or embedded telemetry of games and other applications. It can profile CPU\footnote{Direct support is provided for C, C++, Lua, Python and Fortran integration. At the same time, third-party bindings to many other languages exist on the internet, such as Rust, Zig, C\#, OCaml, Odin, etc.}, GPU\footnote{All major graphic APIs: OpenGL, Vulkan, Direct3D 11/12, Metal, OpenCL.}, memory allocations, locks, context switches, automatically attribute screenshots to captured frames, and much more.
|
||||
|
||||
While Tracy can perform statistical analysis of sampled call stack data, just like other \emph{statistical profilers} (such as VTune, perf, or Very Sleepy), it mainly focuses on manual markup of the source code. Such markup allows frame-by-frame inspection of the program execution. For example, you will be able to see exactly which functions are called, how much time they require, and how they interact with each other in a multi-threaded environment. In contrast, the statistical analysis may show you the hot spots in your code, but it cannot accurately pinpoint the underlying cause for semi-random frame stutter that may occur every couple of seconds.
|
||||
|
||||
@@ -1050,8 +1050,6 @@ Memory & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faXm
|
||||
GPU zones (OpenGL) & \faCheck & \faCheck & \faCheck & \faPoo & \faPoo & & \faXmark \\
|
||||
GPU zones (Vulkan) & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & & \faXmark \\
|
||||
GPU zones (Metal) & \faXmark & \faXmark & \faXmark & \faCheck\textsuperscript{\emph{b}} & \faCheck\textsuperscript{\emph{b}} & \faXmark & \faXmark \\
|
||||
GPU zones (CUDA) & \faCheck & \faCheck & \faXmark & \faXmark & \faXmark & \faQuestion & \faXmark \\
|
||||
GPU zones (WebGPU) & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faQuestion & \faQuestion \\
|
||||
Call stacks & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faXmark \\
|
||||
Symbol resolution & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\
|
||||
Crash handling & \faCheck & \faCheck & \faCheck & \faXmark & \faXmark & \faXmark & \faXmark \\
|
||||
@@ -1647,7 +1645,7 @@ To mark that a separate memory pool is to be tracked you should use the named ve
|
||||
\subsection{GPU profiling}
|
||||
\label{gpuprofiling}
|
||||
|
||||
Tracy provides bindings for profiling OpenGL, Vulkan, Direct3D 11, Direct3D 12, Metal, OpenCL, CUDA and WebGPU execution time on GPU.
|
||||
Tracy provides bindings for profiling OpenGL, Vulkan, Direct3D 11, Direct3D 12, Metal, OpenCL and CUDA execution time on GPU.
|
||||
|
||||
Note that the CPU and GPU timers may be unsynchronized unless you create a calibrated context, but the availability of calibrated contexts is limited. You can try to correct the desynchronization of uncalibrated contexts in the profiler's options (section~\ref{options}).
|
||||
|
||||
@@ -1793,16 +1791,6 @@ Unlike other GPU backends in Tracy, there is no need to call \texttt{TracyCUDACo
|
||||
|
||||
To stop profiling, call the \texttt{TracyCUDAStopProfiling(ctx)} macro.
|
||||
|
||||
\subsubsection{WebGPU}
|
||||
|
||||
WebGPU support is enabled by including the \texttt{public/tracy/TracyWebGPU.hpp} header file. Both major implementations of WebGPU (Dawn and wgpu-native) are supported.
|
||||
|
||||
Before creating the WebGPU device, make sure to call \texttt{TracyWebGPUSetupDeviceDescriptor()} to let Tracy request the necessary device features and extensions necessary for profiling. After the device is created, use the \texttt{TracyWebGPUContext()} macro to instantiate the necessary \texttt{WebGPUQueueCtx} object required for GPU instrumentation. The object should later be cleaned up with the \texttt{TracyWebGPUDestroy()} macro. To set a custom name for the context, use the \texttt{TracyWebGPUContextName()} macro.
|
||||
|
||||
To instrument a GPU zone, use the various \texttt{TracyWebGPU*Zone*()} macros. Note that WebGPU only offers command instrumentation at the "pass"-level. While command-level granularity is possible through implementation-specific WebGPU extensions, Tracy does not support it at the moment. Supply the corresponding WebGPU pass descriptor to the instrumentation macro \textit{before} creating the WebGPU pass encoder.
|
||||
|
||||
You are required to periodically collect the GPU events using the \texttt{TracyWebGPUCollect()} macro. Good places for collection are: after synchronous waits, after event processing \texttt{wgpuInstanceProcessEvents}, after present drawable calls (\texttt{wgpuSurfacePresent}), and inside the completion callback of command queues (\texttt{wgpuQueueOnSubmittedWorkDone}).
|
||||
|
||||
\subsubsection{ROCm}
|
||||
|
||||
On Linux, if rocprofiler-sdk is installed, tracy can automatically trace GPU dispatches and collect
|
||||
@@ -1836,13 +1824,13 @@ sudo amd-smi set -g 0 -l stable_std
|
||||
|
||||
Putting more than one GPU zone macro in a single scope features the same issue as with the \texttt{ZoneScoped} macros, described in section~\ref{multizone} (but this time the variable name is \texttt{\_\_\_tracy\_gpu\_zone}).
|
||||
|
||||
To solve this problem, in case of OpenGL use the \texttt{TracyGpuNamedZone} macro in place of \texttt{TracyGpuZone} (or the color variant). The same applies to Vulkan, Direct3D 11/12, Metal and WebGPU -- replace \texttt{TracyVkZone} with \texttt{TracyVkNamedZone}, \texttt{TracyD3D11Zone}/\texttt{TracyD3D12Zone} with \texttt{TracyD3D11NamedZone}/\texttt{TracyD3D12NamedZone}, \texttt{TracyMetalZone} with \texttt{TracyMetalNamedZone}, and \texttt{TracyWebGPUZone} with \texttt{TracyWebGPUNamedZone}.
|
||||
To solve this problem, in case of OpenGL use the \texttt{TracyGpuNamedZone} macro in place of \texttt{TracyGpuZone} (or the color variant). The same applies to Vulkan, Direct3D 11/12 and Metal -- replace \texttt{TracyVkZone} with \texttt{TracyVkNamedZone}, \texttt{TracyD3D11Zone}/\texttt{TracyD3D12Zone} with \texttt{TracyD3D11NamedZone}/\texttt{TracyD3D12NamedZone}, and \texttt{TracyMetalZone} with \texttt{TracyMetalNamedZone}.
|
||||
|
||||
Remember to provide your name for the created stack variable as the first parameter to the macros.
|
||||
|
||||
\subsubsection{Transient GPU zones}
|
||||
|
||||
Transient zones (see section~\ref{transientzones} for details) are available in OpenGL, Vulkan, Direct3D 11/12 and WebGPU macros. Transient zones are not available for Metal at this moment.
|
||||
Transient zones (see section~\ref{transientzones} for details) are available in OpenGL, Vulkan, and Direct3D 11/12 macros. Transient zones are not available for Metal at this moment.
|
||||
|
||||
\subsection{Fibers}
|
||||
\label{fibers}
|
||||
@@ -3889,7 +3877,7 @@ You will find the zones with locks and their associated threads on this combined
|
||||
The left-hand side \emph{index area} of the timeline view displays various labels (threads, locks), which can be categorized in the following way:
|
||||
|
||||
\begin{itemize}
|
||||
\item \emph{Light blue label} -- GPU context. Multi-threaded Vulkan, OpenCL, Direct3D 12, Metal and WebGPU contexts are additionally split into separate threads.
|
||||
\item \emph{Light blue label} -- GPU context. Multi-threaded Vulkan, OpenCL, Direct3D 12 and Metal contexts are additionally split into separate threads.
|
||||
\item \emph{Pink label} -- CPU data graph.
|
||||
\item \emph{White label} -- A CPU thread. It will be replaced by a bright red label in a thread that has crashed (section~\ref{crashhandling}). If automated sampling was performed, clicking the~\LMB{}~left mouse button on the \emph{\faGhost{}~ghost zones} button will switch zone display mode between 'instrumented' and 'ghost.'
|
||||
\item \emph{Green label} -- Fiber, coroutine, or any other sort of cooperative multitasking 'green thread.'
|
||||
@@ -3911,7 +3899,7 @@ In an example in figure~\ref{zoneslocks} you can see that there are two threads:
|
||||
|
||||
Meanwhile, the \emph{Streaming thread} is performing some \emph{Streaming jobs}. The first \emph{Streaming job} sent a message (section~\ref{messagelog}). In addition to being listed in the message log, it is indicated by a triangle over the thread separator. When multiple messages are in one place, the triangle outline shape changes to a filled triangle.
|
||||
|
||||
The GPU zones are displayed just like CPU zones, with an OpenGL/Vulkan/Direct3D/Metal/OpenCL/CUDA/WebGPU context in place of a thread name.
|
||||
The GPU zones are displayed just like CPU zones, with an OpenGL/Vulkan/Direct3D/Metal/OpenCL context in place of a thread name.
|
||||
|
||||
Hovering the \faArrowPointer{} mouse pointer over a zone will highlight all other zones that have the exact source location with a white outline. Clicking the \LMB{}~left mouse button on a zone will open the zone information window (section~\ref{zoneinfo}). Holding the \keys{\ctrl} key and clicking the \LMB{}~left mouse button on a zone will open the zone statistics window (section~\ref{findzone}). Clicking the \MMB{}~middle mouse button on a zone will zoom the view to the extent of the zone.
|
||||
|
||||
@@ -4083,7 +4071,7 @@ You can freely adjust each time range on the timeline by clicking the \LMB{}~lef
|
||||
|
||||
Tracy allows adding custom notes to the trace. For example, you may want to mark a region to ignore because the application was out-of-focus or a region where a new user was connecting to the game, which resulted in a frame drop that needs to be investigated.
|
||||
|
||||
Methods of specifying the annotation region are described in section~\ref{timeranges}. When a new annotation is added, it is assigned a semi-unique random name to make it distinguishable. The settings window is also opened (section~\ref{annotationsettings}), allowing you to enter your own description of the annotation.
|
||||
Methods of specifying the annotation region are described in section~\ref{timeranges}. When a new annotation is added, a settings window is displayed (section~\ref{annotationsettings}), allowing you to enter a description.
|
||||
|
||||
Annotations are displayed on the timeline, as presented in figure~\ref{annotation}. Clicking on the circle next to the text description will open the annotation settings window, in which you can modify or remove the region. List of all annotations in the trace is available in the annotations list window described in section~\ref{annotationlist}, which is accessible through the \emph{\faScrewdriverWrench{} Tools} button on the control menu.
|
||||
|
||||
@@ -4120,7 +4108,7 @@ In this window, you can set various trace-related options. For example, the time
|
||||
\begin{itemize}
|
||||
\item \emph{\faSignature{} Draw CPU usage graph} -- You can disable drawing of the CPU usage graph here.
|
||||
\end{itemize}
|
||||
\item \emph{\faEye{} Draw GPU zones} -- Allows disabling display of OpenGL/Vulkan/Metal/Direct3D/OpenCL/CUDA/WebGPU zones. The \emph{GPU zones} drop-down allows disabling individual GPU contexts and setting CPU/GPU drift offsets of uncalibrated contexts (see section~\ref{gpuprofiling} for more information). The \emph{\faRobot~Auto} button automatically measures the GPU drift value\footnote{There is an assumption that drift is linear. Automated measurement calculates and removes change over time in delay-to-execution of GPU zones. Resulting value may still be incorrect.}.
|
||||
\item \emph{\faEye{} Draw GPU zones} -- Allows disabling display of OpenGL/Vulkan/Metal/Direct3D/OpenCL zones. The \emph{GPU zones} drop-down allows disabling individual GPU contexts and setting CPU/GPU drift offsets of uncalibrated contexts (see section~\ref{gpuprofiling} for more information). The \emph{\faRobot~Auto} button automatically measures the GPU drift value\footnote{There is an assumption that drift is linear. Automated measurement calculates and removes change over time in delay-to-execution of GPU zones. Resulting value may still be incorrect.}.
|
||||
\item \emph{\faMicrochip{} Draw CPU zones} -- Determines whether CPU zones are displayed.
|
||||
\begin{itemize}
|
||||
\item \emph{\faGhost{} Draw ghost zones} -- Controls if ghost zones should be displayed in threads which don't have any instrumented zones available.
|
||||
@@ -4594,9 +4582,7 @@ The information about the selected memory allocation is displayed in this window
|
||||
\subsection{Trace information window}
|
||||
\label{traceinfo}
|
||||
|
||||
This window contains information about the current trace: captured program name, time of the capture, profiler version which performed the capture.
|
||||
|
||||
There's an text entry field for an optional custom description of the trace for you to fill in. This description will appear on the profiler window title bar, or when comparing two traces (section~\ref{compare}), enabling you to quickly recognize what the trace contains. For some people it's fine to just have \emph{any} semi-unique description to be able to identify a specific trace. For such purposes there's an \emph{\faDice{}~Generate name} button, which will set the trace description to an abstract meaningless identifier.
|
||||
This window contains information about the current trace: captured program name, time of the capture, profiler version which performed the capture, and a custom trace description, which you can fill in.
|
||||
|
||||
If the \emph{\faUserGear{}~Public sidecar} option is selected, the file containing trace-specific user settings (see section~\ref{tracespecific}) will be saved on disk next to the trace file.
|
||||
|
||||
@@ -4936,7 +4922,7 @@ The profiled program is highlighted using green color. Furthermore, the yellow h
|
||||
\subsection{Annotation settings window}
|
||||
\label{annotationsettings}
|
||||
|
||||
In this window, you may modify how a timeline annotation (section~\ref{annotatingtrace}) is presented by setting its text description or selecting region highlight color. A random annotation description can be set with the \emph{\faDice{}~Generate name} button. If the note is no longer needed, you may also remove it here.
|
||||
In this window, you may modify how a timeline annotation (section~\ref{annotatingtrace}) is presented by setting its text description or selecting region highlight color. If the note is no longer needed, you may also remove it here.
|
||||
|
||||
\subsection{Annotation list window}
|
||||
\label{annotationlist}
|
||||
|
||||
@@ -70,7 +70,6 @@ set(SERVER_FILES
|
||||
TracyMarkdown.cpp
|
||||
TracyMicroArchitecture.cpp
|
||||
TracyMouse.cpp
|
||||
TracyNameGen.cpp
|
||||
TracyProtoHistory.cpp
|
||||
TracySourceContents.cpp
|
||||
TracySourceTokenizer.cpp
|
||||
|
||||
@@ -162,15 +162,6 @@ static ImGuiKey TranslateKeyCode( const char* code )
|
||||
return ImGuiKey_None;
|
||||
}
|
||||
|
||||
static void UpdateKeyModifiers( const EmscriptenKeyboardEvent* e )
|
||||
{
|
||||
ImGuiIO& io = ImGui::GetIO();
|
||||
io.AddKeyEvent( ImGuiMod_Ctrl, e->ctrlKey );
|
||||
io.AddKeyEvent( ImGuiMod_Shift, e->shiftKey );
|
||||
io.AddKeyEvent( ImGuiMod_Alt, e->altKey );
|
||||
io.AddKeyEvent( ImGuiMod_Super, e->metaKey );
|
||||
}
|
||||
|
||||
Backend::Backend( const char* title, const std::function<void()>& redraw, const std::function<void(float)>& scaleChanged, const std::function<int(void)>& isBusy, RunQueue* mainThreadTasks )
|
||||
{
|
||||
constexpr EGLint eglConfigAttrib[] = {
|
||||
@@ -252,7 +243,6 @@ Backend::Backend( const char* title, const std::function<void()>& redraw, const
|
||||
return EM_TRUE;
|
||||
} );
|
||||
emscripten_set_keydown_callback( EMSCRIPTEN_EVENT_TARGET_WINDOW, nullptr, EM_TRUE, [] ( int, const EmscriptenKeyboardEvent* e, void* ) -> EM_BOOL {
|
||||
UpdateKeyModifiers( e );
|
||||
const auto code = TranslateKeyCode( e->code );
|
||||
if( code == ImGuiKey_None ) return EM_FALSE;
|
||||
ImGui::GetIO().AddKeyEvent( code, true );
|
||||
@@ -260,7 +250,6 @@ Backend::Backend( const char* title, const std::function<void()>& redraw, const
|
||||
return EM_TRUE;
|
||||
} );
|
||||
emscripten_set_keyup_callback( EMSCRIPTEN_EVENT_TARGET_WINDOW, nullptr, EM_TRUE, [] ( int, const EmscriptenKeyboardEvent* e, void* ) -> EM_BOOL {
|
||||
UpdateKeyModifiers( e );
|
||||
const auto code = TranslateKeyCode( e->code );
|
||||
if( code == ImGuiKey_None ) return EM_FALSE;
|
||||
ImGui::GetIO().AddKeyEvent( code, false );
|
||||
|
||||
@@ -290,7 +290,7 @@ static constexpr const uint32_t AsmSyntaxColors[] = {
|
||||
|
||||
[[maybe_unused]] static tracy_force_inline void TooltipIfHovered( const char* text )
|
||||
{
|
||||
if( !ImGui::IsItemHovered( ImGuiHoveredFlags_AllowWhenDisabled ) ) return;
|
||||
if( !ImGui::IsItemHovered() ) return;
|
||||
ImGui::BeginTooltip();
|
||||
ImGui::TextUnformatted( text );
|
||||
ImGui::EndTooltip();
|
||||
|
||||
@@ -1,221 +0,0 @@
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include "TracyNameGen.hpp"
|
||||
|
||||
namespace tracy
|
||||
{
|
||||
|
||||
struct NameBank
|
||||
{
|
||||
const char* const* adjectives;
|
||||
const char* const* nouns;
|
||||
size_t numAdjectives;
|
||||
size_t numNouns;
|
||||
};
|
||||
|
||||
constexpr const char* AnalysisAdjectives[] = {
|
||||
"Granular", "Forensic", "Acute", "Lucid", "Precise",
|
||||
"Deep", "Exact", "Critical", "Analytical", "Transparent",
|
||||
"Subtle", "Sharp", "Rigid", "Focused", "Absolute",
|
||||
"Meticulous", "Spectral", "Diagnostic", "Pervasive", "Introspective",
|
||||
"Systematic", "Optical", "Minute", "Piercing", "Detailed",
|
||||
"Scrutinized", "Clear", "Keen", "Rigorous", "Vast",
|
||||
"Incisive", "Exhaustive", "Lateral", "Prismatic", "Observant"
|
||||
};
|
||||
constexpr const char* AnalysisNouns[] = {
|
||||
"Probe", "Trace", "Lens", "Scope", "Metric",
|
||||
"Insight", "Scan", "Audit", "Point", "Vector",
|
||||
"Signal", "Marker", "Frame", "Detail", "View",
|
||||
"Spectrum", "Snapshot", "Blueprint", "Aperture", "Index",
|
||||
"Radar", "Prism", "Gauge", "Focal", "Pattern",
|
||||
"Echo", "Signature", "Horizon", "Mirror", "Scale",
|
||||
"Telemetry", "Graph", "Stratum", "Artifact", "Aspect"
|
||||
};
|
||||
|
||||
constexpr const char* PerformanceAdjectives[] = {
|
||||
"Swift", "Lean", "Kinetic", "Agile", "Hyper",
|
||||
"Rapid", "Fluid", "Peak", "Instant", "Nimble",
|
||||
"Optimal", "Sonic", "Linear", "Warp", "Turbo",
|
||||
"Frictionless", "Seamless", "Electric", "Blazing", "Aerodynamic",
|
||||
"Quantum", "Prompt", "Direct", "Streamlined", "Volatile",
|
||||
"Highgain", "Rapidfire", "Torrential", "Sleek", "Velocity",
|
||||
"Dynamic", "Active", "Persistent", "Lightweight", "Snappy"
|
||||
};
|
||||
constexpr const char* PerformanceNouns[] = {
|
||||
"Pulse", "Flow", "Cycle", "Burst", "Stream",
|
||||
"Tick", "Glide", "Shift", "Velocity", "Spike",
|
||||
"Pace", "Rhythm", "Drive", "Path", "Edge",
|
||||
"Sprint", "Torrent", "Current", "Surge", "Momentum",
|
||||
"Flux", "Wave", "Accelerator", "Spark", "Jet",
|
||||
"Thrust", "Orbit", "Apex", "Bolt", "Phase",
|
||||
"Rush", "Impact", "Frequency", "Lapse", "Kick"
|
||||
};
|
||||
|
||||
constexpr const char* CoreAdjectives[] = {
|
||||
"Binary", "Raw", "Atomic", "Static", "Core",
|
||||
"Virtual", "Base", "Solid", "Dense", "Opaque",
|
||||
"Primitive", "Native", "Hard", "Stable", "Immutable",
|
||||
"Monolithic", "Bare", "Rigid", "Concrete", "Fundamental",
|
||||
"Discrete", "Fixed", "Heavy", "Latent", "Symmetric",
|
||||
"Implicit", "Explicit", "Cold", "Basic", "Granite",
|
||||
"Stark", "Brute", "Firm", "Stout", "Coarse"
|
||||
};
|
||||
constexpr const char* CoreNouns[] = {
|
||||
"Stack", "Heap", "Node", "Buffer", "Segment",
|
||||
"Thread", "Kernel", "Block", "Page", "Shell",
|
||||
"Layer", "Bit", "Logic", "Port", "Root",
|
||||
"Register", "Pointer", "Address", "Cache", "Opcode",
|
||||
"Slab", "Pipeline", "Bus", "Socket", "Sector",
|
||||
"Vault", "Anchor", "Pillar", "Base", "Primitive",
|
||||
"Offset", "Handle", "Struct", "Memory", "Word"
|
||||
};
|
||||
|
||||
constexpr const char* ModernAdjectives[] = {
|
||||
"Synthetic", "Neural", "Async", "Elastic", "Cloud",
|
||||
"Distributed", "Reactive", "Orbital", "Poly", "Infinite",
|
||||
"Parallel", "Modular", "Virtualized", "Scalable", "Agnostic",
|
||||
"Adaptive", "Hybrid", "Autonomous", "Global", "Synergic",
|
||||
"Omnipresent", "Evolving", "Abstract", "Unified", "Concurrent",
|
||||
"Remote", "Digital", "Cluster", "Ephemeral", "Stateful",
|
||||
"Stateless", "Serverless", "Decoupled", "Fluent", "Native"
|
||||
};
|
||||
constexpr const char* ModernNouns[] = {
|
||||
"Nexus", "Grid", "Matrix", "Vertex", "Sync",
|
||||
"Axiom", "Sphere", "Hub", "Mesh", "Bridge",
|
||||
"Link", "Unit", "Fabric", "Cluster", "Portal",
|
||||
"Ecosystem", "Catalyst", "Interface", "Domain", "Gateway",
|
||||
"Lattice", "Cloud", "Instance", "Schema", "Registry",
|
||||
"Tenant", "Namespace", "Pod", "Stream", "Endpoint",
|
||||
"Payload", "Relay", "Orchestrator", "Broker", "Agent"
|
||||
};
|
||||
|
||||
constexpr const char* FailureAdjectives[] = {
|
||||
"Clumsy", "Wobbly", "Confused", "Chaotic", "Sneaky",
|
||||
"Lazy", "Dizzy", "Broken", "Leaky", "Fragile",
|
||||
"Shaky", "Erratic", "Sleepy", "Lost", "Random",
|
||||
"Glitchy", "Unstable", "Paradoxical", "Cluttery", "Hiccupy",
|
||||
"Wonky", "Flaky", "Stubborn", "Moody", "Nervous",
|
||||
"Fumbling", "Drifting", "Tangled", "Blurred", "Absent",
|
||||
"Haphazard", "Spasmodic", "Clunky", "Jittery", "Bewildered"
|
||||
};
|
||||
constexpr const char* FailureNouns[] = {
|
||||
"Crash", "Bug", "Leak", "Hang", "Timeout",
|
||||
"Panic", "Loop", "Spill", "Hiccup", "Glitch",
|
||||
"Wobble", "Tumble", "Void", "Abyss", "Maze",
|
||||
"Knot", "Static", "Noise", "Drift", "Stumble",
|
||||
"Gap", "Fragment", "Shard", "Spark", "Bubble",
|
||||
"Slip", "Trip", "Fall", "Ghost", "Shadow",
|
||||
"Blur", "Overflow", "Sinkhole", "Echo", "Mirage"
|
||||
};
|
||||
|
||||
constexpr const char* MythicAdjectives[] = {
|
||||
"Mythic", "Arcane", "Ancient", "Eternal", "Sacred",
|
||||
"Divine", "Forgotten", "Elder", "Primordial", "Venerable",
|
||||
"Runic", "Prophetic", "Colossal", "Imperial", "Regal",
|
||||
"Sovereign", "Mystic", "Occult", "Hidden", "Cryptic",
|
||||
"Ethereal", "Celestial", "Gnostic", "Hermetic", "Alchemical",
|
||||
"Astral", "Golden", "Iron", "Bronze", "Obsidian",
|
||||
"Silver", "Timeless", "Boundless", "Omnipotent", "Everlasting"
|
||||
};
|
||||
constexpr const char* MythicNouns[] = {
|
||||
"Aegis", "Helios", "Oracle", "Titan", "Rune",
|
||||
"Lex", "Codex", "Obelisk", "Monolith", "Temple",
|
||||
"Altar", "Scepter", "Crown", "Sigil", "Glyph",
|
||||
"Tome", "Relic", "Artifact", "Sanctum", "Citadel",
|
||||
"Bastion", "Spire", "Pillar", "Throne", "Vault",
|
||||
"Key", "Gate", "Bridge", "Seal", "Pact",
|
||||
"Covenant", "Legacy", "Epoch", "Era", "Myth"
|
||||
};
|
||||
|
||||
constexpr const char* CosmosAdjectives[] = {
|
||||
"Relativistic", "Baryonic", "Intergalactic", "Event-Horizon", "Singular",
|
||||
"Celestial", "Nebular", "Void-Born", "Astral", "Luminous",
|
||||
"Spectral", "Ionized", "Gravitational", "Ecliptic", "Zenithal",
|
||||
"Stellar", "Cosmological", "Parallactic", "Zero-Point", "Dark-Matter",
|
||||
"Radiant", "Orbital", "Supernova", "Hyper-Spatial", "Aetheric",
|
||||
"Cold-Void", "Infinite", "Dimensional", "Crystalline", "Tidal",
|
||||
"Planetary", "Solar", "Lunar", "Galactic", "Oblique"
|
||||
};
|
||||
constexpr const char* CosmosNouns[] = {
|
||||
"Pulsar", "Quasar", "Singularity", "Void", "Nebula",
|
||||
"Horizon", "Apex", "Zenith", "Equinox", "Corona",
|
||||
"Aperture", "Axis", "Parallax", "Cluster", "Constellation",
|
||||
"Vacuum", "Symmetry", "Continuum", "Flux", "Vortex",
|
||||
"Nova", "Eclipse", "Solenoid", "Sphere", "Vector",
|
||||
"Siderostat", "Sextant", "Obliquity", "Precession", "Azimuth",
|
||||
"Wavelength", "Frequency", "Radiance", "Entropy", "Magnitude"
|
||||
};
|
||||
|
||||
constexpr const char* GameAdjectives[] = {
|
||||
"Frame-Locked", "Pixel-Perfect", "Arcade", "Retro", "Hardcore",
|
||||
"Unlocked", "Godlike", "Buffed", "Nerfed", "Overclocked",
|
||||
"Clutch", "Lagless", "Sweaty", "Tryhard", "Broken",
|
||||
"Turbo", "Min-Max", "Rage-Quit", "No-Scope", "Frame-Perfect",
|
||||
"Savescum", "Co-Op", "Modded", "Patched", "Hotfixed",
|
||||
"Debugged", "Optimized", "Smoothed", "Playtest", "Sandbox",
|
||||
"Scripted", "Speedrun", "Cheat-Code", "Invincible", "Flawless"
|
||||
};
|
||||
constexpr const char* GameNouns[] = {
|
||||
"Frame", "Tick", "Sprite", "Polygon", "Shader",
|
||||
"Texture", "Voxel", "Render", "Hitbox", "Hurtbox",
|
||||
"Collision", "Input", "Viewport", "Level", "Checkpoint",
|
||||
"Boss", "Loot", "Quest", "Spawn", "Respawn",
|
||||
"Grind", "Scroll", "Tilemap", "Backdrop", "Rig",
|
||||
"Build", "Frag", "Gib", "Drawcall", "Pass",
|
||||
"Batch", "Delta", "Pool", "Arena", "Worker"
|
||||
};
|
||||
|
||||
constexpr std::array NameBanks = {
|
||||
NameBank { AnalysisAdjectives, AnalysisNouns, sizeof(AnalysisAdjectives) / sizeof(AnalysisAdjectives[0]), sizeof(AnalysisNouns) / sizeof(AnalysisNouns[0]) },
|
||||
NameBank { PerformanceAdjectives, PerformanceNouns, sizeof(PerformanceAdjectives) / sizeof(PerformanceAdjectives[0]), sizeof(PerformanceNouns) / sizeof(PerformanceNouns[0]) },
|
||||
NameBank { CoreAdjectives, CoreNouns, sizeof(CoreAdjectives) / sizeof(CoreAdjectives[0]), sizeof(CoreNouns) / sizeof(CoreNouns[0]) },
|
||||
NameBank { ModernAdjectives, ModernNouns, sizeof(ModernAdjectives) / sizeof(ModernAdjectives[0]), sizeof(ModernNouns) / sizeof(ModernNouns[0]) },
|
||||
NameBank { FailureAdjectives, FailureNouns, sizeof(FailureAdjectives) / sizeof(FailureAdjectives[0]), sizeof(FailureNouns) / sizeof(FailureNouns[0]) },
|
||||
NameBank { MythicAdjectives, MythicNouns, sizeof(MythicAdjectives) / sizeof(MythicAdjectives[0]), sizeof(MythicNouns) / sizeof(MythicNouns[0]) },
|
||||
NameBank { CosmosAdjectives, CosmosNouns, sizeof(CosmosAdjectives) / sizeof(CosmosAdjectives[0]), sizeof(CosmosNouns) / sizeof(CosmosNouns[0]) },
|
||||
NameBank { GameAdjectives, GameNouns, sizeof(GameAdjectives) / sizeof(GameAdjectives[0]), sizeof(GameNouns) / sizeof(GameNouns[0]) },
|
||||
};
|
||||
|
||||
constexpr std::array NameStructure = { "an", "aan", "nn" };
|
||||
|
||||
|
||||
std::string GenerateAbstractName()
|
||||
{
|
||||
std::random_device rd;
|
||||
std::default_random_engine gen( rd() );
|
||||
std::uniform_int_distribution<uint32_t> dist( 0, UINT32_MAX );
|
||||
|
||||
const auto baseBank = NameBanks[dist( gen ) % NameBanks.size()];
|
||||
const char* structure = NameStructure[dist( gen ) % NameStructure.size()];
|
||||
|
||||
std::vector<std::string> parts;
|
||||
while( *structure )
|
||||
{
|
||||
const auto type = *structure++;
|
||||
assert( type == 'a' || type == 'n' );
|
||||
const auto bank = dist( gen ) % 6 == 0 ? NameBanks[dist( gen ) % NameBanks.size()] : baseBank;
|
||||
for(;;)
|
||||
{
|
||||
auto part = std::string( type == 'a' ? bank.adjectives[dist( gen ) % bank.numAdjectives] : bank.nouns[dist( gen ) % bank.numNouns] );
|
||||
if( std::ranges::find( parts, part ) == parts.end() )
|
||||
{
|
||||
parts.emplace_back( std::move( part ) );
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
std::string ret = parts[0];
|
||||
for( size_t i=1; i<parts.size(); i++ )
|
||||
{
|
||||
ret += " " + parts[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,13 +0,0 @@
|
||||
#ifndef __TRACYNAMEGEN_HPP__
|
||||
#define __TRACYNAMEGEN_HPP__
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace tracy
|
||||
{
|
||||
|
||||
std::string GenerateAbstractName();
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -295,7 +295,6 @@ bool UserData::Load()
|
||||
LoadValue( v, "min", a->range.min );
|
||||
LoadValue( v, "max", a->range.max );
|
||||
LoadValue( v, "color", a->color );
|
||||
a->range.active = true;
|
||||
m_annotations.emplace_back( std::move( a ) );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,8 +49,7 @@ constexpr const char* GpuContextNames[] = {
|
||||
"Metal",
|
||||
"Custom",
|
||||
"CUDA",
|
||||
"Rocprof",
|
||||
"WebGPU"
|
||||
"Rocprof"
|
||||
};
|
||||
|
||||
struct MemoryPage;
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
#include <string.h>
|
||||
|
||||
#include "TracyImGui.hpp"
|
||||
#include "TracyNameGen.hpp"
|
||||
#include "TracyPrint.hpp"
|
||||
#include "TracyView.hpp"
|
||||
#include "tracy_pdqsort.h"
|
||||
@@ -13,7 +10,6 @@ namespace tracy
|
||||
void View::AddAnnotation( int64_t start, int64_t end )
|
||||
{
|
||||
auto ann = std::make_shared<Annotation>();
|
||||
ann->text = GenerateAbstractName();
|
||||
ann->range.active = true;
|
||||
ann->range.min = start;
|
||||
ann->range.max = end;
|
||||
@@ -56,22 +52,7 @@ void View::DrawSelectedAnnotation()
|
||||
char buf[1024];
|
||||
buf[descsz] = '\0';
|
||||
memcpy( buf, desc, descsz );
|
||||
|
||||
const char* buttonText = ICON_FA_DICE;
|
||||
auto buttonSize = ImGui::CalcTextSize( buttonText );
|
||||
buttonSize.x += ImGui::GetStyle().FramePadding.x * 2.0f + ImGui::GetStyle().ItemSpacing.x;
|
||||
ImGui::SetNextItemWidth( ImGui::GetContentRegionAvail().x - buttonSize.x );
|
||||
bool changed = ImGui::InputTextWithHint( "##anndesc", "Describe annotation", buf, 256 );
|
||||
ImGui::SameLine();
|
||||
if( ImGui::Button( buttonText ) )
|
||||
{
|
||||
changed = true;
|
||||
const auto name = GenerateAbstractName();
|
||||
const auto len = std::min( sizeof( buf ) - 1, name.size() );
|
||||
memcpy( buf, name.c_str(), len );
|
||||
buf[len] = '\0';
|
||||
}
|
||||
if( changed )
|
||||
if( ImGui::InputTextWithHint( "##anndesc", "Describe annotation", buf, 256 ) )
|
||||
{
|
||||
m_selectedAnnotation->text.assign( buf );
|
||||
}
|
||||
|
||||
@@ -299,22 +299,6 @@ void View::DrawTimeline()
|
||||
v->range.StartFrame();
|
||||
HandleRange( v->range, timespan, ImGui::GetCursorScreenPos(), w );
|
||||
}
|
||||
if( IsMouseClicked( 0 ) )
|
||||
{
|
||||
const auto ty = ImGui::GetTextLineHeight();
|
||||
for( auto& ann : m_annotations )
|
||||
{
|
||||
if( ann->range.min >= m_vd.zvEnd || ann->range.max <= m_vd.zvStart ) continue;
|
||||
const auto aMin = ( ann->range.min - m_vd.zvStart ) * pxns;
|
||||
const auto aMax = ( ann->range.max - m_vd.zvStart ) * pxns;
|
||||
if( ImGui::IsMouseHoveringRect( linepos + ImVec2( aMin, lineh - ty * 1.5f ), linepos + ImVec2( aMax, lineh ) ) )
|
||||
{
|
||||
m_selectedAnnotation = ann.get();
|
||||
ConsumeMouseEvents( 0 );
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
HandleTimelineMouse( timespan, ImGui::GetCursorScreenPos(), w );
|
||||
}
|
||||
if( ImGui::IsWindowFocused( ImGuiHoveredFlags_ChildWindows | ImGuiHoveredFlags_AllowWhenBlockedByActiveItem ) )
|
||||
@@ -376,8 +360,9 @@ void View::DrawTimeline()
|
||||
bool hover = ImGui::IsWindowHovered() && ImGui::IsMouseHoveringRect( wpos, wpos + ImVec2( w, h ) );
|
||||
draw = ImGui::GetWindowDrawList();
|
||||
|
||||
const auto scale = GetScale();
|
||||
const auto ty = ImGui::GetTextLineHeight();
|
||||
const auto to = 9.f;
|
||||
const auto th = ( ty - to ) * sqrt( 3 ) * 0.5;
|
||||
|
||||
if( m_vd.drawGpuZones )
|
||||
{
|
||||
@@ -430,24 +415,17 @@ void View::DrawTimeline()
|
||||
|
||||
m_lockHighlight = m_nextLockHighlight;
|
||||
|
||||
const auto iconSize = ImGui::CalcTextSize( ICON_FA_NOTE_STICKY );
|
||||
for( auto& ann : m_annotations )
|
||||
{
|
||||
if( ann->range.min < m_vd.zvEnd && ann->range.max > m_vd.zvStart )
|
||||
{
|
||||
uint32_t c0 = ( ann->color & 0xFFFFFF ) | ( m_selectedAnnotation == ann.get() ? 0x22000000 : 0x11000000 );
|
||||
uint32_t c1 = ( ann->color & 0xFFFFFF ) | ( m_selectedAnnotation == ann.get() ? 0x88000000 : 0x66000000 );
|
||||
uint32_t c2 = ( ann->color & 0xFFFFFF ) | ( m_selectedAnnotation == ann.get() ? 0xDD000000 : 0xBB000000 );
|
||||
|
||||
const auto aMin = ( ann->range.min - m_vd.zvStart ) * pxns;
|
||||
const auto aMax = ( ann->range.max - m_vd.zvStart ) * pxns;
|
||||
|
||||
draw->AddRectFilled( linepos + ImVec2( aMin, 0 ), linepos + ImVec2( aMax, lineh ), c0 );
|
||||
draw->AddRectFilled( linepos + ImVec2( aMin + 1, lineh - ty * 1.5f ), linepos + ImVec2( aMax - 1, lineh ), 0x88000000 );
|
||||
DrawLine( draw, linepos + ImVec2( aMin + 0.5f, 0.5f ), linepos + ImVec2( aMin + 0.5f, lineh + 0.5f ), ann->range.hiMin ? c2 : c1, ann->range.hiMin ? 2 : 1 );
|
||||
DrawLine( draw, linepos + ImVec2( aMax - 0.5f, 0.5f ), linepos + ImVec2( aMax - 0.5f, lineh + 0.5f ), ann->range.hiMax ? c2 : c1, ann->range.hiMax ? 2 : 1 );
|
||||
|
||||
if( drawMouseLine && ImGui::IsMouseHoveringRect( linepos + ImVec2( aMin, 0 ), linepos + ImVec2( aMax, lineh ) ) )
|
||||
uint32_t c0 = ( ann->color & 0xFFFFFF ) | ( m_selectedAnnotation == ann.get() ? 0x44000000 : 0x22000000 );
|
||||
uint32_t c1 = ( ann->color & 0xFFFFFF ) | ( m_selectedAnnotation == ann.get() ? 0x66000000 : 0x44000000 );
|
||||
uint32_t c2 = ( ann->color & 0xFFFFFF ) | ( m_selectedAnnotation == ann.get() ? 0xCC000000 : 0xAA000000 );
|
||||
draw->AddRectFilled( linepos + ImVec2( ( ann->range.min - m_vd.zvStart ) * pxns, 0 ), linepos + ImVec2( ( ann->range.max - m_vd.zvStart ) * pxns, lineh ), c0 );
|
||||
DrawLine( draw, linepos + ImVec2( ( ann->range.min - m_vd.zvStart ) * pxns + 0.5f, 0.5f ), linepos + ImVec2( ( ann->range.min - m_vd.zvStart ) * pxns + 0.5f, lineh + 0.5f ), ann->range.hiMin ? c2 : c1, ann->range.hiMin ? 2 : 1 );
|
||||
DrawLine( draw, linepos + ImVec2( ( ann->range.max - m_vd.zvStart ) * pxns + 0.5f, 0.5f ), linepos + ImVec2( ( ann->range.max - m_vd.zvStart ) * pxns + 0.5f, lineh + 0.5f ), ann->range.hiMax ? c2 : c1, ann->range.hiMax ? 2 : 1 );
|
||||
if( drawMouseLine && ImGui::IsMouseHoveringRect( linepos + ImVec2( ( ann->range.min - m_vd.zvStart ) * pxns, 0 ), linepos + ImVec2( ( ann->range.max - m_vd.zvStart ) * pxns, lineh ) ) )
|
||||
{
|
||||
ImGui::BeginTooltip();
|
||||
if( ann->text.empty() )
|
||||
@@ -464,22 +442,27 @@ void View::DrawTimeline()
|
||||
TextFocused( "Annotation length:", TimeToString( ann->range.max - ann->range.min ) );
|
||||
ImGui::EndTooltip();
|
||||
}
|
||||
|
||||
const auto aw = ( ann->range.max - ann->range.min ) * pxns;
|
||||
if( aw > ty + iconSize.x )
|
||||
if( aw > th * 4 )
|
||||
{
|
||||
draw->AddText( linepos + ImVec2( aMin + ty * 0.5f, lineh - ty * 1.25f ), ann->color | 0xFF000000, ICON_FA_NOTE_STICKY );
|
||||
draw->AddCircleFilled( linepos + ImVec2( ( ann->range.min - m_vd.zvStart ) * pxns + th * 2, th * 2 ), th, 0x88AABB22 );
|
||||
draw->AddCircle( linepos + ImVec2( ( ann->range.min - m_vd.zvStart ) * pxns + th * 2, th * 2 ), th, 0xAAAABB22 );
|
||||
if( drawMouseLine && IsMouseClicked( 0 ) && ImGui::IsMouseHoveringRect( linepos + ImVec2( ( ann->range.min - m_vd.zvStart ) * pxns + th, th ), linepos + ImVec2( ( ann->range.min - m_vd.zvStart ) * pxns + th * 3, th * 3 ) ) )
|
||||
{
|
||||
m_selectedAnnotation = ann.get();
|
||||
}
|
||||
|
||||
if( !ann->text.empty() )
|
||||
{
|
||||
const auto tw = ImGui::CalcTextSize( ann->text.c_str() ).x;
|
||||
if( aw > ty + iconSize.x + tw )
|
||||
if( aw - th*4 > tw )
|
||||
{
|
||||
draw->AddText( linepos + ImVec2( aMin + ty + iconSize.x, lineh - ty * 1.25f ), 0xFFFFFFFF, ann->text.c_str() );
|
||||
draw->AddText( linepos + ImVec2( ( ann->range.min - m_vd.zvStart ) * pxns + th * 4, th * 0.5 ), 0xFFFFFFFF, ann->text.c_str() );
|
||||
}
|
||||
else
|
||||
{
|
||||
draw->PushClipRect( linepos + ImVec2( aMin + 1, lineh - ty * 1.5f ), linepos + ImVec2( aMax - 1, lineh ) );
|
||||
draw->AddText( linepos + ImVec2( aMin + ty + iconSize.x, lineh - ty * 1.25f ), 0xFFFFFFFF, ann->text.c_str() );
|
||||
draw->PushClipRect( linepos + ImVec2( ( ann->range.min - m_vd.zvStart ) * pxns, 0 ), linepos + ImVec2( ( ann->range.max - m_vd.zvStart ) * pxns, lineh ), true );
|
||||
draw->AddText( linepos + ImVec2( ( ann->range.min - m_vd.zvStart ) * pxns + th * 4, th * 0.5 ), 0xFFFFFFFF, ann->text.c_str() );
|
||||
draw->PopClipRect();
|
||||
}
|
||||
}
|
||||
@@ -502,6 +485,7 @@ void View::DrawTimeline()
|
||||
draw->AddRect( ImVec2( wpos.x + px0, linepos.y ), ImVec2( wpos.x + px1, linepos.y + lineh ), 0x4488DD88 );
|
||||
}
|
||||
|
||||
const auto scale = GetScale();
|
||||
if( m_findZone.range.active && ( m_findZone.show || m_showRanges ) )
|
||||
{
|
||||
const auto px0 = ( m_findZone.range.min - m_vd.zvStart ) * pxns;
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "TracyImGui.hpp"
|
||||
#include "TracyNameGen.hpp"
|
||||
#include "TracyPrint.hpp"
|
||||
#include "TracyView.hpp"
|
||||
#include "tracy_pdqsort.h"
|
||||
@@ -56,22 +55,8 @@ void View::DrawInfo()
|
||||
char buf[256];
|
||||
buf[descsz] = '\0';
|
||||
memcpy( buf, desc.c_str(), descsz );
|
||||
|
||||
const char* buttonText = ICON_FA_DICE;
|
||||
auto buttonSize = ImGui::CalcTextSize( buttonText );
|
||||
buttonSize.x += ImGui::GetStyle().FramePadding.x * 2.0f + ImGui::GetStyle().ItemSpacing.x;
|
||||
ImGui::SetNextItemWidth( ImGui::GetContentRegionAvail().x - buttonSize.x );
|
||||
bool changed = ImGui::InputTextWithHint( "##traceDesc", "Enter description of the trace", buf, 256 );
|
||||
ImGui::SameLine();
|
||||
if( ImGui::Button( buttonText ) )
|
||||
{
|
||||
changed = true;
|
||||
const auto name = GenerateAbstractName();
|
||||
const auto len = std::min( sizeof( buf ) - 1, name.size() );
|
||||
memcpy( buf, name.c_str(), len );
|
||||
buf[len] = '\0';
|
||||
}
|
||||
if( changed )
|
||||
ImGui::SetNextItemWidth( -1 );
|
||||
if( ImGui::InputTextWithHint( "##traceDesc", "Enter description of the trace", buf, 256 ) )
|
||||
{
|
||||
m_userData.SetDescription( buf );
|
||||
if( m_stcb ) UpdateTitle();
|
||||
|
||||
@@ -524,7 +524,7 @@ static const char* GetHostInfo()
|
||||
auto ptr = buf;
|
||||
#if defined _WIN32
|
||||
# if defined TRACY_WIN32_NO_DESKTOP
|
||||
auto GetVersion = &::GetVersionExW;
|
||||
auto GetVersion = &::GetVersionEx;
|
||||
# else
|
||||
auto GetVersion = (t_RtlGetVersion)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlGetVersion" );
|
||||
# endif
|
||||
@@ -1408,30 +1408,9 @@ namespace
|
||||
// 1a. But s_queue is needed for initialization of variables in point 2.
|
||||
extern moodycamel::ConcurrentQueue<QueueItem> s_queue;
|
||||
|
||||
// A producer token may be created before s_initTime is constructed (the dynamic loader
|
||||
// runs shared object initializers before any of the executable's constructors, and such
|
||||
// an initializer may emit a zone). Remember the time of such an early token creation, so
|
||||
// that the init time can be backdated accordingly and no event timestamp precedes the
|
||||
// trace epoch.
|
||||
static std::atomic<int64_t> s_earlyTokenTime { 0 };
|
||||
static bool s_initTimeConstructed = false;
|
||||
|
||||
// 2. If these variables would be in the .CRT$XCB section, they would be initialized only in main thread.
|
||||
thread_local moodycamel::ProducerToken init_order(107) s_token_detail( s_queue );
|
||||
|
||||
static moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* CreateProducerToken()
|
||||
{
|
||||
auto ptr = s_queue.get_explicit_producer( s_token_detail );
|
||||
if( !s_initTimeConstructed )
|
||||
{
|
||||
const auto t = Profiler::GetTime();
|
||||
auto e = s_earlyTokenTime.load( std::memory_order_relaxed );
|
||||
while( ( e == 0 || t < e ) && !s_earlyTokenTime.compare_exchange_weak( e, t, std::memory_order_relaxed ) ) {}
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
thread_local ProducerWrapper init_order(108) s_token { CreateProducerToken() };
|
||||
thread_local ProducerWrapper init_order(108) s_token { s_queue.get_explicit_producer( s_token_detail ) };
|
||||
thread_local ThreadHandleWrapper init_order(104) s_threadHandle { detail::GetThreadHandleImpl() };
|
||||
|
||||
# ifdef _MSC_VER
|
||||
@@ -1440,36 +1419,12 @@ thread_local ThreadHandleWrapper init_order(104) s_threadHandle { detail::GetThr
|
||||
# pragma init_seg( ".CRT$XCB" )
|
||||
# endif
|
||||
|
||||
static int64_t GetInitTimeImpl()
|
||||
{
|
||||
auto t = SetupHwTimer();
|
||||
const auto e = s_earlyTokenTime.load( std::memory_order_relaxed );
|
||||
if( e != 0 && e < t ) t = e;
|
||||
s_initTimeConstructed = true;
|
||||
return t;
|
||||
}
|
||||
static InitTimeWrapper init_order(101) s_initTime { GetInitTimeImpl() };
|
||||
static InitTimeWrapper init_order(101) s_initTime { SetupHwTimer() };
|
||||
std::atomic<int> init_order(102) RpInitDone( 0 );
|
||||
std::atomic<int> init_order(102) RpInitLock( 0 );
|
||||
thread_local bool RpThreadInitDone = false;
|
||||
thread_local bool RpThreadShutdown = false;
|
||||
moodycamel::ConcurrentQueue<QueueItem> init_order(103) s_queue( QueuePrealloc );
|
||||
|
||||
# ifndef _MSC_VER
|
||||
// An instrumented shared object may emit zones from its static initializers, which the
|
||||
// dynamic loader runs before any of the executable's constructors, including the
|
||||
// priority-ordered constructor of s_queue above. The main thread producer token (s_token)
|
||||
// is then lazily created against the zero-initialized queue memory, and the queue
|
||||
// constructor subsequently orphans it, making all zones emitted on the main thread
|
||||
// invisible to the consumer. Re-adopt such a producer here. If no zones were emitted up
|
||||
// to this point, this only triggers construction of s_token, which is a no-op repair.
|
||||
struct EarlyMainThreadTokenRepair
|
||||
{
|
||||
EarlyMainThreadTokenRepair() { if( s_token.ptr ) s_queue.readopt_orphaned_producer( s_token.ptr ); }
|
||||
};
|
||||
static EarlyMainThreadTokenRepair init_order(104) s_earlyMainThreadTokenRepair;
|
||||
# endif
|
||||
|
||||
std::atomic<uint32_t> init_order(104) s_lockCounter( 0 );
|
||||
std::atomic<uint8_t> init_order(104) s_gpuCtxCounter( 0 );
|
||||
|
||||
@@ -5302,7 +5257,7 @@ TRACY_API int32_t ___tracy_before_lock_shared_shared_lockable_ctx( struct __trac
|
||||
return static_cast<int32_t>(true);
|
||||
}
|
||||
|
||||
TRACY_API void ___tracy_after_lock_shared_shared_lockable_ctx( struct __tracy_shared_lockable_context_data* lockdata )
|
||||
TRACY_API void ___tracy_after_locked_shared_shared_lockable_ctx( struct __tracy_shared_lockable_context_data* lockdata )
|
||||
{
|
||||
auto item = tracy::Profiler::QueueSerial();
|
||||
tracy::MemWrite( &item->hdr.type, tracy::QueueType::LockSharedObtain );
|
||||
|
||||
@@ -52,8 +52,20 @@ public:
|
||||
RingBuffer( const RingBuffer& ) = delete;
|
||||
RingBuffer& operator=( const RingBuffer& ) = delete;
|
||||
|
||||
RingBuffer( RingBuffer&& other ) = delete;
|
||||
RingBuffer& operator=( RingBuffer&& other ) = delete;
|
||||
RingBuffer( RingBuffer&& other )
|
||||
{
|
||||
memcpy( (char*)&other, (char*)this, sizeof( RingBuffer ) );
|
||||
m_metadata = nullptr;
|
||||
m_fd = 0;
|
||||
}
|
||||
|
||||
RingBuffer& operator=( RingBuffer&& other )
|
||||
{
|
||||
memcpy( (char*)&other, (char*)this, sizeof( RingBuffer ) );
|
||||
m_metadata = nullptr;
|
||||
m_fd = 0;
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool IsValid() const { return m_metadata != nullptr; }
|
||||
int GetId() const { return m_id; }
|
||||
|
||||
@@ -171,8 +171,8 @@ struct ConcurrentQueueDefaultTraits
|
||||
#if defined(malloc) || defined(free)
|
||||
// Gah, this is 2015, stop defining macros that break standard code already!
|
||||
// Work around malloc/free being special macros:
|
||||
static inline void* WORKAROUND_malloc(size_t size) { return tracy::tracy_malloc(size); }
|
||||
static inline void WORKAROUND_free(void* ptr) { return tracy::tracy_free(ptr); }
|
||||
static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
|
||||
static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
|
||||
static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
|
||||
static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
|
||||
#else
|
||||
@@ -1210,21 +1210,6 @@ private:
|
||||
return static_cast<ExplicitProducer*>(token.producer);
|
||||
}
|
||||
|
||||
// If a producer token is created before the constructor of a statically allocated
|
||||
// queue runs (which may happen due to the undefined order of static initialization
|
||||
// across module boundaries), the constructor will orphan it by resetting the
|
||||
// producer list. Such a producer is functional, as producer creation works on the
|
||||
// zero-initialized queue memory, but the consumer is not able to see the data it
|
||||
// enqueues. This method links the producer back into the list.
|
||||
bool readopt_orphaned_producer(ExplicitProducer* producer)
|
||||
{
|
||||
for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
|
||||
if (ptr == static_cast<ProducerBase*>(producer)) return false;
|
||||
}
|
||||
add_producer(static_cast<ProducerBase*>(producer));
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
//////////////////////////////////
|
||||
|
||||
@@ -492,8 +492,7 @@ enum class GpuContextType : uint8_t
|
||||
Metal,
|
||||
Custom,
|
||||
CUDA,
|
||||
Rocprof,
|
||||
WebGPU
|
||||
Rocprof
|
||||
};
|
||||
|
||||
enum GpuContextFlags : uint8_t
|
||||
|
||||
@@ -391,7 +391,7 @@ TRACY_API void ___tracy_after_lock_shared_lockable_ctx( struct __tracy_shared_lo
|
||||
TRACY_API void ___tracy_after_unlock_shared_lockable_ctx( struct __tracy_shared_lockable_context_data* lockdata );
|
||||
TRACY_API void ___tracy_after_try_lock_shared_lockable_ctx( struct __tracy_shared_lockable_context_data* lockdata, int32_t acquired );
|
||||
TRACY_API int32_t ___tracy_before_lock_shared_shared_lockable_ctx( struct __tracy_shared_lockable_context_data* lockdata );
|
||||
TRACY_API void ___tracy_after_lock_shared_shared_lockable_ctx( struct __tracy_shared_lockable_context_data* lockdata );
|
||||
TRACY_API void ___tracy_after_locked_shared_shared_lockable_ctx( struct __tracy_shared_lockable_context_data* lockdata );
|
||||
TRACY_API void ___tracy_after_unlock_shared_shared_lockable_ctx( struct __tracy_shared_lockable_context_data* lockdata );
|
||||
TRACY_API void ___tracy_after_try_lock_shared_shared_lockable_ctx( struct __tracy_shared_lockable_context_data* lockdata, int32_t acquired );
|
||||
TRACY_API void ___tracy_mark_shared_lockable_ctx( struct __tracy_shared_lockable_context_data* lockdata, const struct ___tracy_source_location_data* srcloc );
|
||||
@@ -414,7 +414,7 @@ TRACY_API void ___tracy_custom_name_shared_lockable_ctx( struct __tracy_shared_l
|
||||
#define TracyCSharedLockAfterUnlock( lock ) ___tracy_after_unlock_shared_lockable_ctx( lock );
|
||||
#define TracyCSharedLockAfterTryLock( lock, acquired ) ___tracy_after_try_lock_shared_lockable_ctx( lock, acquired );
|
||||
#define TracyCSharedLockBeforeSharedLock( lock ) ___tracy_before_lock_shared_shared_lockable_ctx( lock );
|
||||
#define TracyCSharedLockAfterSharedLock( lock ) ___tracy_after_lock_shared_shared_lockable_ctx( lock );
|
||||
#define TracyCSharedLockAfterSharedLock( lock ) ___tracy_after_locked_shared_shared_lockable_ctx( lock );
|
||||
#define TracyCSharedLockAfterSharedUnlock( lock ) ___tracy_after_unlock_shared_shared_lockable_ctx( lock );
|
||||
#define TracyCSharedLockAfterTrySharedLock( lock, acquired ) ___tracy_after_try_lock_shared_shared_lockable_ctx( lock, acquired );
|
||||
#define TracyCSharedLockMark( lock ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, 0 }; ___tracy_mark_shared_lockable_ctx( lock, &TracyConcat(__tracy_source_location,TracyLine) );
|
||||
|
||||
@@ -34,9 +34,7 @@ public:
|
||||
#include <atomic>
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef TRACY_OPENGL_AUTO_CALIBRATION
|
||||
# include <chrono>
|
||||
#endif
|
||||
#include <chrono>
|
||||
|
||||
#include "Tracy.hpp"
|
||||
#include "../client/TracyProfiler.hpp"
|
||||
@@ -102,34 +100,24 @@ public:
|
||||
|
||||
glGenQueries( QueryCount, m_query );
|
||||
|
||||
int64_t tgpu;
|
||||
glGetInteger64v( GL_TIMESTAMP, &tgpu );
|
||||
int64_t tcpu = Profiler::GetTime();
|
||||
|
||||
GLint bits;
|
||||
glGetQueryiv( GL_TIMESTAMP, GL_QUERY_COUNTER_BITS, &bits );
|
||||
|
||||
#ifdef TRACY_OPENGL_AUTO_CALIBRATION
|
||||
// The anchor above is never refreshed; advertise calibration and emit periodic
|
||||
// GpuCalibration events to correct CPU/GPU drift (see Recalibrate). Opt-in,
|
||||
// because Recalibrate() calls glGetInteger64v( GL_TIMESTAMP ), which forces a
|
||||
// CPU/GPU sync.
|
||||
int64_t tcpu, tgpu;
|
||||
double period;
|
||||
CalibrateClocks( tcpu, tgpu, period, 100 );
|
||||
m_prevCalibration = GetHostTimeNs();
|
||||
|
||||
GpuContextFlags flags = GpuContextFlags(0);
|
||||
#ifdef TRACY_OPENGL_AUTO_CALIBRATION
|
||||
flags = GpuContextFlags::GpuContextCalibration;
|
||||
#endif
|
||||
|
||||
const float period = 1.f;
|
||||
const auto thread = GetThreadHandle();
|
||||
TracyLfqPrepare( QueueType::GpuNewContext );
|
||||
MemWrite( &item->gpuNewContext.cpuTime, tcpu );
|
||||
MemWrite( &item->gpuNewContext.gpuTime, tgpu );
|
||||
MemWrite( &item->gpuNewContext.thread, thread );
|
||||
MemWrite( &item->gpuNewContext.period, period );
|
||||
MemWrite( &item->gpuNewContext.period, (float)period );
|
||||
MemWrite( &item->gpuNewContext.context, m_context );
|
||||
#ifdef TRACY_OPENGL_AUTO_CALIBRATION
|
||||
MemWrite( &item->gpuNewContext.flags, GpuContextFlags( GpuContextCalibration ) );
|
||||
#else
|
||||
MemWrite( &item->gpuNewContext.flags, GpuContextFlags( 0 ) );
|
||||
#endif
|
||||
MemWrite( &item->gpuNewContext.flags, flags );
|
||||
MemWrite( &item->gpuNewContext.type, GpuContextType::OpenGl );
|
||||
|
||||
#ifdef TRACY_ON_DEMAND
|
||||
@@ -194,7 +182,6 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
#ifdef TRACY_OPENGL_AUTO_CALIBRATION
|
||||
// Monotonic host ns for the inter-calibration interval (cpuDelta), kept
|
||||
// separate from Profiler::GetTime() as in the D3D12/Vulkan backends.
|
||||
static tracy_force_inline int64_t GetHostTimeNs()
|
||||
@@ -203,28 +190,65 @@ private:
|
||||
std::chrono::steady_clock::now().time_since_epoch() ).count();
|
||||
}
|
||||
|
||||
// OpenGL has no atomic CPU+GPU timestamp query, so sample back-to-back; the
|
||||
// gap is negligible against the recalibration interval below. Note this forces
|
||||
// a CPU/GPU sync, which is why the whole path is opt-in (TRACY_OPENGL_AUTO_CALIBRATION).
|
||||
tracy_force_inline void Recalibrate()
|
||||
{
|
||||
ZoneScopedC( Color::Red4 );
|
||||
|
||||
const int64_t hostNow = GetHostTimeNs();
|
||||
const int64_t delta = hostNow - m_prevCalibration;
|
||||
if( delta < 1000ll * 1000 * 1000 ) return; // throttle: ~once per second
|
||||
|
||||
int64_t tgpu;
|
||||
glGetInteger64v( GL_TIMESTAMP, &tgpu );
|
||||
const int64_t refCpu = Profiler::GetTime();
|
||||
int64_t tcpu, tgpu;
|
||||
double period;
|
||||
// perform a single iteration of CalibrateClocks
|
||||
CalibrateClocks( tcpu, tgpu, period, 0 );
|
||||
m_prevCalibration = hostNow;
|
||||
|
||||
TracyLfqPrepare( QueueType::GpuCalibration );
|
||||
MemWrite( &item->gpuCalibration.gpuTime, tgpu );
|
||||
MemWrite( &item->gpuCalibration.cpuTime, refCpu );
|
||||
MemWrite( &item->gpuCalibration.cpuTime, tcpu );
|
||||
MemWrite( &item->gpuCalibration.cpuDelta, delta );
|
||||
MemWrite( &item->gpuCalibration.context, m_context );
|
||||
TracyLfqCommit;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void CalibrateClocks( int64_t& outCpuTime, int64_t& outGpuTime, double& outPeriod, int timeout )
|
||||
{
|
||||
ZoneScopedC( Color::Red4 );
|
||||
|
||||
using Clock = std::chrono::steady_clock;
|
||||
const auto deadline = Clock::now() + std::chrono::milliseconds( timeout );
|
||||
|
||||
int64_t bestRange = int64_t( ~uint64_t(0) >> 1 );
|
||||
int64_t bestCpu = 0;
|
||||
int64_t bestGpu = 0;
|
||||
|
||||
for( int i = 0; i < 1000; ++i )
|
||||
{
|
||||
int64_t tgpu;
|
||||
const int64_t cpu0 = Profiler::GetTime();
|
||||
// querying the GPU timestamp with glGetInteger64v will block until done
|
||||
glGetInteger64v( GL_TIMESTAMP, &tgpu );
|
||||
const int64_t cpu1 = Profiler::GetTime();
|
||||
|
||||
// the tightest CPU interval wins (less uncertainty, better correlation)
|
||||
const int64_t range = cpu1 - cpu0;
|
||||
if( range < bestRange )
|
||||
{
|
||||
bestRange = range;
|
||||
//bestCpu = cpu0 + range / 2; // mid-point estimate
|
||||
bestCpu = cpu1; // right-bias estimate
|
||||
bestGpu = tgpu;
|
||||
}
|
||||
|
||||
if( Clock::now() >= deadline ) break;
|
||||
}
|
||||
|
||||
outCpuTime = bestCpu;
|
||||
outGpuTime = bestGpu;
|
||||
// ARB_timer_query stipulates GL_TIMESTAMP values to be in nanoseconds
|
||||
outPeriod = 1.0; // 1ns / gpu-tick
|
||||
}
|
||||
|
||||
tracy_force_inline unsigned int NextQueryId()
|
||||
{
|
||||
@@ -250,9 +274,7 @@ private:
|
||||
unsigned int m_head;
|
||||
unsigned int m_tail;
|
||||
|
||||
#ifdef TRACY_OPENGL_AUTO_CALIBRATION
|
||||
int64_t m_prevCalibration; // host-ns timestamp of the last emitted calibration
|
||||
#endif
|
||||
};
|
||||
|
||||
class GpuCtxScope
|
||||
|
||||
@@ -1,968 +0,0 @@
|
||||
#ifndef __TRACYWEBGPU_HPP__
|
||||
#define __TRACYWEBGPU_HPP__
|
||||
|
||||
// WebGPU, unlike other graphics APIs, has many annoying restrictions that complicate
|
||||
// the design of the Tracy WebGPU back-end:
|
||||
// - there's no CPU/GPU clock calibration API
|
||||
// - submitting GPU commands that touch a buffer that the host is mapping is not permitted
|
||||
// - resolving timestamps require destination offsets aligned to 256 bytes
|
||||
// - timestamps are only available at pass granularity (implementations may need to emulate this)
|
||||
// - spec mandates timestamps to be in nanoseconds (implementationw may need to emulate this)
|
||||
|
||||
#ifndef TRACY_ENABLE
|
||||
|
||||
#define TracyWebGPUSetupDeviceDescriptor(deviceDescriptor)
|
||||
|
||||
#define TracyWebGPUContext(instance, device, queue) nullptr
|
||||
#define TracyWebGPUDestroy(ctx)
|
||||
#define TracyWebGPUContextName(ctx, name, size)
|
||||
|
||||
#define TracyWebGPUZone(ctx, encoder, passDesc, name)
|
||||
#define TracyWebGPUZoneC(ctx, encoder, passDesc, name, color)
|
||||
#define TracyWebGPUNamedZone(ctx, varname, encoder, passDesc, name, active)
|
||||
#define TracyWebGPUNamedZoneC(ctx, varname, encoder, passDesc, name, color, active)
|
||||
#define TracyWebGPUZoneTransient(ctx, varname, encoder, passDesc, name, active)
|
||||
|
||||
#define TracyWebGPUZoneS(ctx, encoder, passDesc, name, depth)
|
||||
#define TracyWebGPUZoneCS(ctx, encoder, passDesc, name, color, depth)
|
||||
#define TracyWebGPUNamedZoneS(ctx, varname, encoder, passDesc, name, depth, active)
|
||||
#define TracyWebGPUNamedZoneCS(ctx, varname, encoder, passDesc, name, color, depth, active)
|
||||
#define TracyWebGPUZoneTransientS(ctx, varname, encoder, passDesc, name, depth, active)
|
||||
|
||||
#define TracyWebGPUCollect(ctx)
|
||||
|
||||
namespace tracy
|
||||
{
|
||||
class WebGPUZoneScope {};
|
||||
}
|
||||
|
||||
using TracyWebGPUCtx = void*;
|
||||
|
||||
#else
|
||||
|
||||
#include "Tracy.hpp"
|
||||
#include "../client/TracyProfiler.hpp"
|
||||
#include "../client/TracyCallstack.hpp"
|
||||
#include "../common/TracyAlign.hpp"
|
||||
#include "../common/TracyAlloc.hpp"
|
||||
|
||||
#include <atomic>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
|
||||
#include <webgpu/webgpu.h>
|
||||
|
||||
// piggy-back on WGPU_DAWN_TOGGLES_DESCRIPTOR_INIT to detect Dawn header
|
||||
#ifdef WGPU_DAWN_TOGGLES_DESCRIPTOR_INIT
|
||||
#define TRACY_WEBGPU_DAWN_NATIVE (1)
|
||||
#include <dawn/native/DawnNative.h>
|
||||
#else
|
||||
#define TRACY_WEBGPU_WGPU_NATIVE (1)
|
||||
#include <webgpu/wgpu.h>
|
||||
#endif
|
||||
|
||||
#ifndef TRACY_WEBGPU_DEBUG_LEVEL
|
||||
#define TRACY_WEBGPU_DEBUG_LEVEL (0)
|
||||
#endif//TRACY_WEBGPU_DEBUG_LEVEL
|
||||
|
||||
#if TRACY_WEBGPU_DEBUG_LEVEL
|
||||
#define TracyWebGPUDebug(...) __VA_ARGS__;
|
||||
#if defined(_MSC_VER)
|
||||
extern "C" int32_t IsDebuggerPresent(void);
|
||||
#define TracyWebGPUBreak() if (IsDebuggerPresent()) __debugbreak()
|
||||
#else
|
||||
#define TracyWebGPUBreak() ((void)0)
|
||||
#endif
|
||||
#define TracyWebGPUAssert(predicate, ...) if (predicate) {} else { __VA_ARGS__; TracyWebGPUBreak(); }
|
||||
#else
|
||||
#define TracyWebGPUDebug(...)
|
||||
#define TracyWebGPUBreak()
|
||||
#define TracyWebGPUAssert(predicate, ...) assert(predicate);
|
||||
#endif
|
||||
|
||||
#define TracyWebGPULog(severity, msg) fprintf(stdout, "%s", msg), tracy::Profiler::LogString( tracy::MessageSourceType::Tracy, tracy::MessageSeverity::severity, tracy::Color::Red4, 0, msg );
|
||||
#define TracyWebGPUPanic(msg, ...) do { TracyWebGPULog(Error, msg); TracyWebGPUAssert(false && "TracyWebGPU: " msg); __VA_ARGS__; } while(false);
|
||||
|
||||
namespace tracy
|
||||
{
|
||||
|
||||
class WebGPUQueueCtx
|
||||
{
|
||||
friend class WebGPUZoneScope;
|
||||
|
||||
uint8_t m_contextId = 255; // 255 represents "invalid id"
|
||||
|
||||
std::mutex m_collectionMutex;
|
||||
|
||||
WGPUInstance m_instance = nullptr;
|
||||
WGPUDevice m_device = nullptr;
|
||||
WGPUQueue m_queue = nullptr;
|
||||
|
||||
struct ReadbackStage
|
||||
{
|
||||
WGPUBuffer buffer = nullptr;
|
||||
std::atomic<uint64_t> copiedUpto {0};
|
||||
std::atomic<WGPUMapAsyncStatus> mapStatus = {};
|
||||
WGPUFuture pendingFuture = {};
|
||||
};
|
||||
static_assert(std::atomic<WGPUMapAsyncStatus>::is_always_lock_free, "WGPUMapAsyncStatus must be lock-free atomic");
|
||||
|
||||
WGPUQuerySet m_querySet = nullptr;
|
||||
WGPUBuffer m_resolveBuffer = nullptr;
|
||||
ReadbackStage m_readbackReel [3];
|
||||
std::atomic<int> m_writeIdx {0};
|
||||
|
||||
using atomic_counter = std::atomic<uint64_t>;
|
||||
atomic_counter m_queryCounter = 0;
|
||||
atomic_counter m_previousCheckpoint = 0;
|
||||
|
||||
uint32_t m_queryLimit = 0;
|
||||
|
||||
std::vector<uint64_t> m_shadowBuffer;
|
||||
|
||||
using WallTime = std::chrono::steady_clock::time_point;
|
||||
static tracy_force_inline auto GetWallTime() { return WallTime::clock::now(); }
|
||||
static tracy_force_inline auto Milliseconds(int value) { return std::chrono::milliseconds(value); }
|
||||
|
||||
static bool WaitQueueIdle(WGPUQueue queue, WGPUInstance instance)
|
||||
{
|
||||
bool gpuDone = false;
|
||||
WGPUQueueWorkDoneCallbackInfo doneCB = {};
|
||||
doneCB.mode = WGPUCallbackMode_AllowProcessEvents;
|
||||
doneCB.callback = [](WGPUQueueWorkDoneStatus, WGPUStringView, void* userData, void*) {
|
||||
*static_cast<bool*>(userData) = true;
|
||||
};
|
||||
doneCB.userdata1 = &gpuDone;
|
||||
wgpuQueueOnSubmittedWorkDone(queue, doneCB);
|
||||
|
||||
const auto deadline = GetWallTime() + Milliseconds(2000);
|
||||
while (!gpuDone && GetWallTime() < deadline)
|
||||
wgpuInstanceProcessEvents(instance);
|
||||
return gpuDone;
|
||||
}
|
||||
|
||||
static const uint64_t* MapBufferSync(WGPUBuffer buffer, WGPUInstance instance)
|
||||
{
|
||||
struct MapCtx { WGPUMapAsyncStatus status = {}; } ctx;
|
||||
WGPUBufferMapCallbackInfo cbInfo = {};
|
||||
cbInfo.mode = WGPUCallbackMode_AllowProcessEvents;
|
||||
cbInfo.callback = [](WGPUMapAsyncStatus status, WGPUStringView, void* userData, void*) {
|
||||
auto* ctx = static_cast<MapCtx*>(userData);
|
||||
ctx->status = status;
|
||||
};
|
||||
cbInfo.userdata1 = &ctx;
|
||||
size_t offset = 0;
|
||||
size_t size = 2 * sizeof(uint64_t);
|
||||
wgpuBufferMapAsync(buffer, WGPUMapMode_Read, offset, size, cbInfo);
|
||||
|
||||
const auto deadline = GetWallTime() + Milliseconds(2000);
|
||||
while (ctx.status == 0 && GetWallTime() < deadline)
|
||||
wgpuInstanceProcessEvents(instance);
|
||||
|
||||
if (ctx.status != WGPUMapAsyncStatus_Success) return nullptr;
|
||||
auto data = wgpuBufferGetConstMappedRange(buffer, offset, size);
|
||||
return static_cast<const uint64_t*>(data);
|
||||
}
|
||||
|
||||
struct Calibration {
|
||||
int64_t minCpuRange = ~uint64_t(0) >> 1;
|
||||
struct Regression
|
||||
{
|
||||
int64_t n = 0;
|
||||
int64_t mean_x = 0;
|
||||
int64_t mean_y = 0;
|
||||
int64_t S_xx = 0;
|
||||
int64_t S_xy = 0;
|
||||
void Update(int64_t x, int64_t y)
|
||||
{
|
||||
n += 1;
|
||||
int64_t dx = x - mean_x;
|
||||
int64_t dy = y - mean_y;
|
||||
mean_x += dx / n;
|
||||
mean_y += dy / n;
|
||||
S_xx += dx * (x - mean_x);
|
||||
S_xy += dx * (y - mean_y);
|
||||
}
|
||||
double Slope() const { return double(S_xy) / S_xx; }
|
||||
double Intercept() const { return mean_y - Slope() * mean_x; }
|
||||
};
|
||||
Regression cpuToGpuModel; // cpu-ticks to gpu-ticks
|
||||
Regression cpuRangeModel; // cpu-tick interval uncertainty
|
||||
Regression wallToGpuModel; // nanoseconds to gpu-ticks
|
||||
void GetReferenceTime(uint64_t& cpuTime, uint64_t& gpuTime) const
|
||||
{
|
||||
// the mean belongs to the regression line
|
||||
cpuTime = cpuToGpuModel.mean_x;
|
||||
gpuTime = cpuToGpuModel.mean_y;
|
||||
}
|
||||
double Period() const { return 1.0 / wallToGpuModel.Slope(); } // ns/tick
|
||||
bool AcceptX(const Regression& r, int64_t x, double threshold = 3.0) const {
|
||||
if (r.n < 2) return true;
|
||||
auto dx = x - r.mean_x;
|
||||
if (dx <= 0) return true; // always accept "tighter" outliers
|
||||
double variance = double(r.S_xx) / (r.n - 1);
|
||||
if (variance == 0.0) return true;
|
||||
// WARN: dx*dx "could" overflow, but very unlikely in practice
|
||||
double zz = (double)(dx*dx) / variance;
|
||||
return zz <= (threshold*threshold);
|
||||
}
|
||||
bool Update(WallTime twall0, WallTime twall1, uint64_t tcpu0, uint64_t tcpu1, uint64_t tgpu)
|
||||
{
|
||||
using namespace std::chrono;
|
||||
int64_t cpuRange = tcpu1 - tcpu0;
|
||||
cpuRangeModel.Update(cpuRange, 0);
|
||||
if (!AcceptX(cpuRangeModel, cpuRange, 1.0)) return false;
|
||||
// Process sample:
|
||||
int64_t tcpu = tcpu0 + (tcpu1 - tcpu0) / 2; // mid-point
|
||||
int64_t twall = duration_cast<nanoseconds>(
|
||||
(twall0 + (twall1 - twall0) / 2) // mid-point
|
||||
.time_since_epoch()
|
||||
).count();
|
||||
// incremental regression:
|
||||
cpuToGpuModel.Update(tcpu, tgpu);
|
||||
wallToGpuModel.Update(twall, tgpu);
|
||||
TracyWebGPUDebug( fprintf(stderr, "----- (sample accepted! wall = %lld | cpu = %lld | gpu = %lld | period = %f)\n", twall, tcpu, tgpu, Period()) );
|
||||
return true;
|
||||
}
|
||||
} m_calibration;
|
||||
|
||||
tracy_force_inline void SubmitQueueItem(tracy::QueueItem* item)
|
||||
{
|
||||
#ifdef TRACY_ON_DEMAND
|
||||
GetProfiler().DeferItem(*item);
|
||||
#endif
|
||||
Profiler::QueueSerialFinish();
|
||||
}
|
||||
|
||||
bool CalibrateClocks(uint64_t& outCpuTime, uint64_t& outGpuTime, double& period)
|
||||
{
|
||||
// WebGPU does not have any clock calibration API.
|
||||
// This routine attempts to estimates a reasonable (cpuTime, gpuTime) correlation
|
||||
// by sampling CPU and GPU timestamps around a "synchronous" draw call.
|
||||
// Several samples are taken to tighten the estimation.
|
||||
|
||||
ZoneScoped;
|
||||
|
||||
WGPUShaderSourceWGSL wgslSrc = {};
|
||||
wgslSrc.chain.sType = WGPUSType_ShaderSourceWGSL;
|
||||
wgslSrc.code =
|
||||
{
|
||||
R"(
|
||||
@vertex fn vs(@builtin(vertex_index) i: u32) -> @builtin(position) vec4f {
|
||||
var p = array(vec4f(-1,-1,.5,1), vec4f(3,-1,.5,1), vec4f(-1,3,.5,1));
|
||||
return p[i];
|
||||
}
|
||||
@fragment fn fs() -> @location(0) vec4f { return vec4f(0.0); }
|
||||
)",
|
||||
WGPU_STRLEN
|
||||
};
|
||||
WGPUShaderModuleDescriptor smDesc = {};
|
||||
smDesc.nextInChain = reinterpret_cast<WGPUChainedStruct*>(&wgslSrc);
|
||||
WGPUShaderModule calibShader = wgpuDeviceCreateShaderModule(m_device, &smDesc);
|
||||
if (!calibShader) { TracyWebGPUPanic("Failed to create calibration shader.", return false); }
|
||||
|
||||
WGPUTextureDescriptor texDesc = {};
|
||||
texDesc.usage = WGPUTextureUsage_RenderAttachment;
|
||||
texDesc.dimension = WGPUTextureDimension_2D;
|
||||
texDesc.size = { 1, 1, 1 };
|
||||
texDesc.format = WGPUTextureFormat_BGRA8Unorm;
|
||||
texDesc.mipLevelCount = 1;
|
||||
texDesc.sampleCount = 1;
|
||||
WGPUTexture tex = wgpuDeviceCreateTexture(m_device, &texDesc);
|
||||
if (!tex) { wgpuShaderModuleRelease(calibShader); TracyWebGPUPanic("Failed to create calibration scratch texture.", return false); }
|
||||
WGPUTextureView texView = wgpuTextureCreateView(tex, nullptr);
|
||||
if (!texView) { wgpuTextureRelease(tex); wgpuShaderModuleRelease(calibShader); TracyWebGPUPanic("Failed to create calibration scratch texture view.", return false); }
|
||||
|
||||
WGPUColorTargetState colorTarget = {};
|
||||
colorTarget.format = WGPUTextureFormat_BGRA8Unorm;
|
||||
colorTarget.writeMask = WGPUColorWriteMask_All;
|
||||
WGPUFragmentState fragState = {};
|
||||
fragState.module = calibShader;
|
||||
fragState.entryPoint = { "fs", WGPU_STRLEN };
|
||||
fragState.targetCount = 1;
|
||||
fragState.targets = &colorTarget;
|
||||
WGPURenderPipelineDescriptor pipeDesc = {};
|
||||
pipeDesc.vertex.module = calibShader;
|
||||
pipeDesc.vertex.entryPoint = { "vs", WGPU_STRLEN };
|
||||
pipeDesc.primitive.topology = WGPUPrimitiveTopology_TriangleList;
|
||||
pipeDesc.multisample.count = 1;
|
||||
pipeDesc.fragment = &fragState;
|
||||
WGPURenderPipeline calibPipeline = wgpuDeviceCreateRenderPipeline(m_device, &pipeDesc);
|
||||
if (!calibPipeline) { wgpuTextureViewRelease(texView); wgpuTextureRelease(tex); wgpuShaderModuleRelease(calibShader); TracyWebGPUPanic("Failed to create calibration pipeline.", return false); }
|
||||
|
||||
uint32_t queryId = 0;
|
||||
WGPUPassTimestampWrites anchorTs = {};
|
||||
anchorTs.querySet = m_querySet;
|
||||
anchorTs.beginningOfPassWriteIndex = queryId;
|
||||
anchorTs.endOfPassWriteIndex = queryId+1;
|
||||
|
||||
WGPURenderPassColorAttachment att = {};
|
||||
att.view = texView;
|
||||
att.loadOp = WGPULoadOp_Clear;
|
||||
att.storeOp = WGPUStoreOp_Store;
|
||||
att.depthSlice = WGPU_DEPTH_SLICE_UNDEFINED;
|
||||
|
||||
WGPURenderPassDescriptor passDesc = {};
|
||||
passDesc.colorAttachmentCount = 1;
|
||||
passDesc.colorAttachments = &att;
|
||||
passDesc.timestampWrites = &anchorTs;
|
||||
|
||||
// calibration loop
|
||||
const auto deadline = GetWallTime() + Milliseconds(100);
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
{
|
||||
// loop until time budget (100ms) allows, but ensure at least 5 iterations
|
||||
if ((GetWallTime() >= deadline) && (i > 5))
|
||||
break;
|
||||
|
||||
WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(m_device, nullptr);
|
||||
if (!enc) { TracyWebGPUPanic("Failed to create command encoder for time calibration.", return false); }
|
||||
|
||||
WGPURenderPassEncoder pass = wgpuCommandEncoderBeginRenderPass(enc, &passDesc);
|
||||
wgpuRenderPassEncoderSetPipeline(pass, calibPipeline);
|
||||
wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
|
||||
wgpuRenderPassEncoderEnd(pass);
|
||||
wgpuRenderPassEncoderRelease(pass);
|
||||
|
||||
WGPUBuffer readBackBuffer = m_readbackReel[0].buffer;
|
||||
uint32_t byteOffset = queryId * sizeof(uint64_t);
|
||||
uint32_t sizeInBytes = 2 * sizeof(uint64_t);
|
||||
wgpuCommandEncoderResolveQuerySet(enc, m_querySet, queryId, 2, m_resolveBuffer, byteOffset);
|
||||
wgpuCommandEncoderCopyBufferToBuffer(enc, m_resolveBuffer, byteOffset, readBackBuffer, byteOffset, sizeInBytes);
|
||||
|
||||
WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, nullptr);
|
||||
wgpuCommandEncoderRelease(enc);
|
||||
if (!cmd) { TracyWebGPUPanic("Failed to finish calibration command encoder.", return false); }
|
||||
|
||||
WaitQueueIdle(m_queue, m_instance);
|
||||
int64_t cpu [2] = {};
|
||||
int64_t gpu [2] = {};
|
||||
WallTime wall [2] = {};
|
||||
cpu[0] = Profiler::GetTime();
|
||||
wall[0] = GetWallTime();
|
||||
wgpuQueueSubmit(m_queue, 1, &cmd);
|
||||
wgpuCommandBufferRelease(cmd);
|
||||
WaitQueueIdle(m_queue, m_instance);
|
||||
wall[1] = GetWallTime();
|
||||
cpu[1] = Profiler::GetTime();
|
||||
auto gpuTimestamps = MapBufferSync(readBackBuffer, m_instance);
|
||||
TracyWebGPUAssert(gpuTimestamps != nullptr);
|
||||
gpu[0] = gpuTimestamps[0];
|
||||
gpu[1] = gpuTimestamps[1];
|
||||
wgpuBufferUnmap(readBackBuffer);
|
||||
TracyWebGPUDebug(
|
||||
fprintf(stdout, "[%03d] CalibrateClocks() [CPU] %16lld | %16lld | /// %lld\n", i, cpu[0], cpu[1], cpu[1]-cpu[0]);
|
||||
fprintf(stdout, "----------------------- [GPU] %16llu | %16llu | /// %lld\n", gpu[0], gpu[1], gpu[1]-gpu[0]);
|
||||
uint64_t cpuTimeRef, gpuTimeRef;
|
||||
m_calibration.GetReferenceTime(cpuTimeRef, gpuTimeRef);
|
||||
if (gpu[0] < gpuTimeRef)
|
||||
fprintf(stdout, "!!!!! CalibrateClocks() -> WARNING!!! going backwards!\n%llu\n%llu\n%lld\n", gpuTimeRef, gpu[0], gpu[0] - gpuTimeRef);
|
||||
);
|
||||
|
||||
// skip first sample since it is quite jittery (lazy intialization of WebGPU objects)
|
||||
if (i == 0)
|
||||
continue;
|
||||
|
||||
m_calibration.Update(wall[0], wall[1], cpu[0], cpu[1], gpu[0]);
|
||||
};
|
||||
|
||||
TracyWebGPUDebug(
|
||||
fprintf(stdout, "##### CalibrateClocks() WALL = %lld | CPU = %lld | GPU = %lld | period = %f\n",
|
||||
m_calibration.wallToGpuModel.mean_x,
|
||||
m_calibration.cpuToGpuModel.mean_x,
|
||||
m_calibration.cpuToGpuModel.mean_y,
|
||||
m_calibration.Period());
|
||||
);
|
||||
|
||||
wgpuRenderPipelineRelease(calibPipeline);
|
||||
wgpuShaderModuleRelease(calibShader);
|
||||
wgpuTextureViewRelease(texView);
|
||||
wgpuTextureRelease(tex);
|
||||
|
||||
m_calibration.GetReferenceTime(outCpuTime, outGpuTime);
|
||||
period = m_calibration.Period();
|
||||
// assume 1 ns/tick if the period estimation is close enough to 1
|
||||
if (std::abs(period - 1.0) < 0.001)
|
||||
period = 1.0;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
class Requirements
|
||||
{
|
||||
private:
|
||||
# if (TRACY_WEBGPU_DAWN_NATIVE)
|
||||
WGPUDawnTogglesDescriptor dawnTogglesDesc = {};
|
||||
static constexpr int NumExtras = 0;
|
||||
# elif (TRACY_WEBGPU_WGPU_NATIVE)
|
||||
static constexpr int NumExtras = 1;
|
||||
# endif
|
||||
|
||||
public:
|
||||
static constexpr int NumFeatures = 1 + NumExtras;
|
||||
WGPUFeatureName features [NumFeatures] = {};
|
||||
WGPUChainedStruct* togglesDesc = nullptr;
|
||||
|
||||
Requirements()
|
||||
{
|
||||
this->features[0] = WGPUFeatureName_TimestampQuery;
|
||||
# if (TRACY_WEBGPU_WGPU_NATIVE)
|
||||
this->features[1] = (WGPUFeatureName)WGPUNativeFeature_TimestampQueryInsideEncoders;
|
||||
# endif
|
||||
# if (TRACY_WEBGPU_DAWN_NATIVE)
|
||||
static const char* dawnDisabledToggles[] = { "timestamp_quantization" };
|
||||
static const char* dawnEnabledToggles[] = { "disable_timestamp_query_conversion" };
|
||||
this->dawnTogglesDesc.chain.sType = WGPUSType_DawnTogglesDescriptor;
|
||||
this->dawnTogglesDesc.disabledToggles = dawnDisabledToggles;
|
||||
this->dawnTogglesDesc.disabledToggleCount = 1;
|
||||
this->dawnTogglesDesc.enabledToggles = dawnEnabledToggles;
|
||||
this->dawnTogglesDesc.enabledToggleCount = 1;
|
||||
this->togglesDesc = reinterpret_cast<WGPUChainedStruct*>(&this->dawnTogglesDesc);
|
||||
# endif
|
||||
}
|
||||
|
||||
static bool VerifyDevice(WGPUDevice device)
|
||||
{
|
||||
if (device == nullptr)
|
||||
return false;
|
||||
if (wgpuDeviceHasFeature(device, WGPUFeatureName_TimestampQuery) == WGPU_FALSE)
|
||||
return false;
|
||||
# if (TRACY_WEBGPU_DAWN_NATIVE)
|
||||
bool hasDisableConversion = false, hasQuantization = false;
|
||||
for (const char* t : ::dawn::native::GetTogglesUsed(device))
|
||||
{
|
||||
if (strcmp(t, "disable_timestamp_query_conversion") == 0)
|
||||
hasDisableConversion = true;
|
||||
if (strcmp(t, "timestamp_quantization") == 0)
|
||||
hasQuantization = true;
|
||||
}
|
||||
return hasDisableConversion && !hasQuantization;
|
||||
# elif (TRACY_WEBGPU_WGPU_NATIVE)
|
||||
if (wgpuDeviceHasFeature(device, (WGPUFeatureName)WGPUNativeFeature_TimestampQueryInsideEncoders) == WGPU_FALSE)
|
||||
return false;
|
||||
return true;
|
||||
# endif
|
||||
return false;
|
||||
}
|
||||
|
||||
void ApplyToDeviceDescriptor(WGPUDeviceDescriptor& deviceDescriptor)
|
||||
{
|
||||
size_t userCount = deviceDescriptor.requiredFeatureCount;
|
||||
size_t totalCount = userCount + NumFeatures;
|
||||
// NOTE: this allocation will leak...
|
||||
auto* mergedFeatures = static_cast<WGPUFeatureName*>(tracy_malloc(totalCount * sizeof(WGPUFeatureName)));
|
||||
if (userCount > 0 && deviceDescriptor.requiredFeatures)
|
||||
memcpy(mergedFeatures, deviceDescriptor.requiredFeatures, userCount * sizeof(WGPUFeatureName));
|
||||
memcpy(mergedFeatures + userCount, features, NumFeatures * sizeof(WGPUFeatureName));
|
||||
deviceDescriptor.requiredFeatures = mergedFeatures;
|
||||
deviceDescriptor.requiredFeatureCount = totalCount;
|
||||
|
||||
if (togglesDesc)
|
||||
{
|
||||
togglesDesc->next = deviceDescriptor.nextInChain;
|
||||
deviceDescriptor.nextInChain = togglesDesc;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
WebGPUQueueCtx(WGPUInstance instance, WGPUDevice device, WGPUQueue queue)
|
||||
{
|
||||
ZoneScopedC(Color::Red4);
|
||||
|
||||
if (!Requirements::VerifyDevice(device))
|
||||
TracyWebGPUPanic("GPU profiling disabled because the device did not enable the necessary features.", return)
|
||||
|
||||
TracyWebGPUAssert(instance); wgpuInstanceAddRef(instance); m_instance = instance;
|
||||
TracyWebGPUAssert(device); wgpuDeviceAddRef(device); m_device = device;
|
||||
TracyWebGPUAssert(queue); wgpuQueueAddRef(queue); m_queue = queue;
|
||||
|
||||
// Setup Query Set: must have even size since queries are issued in pairs.
|
||||
// (The WebGPU spec mandates 4096, with no way to query the device limit.)
|
||||
WGPUQuerySetDescriptor qsDesc = {};
|
||||
qsDesc.type = WGPUQueryType_Timestamp;
|
||||
qsDesc.count = 4096;
|
||||
for (;;)
|
||||
{
|
||||
m_querySet = wgpuDeviceCreateQuerySet(m_device, &qsDesc);
|
||||
if (m_querySet) break;
|
||||
qsDesc.count /= 2;
|
||||
if (qsDesc.count < 128) break;
|
||||
}
|
||||
if (m_querySet == nullptr)
|
||||
TracyWebGPUPanic("Failed to create timestamp query set.", return);
|
||||
m_queryLimit = qsDesc.count;
|
||||
|
||||
WGPUBufferDescriptor resolveDesc = {};
|
||||
resolveDesc.usage = WGPUBufferUsage_QueryResolve | WGPUBufferUsage_CopySrc;
|
||||
resolveDesc.size = static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t);
|
||||
m_resolveBuffer = wgpuDeviceCreateBuffer(m_device, &resolveDesc);
|
||||
if (!m_resolveBuffer)
|
||||
TracyWebGPUPanic("Failed to create timestamp resolve buffer.", return);
|
||||
|
||||
WGPUBufferDescriptor readbackDesc = {};
|
||||
readbackDesc.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
|
||||
readbackDesc.size = static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t);
|
||||
for (auto& stage : m_readbackReel)
|
||||
{
|
||||
stage.buffer = wgpuDeviceCreateBuffer(m_device, &readbackDesc);
|
||||
stage.copiedUpto = 0;
|
||||
if (!stage.buffer) { TracyWebGPUPanic("Failed to create timestamp readback buffer.", return); }
|
||||
}
|
||||
|
||||
uint64_t cpuTimestamp = 0;
|
||||
uint64_t gpuTimestamp = 0;
|
||||
double period = 0.0; // in nanoseconds per gpu-tick
|
||||
if (!CalibrateClocks(cpuTimestamp, gpuTimestamp, period))
|
||||
TracyWebGPUPanic("Failed to calibrate CPU/GPU clocks.", return);
|
||||
|
||||
TracyWebGPUDebug( fprintf(stdout, "[WebGPUQueueCtx] cpuTimestamp: %llu | gpuTimestamp: %llu | period: %f\n", cpuTimestamp, gpuTimestamp, period) );
|
||||
m_shadowBuffer.resize(m_queryLimit, gpuTimestamp);
|
||||
|
||||
// All setup completed: register the context.
|
||||
m_contextId = GetGpuCtxCounter().fetch_add(1);
|
||||
ZoneValue(m_contextId);
|
||||
|
||||
auto* item = Profiler::QueueSerial();
|
||||
MemWrite(&item->hdr.type, QueueType::GpuNewContext);
|
||||
MemWrite(&item->gpuNewContext.cpuTime, static_cast<int64_t>(cpuTimestamp));
|
||||
MemWrite(&item->gpuNewContext.gpuTime, static_cast<int64_t>(gpuTimestamp));
|
||||
MemWrite(&item->gpuNewContext.thread, static_cast<uint32_t>(0));
|
||||
MemWrite(&item->gpuNewContext.period, static_cast<float>(period));
|
||||
MemWrite(&item->gpuNewContext.context, static_cast<uint8_t>(GetId()));
|
||||
MemWrite(&item->gpuNewContext.flags, GpuContextFlags(0)); // no calibration available
|
||||
MemWrite(&item->gpuNewContext.type, GpuContextType::WebGPU);
|
||||
SubmitQueueItem(item);
|
||||
}
|
||||
|
||||
~WebGPUQueueCtx()
|
||||
{
|
||||
// TODO: a few problems to address later during this final Collect():
|
||||
// 1. ensure "partial" query batches are collected
|
||||
// 2. ensure all readback stages are collected and empty
|
||||
// 3. ensure readback buffers are not mapped before deleting them
|
||||
Collect();
|
||||
|
||||
for (auto& stage : m_readbackReel)
|
||||
if (stage.buffer) { wgpuBufferRelease(stage.buffer); stage.buffer = nullptr; }
|
||||
if (m_resolveBuffer) { wgpuBufferRelease(m_resolveBuffer); m_resolveBuffer = nullptr; }
|
||||
if (m_querySet) { wgpuQuerySetRelease(m_querySet); m_querySet = nullptr; }
|
||||
if (m_queue) { wgpuQueueRelease(m_queue); m_queue = nullptr; }
|
||||
if (m_device) { wgpuDeviceRelease(m_device); m_device = nullptr; }
|
||||
if (m_instance) { wgpuInstanceRelease(m_instance); m_instance = nullptr; }
|
||||
}
|
||||
|
||||
tracy_force_inline uint8_t GetId() const
|
||||
{
|
||||
return m_contextId;
|
||||
}
|
||||
|
||||
void Name(const char* name, uint16_t len)
|
||||
{
|
||||
auto ptr = (char*)tracy_malloc(len);
|
||||
memcpy(ptr, name, len);
|
||||
|
||||
auto item = Profiler::QueueSerial();
|
||||
MemWrite(&item->hdr.type, QueueType::GpuContextName);
|
||||
MemWrite(&item->gpuContextNameFat.context, GetId());
|
||||
MemWrite(&item->gpuContextNameFat.ptr, (uint64_t)ptr);
|
||||
MemWrite(&item->gpuContextNameFat.size, len);
|
||||
SubmitQueueItem(item);
|
||||
}
|
||||
|
||||
void Collect(bool webgpuProcessEvents=false)
|
||||
{
|
||||
#ifdef TRACY_ON_DEMAND
|
||||
if (!GetProfiler().IsConnected()) return;
|
||||
#endif
|
||||
if (!m_collectionMutex.try_lock()) return;
|
||||
std::unique_lock<std::mutex> lock(m_collectionMutex, std::adopt_lock);
|
||||
|
||||
ZoneScopedC(Color::Red4);
|
||||
|
||||
if (Distance(m_previousCheckpoint, m_queryCounter) <= 0)
|
||||
return;
|
||||
|
||||
// Current Readback "Reel" Stages:
|
||||
const int state = m_writeIdx;
|
||||
const int fillingIdx = (state + 0) % 3; // this is where instrumentation is pushing new queries
|
||||
const int pendingIdx = (state + 1) % 3; // instrumentation is done here; ready to be collected
|
||||
const int collectIdx = (state + 2) % 3; // this is where queries are being collected right now
|
||||
|
||||
// Ensure readback buffer has been mapped to the host
|
||||
auto& collectStage = m_readbackReel[collectIdx];
|
||||
if (collectStage.pendingFuture.id != 0)
|
||||
{
|
||||
if (webgpuProcessEvents)
|
||||
wgpuInstanceProcessEvents(m_instance);
|
||||
if (collectStage.mapStatus == WGPUMapAsyncStatus{})
|
||||
return; // callback hasn't fired yet
|
||||
collectStage.pendingFuture = {};
|
||||
if (collectStage.mapStatus != WGPUMapAsyncStatus_Success)
|
||||
TracyWebGPUPanic("Colect(): unable to map readback buffer.", return);
|
||||
}
|
||||
|
||||
if (collectStage.mapStatus == WGPUMapAsyncStatus_Success)
|
||||
{
|
||||
const uint64_t* ts = static_cast<const uint64_t*>(
|
||||
wgpuBufferGetConstMappedRange(collectStage.buffer, 0,
|
||||
static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t)));
|
||||
if (ts)
|
||||
{
|
||||
uint64_t ticket = m_previousCheckpoint;
|
||||
const uint64_t end = collectStage.copiedUpto;
|
||||
TracyWebGPUDebug( fprintf(stdout, "[TWG] Collect [%d] (%llu, %llu)\n", collectIdx, ticket, end) );
|
||||
for (; Distance(ticket, end) > 0; ticket += 2)
|
||||
{
|
||||
const uint32_t slotB = RingIndex(ticket);
|
||||
const uint32_t slotE = slotB + 1;
|
||||
TracyWebGPUDebug(
|
||||
fprintf(stderr,
|
||||
"[TWG] slot B=%4u E=%4u ts[B]=%llu ts[E]=%llu shadow[E]=%llu ts-diff=%lld shadow-diff=%lld\n",
|
||||
slotB, slotE,
|
||||
ts[slotB], ts[slotE], m_shadowBuffer[slotE],
|
||||
Distance(ts[slotB], ts[slotE]),
|
||||
Distance(m_shadowBuffer[slotE], ts[slotE]));
|
||||
);
|
||||
if (Distance(m_shadowBuffer[slotE], ts[slotE]) <= 0)
|
||||
break; // GPU hasn't written this timestamp yet; retry next Collect()
|
||||
EmitGpuTime(ts[slotB], slotB);
|
||||
EmitGpuTime(ts[slotE], slotE);
|
||||
}
|
||||
m_previousCheckpoint = ticket;
|
||||
|
||||
if (Distance(ticket, end) > 0)
|
||||
return; // still unresolved queries in this buffer; come back next Collect()
|
||||
}
|
||||
|
||||
// All queries resolved (or getMappedRange failed): unmap and fall through to rotate.
|
||||
wgpuBufferUnmap(collectStage.buffer);
|
||||
collectStage.mapStatus = {};
|
||||
}
|
||||
|
||||
// At this point, all queries in the collect buffer have been processed.
|
||||
// (it's now tie to "rotate" the buffers around...)
|
||||
|
||||
// Has any ResolveQueryBatch call landed in this reel stage since it was last recycled?
|
||||
// (Are there any queries to resolve and collect at all?)
|
||||
if (m_readbackReel[fillingIdx].copiedUpto <= m_previousCheckpoint)
|
||||
return;
|
||||
|
||||
// Rotate/Cycle the Readback Pipeline State:
|
||||
// the buffer that was just collected shall now be used for instrumentation
|
||||
collectStage.copiedUpto = m_previousCheckpoint.load();
|
||||
m_writeIdx = collectIdx; // atomically commit the pipeline rotation
|
||||
|
||||
auto& nextToCollect = m_readbackReel[pendingIdx];
|
||||
WGPUBufferMapCallbackInfo cbInfo = {};
|
||||
cbInfo.mode = WGPUCallbackMode_AllowProcessEvents;
|
||||
cbInfo.callback = [](WGPUMapAsyncStatus status, WGPUStringView, void* userData, void*)
|
||||
{
|
||||
auto* stage = static_cast<ReadbackStage*>(userData);
|
||||
stage->mapStatus = status;
|
||||
};
|
||||
cbInfo.userdata1 = &nextToCollect;
|
||||
nextToCollect.pendingFuture = wgpuBufferMapAsync(
|
||||
nextToCollect.buffer, WGPUMapMode_Read, 0,
|
||||
static_cast<uint64_t>(m_queryLimit) * sizeof(uint64_t), cbInfo);
|
||||
}
|
||||
|
||||
private:
|
||||
void EmitGpuTime(uint64_t gpuTimestamp, uint32_t queryId)
|
||||
{
|
||||
auto* item = Profiler::QueueSerial();
|
||||
MemWrite(&item->hdr.type, QueueType::GpuTime);
|
||||
MemWrite(&item->gpuTime.gpuTime, static_cast<int64_t>(gpuTimestamp));
|
||||
MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(queryId));
|
||||
MemWrite(&item->gpuTime.context, GetId());
|
||||
Profiler::QueueSerialFinish();
|
||||
m_shadowBuffer[queryId] = gpuTimestamp;
|
||||
}
|
||||
|
||||
tracy_force_inline uint32_t RingCapacity() const { return m_queryLimit; }
|
||||
|
||||
tracy_force_inline uint32_t RingIndex(uint64_t t) const
|
||||
{
|
||||
return static_cast<uint32_t>(t % RingCapacity());
|
||||
}
|
||||
|
||||
tracy_force_inline static int64_t Distance(uint64_t begin, uint64_t end)
|
||||
{
|
||||
return static_cast<int64_t>(end - begin);
|
||||
}
|
||||
|
||||
tracy_force_inline uint64_t NextQueryId()
|
||||
{
|
||||
const uint64_t ticket = m_queryCounter.fetch_add(2, std::memory_order_relaxed);
|
||||
if (Distance(m_previousCheckpoint, ticket)
|
||||
>= static_cast<int64_t>(RingCapacity()))
|
||||
{
|
||||
TracyWebGPULog(Warning, "Too many pending GPU queries: stalling!");
|
||||
Collect();
|
||||
}
|
||||
return ticket;
|
||||
}
|
||||
};
|
||||
|
||||
class WebGPUZoneScope
|
||||
{
|
||||
const bool m_active;
|
||||
WebGPUQueueCtx* m_ctx = nullptr;
|
||||
WGPUCommandEncoder m_encoder = nullptr;
|
||||
uint64_t m_rawTicket = 0;
|
||||
uint32_t m_queryId = 0;
|
||||
|
||||
WGPUPassTimestampWrites m_timestampWrites = {};
|
||||
|
||||
void ResolveQueryBatch(uint32_t queryBatchStartId)
|
||||
{
|
||||
// 32 queries = 32 * 8 bytes = 256 bytes
|
||||
TracyWebGPUAssert(queryBatchStartId % 32 == 0, return);
|
||||
queryBatchStartId = m_ctx->RingIndex(queryBatchStartId);
|
||||
|
||||
const uint64_t blockOffset = static_cast<uint64_t>(queryBatchStartId) * sizeof(uint64_t);
|
||||
wgpuCommandEncoderResolveQuerySet(
|
||||
m_encoder,
|
||||
m_ctx->m_querySet,
|
||||
queryBatchStartId, 32,
|
||||
m_ctx->m_resolveBuffer,
|
||||
blockOffset // MUST be a multiple of (aligned to) 256...
|
||||
);
|
||||
|
||||
auto& stage = m_ctx->m_readbackReel[m_ctx->m_writeIdx];
|
||||
auto readbackBuffer = stage.buffer;
|
||||
wgpuCommandEncoderCopyBufferToBuffer(
|
||||
m_encoder,
|
||||
m_ctx->m_resolveBuffer,
|
||||
blockOffset,
|
||||
readbackBuffer,
|
||||
blockOffset,
|
||||
32 * sizeof(uint64_t)
|
||||
);
|
||||
|
||||
// Advance this stage's high-water mark to cover the block just encoded.
|
||||
// TODO: maybe we can use fetch_add to increment the atomic and not need
|
||||
// to keep track of the raw ticket; Collect would need to derive the raw
|
||||
// end ticket number.
|
||||
const uint64_t blockEnd = m_rawTicket;
|
||||
uint64_t prev = stage.copiedUpto;
|
||||
while ((WebGPUQueueCtx::Distance(prev, blockEnd) > 0) &&
|
||||
!stage.copiedUpto.compare_exchange_weak(prev, blockEnd)) {}
|
||||
TracyWebGPUDebug( fprintf(stdout, "[TWG] WebGPUZoneScope [%d] (%d,%d)\n", (int)m_ctx->m_writeIdx, queryBatchStartId, queryBatchStartId+32) );
|
||||
}
|
||||
|
||||
tracy_force_inline void WriteQueueItem(const SourceLocationData* srcLocation, int32_t callstackDepth, uint32_t sourceLine, const char* sourceFile, size_t sourceFileLen, const char* functionName, size_t functionNameLen, const char* zoneName, size_t zoneNameLen)
|
||||
{
|
||||
if (!m_active) return;
|
||||
|
||||
const bool captureCallstack = callstackDepth > 0 && has_callstack();
|
||||
const bool transientZone = srcLocation == nullptr;
|
||||
uint64_t srcLocationAddr = reinterpret_cast<uint64_t>(srcLocation);
|
||||
|
||||
QueueItem* item = nullptr;
|
||||
QueueType itemType;
|
||||
if (transientZone)
|
||||
{
|
||||
srcLocationAddr = Profiler::AllocSourceLocation(sourceLine, sourceFile, sourceFileLen, functionName, functionNameLen, zoneName, zoneNameLen);
|
||||
if (captureCallstack)
|
||||
{
|
||||
item = Profiler::QueueSerialCallstack(Callstack(callstackDepth));
|
||||
itemType = QueueType::GpuZoneBeginAllocSrcLocCallstackSerial;
|
||||
}
|
||||
else
|
||||
{
|
||||
item = Profiler::QueueSerial();
|
||||
itemType = QueueType::GpuZoneBeginAllocSrcLocSerial;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (captureCallstack)
|
||||
{
|
||||
item = Profiler::QueueSerialCallstack(Callstack(callstackDepth));
|
||||
itemType = QueueType::GpuZoneBeginCallstackSerial;
|
||||
}
|
||||
else
|
||||
{
|
||||
item = Profiler::QueueSerial();
|
||||
itemType = QueueType::GpuZoneBeginSerial;
|
||||
}
|
||||
}
|
||||
|
||||
MemWrite(&item->hdr.type, itemType);
|
||||
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
|
||||
MemWrite(&item->gpuZoneBegin.srcloc, srcLocationAddr);
|
||||
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
|
||||
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
|
||||
MemWrite(&item->gpuZoneBegin.context, m_ctx->GetId());
|
||||
Profiler::QueueSerialFinish();
|
||||
}
|
||||
|
||||
// Fills in m_timestampWrites and assigns its address to passDesc.timestampWrites.
|
||||
// Works with both WGPURenderPassDescriptor and WGPUComputePassDescriptor.
|
||||
template<typename PassDescriptor>
|
||||
tracy_force_inline void InitBase(WebGPUQueueCtx* ctx, WGPUCommandEncoder encoder, PassDescriptor& passDesc)
|
||||
{
|
||||
m_ctx = ctx;
|
||||
m_encoder = encoder;
|
||||
|
||||
m_rawTicket = m_ctx->NextQueryId();
|
||||
m_queryId = m_ctx->RingIndex(m_rawTicket);
|
||||
|
||||
m_timestampWrites.querySet = m_ctx->m_querySet;
|
||||
m_timestampWrites.beginningOfPassWriteIndex = m_queryId;
|
||||
m_timestampWrites.endOfPassWriteIndex = m_queryId + 1;
|
||||
passDesc.timestampWrites = &m_timestampWrites;
|
||||
}
|
||||
|
||||
public:
|
||||
template<typename PassDescriptor>
|
||||
tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, WGPUCommandEncoder encoder, PassDescriptor& passDesc, const SourceLocationData* srcLocation, bool active)
|
||||
#ifdef TRACY_ON_DEMAND
|
||||
: m_active(active && GetProfiler().IsConnected())
|
||||
#else
|
||||
: m_active(active)
|
||||
#endif
|
||||
{
|
||||
if (!m_active || !ctx) return;
|
||||
InitBase(ctx, encoder, passDesc);
|
||||
WriteQueueItem(srcLocation, 0, 0, nullptr, 0, nullptr, 0, nullptr, 0);
|
||||
}
|
||||
|
||||
template<typename PassDescriptor>
|
||||
tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, WGPUCommandEncoder encoder, PassDescriptor& passDesc, const SourceLocationData* srcLocation, int32_t depth, bool active)
|
||||
#ifdef TRACY_ON_DEMAND
|
||||
: m_active(active && GetProfiler().IsConnected())
|
||||
#else
|
||||
: m_active(active)
|
||||
#endif
|
||||
{
|
||||
if (!m_active || !ctx) return;
|
||||
InitBase(ctx, encoder, passDesc);
|
||||
WriteQueueItem(srcLocation, depth, 0, nullptr, 0, nullptr, 0, nullptr, 0);
|
||||
}
|
||||
|
||||
template<typename PassDescriptor>
|
||||
tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, WGPUCommandEncoder encoder, PassDescriptor& passDesc, bool active)
|
||||
#ifdef TRACY_ON_DEMAND
|
||||
: m_active(active && GetProfiler().IsConnected())
|
||||
#else
|
||||
: m_active(active)
|
||||
#endif
|
||||
{
|
||||
if (!m_active || !ctx) return;
|
||||
InitBase(ctx, encoder, passDesc);
|
||||
WriteQueueItem(nullptr, 0, line, source, sourceSz, function, functionSz, name, nameSz);
|
||||
}
|
||||
|
||||
template<typename PassDescriptor>
|
||||
tracy_force_inline WebGPUZoneScope(WebGPUQueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, WGPUCommandEncoder encoder, PassDescriptor& passDesc, int32_t depth, bool active)
|
||||
#ifdef TRACY_ON_DEMAND
|
||||
: m_active(active && GetProfiler().IsConnected())
|
||||
#else
|
||||
: m_active(active)
|
||||
#endif
|
||||
{
|
||||
if (!m_active || !ctx) return;
|
||||
InitBase(ctx, encoder, passDesc);
|
||||
WriteQueueItem(nullptr, depth, line, source, sourceSz, function, functionSz, name, nameSz);
|
||||
}
|
||||
|
||||
tracy_force_inline ~WebGPUZoneScope()
|
||||
{
|
||||
if (!m_active || !m_ctx) return;
|
||||
|
||||
const auto queryId = m_queryId + 1;
|
||||
|
||||
auto* item = Profiler::QueueSerial();
|
||||
MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
|
||||
MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
|
||||
MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
|
||||
MemWrite(&item->gpuZoneEnd.queryId, static_cast<uint16_t>(queryId));
|
||||
MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
|
||||
Profiler::QueueSerialFinish();
|
||||
|
||||
if (m_queryId % 32 == 0)
|
||||
ResolveQueryBatch(m_queryId-32);
|
||||
}
|
||||
};
|
||||
|
||||
static inline void DestroyWebGPUContext(WebGPUQueueCtx* ctx)
|
||||
{
|
||||
if (!ctx) return;
|
||||
ctx->~WebGPUQueueCtx();
|
||||
tracy_free(ctx);
|
||||
}
|
||||
|
||||
static inline WebGPUQueueCtx* CreateWebGPUContext(WGPUInstance instance, WGPUDevice device, WGPUQueue queue)
|
||||
{
|
||||
auto* ctx = static_cast<WebGPUQueueCtx*>(tracy_malloc(sizeof(WebGPUQueueCtx)));
|
||||
new (ctx) WebGPUQueueCtx{ instance, device, queue };
|
||||
if (ctx->GetId() == 255)
|
||||
{
|
||||
DestroyWebGPUContext(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
return ctx;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#undef TracyWebGPUPanic
|
||||
#undef TracyWebGPULog
|
||||
#undef TracyWebGPUAssert
|
||||
#undef TracyWebGPUBreak
|
||||
#undef TracyWebGPUDebug
|
||||
#undef TRACY_WEBGPU_DEBUG_LEVEL
|
||||
|
||||
using TracyWebGPUCtx = tracy::WebGPUQueueCtx*;
|
||||
|
||||
#define TracyWebGPUSetupDeviceDescriptor(deviceDescriptor) tracy::WebGPUQueueCtx::Requirements TracyConcat(__tracy_wgpu_setup_, TracyLine); TracyConcat(__tracy_wgpu_setup_, TracyLine).ApplyToDeviceDescriptor(deviceDescriptor)
|
||||
|
||||
#define TracyWebGPUContext(instance, device, queue) tracy::CreateWebGPUContext(instance, device, queue);
|
||||
#define TracyWebGPUDestroy(ctx) tracy::DestroyWebGPUContext(ctx);
|
||||
#define TracyWebGPUContextName(ctx, name, size) if (ctx) ctx->Name(name, size);
|
||||
|
||||
#define TracyWebGPUUnnamedZone ___tracy_gpu_webgpu_zone
|
||||
#define TracyWebGPUSrcLocSymbol TracyConcat(__tracy_webgpu_source_location,TracyLine)
|
||||
#define TracyWebGPUSrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyWebGPUSrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color };
|
||||
|
||||
#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
|
||||
# define TracyWebGPUZone(ctx, encoder, passDesc, name) TracyWebGPUNamedZoneS(ctx, TracyWebGPUUnnamedZone, encoder, passDesc, name, TRACY_CALLSTACK, true)
|
||||
# define TracyWebGPUZoneC(ctx, encoder, passDesc, name, color) TracyWebGPUNamedZoneCS(ctx, TracyWebGPUUnnamedZone, encoder, passDesc, name, color, TRACY_CALLSTACK, true)
|
||||
# define TracyWebGPUNamedZone(ctx, varname, encoder, passDesc, name, active) TracyWebGPUSrcLocObject(name, 0); tracy::WebGPUZoneScope varname{ ctx, encoder, passDesc, &TracyWebGPUSrcLocSymbol, TRACY_CALLSTACK, active };
|
||||
# define TracyWebGPUNamedZoneC(ctx, varname, encoder, passDesc, name, color, active) TracyWebGPUSrcLocObject(name, color); tracy::WebGPUZoneScope varname{ ctx, encoder, passDesc, &TracyWebGPUSrcLocSymbol, TRACY_CALLSTACK, active };
|
||||
# define TracyWebGPUZoneTransient(ctx, varname, encoder, passDesc, name, active) TracyWebGPUZoneTransientS(ctx, varname, encoder, passDesc, name, TRACY_CALLSTACK, active)
|
||||
#else
|
||||
# define TracyWebGPUZone(ctx, encoder, passDesc, name) TracyWebGPUNamedZone(ctx, TracyWebGPUUnnamedZone, encoder, passDesc, name, true)
|
||||
# define TracyWebGPUZoneC(ctx, encoder, passDesc, name, color) TracyWebGPUNamedZoneC(ctx, TracyWebGPUUnnamedZone, encoder, passDesc, name, color, true)
|
||||
# define TracyWebGPUNamedZone(ctx, varname, encoder, passDesc, name, active) TracyWebGPUSrcLocObject(name, 0); tracy::WebGPUZoneScope varname{ ctx, encoder, passDesc, &TracyWebGPUSrcLocSymbol, active };
|
||||
# define TracyWebGPUNamedZoneC(ctx, varname, encoder, passDesc, name, color, active) TracyWebGPUSrcLocObject(name, color); tracy::WebGPUZoneScope varname{ ctx, encoder, passDesc, &TracyWebGPUSrcLocSymbol, active };
|
||||
# define TracyWebGPUZoneTransient(ctx, varname, encoder, passDesc, name, active) tracy::WebGPUZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), encoder, passDesc, active };
|
||||
#endif
|
||||
|
||||
#ifdef TRACY_HAS_CALLSTACK
|
||||
# define TracyWebGPUZoneS(ctx, encoder, passDesc, name, depth) TracyWebGPUNamedZoneS(ctx, TracyWebGPUUnnamedZone, encoder, passDesc, name, depth, true)
|
||||
# define TracyWebGPUZoneCS(ctx, encoder, passDesc, name, color, depth) TracyWebGPUNamedZoneCS(ctx, TracyWebGPUUnnamedZone, encoder, passDesc, name, color, depth, true)
|
||||
# define TracyWebGPUNamedZoneS(ctx, varname, encoder, passDesc, name, depth, active) TracyWebGPUSrcLocObject(name, 0); tracy::WebGPUZoneScope varname{ ctx, encoder, passDesc, &TracyWebGPUSrcLocSymbol, depth, active };
|
||||
# define TracyWebGPUNamedZoneCS(ctx, varname, encoder, passDesc, name, color, depth, active) TracyWebGPUSrcLocObject(name, color); tracy::WebGPUZoneScope varname{ ctx, encoder, passDesc, &TracyWebGPUSrcLocSymbol, depth, active };
|
||||
# define TracyWebGPUZoneTransientS(ctx, varname, encoder, passDesc, name, depth, active) tracy::WebGPUZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), encoder, passDesc, depth, active };
|
||||
#else
|
||||
# define TracyWebGPUZoneS(ctx, encoder, passDesc, name, depth) TracyWebGPUZone(ctx, encoder, passDesc, name)
|
||||
# define TracyWebGPUZoneCS(ctx, encoder, passDesc, name, color, depth) TracyWebGPUZoneC(ctx, encoder, passDesc, name, color)
|
||||
# define TracyWebGPUNamedZoneS(ctx, varname, encoder, passDesc, name, depth, active) TracyWebGPUNamedZone(ctx, varname, encoder, passDesc, name, active)
|
||||
# define TracyWebGPUNamedZoneCS(ctx, varname, encoder, passDesc, name, color, depth, active) TracyWebGPUNamedZoneC(ctx, varname, encoder, passDesc, name, color, active)
|
||||
# define TracyWebGPUZoneTransientS(ctx, varname, encoder, passDesc, name, depth, active) TracyWebGPUZoneTransient(ctx, varname, encoder, passDesc, name, active)
|
||||
#endif
|
||||
|
||||
#define TracyWebGPUCollect(ctx) if (ctx) ctx->Collect();
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1033,15 +1033,14 @@ PYBIND11_MODULE( TracyServerBindings, m )
|
||||
// --- GPU contexts ---
|
||||
.def( "get_gpu_contexts", []( const Worker& w ) {
|
||||
static const char* gpuTypeStr[] = {
|
||||
"Invalid", "OpenGL", "Vulkan", "OpenCL", "Direct3D12", "Direct3D11", "Metal", "Custom", "CUDA", "Rocprof", "WebGPU" };
|
||||
static size_t numTypes = sizeof(gpuTypeStr) / sizeof(gpuTypeStr[0]);
|
||||
"Invalid", "OpenGL", "Vulkan", "OpenCL", "Direct3D12", "Direct3D11", "Metal", "Custom", "CUDA", "Rocprof" };
|
||||
std::vector<GpuContextSummary> result;
|
||||
for( const auto* ctx : w.GetGpuData() )
|
||||
{
|
||||
if( !ctx ) continue;
|
||||
const std::string name = ctx->name.Active() ? w.GetString( ctx->name ) : "";
|
||||
const uint8_t typeIdx = (uint8_t)ctx->type;
|
||||
const char* typeStr = typeIdx < numTypes ? gpuTypeStr[typeIdx] : "Unknown";
|
||||
const char* typeStr = typeIdx < 10 ? gpuTypeStr[typeIdx] : "Unknown";
|
||||
result.push_back( GpuContextSummary{
|
||||
name, ctx->count, std::string( typeStr ), ctx->thread } );
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user