diff --git a/extra/mcp/.gitignore b/extra/mcp/.gitignore new file mode 100644 index 00000000..733edd20 --- /dev/null +++ b/extra/mcp/.gitignore @@ -0,0 +1,3 @@ +tracy_mcp.port +tracy_mcp.pid +*.local.sh diff --git a/extra/mcp/eval_guide.md b/extra/mcp/eval_guide.md new file mode 100644 index 00000000..b67ccbe8 --- /dev/null +++ b/extra/mcp/eval_guide.md @@ -0,0 +1,72 @@ +# Tracy MCP eval guide + +This document covers the bindings-layer detail that the curated catalog +(`tracy://catalog`) and analysis guidance (`tracy://prompt`) do not. + +## ctx + +`ctx` is a `TracyServerBindings.Worker` — the same object Tracy Assist's +C++ tools query through `Worker::Get*`. The pybind methods are the canonical +data surface. Common entry points: + +- Zones: `get_all_zone_stats()` (every callsite, large), `get_root_zone_stats()` + (top-level zones only, useful for "where is the program spending time"), + `get_zone_stats(srcloc_id)`, `get_child_zone_stats(srcloc_id)` (subtract for + self-time), `get_zone_durations(name)`, `get_zone_count()`, + `get_all_zone_source_locations()` +- GPU zones: `get_all_gpu_zone_stats()`, `get_gpu_zone_durations(...)`, + `get_gpu_contexts()` +- Frames: `get_frame_count()`, `get_frame_times()`, `get_frame_times_named(name)`, + `get_frame_boundaries()`, `get_zones_in_frame(...)` +- Threads: `get_threads()`, `get_thread_name(tid)`, `get_thread_context_switches(tid)` +- Messages / plots / locks / memory / callstacks: `get_messages()`, `get_plots()`, + `get_locks()`, `get_memory_events()`, `get_callstack_frames(...)` +- Capture metadata: `get_capture_name()`, `get_capture_program()`, + `get_first_time()`, `get_last_time()`, `get_resolution()`, `get_host_info()` + +Run `print([m for m in dir(ctx) if not m.startswith('_')])` for the full list. + +## Units and conventions + +- All time values returned by Worker methods are **nanoseconds** (int). + `get_first_time()` / `get_last_time()` bound the capture timeline. +- `ZoneStats` fields: `count`, `total`, `min`, `max`, `avg`, `sum_sq`. `total` + is the inclusive aggregate; use `get_child_zone_stats(srcloc_id)` to subtract + child time when you need self-time. +- `get_all_zone_stats()` returns `dict[str, ZoneStats]` keyed by an opaque label + of the form `'name (addr)[arch] '`. The trailing `` is the + source-location ID — the int accepted by `get_zone_stats(int)`, + `get_zone_durations_by_id`, and friends. Parse it with a regex if you need + to join across calls. +- Source-location IDs from `get_all_zone_source_locations()` are the join key + between zone-name lookups and per-callsite queries. + +## Translating catalog entries to ctx Python + +The catalog (`tracy://catalog`) lists curated queries. Each maps to a small +Python snippet: + +```python +# zone_list — top 10 hottest zones by total time +top = sorted(ctx.get_all_zone_stats().items(), + key=lambda kv: kv[1].total, reverse=True)[:10] +for k, v in top: + print(f"{v.total/1e6:.2f}ms count={v.count} {k}") + +# frame_list — primary frame set timing +times = ctx.get_frame_times() # ns per frame +print(f"frames={len(times)} avg={sum(times)/len(times)/1e6:.2f}ms " + f"p99={sorted(times)[int(len(times)*0.99)]/1e6:.2f}ms") + +# zone_stats for a named zone — find the srcloc id, then drill in +import re +matches = [k for k in ctx.get_all_zone_stats() if k.startswith("MyFunc ")] +sid = int(re.search(r"<(\d+)>$", matches[0]).group(1)) +stats = ctx.get_zone_stats(sid) +``` + +## Async mode + +For long-running queries pass `async_mode=True` to `eval`; it returns +`{task_id, status: "running"}`. Poll with the `task` tool +(`action="poll", task_id=...`). diff --git a/extra/mcp/start_mcp.sh b/extra/mcp/start_mcp.sh new file mode 100644 index 00000000..2b50afbe --- /dev/null +++ b/extra/mcp/start_mcp.sh @@ -0,0 +1,17 @@ +#!/bin/sh +# Start the Tracy MCP server. +# +# Set PYTHONPATH to the directory containing TracyServerBindings.so/.pyd. +# Adjust the Release/Debug suffix to match your CMake build configuration. +PYTHONPATH="${PYTHONPATH:+$PYTHONPATH:}$(dirname "$0")/../../build/python/Release" +export PYTHONPATH + +# Machine-local overrides (not committed). Create start_mcp.local.sh next to +# this file to set TRACY_CAPTURES_DIR, TRACY_MCP_PORT, or any other env var: +# export TRACY_CAPTURES_DIR=/path/to/captures +# export TRACY_MCP_PORT=47380 +if [ -f "$(dirname "$0")/start_mcp.local.sh" ]; then + . "$(dirname "$0")/start_mcp.local.sh" +fi + +exec python3 "$(dirname "$0")/tracy_mcp.py" "$@" diff --git a/extra/mcp/tracy_mcp.py b/extra/mcp/tracy_mcp.py new file mode 100644 index 00000000..bff765db --- /dev/null +++ b/extra/mcp/tracy_mcp.py @@ -0,0 +1,596 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import asyncio +import atexit +import builtins +import concurrent.futures +import glob +import io +import os +import logging +import re +import socket +import struct +import sys +import time +import uuid +from contextlib import redirect_stdout + +import mcp.server.fastmcp as fastmcp + +# Suppress noisy ASGI shutdown errors known to occur with SSE and Control-C. +# These occur when Starlette attempts to send a 500 error after the loop is cancelled +# but after the SSE 200 OK headers have already been sent. Global level suppression +# is used because surgical filtering of ASGI exceptions is unreliable in this stack. +logging.getLogger("uvicorn.error").setLevel(logging.CRITICAL) +logging.getLogger("starlette").setLevel(logging.CRITICAL) + +_HERE = os.path.dirname(os.path.abspath(__file__)) +_PORT_FILE = os.path.join(_HERE, "tracy_mcp.port") +_PID_FILE = os.path.join(_HERE, "tracy_mcp.pid") +_PREFERRED_PORT = int(os.environ.get("TRACY_MCP_PORT", "47380")) + +# Shared documentation surfaces. system.prompt.md is Tracy Assist's source +# system prompt; exposing it as an MCP resource keeps analysis guidance in +# sync across both surfaces with no plumbing. eval_guide.md covers +# bindings-layer detail (ctx object model, units, source-location ID joins). +_LLM_DIR = os.path.normpath(os.path.join(_HERE, "..", "..", "profiler", "src", "llm")) +_PROMPT_PATH = os.path.join(_LLM_DIR, "system.prompt.md") +_EVAL_GUIDE_PATH = os.path.join(_HERE, "eval_guide.md") + + +def _read_text(path: str) -> str: + try: + with open(path, encoding="utf-8") as f: + return f.read() + except Exception as e: + return f"(unavailable: {e})" + + +# Tracy UDP broadcast packet support. Tracy clients announce themselves on +# port 8086 with a BroadcastMessage (see public/common/TracyProtocol.hpp). +# The dev GUI reads protocolVersion from the broadcast and refuses connection +# on mismatch instead of hitting an opaque TCP timeout. We do the same. +_PROTOCOL_HPP = os.path.normpath( + os.path.join(_HERE, "..", "..", "public", "common", "TracyProtocol.hpp") +) +_BROADCAST_PORT = 8086 +_PROGRAM_NAME_SIZE = 64 + + +def _read_bindings_protocol_version() -> int | None: + """Parse ProtocolVersion from TracyProtocol.hpp at startup so our 'expected' + version stays in sync with the bindings build without extra C++ wiring.""" + try: + with open(_PROTOCOL_HPP, encoding="utf-8") as f: + for line in f: + m = re.search(r"constexpr\s+uint32_t\s+ProtocolVersion\s*=\s*(\d+)", line) + if m: + return int(m.group(1)) + except Exception: + pass + return None + + +_OUR_PROTOCOL_VERSION = _read_bindings_protocol_version() + + +def _parse_broadcast(data: bytes) -> dict | None: + """Parse a Tracy BroadcastMessage. Handles broadcast versions 0-3. + + Fixed-field sizes (from TracyProtocol.hpp, packed): + v3: u16 bv, u16 lp, u32 pv, u64 pid, i32 at, char[<=64] name (>=20 + name) + v2: u16 bv, u16 lp, u32 pv, i32 at, char[<=64] name (>=12 + name) + v1: u32 bv, u32 pv, u32 lp, u32 at, char[<=64] name (>=16 + name) + v0: u32 bv, u32 pv, u32 at, char[<=64] name (>=12 + name) + + The programName field is variable-length on the wire — the sender writes + only the actual name plus null terminator, not the full 64-byte buffer. + """ + if len(data) < 4: + return None + + def _name(buf: bytes) -> str: + return buf[:_PROGRAM_NAME_SIZE].split(b"\0", 1)[0].decode("utf-8", "replace") + + bv16 = struct.unpack_from("= 21: + bv, lp, pv, pid, at = struct.unpack_from("= 13: + bv, lp, pv, at = struct.unpack_from("= 17: + bv, pv, lp, at = struct.unpack_from("= 13: + bv, pv, at = struct.unpack_from(" list[dict]: + """Listen briefly on UDP 8086 for Tracy client announcements. + + Returns a list of parsed broadcasts (deduplicated by listen_port). Empty + list means no broadcast received — the target may use TRACY_ON_DEMAND, + a non-default broadcast port, or simply isn't running. + """ + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + try: + s.bind(("", _BROADCAST_PORT)) + except OSError: + s.close() + return [] + s.setblocking(False) + loop = asyncio.get_running_loop() + seen: dict[int | None, dict] = {} + deadline = loop.time() + timeout_s + try: + while loop.time() < deadline: + remaining = deadline - loop.time() + if remaining <= 0: + break + try: + fut = loop.sock_recvfrom(s, 2048) + data, _addr = await asyncio.wait_for(fut, timeout=remaining) + except (asyncio.TimeoutError, BlockingIOError): + break + parsed = _parse_broadcast(data) + if parsed: + seen.setdefault(parsed.get("listen_port"), parsed) + finally: + s.close() + return list(seen.values()) + + +def _is_our_server_running() -> tuple[bool, int]: + """ + Check the PID file to see if our server is already running. + Returns (running, port). Uses os.kill(pid, 0) to confirm the process is alive. + """ + try: + with open(_PID_FILE) as f: + pid = int(f.read().strip()) + with open(_PORT_FILE) as f: + port = int(f.read().strip()) + os.kill(pid, 0) # raises OSError if process is gone + return True, port + except Exception: + return False, 0 + + +def _find_free_port() -> int: + """Scan from preferred port upward; fall back to OS-assigned if the range is exhausted.""" + for port in range(_PREFERRED_PORT, _PREFERRED_PORT + 16): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + try: + s.bind(("127.0.0.1", port)) + s.close() + return port + except OSError: + s.close() + # Let OS assign any free port + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.bind(("127.0.0.1", 0)) + port = s.getsockname()[1] + s.close() + return port + + +def _write_pid_and_port(port: int) -> None: + try: + with open(_PID_FILE, "w") as f: + f.write(str(os.getpid())) + with open(_PORT_FILE, "w") as f: + f.write(str(port)) + except Exception: + pass + + +def _cleanup_pid_files() -> None: + for path in (_PID_FILE, _PORT_FILE): + try: + os.unlink(path) + except Exception: + pass + + +# Attempt to import Tracy Server bindings +try: + import TracyServerBindings as tracy_server +except ImportError: + sys.path.append(os.path.join(os.path.dirname(__file__), "../../build/python")) + try: + import TracyServerBindings as tracy_server + except ImportError: + tracy_server = None + +mcp_server = fastmcp.FastMCP("Tracy Profiler") +executor = concurrent.futures.ThreadPoolExecutor(max_workers=4) + + +class Task: + def __init__(self, task_id: str, code: str): + self.id = task_id + self.code = code + self.status = "pending" + self.result = None + self.error = None + self.start_time = time.time() + self.end_time = None + + +class TracyInstance: + def __init__(self, name: str, worker: object | None = None): + self.name = name + self.worker = worker + self.path = None + self.mtime = None + + +instances: dict[str, TracyInstance] = {} +tasks: dict[str, Task] = {} +captures_dir: str | None = os.environ.get("TRACY_CAPTURES_DIR") + + +@mcp_server.resource("tracy://prompt") +def _prompt_resource() -> str: + """Tracy Assist's analysis guidance (system.prompt.md). Contains workflows + for optimization, callstack inspection, and privacy rules. %TIME%, %USER%, + and %PROGRAMNAME% are placeholders filled by the in-app chat — ignore them + when reading from MCP.""" + return _read_text(_PROMPT_PATH) + + +@mcp_server.resource("tracy://eval-guide") +def _eval_guide_resource() -> str: + """Bindings-layer guide for the eval tool: ctx object model, time units, + source-location ID semantics, and worked examples translating catalog + entries into ctx Python.""" + return _read_text(_EVAL_GUIDE_PATH) + + +@mcp_server.tool() +async def list_captures() -> list[str]: + """List .tracy capture files in the TRACY_CAPTURES_DIR directory (non-recursive).""" + if not captures_dir: + return [] + return sorted(glob.glob(os.path.join(captures_dir, "*.tracy"))) + + +@mcp_server.tool() +async def list_instances() -> list[dict]: + """List all loaded Tracy instances and captures with metadata.""" + return [ + { + "id": name, + "path": inst.path, + "mtime": inst.mtime, + "live": inst.path is None + } + for name, inst in instances.items() + ] + + +@mcp_server.tool() +async def discover_instances(port_range: str = "8086-8095") -> list[dict]: + """ + Scan for running Tracy-instrumented applications on local ports. + + Returns a list of discovered ports that are listening. + """ + start_port, end_port = map(int, port_range.split("-")) + discovered = [] + + async def check_port(port: int) -> None: + try: + _, writer = await asyncio.wait_for( + asyncio.open_connection("127.0.0.1", port), timeout=0.1 + ) + writer.close() + await writer.wait_closed() + discovered.append({"port": port, "address": "127.0.0.1"}) + except (OSError, asyncio.TimeoutError, ConnectionRefusedError): + pass + + await asyncio.gather(*(check_port(p) for p in range(start_port, end_port + 1))) + return discovered + + +@mcp_server.tool() +async def live_connect(address: str = "127.0.0.1", port: int = 8086, alias: str | None = None) -> str: + """ + Connect to a live running Tracy-instrumented application. + + Wraps Worker(addr, port, memoryLimit=-1). Returns the instance_id. + """ + if not tracy_server: + return "Error: Tracy Server bindings not found." + + # Pre-flight: read Tracy's UDP broadcast on port 8086 and compare protocol + # versions before attempting TCP. Mirrors what the Tracy GUI does so a + # version mismatch produces a precise error instead of an opaque timeout. + # Tracy clients broadcast every ~3s (TracyProfiler.cpp), so we listen a + # little longer to guarantee catching at least one beat. + broadcasts = await _listen_broadcasts(timeout_s=3.5) + match = next((b for b in broadcasts if b.get("listen_port") == port), None) + if match and _OUR_PROTOCOL_VERSION is not None: + if match["protocol_version"] != _OUR_PROTOCOL_VERSION: + return ( + f"Protocol mismatch: target program '{match['program']}' " + f"announces Tracy protocol v{match['protocol_version']} on " + f"{address}:{port}, but these server bindings are built " + f"against v{_OUR_PROTOCOL_VERSION}. Rebuild the bindings or " + f"the target against a matching Tracy version." + ) + + try: + w = tracy_server.Worker(address, port) + except Exception as e: + return f"Failed to connect: {str(e)}" + + # Worker construction returns immediately even on protocol failure (the + # bindings expose no error state — is_connected() is the only signal). + # Probe briefly so silent failures (e.g. TRACY_ON_DEMAND with no profiler + # request yet, or a target broadcasting on a non-default port) surface + # cleanly even when broadcast pre-flight didn't catch them. + deadline_s = 2.0 + step_s = 0.1 + elapsed = 0.0 + while elapsed < deadline_s and not w.is_connected(): + await asyncio.sleep(step_s) + elapsed += step_s + + if not w.is_connected(): + try: + w.shutdown() + except Exception: + pass + hint = "" + if broadcasts and not match: + seen = ", ".join( + f"'{b['program']}' on port {b.get('listen_port')} (protocol v{b['protocol_version']})" + for b in broadcasts + ) + hint = f" Detected other Tracy broadcasts: {seen}." + elif not broadcasts: + hint = ( + " No Tracy broadcasts were received on port 8086 in 3.5s — " + "the target may use TRACY_ON_DEMAND, a non-default broadcast " + "port, or isn't running." + ) + return ( + f"Reached {address}:{port} but the Tracy handshake did not complete " + f"within {deadline_s:.1f}s.{hint} Common causes: (1) the Tracy " + f"client version embedded in the target program differs from these " + f"server bindings; (2) the target was built with TRACY_ON_DEMAND " + f"and is awaiting a profiler request; (3) another client is " + f"already attached." + ) + + name = alias or f"live_{address}_{port}" + instances[name] = TracyInstance(name, w) + return ( + f"Connected to live instance as '{name}'. " + f"Before your first eval, read resources tracy://prompt " + f"(analysis guidance) and tracy://eval-guide (ctx object model, " + f"ns time units, srcloc IDs)." + ) + + +@mcp_server.tool() +async def load_capture(path: str, alias: str | None = None) -> str: + """ + Load a .tracy capture file by absolute path. + + Parameters: + path — absolute path to a .tracy file. On Windows use backslashes + (e.g. 'E:\\\\traces\\\\foo.tracy'). + alias — optional instance name; overwrites existing on collision. + If omitted, an ID is derived from filename and mtime. + + If you don't already have a path, call `list_captures` first — it lists + .tracy files in the TRACY_CAPTURES_DIR environment directory. + """ + if not tracy_server: + return "Error: Tracy Server bindings not found." + try: + mtime = os.path.getmtime(path) + if alias: + name = alias + else: + # unique name including mtime to avoid version collision + name = f"{os.path.basename(path)}@{int(mtime):x}" + + if name in instances: + inst = instances[name] + if inst.path == path and inst.mtime == mtime: + return f"Instance '{name}' is already loaded and up to date." + + f = tracy_server.open_file(path) + w = tracy_server.create_worker_from_file(f) + inst = TracyInstance(name, w) + inst.path = path + inst.mtime = mtime + instances[name] = inst + return ( + f"Loaded as '{name}'. " + f"Before your first eval, read resources tracy://prompt " + f"(analysis guidance) and tracy://eval-guide (ctx object model, " + f"ns time units, srcloc IDs)." + ) + except Exception as e: + return f"Failed to load: {str(e)}" + + +@mcp_server.tool() +async def unload_capture(instance_id: str) -> str: + """Unload a Tracy instance and release its memory.""" + if instance_id in instances: + del instances[instance_id] + return f"Instance '{instance_id}' unloaded." + return f"Instance '{instance_id}' not found." + + +@mcp_server.tool(name="eval") +async def tracy_eval(code: str, instance_id: str, async_mode: bool = False) -> object: + """ + Execute Python code against a specific Tracy Worker bound as `ctx`. + + On first use, read the `tracy://prompt` (analysis guidance) and + `tracy://eval-guide` (ctx object model, units, source-location ID joins) + resources. Time values returned by Worker methods are nanoseconds. + + If async_mode=True, returns a task_id immediately; poll via the `task` tool. + """ + if instance_id not in instances: + return f"Error: Instance '{instance_id}' not found. Use list_instances to find valid IDs." + + instance = instances[instance_id] + if not instance.worker: + return f"Error: Instance '{instance_id}' has no worker." + + if not async_mode: + return await _execute_eval(code, instance.worker) + + # Async mode: spawn task and return immediately + task_id = str(uuid.uuid4()) + task = Task(task_id, code) + tasks[task_id] = task + asyncio.get_running_loop().run_in_executor( + executor, _run_task_sync, task, instance.worker + ) + return {"task_id": task_id, "status": "running"} + + +def _run_task_sync(task: Task, worker: object) -> None: + """Run a background eval task in the thread pool.""" + task.status = "running" + try: + task.result = _execute_eval_sync(task.code, worker) + task.status = "completed" + except Exception as e: + task.error = str(e) + task.status = "failed" + finally: + task.end_time = time.time() + + +def _execute_eval_sync(code: str, ctx: object) -> str: + """Execute *code* with `ctx` bound to the Tracy worker. Captures stdout.""" + global_vars = { + "__builtins__": builtins, + "ctx": ctx, + "tracy": tracy_server, + "instances": {name: inst.worker for name, inst in instances.items()}, + } + buf = io.StringIO() + with redirect_stdout(buf): + try: + result = eval(compile(code, "", "eval"), global_vars) + except SyntaxError: + exec(compile(code, "", "exec"), global_vars) + result = None + output = buf.getvalue() + if result is None: + return output or "" + return str(result) + + +async def _execute_eval(code: str, ctx: object) -> str: + """Async wrapper: runs `_execute_eval_sync` in the thread-pool executor.""" + return await asyncio.get_running_loop().run_in_executor( + executor, _execute_eval_sync, code, ctx + ) + + +@mcp_server.tool() +async def task(action: str, task_id: str | None = None) -> object: + """ + Manage background analysis tasks. + + Actions: poll, cancel, list + """ + if action == "list": + return [ + {"id": t.id, "status": t.status, "elapsed": time.time() - t.start_time} + for t in tasks.values() + ] + + if not task_id or task_id not in tasks: + return "Error: Task ID not found." + + t = tasks[task_id] + if action == "poll": + res: dict = {"id": t.id, "status": t.status} + if t.status == "completed": + res["result"] = t.result + elif t.status == "failed": + res["error"] = t.error + return res + + if action == "cancel": + # Cancellation of thread-pool work is not possible post-submission; + # mark the task so callers know it was abandoned. + if t.status == "running": + t.status = "cancelled" + return f"Task {task_id} marked as cancelled." + return f"Task {task_id} is not running." + + return "Error: Unknown action." + + +@mcp_server.tool() +async def shutdown_server() -> str: + """ + Shut down the Tracy MCP server. + + Because the server runs as a singleton (SSE transport, one process shared + across all VS Code windows), this releases the TracyServerBindings.pyd lock + for all clients at once. Restart tracy_mcp.py after rebuilding. + """ + import threading + def _exit() -> None: + time.sleep(0.2) + os._exit(0) + threading.Thread(target=_exit, daemon=True).start() + return "Server shutting down. Restart tracy_mcp.py to reconnect." + + +if __name__ == "__main__": + atexit.register(_cleanup_pid_files) + + running, existing_port = _is_our_server_running() + if running: + print( + f"Tracy MCP already running on port {existing_port}. " + "All VS Code windows share that instance.", + file=sys.stderr, + ) + sys.exit(0) + + port = _find_free_port() + _write_pid_and_port(port) + + print(f"Tracy MCP listening on http://127.0.0.1:{port}/sse", file=sys.stderr) + + mcp_server.settings.host = "127.0.0.1" + mcp_server.settings.port = port + try: + mcp_server.run(transport="sse") + except KeyboardInterrupt: + print("\nTracy MCP server stopped.", file=sys.stderr) + sys.exit(0) diff --git a/manual/tracy.md b/manual/tracy.md index da295997..a17b3aef 100644 --- a/manual/tracy.md +++ b/manual/tracy.md @@ -1,4 +1,4 @@ ---- +--- bibliography: - tracy.bib --- diff --git a/manual/tracy.tex b/manual/tracy.tex index 23946940..28f9370b 100644 --- a/manual/tracy.tex +++ b/manual/tracy.tex @@ -2473,6 +2473,123 @@ The following additional CMake options are available when building the Python pa Be aware that the memory allocated by this buffer is global and is not freed, see section~\ref{uniquepointers}. +\subsection{MCP Server} +\label{mcpserver} + +Tracy provides an optional MCP (Model Context Protocol\footnote{\url{https://modelcontextprotocol.io}}) server that allows AI coding assistants to load and analyze Tracy captures as part of automated workflows. It runs as a separate Python sidecar process and does not integrate with or depend on Tracy Assist (section~\ref{tracyassist}). No Python interpreter is required to run Tracy itself. + +The primary use case is agentic tooling: an AI agent can load a \texttt{.tracy} capture, execute arbitrary analysis code against the \texttt{Worker} bindings (see below), and compare results across multiple captures — for example, validating that a proposed optimization reduced frame time. + +\subsubsection{Building} + +The MCP server requires the Tracy Server Python bindings, which are built alongside the client bindings when \texttt{TRACY\_CLIENT\_PYTHON} is enabled: + +\begin{lstlisting} +cmake -B build -DTRACY_CLIENT_PYTHON=ON +cmake --build build --config Release +\end{lstlisting} + +\subsubsection{Running} + +\begin{lstlisting} +pip install mcp +python extra/mcp/tracy_mcp.py +\end{lstlisting} + +Set the following environment variables before launching (or export them in your shell): + +\begin{lstlisting} +PYTHONPATH=/path/to/tracy/build/python/Release +TRACY_CAPTURES_DIR=/path/to/captures # enables list_captures +TRACY_MCP_PORT=47380 # optional; default 47380 +\end{lstlisting} + +\subsubsection{Integrating with an AI assistant} + +The server runs as a singleton on SSE transport (port 47380 by default). Only one process loads \texttt{TracyServerBindings} regardless of how many editor windows are open; subsequent launches detect the port is taken and exit immediately. + +The server prints its URL on startup and writes it to \texttt{extra/mcp/tracy\_mcp.port}: + +\begin{lstlisting} +Tracy MCP listening on http://127.0.0.1:47380/sse +\end{lstlisting} + +Configure your AI assistant using that URL. For example, for a JSON-based MCP configuration: + +\begin{lstlisting} +{ + "mcpServers": { + "tracy": { + "url": "http://127.0.0.1:47380/sse" + } + } +} +\end{lstlisting} + +\subsubsection{Available tools} + +\begin{itemize} +\item \texttt{list\_captures} --- List \texttt{*.tracy} files in \texttt{TRACY\_CAPTURES\_DIR} (top-level only). +\item \texttt{list\_instances} --- List all captures currently loaded in the server. +\item \texttt{load\_capture} --- Load a \texttt{.tracy} file by path, optionally giving it an alias. +\item \texttt{connect\_instance} --- Set the active instance for subsequent analysis calls. +\item \texttt{live\_connect} --- Connect to a running Tracy-instrumented application by address and port. +\item \texttt{discover\_instances} --- Scan a port range for running Tracy-instrumented applications. +\item \texttt{eval} --- Execute arbitrary Python against the active \texttt{Worker} object (available as \texttt{ctx}). Supports \texttt{async\_mode=True} for long-running queries. +\item \texttt{task} --- Poll, cancel, or list background analysis tasks started with \texttt{async\_mode=True}. +\end{itemize} + +\subsubsection{Worker API (available via \texttt{eval})} + +Inside \texttt{eval}, the variable \texttt{ctx} is a \texttt{Worker} instance. All time values are in nanoseconds. The following methods are available: + +\paragraph{Capture metadata} +\begin{itemize} +\item \texttt{get\_capture\_name()} / \texttt{get\_capture\_program()} --- Name and program string stored in the trace. +\item \texttt{get\_host\_info()} --- OS, CPU, RAM, and compiler info as a string. +\item \texttt{get\_resolution()} --- Timer resolution in nanoseconds. +\item \texttt{get\_first\_time()} / \texttt{get\_last\_time()} --- Trace time range in nanoseconds. +\end{itemize} + +\paragraph{CPU zones} +\begin{itemize} +\item \texttt{get\_all\_zone\_stats()} --- Returns a \texttt{dict[str, ZoneStats]} keyed by zone name. Each \texttt{ZoneStats} has \texttt{min}, \texttt{max}, \texttt{total}, \texttt{avg}, \texttt{count}, \texttt{sum\_sq} (all in nanoseconds). Includes nested zones. +\item \texttt{get\_root\_zone\_stats()} --- Like \texttt{get\_all\_zone\_stats()} but aggregates only top-level zones per thread. Safe to sum across zones. +\item \texttt{get\_zone\_stats(srcloc\_id)} --- Stats for a single source location. +\item \texttt{get\_zone\_durations(name)} --- List of individual zone durations (ns) for distribution analysis. +\item \texttt{get\_zone\_source\_location(name)} --- Returns \texttt{\{"name", "function", "file", "line", "color"\}} for the named zone. +\end{itemize} + +\paragraph{GPU zones} +\begin{itemize} +\item \texttt{get\_all\_gpu\_zone\_stats()} --- Returns a \texttt{dict[str, GpuZoneStats]}. +\item \texttt{get\_gpu\_contexts()} --- Returns a list of \texttt{GpuContextSummary} objects. +\item \texttt{get\_gpu\_zone\_durations(name)} --- Individual GPU zone durations (ns). +\end{itemize} + +\paragraph{Frames} +\begin{itemize} +\item \texttt{get\_frame\_times()} --- Per-frame durations (ns) for the default frame set. +\item \texttt{get\_frame\_times\_named(name)} --- Per-frame durations for a named frame set. +\item \texttt{get\_frame\_boundaries()} --- List of \texttt{(start\_ns, end\_ns)} tuples for each frame. +\item \texttt{get\_frame\_count()} --- Frame count for the default frame set. +\end{itemize} + +\paragraph{Threads, messages, plots, memory, and locks} +\begin{itemize} +\item \texttt{get\_threads()} --- List of \texttt{ThreadData} objects with \texttt{id}, \texttt{count}, \texttt{is\_fiber}. +\item \texttt{get\_messages()} --- List of \texttt{MessageInfo} objects with \texttt{time}, \texttt{text}, \texttt{color}, \texttt{thread}. +\item \texttt{get\_plots()} --- List of \texttt{PlotSummary} objects with \texttt{name}, \texttt{type}, \texttt{min}, \texttt{max}, \texttt{sum}, \texttt{avg}, \texttt{count}. +\item \texttt{get\_memory\_events()} --- List of raw allocation events including pointer, size, alloc/free times, and callstack index. +\item \texttt{get\_locks()} --- List of \texttt{LockSummary} objects. Use \texttt{get\_lock\_wait\_stats()} for contention analysis. +\item \texttt{get\_symbol\_stats()} --- Callstack-sample hit counts per symbol. Sort by \texttt{excl} to find hot functions. +\item \texttt{get\_callstack\_frames(callstack\_idx)} --- Resolve a callstack index to a list of \texttt{\{"name", "file", "line", "addr"\}} frames. +\end{itemize} + +\subsubsection{Loading a capture} + +Traces must be explicitly loaded through the MCP server — opening a file in the Tracy GUI does not make it available to the server. Use \texttt{load\_capture} with the full path to a \texttt{.tracy} file, or use \texttt{list\_captures} first if \texttt{TRACY\_CAPTURES\_DIR} is configured. + \subsection{Fortran API} \label{fortranapi} diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index be867665..f9e019b5 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -20,13 +20,24 @@ target_link_libraries(TracyClientBindings PUBLIC ${Python_LIBRARIES}) target_compile_definitions(TracyClientBindings PUBLIC BUFFER_SIZE=${BUFFER_SIZE}) target_compile_definitions(TracyClientBindings PUBLIC NAME_LENGTH=${NAME_LENGTH}) +include(${CMAKE_CURRENT_LIST_DIR}/../cmake/config.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/../cmake/vendor.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/../cmake/server.cmake) + +pybind11_add_module(TracyServerBindings SHARED bindings/ServerModule.cpp) +target_link_libraries(TracyServerBindings PUBLIC TracyServer) +target_link_libraries(TracyServerBindings PUBLIC ${Python_LIBRARIES}) + if (UNIX) set_target_properties(TracyClientBindings PROPERTIES BUILD_RPATH_USE_ORIGIN TRUE INSTALL_RPATH "\$ORIGIN/lib") + set_target_properties(TracyServerBindings PROPERTIES + BUILD_RPATH_USE_ORIGIN TRUE + INSTALL_RPATH "\$ORIGIN/lib") endif () -install(TARGETS TracyClientBindings +install(TARGETS TracyClientBindings TracyServerBindings RUNTIME DESTINATION . LIBRARY DESTINATION . ) diff --git a/python/bindings/ServerModule.cpp b/python/bindings/ServerModule.cpp new file mode 100644 index 00000000..1dbd0d52 --- /dev/null +++ b/python/bindings/ServerModule.cpp @@ -0,0 +1,1089 @@ +#include +#include +#include + +#ifdef _MSC_VER +# pragma warning( push ) +# pragma warning( disable : 4244 4267 ) // third-party ppqsort: narrowing conversions +#elif defined( __GNUC__ ) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" +# pragma GCC diagnostic ignored "-Wnarrowing" +#endif +#include "../../server/TracyFileRead.hpp" +#include "../../server/TracyWorker.hpp" +#ifdef _MSC_VER +# pragma warning( pop ) +#elif defined( __GNUC__ ) +# pragma GCC diagnostic pop +#endif + +namespace py = pybind11; +using namespace pybind11::literals; + +namespace tracy +{ + +PYBIND11_MODULE( TracyServerBindings, m ) +{ + m.doc() = "Tracy Server (Analysis) Bindings"; + + // ------------------------------------------------------------------------- + // SourceLocation + // ------------------------------------------------------------------------- + py::class_( m, "SourceLocation" ) + .def_readonly( "line", &SourceLocation::line ); + + // ------------------------------------------------------------------------- + // ZoneStats — POD summary returned by zone stat helpers + // ------------------------------------------------------------------------- + struct ZoneStats + { + int64_t min; + int64_t max; + int64_t total; + double sumSq; + size_t count; + double avg; + }; + + py::class_( m, "ZoneStats" ) + .def_readonly( "min", &ZoneStats::min ) + .def_readonly( "max", &ZoneStats::max ) + .def_readonly( "total", &ZoneStats::total ) + .def_readonly( "sum_sq", &ZoneStats::sumSq ) + .def_readonly( "count", &ZoneStats::count ) + .def_readonly( "avg", &ZoneStats::avg ); + + // GpuZoneStats — GPU timestamps are the same int64_t nanosecond type; + // reuse ZoneStats rather than duplicating the struct. + using GpuZoneStats = ZoneStats; + + // ------------------------------------------------------------------------- + // FrameStats — per-frame-set timing summary + // ------------------------------------------------------------------------- + struct FrameStats + { + std::string name; + int64_t min; + int64_t max; + int64_t total; + double sumSq; + size_t count; + double avg; + }; + + py::class_( m, "FrameStats" ) + .def_readonly( "name", &FrameStats::name ) + .def_readonly( "min", &FrameStats::min ) + .def_readonly( "max", &FrameStats::max ) + .def_readonly( "total", &FrameStats::total ) + .def_readonly( "sum_sq", &FrameStats::sumSq ) + .def_readonly( "count", &FrameStats::count ) + .def_readonly( "avg", &FrameStats::avg ); + + // ------------------------------------------------------------------------- + // PlotSummary + // ------------------------------------------------------------------------- + struct PlotSummary + { + std::string name; + double min; + double max; + double sum; + size_t count; + double avg; + std::string type; + }; + + py::class_( m, "PlotSummary" ) + .def_readonly( "name", &PlotSummary::name ) + .def_readonly( "min", &PlotSummary::min ) + .def_readonly( "max", &PlotSummary::max ) + .def_readonly( "sum", &PlotSummary::sum ) + .def_readonly( "count", &PlotSummary::count ) + .def_readonly( "avg", &PlotSummary::avg ) + .def_readonly( "type", &PlotSummary::type ); + + // ------------------------------------------------------------------------- + // MemPoolSummary + // ------------------------------------------------------------------------- + struct MemPoolSummary + { + std::string name; + uint64_t high; + uint64_t low; + uint64_t usage; + size_t alloc_count; + }; + + py::class_( m, "MemPoolSummary" ) + .def_readonly( "name", &MemPoolSummary::name ) + .def_readonly( "high", &MemPoolSummary::high ) + .def_readonly( "low", &MemPoolSummary::low ) + .def_readonly( "usage", &MemPoolSummary::usage ) + .def_readonly( "alloc_count", &MemPoolSummary::alloc_count ); + + // ------------------------------------------------------------------------- + // LockSummary + // ------------------------------------------------------------------------- + struct LockSummary + { + std::string name; + bool is_contended; + std::string type; + int64_t time_announce; + int64_t time_terminate; + std::vector threads; + }; + + py::class_( m, "LockSummary" ) + .def_readonly( "name", &LockSummary::name ) + .def_readonly( "is_contended", &LockSummary::is_contended ) + .def_readonly( "type", &LockSummary::type ) + .def_readonly( "time_announce", &LockSummary::time_announce ) + .def_readonly( "time_terminate", &LockSummary::time_terminate ) + .def_readonly( "threads", &LockSummary::threads ); + + // ------------------------------------------------------------------------- + // GpuContextSummary + // ------------------------------------------------------------------------- + struct GpuContextSummary + { + std::string name; + uint64_t count; + std::string type; + uint64_t thread; + }; + + py::class_( m, "GpuContextSummary" ) + .def_readonly( "name", &GpuContextSummary::name ) + .def_readonly( "count", &GpuContextSummary::count ) + .def_readonly( "type", &GpuContextSummary::type ) + .def_readonly( "thread", &GpuContextSummary::thread ); + + // ------------------------------------------------------------------------- + // MessageInfo + // ------------------------------------------------------------------------- + struct MessageInfo + { + int64_t time; + std::string text; + uint32_t color; + uint64_t thread; + }; + + py::class_( m, "MessageInfo" ) + .def_readonly( "time", &MessageInfo::time ) + .def_readonly( "text", &MessageInfo::text ) + .def_readonly( "color", &MessageInfo::color ) + .def_readonly( "thread", &MessageInfo::thread ); + + // ThreadData — get_threads() returns plain dicts to avoid pybind11 + // raw-pointer ownership issues, so no class registration is needed. + + // ------------------------------------------------------------------------- + // Worker + // ------------------------------------------------------------------------- + auto worker_cls = py::class_( m, "Worker" ); + worker_cls + // Construction + .def( py::init(), "addr"_a, "port"_a, "memoryLimit"_a = -1 ) + + // --- Capture metadata --- + .def( "get_capture_name", &Worker::GetCaptureName ) + .def( "get_capture_program", &Worker::GetCaptureProgram ) + .def( "get_capture_time", &Worker::GetCaptureTime ) + .def( "get_host_info", &Worker::GetHostInfo ) + .def( "get_pid", &Worker::GetPid ) + .def( "get_resolution", &Worker::GetResolution ) + .def( "get_first_time", &Worker::GetFirstTime ) + .def( "get_last_time", &Worker::GetLastTime ) + .def( "get_cpu_manufacturer", &Worker::GetCpuManufacturer ) + + // --- Counts --- + .def( "get_zone_count", &Worker::GetZoneCount ) + .def( "get_gpu_zone_count", &Worker::GetGpuZoneCount ) + .def( "get_lock_count", &Worker::GetLockCount ) + .def( "get_plot_count", &Worker::GetPlotCount ) + .def( "get_context_switch_count", &Worker::GetContextSwitchCount ) + .def( "get_src_loc_count", &Worker::GetSrcLocCount ) + .def( "get_callstack_sample_count", &Worker::GetCallstackSampleCount ) + .def( "get_message_count", []( const Worker& w ) { + return w.GetMessages().size(); + } ) + + // --- Source locations / zones --- + .def( "get_src_loc", []( const Worker& w, int16_t id ) { + return w.GetSourceLocation( id ); + } ).def( "get_zone_name", []( const Worker& w, int16_t id ) { + return w.GetZoneName( w.GetSourceLocation( id ) ); + } ) +#ifndef TRACY_NO_STATISTICS + .def( "get_zone_stats", []( const Worker& w, int16_t id ) { + const auto& stats = w.GetZonesForSourceLocation( id ); + const size_t cnt = stats.zones.size(); + return ZoneStats{ stats.min, stats.max, stats.total, stats.sumSq, cnt, cnt ? (double)stats.total / cnt : 0.0 }; + } ) +#endif + .def( "get_all_zone_stats", []( const Worker& w ) { + py::dict result; +#ifndef TRACY_NO_STATISTICS + for( const auto& kv : w.GetSourceLocationZones() ) + { + const auto& stats = kv.second; + if( stats.zones.size() == 0 ) continue; + const auto& sl = w.GetSourceLocation( kv.first ); + const char* name = w.GetZoneName( sl ); + const size_t cnt = stats.zones.size(); + result[name] = ZoneStats{ stats.min, stats.max, stats.total, stats.sumSq, cnt, (double)stats.total / cnt }; + } +#endif + return result; + } ).def( "get_root_zone_stats", []( const Worker& w ) { + // Aggregate stats for top-level (root) zones only — no nesting, safe to sum + // File-loaded data uses is_magic() — zones stored inline, not as short_ptr + struct Acc + { + int64_t min = INT64_MAX, max = INT64_MIN, total = 0; + double sumSq = 0; + size_t count = 0; + }; + std::unordered_map acc; + auto processRoot = [&]( const ZoneEvent& z ) { + if( !z.IsEndValid() ) return; + const int64_t dur = z.End() - z.Start(); + auto& s = acc[z.SrcLoc()]; + s.total += dur; + s.count++; + if( dur < s.min ) s.min = dur; + if( dur > s.max ) s.max = dur; + }; + for( const auto* td : w.GetThreadData() ) + { + if( !td ) continue; + if( td->timeline.is_magic() ) + { + for( const auto& z : *(const Vector*)&td->timeline ) processRoot( z ); + } + else + { + for( const auto& zptr : td->timeline ) + { + if( const ZoneEvent* z = zptr.get() ) processRoot( *z ); + } + } + } + py::dict result; + for( const auto& kv : acc ) + { + const auto& s = kv.second; + const double avg = (double)s.total / s.count; + const char* name = w.GetZoneName( w.GetSourceLocation( kv.first ) ); + result[name] = ZoneStats{ s.min, s.max, s.total, s.sumSq, s.count, avg }; + } + return result; + } ) + + // --- Per-occurrence zone data (for temporal correlation / distribution) --- + .def( "get_zone_durations", []( const Worker& w, const std::string& name, size_t maxSamples ) { + // Accumulates across ALL srclocs with this name (same name can appear at multiple srclocs) + std::vector result; +#ifndef TRACY_NO_STATISTICS + for( const auto& kv : w.GetSourceLocationZones() ) + { + if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue; + for( const auto& ztd : kv.second.zones ) + { + if( result.size() >= maxSamples ) goto done_durations; + const auto* z = ztd.Zone(); + if( z && z->IsEndValid() ) result.push_back( z->End() - z->Start() ); + } + } + done_durations:; +#endif + return result; + }, "name"_a, "max_samples"_a = 100000 ) + .def( "get_zone_occurrences", []( const Worker& w, const std::string& name, size_t maxSamples ) { + // Returns list of (start_ns, duration_ns) — accumulates across all srclocs with this name + std::vector> result; +#ifndef TRACY_NO_STATISTICS + for( const auto& kv : w.GetSourceLocationZones() ) + { + if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue; + for( const auto& ztd : kv.second.zones ) + { + if( result.size() >= maxSamples ) goto done_occurrences; + const auto* z = ztd.Zone(); + if( z && z->IsEndValid() ) result.emplace_back( z->Start(), z->End() - z->Start() ); + } + } + done_occurrences:; +#endif + return result; + }, "name"_a, "max_samples"_a = 100000 ) + .def( "get_zone_annotations", []( const Worker& w, const std::string& name, size_t maxSamples ) { + // Returns text annotations attached to individual zone occurrences + std::vector result; +#ifndef TRACY_NO_STATISTICS + for( const auto& kv : w.GetSourceLocationZones() ) + { + if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue; + for( const auto& ztd : kv.second.zones ) + { + if( result.size() >= maxSamples ) goto done_annotations; + const auto* z = ztd.Zone(); + if( z && w.HasZoneExtra( *z ) ) + { + const auto& extra = w.GetZoneExtra( *z ); + if( extra.text.Active() ) result.push_back( w.GetString( extra.text ) ); + } + } + } + done_annotations:; +#endif + return result; + }, "name"_a, "max_samples"_a = 10000 ) + .def( "get_gpu_zone_durations", []( const Worker& w, const std::string& name, size_t maxSamples ) { + std::vector result; + for( const auto& kv : w.GetGpuSourceLocationZones() ) + { + if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue; + for( const auto& ztd : kv.second.zones ) + { + if( result.size() >= maxSamples ) goto done_gpu_dur; + const auto* z = ztd.Zone(); + if( z && z->GpuEnd() >= 0 ) result.push_back( z->GpuEnd() - z->GpuStart() ); + } + } + done_gpu_dur:; + return result; + }, "name"_a, "max_samples"_a = 100000 ) + .def( "get_gpu_zone_occurrences", []( const Worker& w, const std::string& name, size_t maxSamples ) { + std::vector> result; + for( const auto& kv : w.GetGpuSourceLocationZones() ) + { + if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue; + for( const auto& ztd : kv.second.zones ) + { + if( result.size() >= maxSamples ) goto done_gpu_occ; + const auto* z = ztd.Zone(); + if( z && z->GpuEnd() >= 0 ) result.emplace_back( z->GpuStart(), z->GpuEnd() - z->GpuStart() ); + } + } + done_gpu_occ:; + return result; + }, "name"_a, "max_samples"_a = 100000 ) + + // --- Callstack resolution --- + .def( "get_callstack_frames", []( const Worker& w, uint32_t callstackIdx ) { + py::list result; + const auto& cs = w.GetCallstack( callstackIdx ); + for( size_t i = 0; i < cs.size(); ++i ) + { + const auto* fd = w.GetCallstackFrame( cs[i] ); + if( !fd ) continue; + for( uint8_t j = 0; j < fd->size; ++j ) + { + const auto& frame = fd->data[j]; + py::dict d; + d["name"] = std::string( w.GetString( frame.name ) ); + d["file"] = std::string( w.GetString( frame.file ) ); + d["line"] = frame.line; + d["addr"] = frame.symAddr; + result.append( d ); + } + } + return result; + }, "callstack_idx"_a ) + + // --- Context switches per thread --- + .def( "get_thread_context_switches", []( const Worker& w, uint64_t tid, size_t maxSamples ) { + py::list result; + const auto* cs = const_cast( w ).GetContextSwitchData( tid ); + if( !cs ) return result; + for( const auto& ev : cs->v ) + { + if( (size_t)result.size() >= maxSamples ) break; + if( !ev.IsEndValid() ) continue; + py::dict d; + d["start"] = ev.Start(); + d["end"] = ev.End(); + d["cpu"] = (int)ev.Cpu(); + d["reason"] = (int)ev.Reason(); + result.append( d ); + } + return result; + }, "tid"_a, "max_samples"_a = 50000 ) + + // --- CPU thread running time / migrations --- + .def( "get_cpu_thread_data", []( const Worker& w ) { + py::dict result; + for( const auto& kv : w.GetCpuThreadData() ) + { + py::dict d; + d["running_time"] = kv.second.runningTime; + d["running_regions"] = kv.second.runningRegions; + d["migrations"] = kv.second.migrations; + result[py::int_( kv.first )] = d; + } + return result; + } ) + + // --- Zone occurrences with thread attribution --- + .def( "get_zone_occurrences_with_thread", []( const Worker& w, const std::string& name, size_t maxSamples ) { + // Returns list of (start_ns, duration_ns, thread_id) — thread_id is the OS thread ID + std::vector> result; +#ifndef TRACY_NO_STATISTICS + const auto& threads = w.GetThreadData(); + for( const auto& kv : w.GetSourceLocationZones() ) + { + if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue; + for( const auto& ztd : kv.second.zones ) + { + if( result.size() >= maxSamples ) goto done_occ_thread; + const auto* z = ztd.Zone(); + if( !z || !z->IsEndValid() ) continue; + const uint16_t tidx = ztd.Thread(); + const uint64_t tid = ( tidx < threads.size() && threads[tidx] ) ? threads[tidx]->id : 0; + result.emplace_back( z->Start(), z->End() - z->Start(), tid ); + } + } + done_occ_thread:; +#endif + return result; + }, "name"_a, "max_samples"_a = 100000 ) + + // --- Child zone stats: aggregate direct children of all occurrences of a parent zone --- + .def( "get_child_zone_stats", []( const Worker& w, const std::string& name, size_t maxParents ) { + // Uses SourceLocationZones for O(occurrences) lookup — avoids walking the full zone tree. + // File-loaded data sets is_magic() on child vectors (inline ZoneEvent, not short_ptr). + struct Acc + { + int64_t min = INT64_MAX, max = INT64_MIN, total = 0; + double sumSq = 0.0; + size_t count = 0; + }; + std::unordered_map acc; + size_t parentCount = 0; + + auto accumChild = [&]( const ZoneEvent& c ) { + if( !c.IsEndValid() ) return; + const int64_t dur = c.End() - c.Start(); + auto& s = acc[c.SrcLoc()]; + s.total += dur; + s.count++; + s.sumSq += (double)dur * dur; + if( dur < s.min ) s.min = dur; + if( dur > s.max ) s.max = dur; + }; + +#ifndef TRACY_NO_STATISTICS + for( const auto& kv : w.GetSourceLocationZones() ) + { + if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue; + for( const auto& ztd : kv.second.zones ) + { + if( parentCount >= maxParents ) goto done_children; + const auto* z = ztd.Zone(); + if( !z || !z->IsEndValid() || !z->HasChildren() ) continue; + parentCount++; + const auto& ch = w.GetZoneChildren( z->Child() ); + if( ch.is_magic() ) + { + for( const auto& c : *(const Vector*)&ch ) accumChild( c ); + } + else + { + for( const auto& cptr : ch ) + { + if( const ZoneEvent* c = cptr.get() ) accumChild( *c ); + } + } + } + } + done_children:; +#endif + py::dict result; + for( const auto& kv : acc ) + { + const auto& s = kv.second; + if( s.count == 0 ) continue; + const double avg = (double)s.total / (double)s.count; + const char* cname = w.GetZoneName( w.GetSourceLocation( kv.first ) ); + result[cname] = ZoneStats{ s.min, s.max, s.total, s.sumSq, s.count, avg }; + } + return result; + }, "name"_a, "max_parents"_a = 100000 ) + + // --- Zone source location (file / line / function for LLM code navigation) --- + .def( "get_zone_source_location", []( const Worker& w, const std::string& name ) { + py::dict result; +#ifndef TRACY_NO_STATISTICS + for( const auto& kv : w.GetSourceLocationZones() ) + { + const auto& sl = w.GetSourceLocation( kv.first ); + if( std::string( w.GetZoneName( sl ) ) != name ) continue; + result["name"] = name; + result["function"] = std::string( w.GetString( sl.function ) ); + result["file"] = std::string( w.GetString( sl.file ) ); + result["line"] = sl.line; + result["color"] = sl.color; + break; + } +#endif + return result; + }, "name"_a ) + .def( "get_all_zone_source_locations", []( const Worker& w ) { + // Returns {zone_name: {file, line, function, color}} for every unique zone name. + // Uses first srcloc found per name — sufficient for navigation purposes. + py::dict result; +#ifndef TRACY_NO_STATISTICS + for( const auto& kv : w.GetSourceLocationZones() ) + { + const auto& sl = w.GetSourceLocation( kv.first ); + const char* name = w.GetZoneName( sl ); + if( result.contains( name ) ) continue; + py::dict d; + d["function"] = std::string( w.GetString( sl.function ) ); + d["file"] = std::string( w.GetString( sl.file ) ); + d["line"] = sl.line; + d["color"] = sl.color; + result[name] = d; + } +#endif + return result; + } ) + + // --- Per-zone callstack samples (call paths leading into a zone) --- + .def( "get_zone_callstacks", []( const Worker& w, const std::string& name, size_t maxSamples ) { + py::list result; +#ifndef TRACY_NO_STATISTICS + for( const auto& kv : w.GetSourceLocationZones() ) + { + if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue; + for( const auto& ztd : kv.second.zones ) + { + if( (size_t)result.size() >= maxSamples ) goto done_callstacks; + const auto* z = ztd.Zone(); + if( !z || !w.HasZoneExtra( *z ) ) continue; + const auto& extra = w.GetZoneExtra( *z ); + const uint32_t csIdx = extra.callstack.Val(); + if( csIdx == 0 ) continue; + py::list frames; + const auto& cs = w.GetCallstack( csIdx ); + for( size_t i = 0; i < cs.size(); ++i ) + { + const auto* fd = w.GetCallstackFrame( cs[i] ); + if( !fd ) continue; + for( uint8_t j = 0; j < fd->size; ++j ) + { + const auto& frame = fd->data[j]; + py::dict d; + d["name"] = std::string( w.GetString( frame.name ) ); + d["file"] = std::string( w.GetString( frame.file ) ); + d["line"] = frame.line; + d["addr"] = frame.symAddr; + frames.append( d ); + } + } + result.append( frames ); + } + } + done_callstacks:; +#endif + return result; + }, "name"_a, "max_samples"_a = 1000 ) + + // --- Symbol-level sampling stats (inclusive / exclusive counts from call-stack profiling) --- + .def( "get_symbol_stats", []( const Worker& w ) { + py::list result; + for( const auto& kv : w.GetSymbolStats() ) + { + const uint64_t addr = kv.first; + const auto& stats = kv.second; + py::dict d; + d["addr"] = addr; + d["incl"] = stats.incl; + d["excl"] = stats.excl; + const auto* sym = w.GetSymbolData( addr ); + if( sym ) + { + d["name"] = std::string( w.GetString( sym->name ) ); + d["file"] = std::string( w.GetString( sym->file ) ); + d["line"] = sym->line; + d["image"] = std::string( w.GetString( sym->imageName ) ); + } + result.append( d ); + } + return result; + } ) + + // --- Timestamps of all call-stack samples hitting a specific symbol --- + .def( "get_samples_for_symbol", []( const Worker& w, uint64_t symAddr ) { + py::list result; + const auto* samples = w.GetSamplesForSymbol( symAddr ); + if( !samples ) return result; + for( const auto& s : *samples ) + { + py::dict d; + d["time"] = s.time.Val(); + d["thread"] = (uint32_t)s.thread; + result.append( d ); + } + return result; + }, "sym_addr"_a ) + + // --- Hardware performance counter summary per symbol (IPC, cache-miss rate, branch-miss rate) --- + .def( "get_hw_sample_summary", []( const Worker& w ) { + py::list result; + for( const auto& kv : w.GetSymbolStats() ) + { + const uint64_t addr = kv.first; + auto* hw = const_cast( w ).GetHwSampleData( addr ); + if( !hw || ( hw->cycles.empty() && hw->retired.empty() ) ) continue; + auto mean = []( const auto& v ) -> double { + if( v.empty() ) return 0.0; + double sum = 0.0; + for( const auto& x : v ) sum += (double)x.Val(); + return sum / (double)v.size(); + }; + const double cyc = mean( hw->cycles ); + const double ret = mean( hw->retired ); + const double cmr = mean( hw->cacheRef ); + const double cmm = mean( hw->cacheMiss ); + const double brr = mean( hw->branchRetired ); + const double brm = mean( hw->branchMiss ); + py::dict d; + d["addr"] = addr; + d["samples"] = hw->cycles.empty() ? hw->retired.size() : hw->cycles.size(); + d["cycles_mean"] = cyc; + d["retired_mean"] = ret; + d["cache_ref_mean"] = cmr; + d["cache_miss_mean"] = cmm; + d["branch_ret_mean"] = brr; + d["branch_miss_mean"] = brm; + d["ipc"] = ( cyc > 0.0 && ret > 0.0 ) ? ret / cyc : -1.0; + d["cache_miss_rate"] = ( cmr > 0.0 ) ? cmm / cmr : -1.0; + d["branch_miss_rate"] = ( brr > 0.0 ) ? brm / brr : -1.0; + const auto* sym = w.GetSymbolData( addr ); + d["name"] = sym ? std::string( w.GetString( sym->name ) ) : std::string( "" ); + d["file"] = sym ? std::string( w.GetString( sym->file ) ) : std::string( "" ); + d["line"] = sym ? sym->line : 0u; + d["image"] = sym ? std::string( w.GetString( sym->imageName ) ) : std::string( "" ); + result.append( d ); + } + return result; + } ) + + // --- Raw memory allocation events (ptr, size, timestamps) for temporal zone correlation --- + .def( "get_memory_events", []( const Worker& w, size_t maxCount, const std::string& poolName ) { + py::list result; + for( const auto& kv : w.GetMemNameMap() ) + { + const std::string name = kv.first == 0 + ? std::string( "(default)" ) + : std::string( w.GetString( kv.first ) ); + if( !poolName.empty() && name != poolName ) continue; + const MemData* md = kv.second; + for( const auto& ev : md->data ) + { + if( (size_t)result.size() >= maxCount ) break; + py::dict d; + d["pool"] = name; + d["ptr"] = ev.Ptr(); + d["size"] = ev.Size(); + d["time_alloc"] = ev.TimeAlloc(); + d["time_free"] = ev.TimeFree(); + d["thread_alloc"] = (uint32_t)ev.ThreadAlloc(); + d["callstack_idx"] = (uint32_t)ev.CsAlloc(); + result.append( d ); + } + if( !poolName.empty() ) break; + } + return result; + }, "max_count"_a = 100000, "pool_name"_a = "" ) + + // --- Per-lock wait/contention stats (total and average wait time) --- + .def( "get_lock_wait_stats", []( const Worker& w ) { + py::list result; + for( const auto& kv : w.GetLockMap() ) + { + const LockMap* lm = kv.second; + if( !lm || !lm->valid || !lm->isContended ) continue; + std::string name; + if( lm->customName.Active() ) + name = w.GetString( lm->customName ); + else + name = w.GetZoneName( w.GetSourceLocation( lm->srcloc ) ); + int64_t totalWaitNs = 0; + uint64_t contentionCount = 0; + std::unordered_map pendingWait; + for( const auto& evPtr : lm->timeline ) + { + const auto* ev = evPtr.ptr.get(); + if( !ev ) continue; + if( ev->type == LockEvent::Type::Wait || ev->type == LockEvent::Type::WaitShared ) + { + pendingWait[ev->thread] = ev->Time(); + } + else if( ev->type == LockEvent::Type::Obtain || ev->type == LockEvent::Type::ObtainShared ) + { + auto it = pendingWait.find( ev->thread ); + if( it != pendingWait.end() ) + { + totalWaitNs += ev->Time() - it->second; + contentionCount++; + pendingWait.erase( it ); + } + } + } + if( contentionCount == 0 ) continue; + py::dict d; + d["name"] = name; + d["total_wait_ns"] = totalWaitNs; + d["avg_wait_ns"] = (double)totalWaitNs / (double)contentionCount; + d["contention_count"] = contentionCount; + d["threads"] = lm->threadList; + result.append( d ); + } + return result; + } ) + + // --- GPU zone stats --- + .def( "get_all_gpu_zone_stats", []( const Worker& w ) { + py::dict result; + for( const auto& kv : w.GetGpuSourceLocationZones() ) + { + const auto& sl = w.GetSourceLocation( kv.first ); + const char* name = w.GetZoneName( sl ); + const auto& s = kv.second; + const size_t cnt = s.zones.size(); + if( cnt > 0 ) + result[name] = GpuZoneStats{ s.min, s.max, s.total, s.sumSq, cnt, (double)s.total / cnt }; + } + return result; + } ).def( "get_gpu_child_zone_stats", []( const Worker& w, const std::string& name, size_t maxParents ) { + // GPU equivalent of get_child_zone_stats — returns per-child-name GPU duration stats + // for all occurrences of the named parent GPU zone. + struct Acc + { + int64_t min = INT64_MAX, max = INT64_MIN, total = 0; + double sumSq = 0.0; + size_t count = 0; + }; + std::unordered_map acc; + size_t parentCount = 0; + + auto accumChild = [&]( const GpuEvent& c ) { + if( c.GpuEnd() < 0 ) return; + const int64_t dur = c.GpuEnd() - c.GpuStart(); + if( dur < 0 ) return; + auto& s = acc[c.SrcLoc()]; + s.total += dur; + s.count++; + s.sumSq += (double)dur * dur; + if( dur < s.min ) s.min = dur; + if( dur > s.max ) s.max = dur; + }; + + for( const auto& kv : w.GetGpuSourceLocationZones() ) + { + if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue; + for( const auto& ztd : kv.second.zones ) + { + if( parentCount >= maxParents ) goto done_gpu_child; + const auto* z = ztd.Zone(); + if( !z || z->GpuEnd() < 0 || z->Child() < 0 ) continue; + parentCount++; + for( const auto& cptr : w.GetGpuChildren( z->Child() ) ) + { + if( const GpuEvent* c = cptr.get() ) accumChild( *c ); + } + } + } + done_gpu_child:; + + py::dict result; + for( const auto& kv : acc ) + { + const auto& s = kv.second; + if( s.count == 0 ) continue; + const char* cname = w.GetZoneName( w.GetSourceLocation( kv.first ) ); + result[cname] = GpuZoneStats{ s.min, s.max, s.total, s.sumSq, s.count, (double)s.total / s.count }; + } + return result; + }, "name"_a, "max_parents"_a = 100000 ) + + // --- Frame sets --- + .def( "get_frame_count", []( const Worker& w ) { + auto frames = w.GetFramesBase(); + return frames ? w.GetFrameCount( *frames ) : 0; + } ).def( "get_all_frame_stats", []( const Worker& w ) { + std::vector result; + for( const auto* fd : w.GetFrames() ) + { + if( !fd ) continue; + const size_t cnt = fd->frames.size(); + const std::string name = w.GetString( fd->name ); + result.push_back( FrameStats{ + name, fd->min, fd->max, fd->total, fd->sumSq, + cnt, cnt ? (double)fd->total / cnt : 0.0 } ); + } + return result; + } ).def( "get_frame_boundaries", []( const Worker& w ) { + auto* fd = w.GetFramesBase(); + if( !fd ) return std::vector>{}; + const size_t cnt = w.GetFrameCount( *fd ); + std::vector> result; + result.reserve( cnt ); + for( size_t i = 0; i < cnt; ++i ) + result.emplace_back( w.GetFrameBegin( *fd, i ), w.GetFrameEnd( *fd, i ) ); + return result; + } ).def( "get_frame_times", []( const Worker& w ) { + auto* fd = w.GetFramesBase(); + if( !fd ) return std::vector{}; + const size_t cnt = w.GetFrameCount( *fd ); + std::vector times; + times.reserve( cnt ); + for( size_t i = 0; i < cnt; ++i ) + times.push_back( w.GetFrameTime( *fd, i ) ); + return times; + } ).def( "get_frame_times_named", []( const Worker& w, const std::string& name ) { + for( const auto* fd : w.GetFrames() ) + { + if( !fd ) continue; + if( w.GetString( fd->name ) == name ) + { + const size_t cnt = w.GetFrameCount( *fd ); + std::vector times; + times.reserve( cnt ); + for( size_t i = 0; i < cnt; ++i ) + times.push_back( w.GetFrameTime( *fd, i ) ); + return times; + } + } + return std::vector{}; + } ).def( "get_zones_in_frame", []( const Worker& w, size_t frameIdx ) { + // Returns {zone_name: {count, total_ns}} for all CPU zones that STARTED within + // the specified frame's time window. Uses sorted thread timelines for early exit. + auto* fd = w.GetFramesBase(); + if( !fd || frameIdx >= (size_t)w.GetFrameCount( *fd ) ) return py::dict{}; + + const int64_t frameStart = w.GetFrameBegin( *fd, (int)frameIdx ); + const int64_t frameEnd = w.GetFrameEnd( *fd, (int)frameIdx ); + + struct Acc + { + int64_t total = 0; + size_t count = 0; + }; + std::unordered_map acc; + + // Returns false when a zone starts at or after frameEnd (prune signal + // for sorted sibling lists). Uses a local struct instead of std::function + // to avoid per-call heap allocation on the hot recursive path. + struct Visitor + { + const Worker& w; + std::unordered_map& acc; + int64_t frameStart, frameEnd; + + bool operator()( const ZoneEvent& z ) + { + if( !z.IsEndValid() ) return true; + const int64_t zs = z.Start(); + if( zs >= frameEnd ) return false; + if( zs >= frameStart ) + { + auto& s = acc[z.SrcLoc()]; + s.total += z.End() - zs; + s.count++; + } + if( z.HasChildren() && z.End() > frameStart ) + { + const auto& ch = w.GetZoneChildren( z.Child() ); + if( ch.is_magic() ) + { + for( const auto& c : *(const Vector*)&ch ) + { + if( !( *this )( c ) ) break; + } + } + else + { + for( const auto& cptr : ch ) + { + if( const ZoneEvent* c = cptr.get() ) + { + if( !( *this )( *c ) ) break; + } + } + } + } + return true; + } + } visit{ w, acc, frameStart, frameEnd }; + + for( const auto* td : w.GetThreadData() ) + { + if( !td ) continue; + if( td->timeline.is_magic() ) + { + for( const auto& z : *(const Vector*)&td->timeline ) + { + if( !visit( z ) ) break; + } + } + else + { + for( const auto& zptr : td->timeline ) + { + const ZoneEvent* z = zptr.get(); + if( z && !visit( *z ) ) break; + } + } + } + + py::dict result; + for( const auto& kv : acc ) + { + py::dict d; + d["count"] = kv.second.count; + d["total_ns"] = kv.second.total; + const char* zname = w.GetZoneName( w.GetSourceLocation( kv.first ) ); + result[zname] = d; + } + return result; + }, "frame_idx"_a ) + + // --- Messages --- + .def( "get_messages", []( const Worker& w ) { + const auto& msgs = w.GetMessages(); + std::vector result; + result.reserve( msgs.size() ); + for( const auto& m_ptr : msgs ) + { + const auto& msg = *m_ptr; + result.push_back( MessageInfo{ + msg.time, + std::string( w.GetString( msg.ref ) ), + msg.color, + (uint64_t)msg.thread } ); + } + return result; + } ) + + // --- Plots --- + .def( "get_plots", []( const Worker& w ) { + static const char* plotTypeStr[] = { "User", "Memory", "SysTime", "Power" }; + std::vector result; + for( const auto* pd : w.GetPlots() ) + { + if( !pd ) continue; + const size_t cnt = pd->data.size(); + const std::string name = w.GetString( pd->name ); + const char* typeStr = (uint8_t)pd->type < 4 ? plotTypeStr[(uint8_t)pd->type] : "Unknown"; + result.push_back( PlotSummary{ + name, pd->min, pd->max, pd->sum, + cnt, cnt ? pd->sum / cnt : 0.0, + std::string( typeStr ) } ); + } + return result; + } ) + + // --- Memory pools --- + .def( "get_memory_pools", []( const Worker& w ) { + std::vector result; + for( const auto& kv : w.GetMemNameMap() ) + { + const MemData* md = kv.second; + const std::string name = kv.first == 0 ? "(default)" : std::string( w.GetString( kv.first ) ); + result.push_back( MemPoolSummary{ + name, md->high, md->low, md->usage, md->data.size() } ); + } + return result; + } ) + + // --- Locks --- + .def( "get_locks", []( const Worker& w ) { + std::vector result; + for( const auto& kv : w.GetLockMap() ) + { + const LockMap* lm = kv.second; + if( !lm || !lm->valid ) continue; + std::string name; + if( lm->customName.Active() ) + name = w.GetString( lm->customName ); + else + name = w.GetZoneName( w.GetSourceLocation( lm->srcloc ) ); + const char* typeStr = lm->type == LockType::Lockable ? "Lockable" : "SharedLockable"; + result.push_back( LockSummary{ + name, + lm->isContended, + std::string( typeStr ), + lm->timeAnnounce, + lm->timeTerminate, + lm->threadList } ); + } + return result; + } ) + + // --- GPU contexts --- + .def( "get_gpu_contexts", []( const Worker& w ) { + static const char* gpuTypeStr[] = { + "Invalid", "OpenGL", "Vulkan", "OpenCL", "Direct3D12", "Direct3D11", "Metal", "Custom", "CUDA", "Rocprof" }; + std::vector result; + for( const auto* ctx : w.GetGpuData() ) + { + if( !ctx ) continue; + const std::string name = ctx->name.Active() ? w.GetString( ctx->name ) : ""; + const uint8_t typeIdx = (uint8_t)ctx->type; + const char* typeStr = typeIdx < 10 ? gpuTypeStr[typeIdx] : "Unknown"; + result.push_back( GpuContextSummary{ + name, ctx->count, std::string( typeStr ), ctx->thread } ); + } + return result; + } ) + + // --- Threads --- + .def( "get_threads", []( const Worker& w ) { + // Returns list of dicts to avoid raw-pointer pybind11 ownership issues + py::list result; + for( const auto& t : w.GetThreadData() ) + { + if( !t ) continue; + py::dict d; + d["id"] = t->id; + d["count"] = t->count; + d["is_fiber"] = (bool)t->isFiber; + d["name"] = std::string( w.GetThreadName( t->id ) ); + result.append( d ); + } + return result; + } ).def( "get_thread_name", []( const Worker& w, uint64_t tid ) { + return w.GetThreadName( tid ); + } ) + + // --- Connection control --- + .def( "is_connected", &Worker::IsConnected ) + .def( "shutdown", &Worker::Shutdown ) + .def( "disconnect", &Worker::Disconnect ); + + // ------------------------------------------------------------------------- + // FileRead + // ------------------------------------------------------------------------- + m.def( "open_file", []( const char* path ) -> std::shared_ptr { + auto f = FileRead::Open( path ); + if( !f ) throw std::runtime_error( "Could not open file" ); + return std::shared_ptr( f ); + } ); + + py::class_>( m, "FileRead" ); + + m.def( "create_worker_from_file", []( std::shared_ptr f ) { + return std::make_unique( *f ); + } ); +} + +} // namespace tracy