diff --git a/extra/mcp/.gitignore b/extra/mcp/.gitignore
new file mode 100644
index 00000000..733edd20
--- /dev/null
+++ b/extra/mcp/.gitignore
@@ -0,0 +1,3 @@
+tracy_mcp.port
+tracy_mcp.pid
+*.local.sh
diff --git a/extra/mcp/eval_guide.md b/extra/mcp/eval_guide.md
new file mode 100644
index 00000000..b67ccbe8
--- /dev/null
+++ b/extra/mcp/eval_guide.md
@@ -0,0 +1,72 @@
+# Tracy MCP eval guide
+
+This document covers the bindings-layer detail that the curated catalog
+(`tracy://catalog`) and analysis guidance (`tracy://prompt`) do not.
+
+## ctx
+
+`ctx` is a `TracyServerBindings.Worker` — the same object Tracy Assist's
+C++ tools query through `Worker::Get*`. The pybind methods are the canonical
+data surface. Common entry points:
+
+- Zones: `get_all_zone_stats()` (every callsite, large), `get_root_zone_stats()`
+  (top-level zones only, useful for "where is the program spending time"),
+  `get_zone_stats(srcloc_id)`, `get_child_zone_stats(srcloc_id)` (subtract for
+  self-time), `get_zone_durations(name)`, `get_zone_count()`,
+  `get_all_zone_source_locations()`
+- GPU zones: `get_all_gpu_zone_stats()`, `get_gpu_zone_durations(...)`,
+  `get_gpu_contexts()`
+- Frames: `get_frame_count()`, `get_frame_times()`, `get_frame_times_named(name)`,
+  `get_frame_boundaries()`, `get_zones_in_frame(...)`
+- Threads: `get_threads()`, `get_thread_name(tid)`, `get_thread_context_switches(tid)`
+- Messages / plots / locks / memory / callstacks: `get_messages()`, `get_plots()`,
+  `get_locks()`, `get_memory_events()`, `get_callstack_frames(...)`
+- Capture metadata: `get_capture_name()`, `get_capture_program()`,
+  `get_first_time()`, `get_last_time()`, `get_resolution()`, `get_host_info()`
+
+Run `print([m for m in dir(ctx) if not m.startswith('_')])` for the full list.
+
+## Units and conventions
+
+- All time values returned by Worker methods are **nanoseconds** (int).
+  `get_first_time()` / `get_last_time()` bound the capture timeline.
+- `ZoneStats` fields: `count`, `total`, `min`, `max`, `avg`, `sum_sq`. `total`
+  is the inclusive aggregate; use `get_child_zone_stats(srcloc_id)` to subtract
+  child time when you need self-time.
+- `get_all_zone_stats()` returns `dict[str, ZoneStats]` keyed by an opaque label
+  of the form `'name (addr)[arch] <srcloc_id>'`. The trailing `<id>` is the
+  source-location ID — the int accepted by `get_zone_stats(int)`,
+  `get_zone_durations_by_id`, and friends. Parse it with a regex if you need
+  to join across calls.
+- Source-location IDs from `get_all_zone_source_locations()` are the join key
+  between zone-name lookups and per-callsite queries.
+
+## Translating catalog entries to ctx Python
+
+The catalog (`tracy://catalog`) lists curated queries. Each maps to a small
+Python snippet:
+
+```python
+# zone_list — top 10 hottest zones by total time
+top = sorted(ctx.get_all_zone_stats().items(),
+             key=lambda kv: kv[1].total, reverse=True)[:10]
+for k, v in top:
+    print(f"{v.total/1e6:.2f}ms  count={v.count}  {k}")
+
+# frame_list — primary frame set timing
+times = ctx.get_frame_times()  # ns per frame
+print(f"frames={len(times)}  avg={sum(times)/len(times)/1e6:.2f}ms  "
+      f"p99={sorted(times)[int(len(times)*0.99)]/1e6:.2f}ms")
+
+# zone_stats for a named zone — find the srcloc id, then drill in
+import re
+matches = [k for k in ctx.get_all_zone_stats() if k.startswith("MyFunc ")]
+sid = int(re.search(r"<(\d+)>$", matches[0]).group(1))
+stats = ctx.get_zone_stats(sid)
+```
+
+## Async mode
+
+For long-running queries pass `async_mode=True` to `eval`; it returns
+`{task_id, status: "running"}`. Poll with the `task` tool
+(`action="poll", task_id=...`).
diff --git a/extra/mcp/start_mcp.sh b/extra/mcp/start_mcp.sh
new file mode 100644
index 00000000..2b50afbe
--- /dev/null
+++ b/extra/mcp/start_mcp.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+# Start the Tracy MCP server.
+#
+# Set PYTHONPATH to the directory containing TracyServerBindings.so/.pyd.
+# Adjust the Release/Debug suffix to match your CMake build configuration.
+PYTHONPATH="${PYTHONPATH:+$PYTHONPATH:}$(dirname "$0")/../../build/python/Release"
+export PYTHONPATH
+
+# Machine-local overrides (not committed). Create start_mcp.local.sh next to
+# this file to set TRACY_CAPTURES_DIR, TRACY_MCP_PORT, or any other env var:
+#   export TRACY_CAPTURES_DIR=/path/to/captures
+#   export TRACY_MCP_PORT=47380
+if [ -f "$(dirname "$0")/start_mcp.local.sh" ]; then
+    . "$(dirname "$0")/start_mcp.local.sh"
+fi
+
+exec python3 "$(dirname "$0")/tracy_mcp.py" "$@"
diff --git a/extra/mcp/tracy_mcp.py b/extra/mcp/tracy_mcp.py
new file mode 100644
index 00000000..bff765db
--- /dev/null
+++ b/extra/mcp/tracy_mcp.py
@@ -0,0 +1,596 @@
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+
+import asyncio
+import atexit
+import builtins
+import concurrent.futures
+import glob
+import io
+import os
+import logging
+import re
+import socket
+import struct
+import sys
+import time
+import uuid
+from contextlib import redirect_stdout
+
+import mcp.server.fastmcp as fastmcp
+
+# Suppress noisy ASGI shutdown errors known to occur with SSE and Control-C.
+# These occur when Starlette attempts to send a 500 error after the loop is cancelled
+# but after the SSE 200 OK headers have already been sent. Global level suppression
+# is used because surgical filtering of ASGI exceptions is unreliable in this stack.
+logging.getLogger("uvicorn.error").setLevel(logging.CRITICAL)
+logging.getLogger("starlette").setLevel(logging.CRITICAL)
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_PORT_FILE = os.path.join(_HERE, "tracy_mcp.port")
+_PID_FILE  = os.path.join(_HERE, "tracy_mcp.pid")
+_PREFERRED_PORT = int(os.environ.get("TRACY_MCP_PORT", "47380"))
+
+# Shared documentation surfaces. system.prompt.md is Tracy Assist's source
+# system prompt; exposing it as an MCP resource keeps analysis guidance in
+# sync across both surfaces with no plumbing. eval_guide.md covers
+# bindings-layer detail (ctx object model, units, source-location ID joins).
+_LLM_DIR = os.path.normpath(os.path.join(_HERE, "..", "..", "profiler", "src", "llm"))
+_PROMPT_PATH = os.path.join(_LLM_DIR, "system.prompt.md")
+_EVAL_GUIDE_PATH = os.path.join(_HERE, "eval_guide.md")
+
+
+def _read_text(path: str) -> str:
+    try:
+        with open(path, encoding="utf-8") as f:
+            return f.read()
+    except Exception as e:
+        return f"(unavailable: {e})"
+
+
+# Tracy UDP broadcast packet support. Tracy clients announce themselves on
+# port 8086 with a BroadcastMessage (see public/common/TracyProtocol.hpp).
+# The dev GUI reads protocolVersion from the broadcast and refuses connection
+# on mismatch instead of hitting an opaque TCP timeout. We do the same.
+_PROTOCOL_HPP = os.path.normpath(
+    os.path.join(_HERE, "..", "..", "public", "common", "TracyProtocol.hpp")
+)
+_BROADCAST_PORT = 8086
+_PROGRAM_NAME_SIZE = 64
+
+
+def _read_bindings_protocol_version() -> int | None:
+    """Parse ProtocolVersion from TracyProtocol.hpp at startup so our 'expected'
+    version stays in sync with the bindings build without extra C++ wiring."""
+    try:
+        with open(_PROTOCOL_HPP, encoding="utf-8") as f:
+            for line in f:
+                m = re.search(r"constexpr\s+uint32_t\s+ProtocolVersion\s*=\s*(\d+)", line)
+                if m:
+                    return int(m.group(1))
+    except Exception:
+        pass
+    return None
+
+
+_OUR_PROTOCOL_VERSION = _read_bindings_protocol_version()
+
+
+def _parse_broadcast(data: bytes) -> dict | None:
+    """Parse a Tracy BroadcastMessage. Handles broadcast versions 0-3.
+
+    Fixed-field sizes (from TracyProtocol.hpp, packed):
+      v3: u16 bv, u16 lp, u32 pv, u64 pid, i32 at, char[<=64] name  (>=20 + name)
+      v2: u16 bv, u16 lp, u32 pv, i32 at, char[<=64] name           (>=12 + name)
+      v1: u32 bv, u32 pv, u32 lp, u32 at, char[<=64] name           (>=16 + name)
+      v0: u32 bv, u32 pv, u32 at, char[<=64] name                   (>=12 + name)
+
+    The programName field is variable-length on the wire — the sender writes
+    only the actual name plus null terminator, not the full 64-byte buffer.
+    """
+    if len(data) < 4:
+        return None
+
+    def _name(buf: bytes) -> str:
+        return buf[:_PROGRAM_NAME_SIZE].split(b"\0", 1)[0].decode("utf-8", "replace")
+
+    bv16 = struct.unpack_from("<H", data, 0)[0]
+    if bv16 == 3 and len(data) >= 21:
+        bv, lp, pv, pid, at = struct.unpack_from("<HHIQi", data, 0)
+        return {"broadcast_version": bv, "listen_port": lp,
+                "protocol_version": pv, "pid": pid,
+                "active_seconds": at, "program": _name(data[20:])}
+    if bv16 == 2 and len(data) >= 13:
+        bv, lp, pv, at = struct.unpack_from("<HHIi", data, 0)
+        return {"broadcast_version": bv, "listen_port": lp,
+                "protocol_version": pv, "active_seconds": at,
+                "program": _name(data[12:])}
+    bv32 = struct.unpack_from("<I", data, 0)[0]
+    if bv32 == 1 and len(data) >= 17:
+        bv, pv, lp, at = struct.unpack_from("<IIII", data, 0)
+        return {"broadcast_version": bv, "listen_port": lp,
+                "protocol_version": pv, "active_seconds": at,
+                "program": _name(data[16:])}
+    if bv32 == 0 and len(data) >= 13:
+        bv, pv, at = struct.unpack_from("<III", data, 0)
+        return {"broadcast_version": bv, "listen_port": None,
+                "protocol_version": pv, "active_seconds": at,
+                "program": _name(data[12:])}
+    return None
+
+
+async def _listen_broadcasts(timeout_s: float = 1.5) -> list[dict]:
+    """Listen briefly on UDP 8086 for Tracy client announcements.
+
+    Returns a list of parsed broadcasts (deduplicated by listen_port). Empty
+    list means no broadcast received — the target may use TRACY_ON_DEMAND,
+    a non-default broadcast port, or simply isn't running.
+    """
+    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    try:
+        s.bind(("", _BROADCAST_PORT))
+    except OSError:
+        s.close()
+        return []
+    s.setblocking(False)
+    loop = asyncio.get_running_loop()
+    seen: dict[int | None, dict] = {}
+    deadline = loop.time() + timeout_s
+    try:
+        while loop.time() < deadline:
+            remaining = deadline - loop.time()
+            if remaining <= 0:
+                break
+            try:
+                fut = loop.sock_recvfrom(s, 2048)
+                data, _addr = await asyncio.wait_for(fut, timeout=remaining)
+            except (asyncio.TimeoutError, BlockingIOError):
+                break
+            parsed = _parse_broadcast(data)
+            if parsed:
+                seen.setdefault(parsed.get("listen_port"), parsed)
+    finally:
+        s.close()
+    return list(seen.values())
+
+
+def _is_our_server_running() -> tuple[bool, int]:
+    """
+    Check the PID file to see if our server is already running.
+    Returns (running, port). Uses os.kill(pid, 0) to confirm the process is alive.
+    """
+    try:
+        with open(_PID_FILE) as f:
+            pid = int(f.read().strip())
+        with open(_PORT_FILE) as f:
+            port = int(f.read().strip())
+        os.kill(pid, 0)   # raises OSError if process is gone
+        return True, port
+    except Exception:
+        return False, 0
+
+
+def _find_free_port() -> int:
+    """Scan from preferred port upward; fall back to OS-assigned if the range is exhausted."""
+    for port in range(_PREFERRED_PORT, _PREFERRED_PORT + 16):
+        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        try:
+            s.bind(("127.0.0.1", port))
+            s.close()
+            return port
+        except OSError:
+            s.close()
+    # Let OS assign any free port
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.bind(("127.0.0.1", 0))
+    port = s.getsockname()[1]
+    s.close()
+    return port
+
+
+def _write_pid_and_port(port: int) -> None:
+    try:
+        with open(_PID_FILE, "w") as f:
+            f.write(str(os.getpid()))
+        with open(_PORT_FILE, "w") as f:
+            f.write(str(port))
+    except Exception:
+        pass
+
+
+def _cleanup_pid_files() -> None:
+    for path in (_PID_FILE, _PORT_FILE):
+        try:
+            os.unlink(path)
+        except Exception:
+            pass
+
+
+# Attempt to import Tracy Server bindings
+try:
+    import TracyServerBindings as tracy_server
+except ImportError:
+    sys.path.append(os.path.join(os.path.dirname(__file__), "../../build/python"))
+    try:
+        import TracyServerBindings as tracy_server
+    except ImportError:
+        tracy_server = None
+
+mcp_server = fastmcp.FastMCP("Tracy Profiler")
+executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)
+
+
+class Task:
+    def __init__(self, task_id: str, code: str):
+        self.id = task_id
+        self.code = code
+        self.status = "pending"
+        self.result = None
+        self.error = None
+        self.start_time = time.time()
+        self.end_time = None
+
+
+class TracyInstance:
+    def __init__(self, name: str, worker: object | None = None):
+        self.name = name
+        self.worker = worker
+        self.path = None
+        self.mtime = None
+
+
+instances: dict[str, TracyInstance] = {}
+tasks: dict[str, Task] = {}
+captures_dir: str | None = os.environ.get("TRACY_CAPTURES_DIR")
+
+
+@mcp_server.resource("tracy://prompt")
+def _prompt_resource() -> str:
+    """Tracy Assist's analysis guidance (system.prompt.md). Contains workflows
+    for optimization, callstack inspection, and privacy rules. %TIME%, %USER%,
+    and %PROGRAMNAME% are placeholders filled by the in-app chat — ignore them
+    when reading from MCP."""
+    return _read_text(_PROMPT_PATH)
+
+
+@mcp_server.resource("tracy://eval-guide")
+def _eval_guide_resource() -> str:
+    """Bindings-layer guide for the eval tool: ctx object model, time units,
+    source-location ID semantics, and worked examples translating catalog
+    entries into ctx Python."""
+    return _read_text(_EVAL_GUIDE_PATH)
+
+
+@mcp_server.tool()
+async def list_captures() -> list[str]:
+    """List .tracy capture files in the TRACY_CAPTURES_DIR directory (non-recursive)."""
+    if not captures_dir:
+        return []
+    return sorted(glob.glob(os.path.join(captures_dir, "*.tracy")))
+
+
+@mcp_server.tool()
+async def list_instances() -> list[dict]:
+    """List all loaded Tracy instances and captures with metadata."""
+    return [
+        {
+            "id": name,
+            "path": inst.path,
+            "mtime": inst.mtime,
+            "live": inst.path is None
+        }
+        for name, inst in instances.items()
+    ]
+
+
+@mcp_server.tool()
+async def discover_instances(port_range: str = "8086-8095") -> list[dict]:
+    """
+    Scan for running Tracy-instrumented applications on local ports.
+
+    Returns a list of discovered ports that are listening.
+    """
+    start_port, end_port = map(int, port_range.split("-"))
+    discovered = []
+
+    async def check_port(port: int) -> None:
+        try:
+            _, writer = await asyncio.wait_for(
+                asyncio.open_connection("127.0.0.1", port), timeout=0.1
+            )
+            writer.close()
+            await writer.wait_closed()
+            discovered.append({"port": port, "address": "127.0.0.1"})
+        except (OSError, asyncio.TimeoutError, ConnectionRefusedError):
+            pass
+
+    await asyncio.gather(*(check_port(p) for p in range(start_port, end_port + 1)))
+    return discovered
+
+
+@mcp_server.tool()
+async def live_connect(address: str = "127.0.0.1", port: int = 8086, alias: str | None = None) -> str:
+    """
+    Connect to a live running Tracy-instrumented application.
+
+    Wraps Worker(addr, port, memoryLimit=-1). Returns the instance_id.
+    """
+    if not tracy_server:
+        return "Error: Tracy Server bindings not found."
+
+    # Pre-flight: read Tracy's UDP broadcast on port 8086 and compare protocol
+    # versions before attempting TCP. Mirrors what the Tracy GUI does so a
+    # version mismatch produces a precise error instead of an opaque timeout.
+    # Tracy clients broadcast every ~3s (TracyProfiler.cpp), so we listen a
+    # little longer to guarantee catching at least one beat.
+    broadcasts = await _listen_broadcasts(timeout_s=3.5)
+    match = next((b for b in broadcasts if b.get("listen_port") == port), None)
+    if match and _OUR_PROTOCOL_VERSION is not None:
+        if match["protocol_version"] != _OUR_PROTOCOL_VERSION:
+            return (
+                f"Protocol mismatch: target program '{match['program']}' "
+                f"announces Tracy protocol v{match['protocol_version']} on "
+                f"{address}:{port}, but these server bindings are built "
+                f"against v{_OUR_PROTOCOL_VERSION}. Rebuild the bindings or "
+                f"the target against a matching Tracy version."
+            )
+
+    try:
+        w = tracy_server.Worker(address, port)
+    except Exception as e:
+        return f"Failed to connect: {str(e)}"
+
+    # Worker construction returns immediately even on protocol failure (the
+    # bindings expose no error state — is_connected() is the only signal).
+    # Probe briefly so silent failures (e.g. TRACY_ON_DEMAND with no profiler
+    # request yet, or a target broadcasting on a non-default port) surface
+    # cleanly even when broadcast pre-flight didn't catch them.
+    deadline_s = 2.0
+    step_s = 0.1
+    elapsed = 0.0
+    while elapsed < deadline_s and not w.is_connected():
+        await asyncio.sleep(step_s)
+        elapsed += step_s
+
+    if not w.is_connected():
+        try:
+            w.shutdown()
+        except Exception:
+            pass
+        hint = ""
+        if broadcasts and not match:
+            seen = ", ".join(
+                f"'{b['program']}' on port {b.get('listen_port')} (protocol v{b['protocol_version']})"
+                for b in broadcasts
+            )
+            hint = f" Detected other Tracy broadcasts: {seen}."
+        elif not broadcasts:
+            hint = (
+                " No Tracy broadcasts were received on port 8086 in 3.5s — "
+                "the target may use TRACY_ON_DEMAND, a non-default broadcast "
+                "port, or isn't running."
+            )
+        return (
+            f"Reached {address}:{port} but the Tracy handshake did not complete "
+            f"within {deadline_s:.1f}s.{hint} Common causes: (1) the Tracy "
+            f"client version embedded in the target program differs from these "
+            f"server bindings; (2) the target was built with TRACY_ON_DEMAND "
+            f"and is awaiting a profiler request; (3) another client is "
+            f"already attached."
+        )
+
+    name = alias or f"live_{address}_{port}"
+    instances[name] = TracyInstance(name, w)
+    return (
+        f"Connected to live instance as '{name}'. "
+        f"Before your first eval, read resources tracy://prompt "
+        f"(analysis guidance) and tracy://eval-guide (ctx object model, "
+        f"ns time units, srcloc IDs)."
+    )
+
+
+@mcp_server.tool()
+async def load_capture(path: str, alias: str | None = None) -> str:
+    """
+    Load a .tracy capture file by absolute path.
+
+    Parameters:
+      path  — absolute path to a .tracy file. On Windows use backslashes
+              (e.g. 'E:\\\\traces\\\\foo.tracy').
+      alias — optional instance name; overwrites existing on collision.
+              If omitted, an ID is derived from filename and mtime.
+
+    If you don't already have a path, call `list_captures` first — it lists
+    .tracy files in the TRACY_CAPTURES_DIR environment directory.
+    """
+    if not tracy_server:
+        return "Error: Tracy Server bindings not found."
+    try:
+        mtime = os.path.getmtime(path)
+        if alias:
+            name = alias
+        else:
+            # unique name including mtime to avoid version collision
+            name = f"{os.path.basename(path)}@{int(mtime):x}"
+
+        if name in instances:
+            inst = instances[name]
+            if inst.path == path and inst.mtime == mtime:
+                return f"Instance '{name}' is already loaded and up to date."
+
+        f = tracy_server.open_file(path)
+        w = tracy_server.create_worker_from_file(f)
+        inst = TracyInstance(name, w)
+        inst.path = path
+        inst.mtime = mtime
+        instances[name] = inst
+        return (
+            f"Loaded as '{name}'. "
+            f"Before your first eval, read resources tracy://prompt "
+            f"(analysis guidance) and tracy://eval-guide (ctx object model, "
+            f"ns time units, srcloc IDs)."
+        )
+    except Exception as e:
+        return f"Failed to load: {str(e)}"
+
+
+@mcp_server.tool()
+async def unload_capture(instance_id: str) -> str:
+    """Unload a Tracy instance and release its memory."""
+    if instance_id in instances:
+        del instances[instance_id]
+        return f"Instance '{instance_id}' unloaded."
+    return f"Instance '{instance_id}' not found."
+
+
+@mcp_server.tool(name="eval")
+async def tracy_eval(code: str, instance_id: str, async_mode: bool = False) -> object:
+    """
+    Execute Python code against a specific Tracy Worker bound as `ctx`.
+
+    On first use, read the `tracy://prompt` (analysis guidance) and
+    `tracy://eval-guide` (ctx object model, units, source-location ID joins)
+    resources. Time values returned by Worker methods are nanoseconds.
+
+    If async_mode=True, returns a task_id immediately; poll via the `task` tool.
+    """
+    if instance_id not in instances:
+        return f"Error: Instance '{instance_id}' not found. Use list_instances to find valid IDs."
+
+    instance = instances[instance_id]
+    if not instance.worker:
+        return f"Error: Instance '{instance_id}' has no worker."
+
+    if not async_mode:
+        return await _execute_eval(code, instance.worker)
+
+    # Async mode: spawn task and return immediately
+    task_id = str(uuid.uuid4())
+    task = Task(task_id, code)
+    tasks[task_id] = task
+    asyncio.get_running_loop().run_in_executor(
+        executor, _run_task_sync, task, instance.worker
+    )
+    return {"task_id": task_id, "status": "running"}
+
+
+def _run_task_sync(task: Task, worker: object) -> None:
+    """Run a background eval task in the thread pool."""
+    task.status = "running"
+    try:
+        task.result = _execute_eval_sync(task.code, worker)
+        task.status = "completed"
+    except Exception as e:
+        task.error = str(e)
+        task.status = "failed"
+    finally:
+        task.end_time = time.time()
+
+
+def _execute_eval_sync(code: str, ctx: object) -> str:
+    """Execute *code* with `ctx` bound to the Tracy worker. Captures stdout."""
+    global_vars = {
+        "__builtins__": builtins,
+        "ctx": ctx,
+        "tracy": tracy_server,
+        "instances": {name: inst.worker for name, inst in instances.items()},
+    }
+    buf = io.StringIO()
+    with redirect_stdout(buf):
+        try:
+            result = eval(compile(code, "<eval>", "eval"), global_vars)
+        except SyntaxError:
+            exec(compile(code, "<exec>", "exec"), global_vars)
+            result = None
+    output = buf.getvalue()
+    if result is None:
+        return output or ""
+    return str(result)
+
+
+async def _execute_eval(code: str, ctx: object) -> str:
+    """Async wrapper: runs `_execute_eval_sync` in the thread-pool executor."""
+    return await asyncio.get_running_loop().run_in_executor(
+        executor, _execute_eval_sync, code, ctx
+    )
+
+
+@mcp_server.tool()
+async def task(action: str, task_id: str | None = None) -> object:
+    """
+    Manage background analysis tasks.
+
+    Actions: poll, cancel, list
+    """
+    if action == "list":
+        return [
+            {"id": t.id, "status": t.status, "elapsed": time.time() - t.start_time}
+            for t in tasks.values()
+        ]
+
+    if not task_id or task_id not in tasks:
+        return "Error: Task ID not found."
+
+    t = tasks[task_id]
+    if action == "poll":
+        res: dict = {"id": t.id, "status": t.status}
+        if t.status == "completed":
+            res["result"] = t.result
+        elif t.status == "failed":
+            res["error"] = t.error
+        return res
+
+    if action == "cancel":
+        # Cancellation of thread-pool work is not possible post-submission;
+        # mark the task so callers know it was abandoned.
+        if t.status == "running":
+            t.status = "cancelled"
+            return f"Task {task_id} marked as cancelled."
+        return f"Task {task_id} is not running."
+
+    return "Error: Unknown action."
+
+
+@mcp_server.tool()
+async def shutdown_server() -> str:
+    """
+    Shut down the Tracy MCP server.
+
+    Because the server runs as a singleton (SSE transport, one process shared
+    across all VS Code windows), this releases the TracyServerBindings.pyd lock
+    for all clients at once. Restart tracy_mcp.py after rebuilding.
+    """
+    import threading
+    def _exit() -> None:
+        time.sleep(0.2)
+        os._exit(0)
+    threading.Thread(target=_exit, daemon=True).start()
+    return "Server shutting down. Restart tracy_mcp.py to reconnect."
+
+
+if __name__ == "__main__":
+    atexit.register(_cleanup_pid_files)
+
+    running, existing_port = _is_our_server_running()
+    if running:
+        print(
+            f"Tracy MCP already running on port {existing_port}. "
+            "All VS Code windows share that instance.",
+            file=sys.stderr,
+        )
+        sys.exit(0)
+
+    port = _find_free_port()
+    _write_pid_and_port(port)
+
+    print(f"Tracy MCP listening on http://127.0.0.1:{port}/sse", file=sys.stderr)
+
+    mcp_server.settings.host = "127.0.0.1"
+    mcp_server.settings.port = port
+    try:
+        mcp_server.run(transport="sse")
+    except KeyboardInterrupt:
+        print("\nTracy MCP server stopped.", file=sys.stderr)
+        sys.exit(0)
diff --git a/manual/tracy.md b/manual/tracy.md
index da295997..a17b3aef 100644
--- a/manual/tracy.md
+++ b/manual/tracy.md
@@ -1,4 +1,4 @@
----
+﻿---
 bibliography:
 - tracy.bib
 ---
diff --git a/manual/tracy.tex b/manual/tracy.tex
index 23946940..28f9370b 100644
--- a/manual/tracy.tex
+++ b/manual/tracy.tex
@@ -2473,6 +2473,123 @@ The following additional CMake options are available when building the Python pa
 
 Be aware that the memory allocated by this buffer is global and is not freed, see section~\ref{uniquepointers}.
 
+\subsection{MCP Server}
+\label{mcpserver}
+
+Tracy provides an optional MCP (Model Context Protocol\footnote{\url{https://modelcontextprotocol.io}}) server that allows AI coding assistants to load and analyze Tracy captures as part of automated workflows. It runs as a separate Python sidecar process and does not integrate with or depend on Tracy Assist (section~\ref{tracyassist}). No Python interpreter is required to run Tracy itself.
+
+The primary use case is agentic tooling: an AI agent can load a \texttt{.tracy} capture, execute arbitrary analysis code against the \texttt{Worker} bindings (see below), and compare results across multiple captures — for example, validating that a proposed optimization reduced frame time.
+
+\subsubsection{Building}
+
+The MCP server requires the Tracy Server Python bindings, which are built alongside the client bindings when \texttt{TRACY\_CLIENT\_PYTHON} is enabled:
+
+\begin{lstlisting}
+cmake -B build -DTRACY_CLIENT_PYTHON=ON
+cmake --build build --config Release
+\end{lstlisting}
+
+\subsubsection{Running}
+
+\begin{lstlisting}
+pip install mcp
+python extra/mcp/tracy_mcp.py
+\end{lstlisting}
+
+Set the following environment variables before launching (or export them in your shell):
+
+\begin{lstlisting}
+PYTHONPATH=/path/to/tracy/build/python/Release
+TRACY_CAPTURES_DIR=/path/to/captures   # enables list_captures
+TRACY_MCP_PORT=47380                   # optional; default 47380
+\end{lstlisting}
+
+\subsubsection{Integrating with an AI assistant}
+
+The server runs as a singleton on SSE transport (port 47380 by default). Only one process loads \texttt{TracyServerBindings} regardless of how many editor windows are open; subsequent launches detect the port is taken and exit immediately.
+
+The server prints its URL on startup and writes it to \texttt{extra/mcp/tracy\_mcp.port}:
+
+\begin{lstlisting}
+Tracy MCP listening on http://127.0.0.1:47380/sse
+\end{lstlisting}
+
+Configure your AI assistant using that URL. For example, for a JSON-based MCP configuration:
+
+\begin{lstlisting}
+{
+  "mcpServers": {
+    "tracy": {
+      "url": "http://127.0.0.1:47380/sse"
+    }
+  }
+}
+\end{lstlisting}
+
+\subsubsection{Available tools}
+
+\begin{itemize}
+\item \texttt{list\_captures} --- List \texttt{*.tracy} files in \texttt{TRACY\_CAPTURES\_DIR} (top-level only).
+\item \texttt{list\_instances} --- List all captures currently loaded in the server.
+\item \texttt{load\_capture} --- Load a \texttt{.tracy} file by path, optionally giving it an alias.
+\item \texttt{connect\_instance} --- Set the active instance for subsequent analysis calls.
+\item \texttt{live\_connect} --- Connect to a running Tracy-instrumented application by address and port.
+\item \texttt{discover\_instances} --- Scan a port range for running Tracy-instrumented applications.
+\item \texttt{eval} --- Execute arbitrary Python against the active \texttt{Worker} object (available as \texttt{ctx}). Supports \texttt{async\_mode=True} for long-running queries.
+\item \texttt{task} --- Poll, cancel, or list background analysis tasks started with \texttt{async\_mode=True}.
+\end{itemize}
+
+\subsubsection{Worker API (available via \texttt{eval})}
+
+Inside \texttt{eval}, the variable \texttt{ctx} is a \texttt{Worker} instance. All time values are in nanoseconds. The following methods are available:
+
+\paragraph{Capture metadata}
+\begin{itemize}
+\item \texttt{get\_capture\_name()} / \texttt{get\_capture\_program()} --- Name and program string stored in the trace.
+\item \texttt{get\_host\_info()} --- OS, CPU, RAM, and compiler info as a string.
+\item \texttt{get\_resolution()} --- Timer resolution in nanoseconds.
+\item \texttt{get\_first\_time()} / \texttt{get\_last\_time()} --- Trace time range in nanoseconds.
+\end{itemize}
+
+\paragraph{CPU zones}
+\begin{itemize}
+\item \texttt{get\_all\_zone\_stats()} --- Returns a \texttt{dict[str, ZoneStats]} keyed by zone name. Each \texttt{ZoneStats} has \texttt{min}, \texttt{max}, \texttt{total}, \texttt{avg}, \texttt{count}, \texttt{sum\_sq} (all in nanoseconds). Includes nested zones.
+\item \texttt{get\_root\_zone\_stats()} --- Like \texttt{get\_all\_zone\_stats()} but aggregates only top-level zones per thread. Safe to sum across zones.
+\item \texttt{get\_zone\_stats(srcloc\_id)} --- Stats for a single source location.
+\item \texttt{get\_zone\_durations(name)} --- List of individual zone durations (ns) for distribution analysis.
+\item \texttt{get\_zone\_source\_location(name)} --- Returns \texttt{\{"name", "function", "file", "line", "color"\}} for the named zone.
+\end{itemize}
+
+\paragraph{GPU zones}
+\begin{itemize}
+\item \texttt{get\_all\_gpu\_zone\_stats()} --- Returns a \texttt{dict[str, GpuZoneStats]}.
+\item \texttt{get\_gpu\_contexts()} --- Returns a list of \texttt{GpuContextSummary} objects.
+\item \texttt{get\_gpu\_zone\_durations(name)} --- Individual GPU zone durations (ns).
+\end{itemize}
+
+\paragraph{Frames}
+\begin{itemize}
+\item \texttt{get\_frame\_times()} --- Per-frame durations (ns) for the default frame set.
+\item \texttt{get\_frame\_times\_named(name)} --- Per-frame durations for a named frame set.
+\item \texttt{get\_frame\_boundaries()} --- List of \texttt{(start\_ns, end\_ns)} tuples for each frame.
+\item \texttt{get\_frame\_count()} --- Frame count for the default frame set.
+\end{itemize}
+
+\paragraph{Threads, messages, plots, memory, and locks}
+\begin{itemize}
+\item \texttt{get\_threads()} --- List of \texttt{ThreadData} objects with \texttt{id}, \texttt{count}, \texttt{is\_fiber}.
+\item \texttt{get\_messages()} --- List of \texttt{MessageInfo} objects with \texttt{time}, \texttt{text}, \texttt{color}, \texttt{thread}.
+\item \texttt{get\_plots()} --- List of \texttt{PlotSummary} objects with \texttt{name}, \texttt{type}, \texttt{min}, \texttt{max}, \texttt{sum}, \texttt{avg}, \texttt{count}.
+\item \texttt{get\_memory\_events()} --- List of raw allocation events including pointer, size, alloc/free times, and callstack index.
+\item \texttt{get\_locks()} --- List of \texttt{LockSummary} objects. Use \texttt{get\_lock\_wait\_stats()} for contention analysis.
+\item \texttt{get\_symbol\_stats()} --- Callstack-sample hit counts per symbol. Sort by \texttt{excl} to find hot functions.
+\item \texttt{get\_callstack\_frames(callstack\_idx)} --- Resolve a callstack index to a list of \texttt{\{"name", "file", "line", "addr"\}} frames.
+\end{itemize}
+
+\subsubsection{Loading a capture}
+
+Traces must be explicitly loaded through the MCP server — opening a file in the Tracy GUI does not make it available to the server. Use \texttt{load\_capture} with the full path to a \texttt{.tracy} file, or use \texttt{list\_captures} first if \texttt{TRACY\_CAPTURES\_DIR} is configured.
+
 \subsection{Fortran API}
 \label{fortranapi}
 
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index be867665..f9e019b5 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -20,13 +20,24 @@ target_link_libraries(TracyClientBindings PUBLIC ${Python_LIBRARIES})
 target_compile_definitions(TracyClientBindings PUBLIC BUFFER_SIZE=${BUFFER_SIZE})
 target_compile_definitions(TracyClientBindings PUBLIC NAME_LENGTH=${NAME_LENGTH})
 
+include(${CMAKE_CURRENT_LIST_DIR}/../cmake/config.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/../cmake/vendor.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/../cmake/server.cmake)
+
+pybind11_add_module(TracyServerBindings SHARED bindings/ServerModule.cpp)
+target_link_libraries(TracyServerBindings PUBLIC TracyServer)
+target_link_libraries(TracyServerBindings PUBLIC ${Python_LIBRARIES})
+
 if (UNIX)
   set_target_properties(TracyClientBindings PROPERTIES
     BUILD_RPATH_USE_ORIGIN TRUE
     INSTALL_RPATH "\$ORIGIN/lib")
+  set_target_properties(TracyServerBindings PROPERTIES
+    BUILD_RPATH_USE_ORIGIN TRUE
+    INSTALL_RPATH "\$ORIGIN/lib")
 endif ()
 
-install(TARGETS TracyClientBindings
+install(TARGETS TracyClientBindings TracyServerBindings
   RUNTIME DESTINATION .
   LIBRARY DESTINATION .
 )
diff --git a/python/bindings/ServerModule.cpp b/python/bindings/ServerModule.cpp
new file mode 100644
index 00000000..1dbd0d52
--- /dev/null
+++ b/python/bindings/ServerModule.cpp
@@ -0,0 +1,1089 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <unordered_map>
+
+#ifdef _MSC_VER
+#  pragma warning( push )
+#  pragma warning( disable : 4244 4267 )  // third-party ppqsort: narrowing conversions
+#elif defined( __GNUC__ )
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wconversion"
+#  pragma GCC diagnostic ignored "-Wnarrowing"
+#endif
+#include "../../server/TracyFileRead.hpp"
+#include "../../server/TracyWorker.hpp"
+#ifdef _MSC_VER
+#  pragma warning( pop )
+#elif defined( __GNUC__ )
+#  pragma GCC diagnostic pop
+#endif
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+namespace tracy
+{
+
+PYBIND11_MODULE( TracyServerBindings, m )
+{
+    m.doc() = "Tracy Server (Analysis) Bindings";
+
+    // -------------------------------------------------------------------------
+    // SourceLocation
+    // -------------------------------------------------------------------------
+    py::class_<SourceLocation>( m, "SourceLocation" )
+        .def_readonly( "line", &SourceLocation::line );
+
+    // -------------------------------------------------------------------------
+    // ZoneStats — POD summary returned by zone stat helpers
+    // -------------------------------------------------------------------------
+    struct ZoneStats
+    {
+        int64_t min;
+        int64_t max;
+        int64_t total;
+        double sumSq;
+        size_t count;
+        double avg;
+    };
+
+    py::class_<ZoneStats>( m, "ZoneStats" )
+        .def_readonly( "min", &ZoneStats::min )
+        .def_readonly( "max", &ZoneStats::max )
+        .def_readonly( "total", &ZoneStats::total )
+        .def_readonly( "sum_sq", &ZoneStats::sumSq )
+        .def_readonly( "count", &ZoneStats::count )
+        .def_readonly( "avg", &ZoneStats::avg );
+
+    // GpuZoneStats — GPU timestamps are the same int64_t nanosecond type;
+    // reuse ZoneStats rather than duplicating the struct.
+    using GpuZoneStats = ZoneStats;
+
+    // -------------------------------------------------------------------------
+    // FrameStats — per-frame-set timing summary
+    // -------------------------------------------------------------------------
+    struct FrameStats
+    {
+        std::string name;
+        int64_t min;
+        int64_t max;
+        int64_t total;
+        double sumSq;
+        size_t count;
+        double avg;
+    };
+
+    py::class_<FrameStats>( m, "FrameStats" )
+        .def_readonly( "name", &FrameStats::name )
+        .def_readonly( "min", &FrameStats::min )
+        .def_readonly( "max", &FrameStats::max )
+        .def_readonly( "total", &FrameStats::total )
+        .def_readonly( "sum_sq", &FrameStats::sumSq )
+        .def_readonly( "count", &FrameStats::count )
+        .def_readonly( "avg", &FrameStats::avg );
+
+    // -------------------------------------------------------------------------
+    // PlotSummary
+    // -------------------------------------------------------------------------
+    struct PlotSummary
+    {
+        std::string name;
+        double min;
+        double max;
+        double sum;
+        size_t count;
+        double avg;
+        std::string type;
+    };
+
+    py::class_<PlotSummary>( m, "PlotSummary" )
+        .def_readonly( "name", &PlotSummary::name )
+        .def_readonly( "min", &PlotSummary::min )
+        .def_readonly( "max", &PlotSummary::max )
+        .def_readonly( "sum", &PlotSummary::sum )
+        .def_readonly( "count", &PlotSummary::count )
+        .def_readonly( "avg", &PlotSummary::avg )
+        .def_readonly( "type", &PlotSummary::type );
+
+    // -------------------------------------------------------------------------
+    // MemPoolSummary
+    // -------------------------------------------------------------------------
+    struct MemPoolSummary
+    {
+        std::string name;
+        uint64_t high;
+        uint64_t low;
+        uint64_t usage;
+        size_t alloc_count;
+    };
+
+    py::class_<MemPoolSummary>( m, "MemPoolSummary" )
+        .def_readonly( "name", &MemPoolSummary::name )
+        .def_readonly( "high", &MemPoolSummary::high )
+        .def_readonly( "low", &MemPoolSummary::low )
+        .def_readonly( "usage", &MemPoolSummary::usage )
+        .def_readonly( "alloc_count", &MemPoolSummary::alloc_count );
+
+    // -------------------------------------------------------------------------
+    // LockSummary
+    // -------------------------------------------------------------------------
+    struct LockSummary
+    {
+        std::string name;
+        bool is_contended;
+        std::string type;
+        int64_t time_announce;
+        int64_t time_terminate;
+        std::vector<uint64_t> threads;
+    };
+
+    py::class_<LockSummary>( m, "LockSummary" )
+        .def_readonly( "name", &LockSummary::name )
+        .def_readonly( "is_contended", &LockSummary::is_contended )
+        .def_readonly( "type", &LockSummary::type )
+        .def_readonly( "time_announce", &LockSummary::time_announce )
+        .def_readonly( "time_terminate", &LockSummary::time_terminate )
+        .def_readonly( "threads", &LockSummary::threads );
+
+    // -------------------------------------------------------------------------
+    // GpuContextSummary
+    // -------------------------------------------------------------------------
+    struct GpuContextSummary
+    {
+        std::string name;
+        uint64_t count;
+        std::string type;
+        uint64_t thread;
+    };
+
+    py::class_<GpuContextSummary>( m, "GpuContextSummary" )
+        .def_readonly( "name", &GpuContextSummary::name )
+        .def_readonly( "count", &GpuContextSummary::count )
+        .def_readonly( "type", &GpuContextSummary::type )
+        .def_readonly( "thread", &GpuContextSummary::thread );
+
+    // -------------------------------------------------------------------------
+    // MessageInfo
+    // -------------------------------------------------------------------------
+    struct MessageInfo
+    {
+        int64_t time;
+        std::string text;
+        uint32_t color;
+        uint64_t thread;
+    };
+
+    py::class_<MessageInfo>( m, "MessageInfo" )
+        .def_readonly( "time", &MessageInfo::time )
+        .def_readonly( "text", &MessageInfo::text )
+        .def_readonly( "color", &MessageInfo::color )
+        .def_readonly( "thread", &MessageInfo::thread );
+
+    // ThreadData — get_threads() returns plain dicts to avoid pybind11
+    // raw-pointer ownership issues, so no class registration is needed.
+
+    // -------------------------------------------------------------------------
+    // Worker
+    // -------------------------------------------------------------------------
+    auto worker_cls = py::class_<Worker>( m, "Worker" );
+    worker_cls
+        // Construction
+        .def( py::init<const char*, uint16_t, int64_t>(), "addr"_a, "port"_a, "memoryLimit"_a = -1 )
+
+        // --- Capture metadata ---
+        .def( "get_capture_name", &Worker::GetCaptureName )
+        .def( "get_capture_program", &Worker::GetCaptureProgram )
+        .def( "get_capture_time", &Worker::GetCaptureTime )
+        .def( "get_host_info", &Worker::GetHostInfo )
+        .def( "get_pid", &Worker::GetPid )
+        .def( "get_resolution", &Worker::GetResolution )
+        .def( "get_first_time", &Worker::GetFirstTime )
+        .def( "get_last_time", &Worker::GetLastTime )
+        .def( "get_cpu_manufacturer", &Worker::GetCpuManufacturer )
+
+        // --- Counts ---
+        .def( "get_zone_count", &Worker::GetZoneCount )
+        .def( "get_gpu_zone_count", &Worker::GetGpuZoneCount )
+        .def( "get_lock_count", &Worker::GetLockCount )
+        .def( "get_plot_count", &Worker::GetPlotCount )
+        .def( "get_context_switch_count", &Worker::GetContextSwitchCount )
+        .def( "get_src_loc_count", &Worker::GetSrcLocCount )
+        .def( "get_callstack_sample_count", &Worker::GetCallstackSampleCount )
+        .def( "get_message_count", []( const Worker& w ) {
+        return w.GetMessages().size();
+    } )
+
+        // --- Source locations / zones ---
+        .def( "get_src_loc", []( const Worker& w, int16_t id ) {
+        return w.GetSourceLocation( id );
+    } ).def( "get_zone_name", []( const Worker& w, int16_t id ) {
+        return w.GetZoneName( w.GetSourceLocation( id ) );
+    } )
+#ifndef TRACY_NO_STATISTICS
+        .def( "get_zone_stats", []( const Worker& w, int16_t id ) {
+        const auto& stats = w.GetZonesForSourceLocation( id );
+        const size_t cnt = stats.zones.size();
+        return ZoneStats{ stats.min, stats.max, stats.total, stats.sumSq, cnt, cnt ? (double)stats.total / cnt : 0.0 };
+    } )
+#endif
+        .def( "get_all_zone_stats", []( const Worker& w ) {
+        py::dict result;
+#ifndef TRACY_NO_STATISTICS
+        for( const auto& kv : w.GetSourceLocationZones() )
+        {
+            const auto& stats = kv.second;
+            if( stats.zones.size() == 0 ) continue;
+            const auto& sl = w.GetSourceLocation( kv.first );
+            const char* name = w.GetZoneName( sl );
+            const size_t cnt = stats.zones.size();
+            result[name] = ZoneStats{ stats.min, stats.max, stats.total, stats.sumSq, cnt, (double)stats.total / cnt };
+        }
+#endif
+        return result;
+    } ).def( "get_root_zone_stats", []( const Worker& w ) {
+            // Aggregate stats for top-level (root) zones only — no nesting, safe to sum
+            // File-loaded data uses is_magic() — zones stored inline, not as short_ptr
+        struct Acc
+        {
+            int64_t min = INT64_MAX, max = INT64_MIN, total = 0;
+            double sumSq = 0;
+            size_t count = 0;
+        };
+        std::unordered_map<int16_t, Acc> acc;
+        auto processRoot = [&]( const ZoneEvent& z ) {
+            if( !z.IsEndValid() ) return;
+            const int64_t dur = z.End() - z.Start();
+            auto& s = acc[z.SrcLoc()];
+            s.total += dur;
+            s.count++;
+            if( dur < s.min ) s.min = dur;
+            if( dur > s.max ) s.max = dur;
+        };
+        for( const auto* td : w.GetThreadData() )
+        {
+            if( !td ) continue;
+            if( td->timeline.is_magic() )
+            {
+                for( const auto& z : *(const Vector<ZoneEvent>*)&td->timeline ) processRoot( z );
+            }
+            else
+            {
+                for( const auto& zptr : td->timeline )
+                {
+                    if( const ZoneEvent* z = zptr.get() ) processRoot( *z );
+                }
+            }
+        }
+        py::dict result;
+        for( const auto& kv : acc )
+        {
+            const auto& s = kv.second;
+            const double avg = (double)s.total / s.count;
+            const char* name = w.GetZoneName( w.GetSourceLocation( kv.first ) );
+            result[name] = ZoneStats{ s.min, s.max, s.total, s.sumSq, s.count, avg };
+        }
+        return result;
+    } )
+
+        // --- Per-occurrence zone data (for temporal correlation / distribution) ---
+        .def( "get_zone_durations", []( const Worker& w, const std::string& name, size_t maxSamples ) {
+            // Accumulates across ALL srclocs with this name (same name can appear at multiple srclocs)
+        std::vector<int64_t> result;
+#ifndef TRACY_NO_STATISTICS
+        for( const auto& kv : w.GetSourceLocationZones() )
+        {
+            if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue;
+            for( const auto& ztd : kv.second.zones )
+            {
+                if( result.size() >= maxSamples ) goto done_durations;
+                const auto* z = ztd.Zone();
+                if( z && z->IsEndValid() ) result.push_back( z->End() - z->Start() );
+            }
+        }
+    done_durations:;
+#endif
+        return result;
+    }, "name"_a, "max_samples"_a = 100000 )
+        .def( "get_zone_occurrences", []( const Worker& w, const std::string& name, size_t maxSamples ) {
+            // Returns list of (start_ns, duration_ns) — accumulates across all srclocs with this name
+        std::vector<std::pair<int64_t, int64_t>> result;
+#ifndef TRACY_NO_STATISTICS
+        for( const auto& kv : w.GetSourceLocationZones() )
+        {
+            if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue;
+            for( const auto& ztd : kv.second.zones )
+            {
+                if( result.size() >= maxSamples ) goto done_occurrences;
+                const auto* z = ztd.Zone();
+                if( z && z->IsEndValid() ) result.emplace_back( z->Start(), z->End() - z->Start() );
+            }
+        }
+    done_occurrences:;
+#endif
+        return result;
+    }, "name"_a, "max_samples"_a = 100000 )
+        .def( "get_zone_annotations", []( const Worker& w, const std::string& name, size_t maxSamples ) {
+            // Returns text annotations attached to individual zone occurrences
+        std::vector<std::string> result;
+#ifndef TRACY_NO_STATISTICS
+        for( const auto& kv : w.GetSourceLocationZones() )
+        {
+            if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue;
+            for( const auto& ztd : kv.second.zones )
+            {
+                if( result.size() >= maxSamples ) goto done_annotations;
+                const auto* z = ztd.Zone();
+                if( z && w.HasZoneExtra( *z ) )
+                {
+                    const auto& extra = w.GetZoneExtra( *z );
+                    if( extra.text.Active() ) result.push_back( w.GetString( extra.text ) );
+                }
+            }
+        }
+    done_annotations:;
+#endif
+        return result;
+    }, "name"_a, "max_samples"_a = 10000 )
+        .def( "get_gpu_zone_durations", []( const Worker& w, const std::string& name, size_t maxSamples ) {
+        std::vector<int64_t> result;
+        for( const auto& kv : w.GetGpuSourceLocationZones() )
+        {
+            if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue;
+            for( const auto& ztd : kv.second.zones )
+            {
+                if( result.size() >= maxSamples ) goto done_gpu_dur;
+                const auto* z = ztd.Zone();
+                if( z && z->GpuEnd() >= 0 ) result.push_back( z->GpuEnd() - z->GpuStart() );
+            }
+        }
+    done_gpu_dur:;
+        return result;
+    }, "name"_a, "max_samples"_a = 100000 )
+        .def( "get_gpu_zone_occurrences", []( const Worker& w, const std::string& name, size_t maxSamples ) {
+        std::vector<std::pair<int64_t, int64_t>> result;
+        for( const auto& kv : w.GetGpuSourceLocationZones() )
+        {
+            if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue;
+            for( const auto& ztd : kv.second.zones )
+            {
+                if( result.size() >= maxSamples ) goto done_gpu_occ;
+                const auto* z = ztd.Zone();
+                if( z && z->GpuEnd() >= 0 ) result.emplace_back( z->GpuStart(), z->GpuEnd() - z->GpuStart() );
+            }
+        }
+    done_gpu_occ:;
+        return result;
+    }, "name"_a, "max_samples"_a = 100000 )
+
+        // --- Callstack resolution ---
+        .def( "get_callstack_frames", []( const Worker& w, uint32_t callstackIdx ) {
+        py::list result;
+        const auto& cs = w.GetCallstack( callstackIdx );
+        for( size_t i = 0; i < cs.size(); ++i )
+        {
+            const auto* fd = w.GetCallstackFrame( cs[i] );
+            if( !fd ) continue;
+            for( uint8_t j = 0; j < fd->size; ++j )
+            {
+                const auto& frame = fd->data[j];
+                py::dict d;
+                d["name"] = std::string( w.GetString( frame.name ) );
+                d["file"] = std::string( w.GetString( frame.file ) );
+                d["line"] = frame.line;
+                d["addr"] = frame.symAddr;
+                result.append( d );
+            }
+        }
+        return result;
+    }, "callstack_idx"_a )
+
+        // --- Context switches per thread ---
+        .def( "get_thread_context_switches", []( const Worker& w, uint64_t tid, size_t maxSamples ) {
+        py::list result;
+        const auto* cs = const_cast<Worker&>( w ).GetContextSwitchData( tid );
+        if( !cs ) return result;
+        for( const auto& ev : cs->v )
+        {
+            if( (size_t)result.size() >= maxSamples ) break;
+            if( !ev.IsEndValid() ) continue;
+            py::dict d;
+            d["start"] = ev.Start();
+            d["end"] = ev.End();
+            d["cpu"] = (int)ev.Cpu();
+            d["reason"] = (int)ev.Reason();
+            result.append( d );
+        }
+        return result;
+    }, "tid"_a, "max_samples"_a = 50000 )
+
+        // --- CPU thread running time / migrations ---
+        .def( "get_cpu_thread_data", []( const Worker& w ) {
+        py::dict result;
+        for( const auto& kv : w.GetCpuThreadData() )
+        {
+            py::dict d;
+            d["running_time"] = kv.second.runningTime;
+            d["running_regions"] = kv.second.runningRegions;
+            d["migrations"] = kv.second.migrations;
+            result[py::int_( kv.first )] = d;
+        }
+        return result;
+    } )
+
+        // --- Zone occurrences with thread attribution ---
+        .def( "get_zone_occurrences_with_thread", []( const Worker& w, const std::string& name, size_t maxSamples ) {
+            // Returns list of (start_ns, duration_ns, thread_id) — thread_id is the OS thread ID
+        std::vector<std::tuple<int64_t, int64_t, uint64_t>> result;
+#ifndef TRACY_NO_STATISTICS
+        const auto& threads = w.GetThreadData();
+        for( const auto& kv : w.GetSourceLocationZones() )
+        {
+            if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue;
+            for( const auto& ztd : kv.second.zones )
+            {
+                if( result.size() >= maxSamples ) goto done_occ_thread;
+                const auto* z = ztd.Zone();
+                if( !z || !z->IsEndValid() ) continue;
+                const uint16_t tidx = ztd.Thread();
+                const uint64_t tid = ( tidx < threads.size() && threads[tidx] ) ? threads[tidx]->id : 0;
+                result.emplace_back( z->Start(), z->End() - z->Start(), tid );
+            }
+        }
+    done_occ_thread:;
+#endif
+        return result;
+    }, "name"_a, "max_samples"_a = 100000 )
+
+        // --- Child zone stats: aggregate direct children of all occurrences of a parent zone ---
+        .def( "get_child_zone_stats", []( const Worker& w, const std::string& name, size_t maxParents ) {
+            // Uses SourceLocationZones for O(occurrences) lookup — avoids walking the full zone tree.
+            // File-loaded data sets is_magic() on child vectors (inline ZoneEvent, not short_ptr).
+        struct Acc
+        {
+            int64_t min = INT64_MAX, max = INT64_MIN, total = 0;
+            double sumSq = 0.0;
+            size_t count = 0;
+        };
+        std::unordered_map<int16_t, Acc> acc;
+        size_t parentCount = 0;
+
+        auto accumChild = [&]( const ZoneEvent& c ) {
+            if( !c.IsEndValid() ) return;
+            const int64_t dur = c.End() - c.Start();
+            auto& s = acc[c.SrcLoc()];
+            s.total += dur;
+            s.count++;
+            s.sumSq += (double)dur * dur;
+            if( dur < s.min ) s.min = dur;
+            if( dur > s.max ) s.max = dur;
+        };
+
+#ifndef TRACY_NO_STATISTICS
+        for( const auto& kv : w.GetSourceLocationZones() )
+        {
+            if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue;
+            for( const auto& ztd : kv.second.zones )
+            {
+                if( parentCount >= maxParents ) goto done_children;
+                const auto* z = ztd.Zone();
+                if( !z || !z->IsEndValid() || !z->HasChildren() ) continue;
+                parentCount++;
+                const auto& ch = w.GetZoneChildren( z->Child() );
+                if( ch.is_magic() )
+                {
+                    for( const auto& c : *(const Vector<ZoneEvent>*)&ch ) accumChild( c );
+                }
+                else
+                {
+                    for( const auto& cptr : ch )
+                    {
+                        if( const ZoneEvent* c = cptr.get() ) accumChild( *c );
+                    }
+                }
+            }
+        }
+    done_children:;
+#endif
+        py::dict result;
+        for( const auto& kv : acc )
+        {
+            const auto& s = kv.second;
+            if( s.count == 0 ) continue;
+            const double avg = (double)s.total / (double)s.count;
+            const char* cname = w.GetZoneName( w.GetSourceLocation( kv.first ) );
+            result[cname] = ZoneStats{ s.min, s.max, s.total, s.sumSq, s.count, avg };
+        }
+        return result;
+    }, "name"_a, "max_parents"_a = 100000 )
+
+        // --- Zone source location (file / line / function for LLM code navigation) ---
+        .def( "get_zone_source_location", []( const Worker& w, const std::string& name ) {
+        py::dict result;
+#ifndef TRACY_NO_STATISTICS
+        for( const auto& kv : w.GetSourceLocationZones() )
+        {
+            const auto& sl = w.GetSourceLocation( kv.first );
+            if( std::string( w.GetZoneName( sl ) ) != name ) continue;
+            result["name"] = name;
+            result["function"] = std::string( w.GetString( sl.function ) );
+            result["file"] = std::string( w.GetString( sl.file ) );
+            result["line"] = sl.line;
+            result["color"] = sl.color;
+            break;
+        }
+#endif
+        return result;
+    }, "name"_a )
+        .def( "get_all_zone_source_locations", []( const Worker& w ) {
+            // Returns {zone_name: {file, line, function, color}} for every unique zone name.
+            // Uses first srcloc found per name — sufficient for navigation purposes.
+        py::dict result;
+#ifndef TRACY_NO_STATISTICS
+        for( const auto& kv : w.GetSourceLocationZones() )
+        {
+            const auto& sl = w.GetSourceLocation( kv.first );
+            const char* name = w.GetZoneName( sl );
+            if( result.contains( name ) ) continue;
+            py::dict d;
+            d["function"] = std::string( w.GetString( sl.function ) );
+            d["file"] = std::string( w.GetString( sl.file ) );
+            d["line"] = sl.line;
+            d["color"] = sl.color;
+            result[name] = d;
+        }
+#endif
+        return result;
+    } )
+
+        // --- Per-zone callstack samples (call paths leading into a zone) ---
+        .def( "get_zone_callstacks", []( const Worker& w, const std::string& name, size_t maxSamples ) {
+        py::list result;
+#ifndef TRACY_NO_STATISTICS
+        for( const auto& kv : w.GetSourceLocationZones() )
+        {
+            if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue;
+            for( const auto& ztd : kv.second.zones )
+            {
+                if( (size_t)result.size() >= maxSamples ) goto done_callstacks;
+                const auto* z = ztd.Zone();
+                if( !z || !w.HasZoneExtra( *z ) ) continue;
+                const auto& extra = w.GetZoneExtra( *z );
+                const uint32_t csIdx = extra.callstack.Val();
+                if( csIdx == 0 ) continue;
+                py::list frames;
+                const auto& cs = w.GetCallstack( csIdx );
+                for( size_t i = 0; i < cs.size(); ++i )
+                {
+                    const auto* fd = w.GetCallstackFrame( cs[i] );
+                    if( !fd ) continue;
+                    for( uint8_t j = 0; j < fd->size; ++j )
+                    {
+                        const auto& frame = fd->data[j];
+                        py::dict d;
+                        d["name"] = std::string( w.GetString( frame.name ) );
+                        d["file"] = std::string( w.GetString( frame.file ) );
+                        d["line"] = frame.line;
+                        d["addr"] = frame.symAddr;
+                        frames.append( d );
+                    }
+                }
+                result.append( frames );
+            }
+        }
+    done_callstacks:;
+#endif
+        return result;
+    }, "name"_a, "max_samples"_a = 1000 )
+
+        // --- Symbol-level sampling stats (inclusive / exclusive counts from call-stack profiling) ---
+        .def( "get_symbol_stats", []( const Worker& w ) {
+        py::list result;
+        for( const auto& kv : w.GetSymbolStats() )
+        {
+            const uint64_t addr = kv.first;
+            const auto& stats = kv.second;
+            py::dict d;
+            d["addr"] = addr;
+            d["incl"] = stats.incl;
+            d["excl"] = stats.excl;
+            const auto* sym = w.GetSymbolData( addr );
+            if( sym )
+            {
+                d["name"] = std::string( w.GetString( sym->name ) );
+                d["file"] = std::string( w.GetString( sym->file ) );
+                d["line"] = sym->line;
+                d["image"] = std::string( w.GetString( sym->imageName ) );
+            }
+            result.append( d );
+        }
+        return result;
+    } )
+
+        // --- Timestamps of all call-stack samples hitting a specific symbol ---
+        .def( "get_samples_for_symbol", []( const Worker& w, uint64_t symAddr ) {
+        py::list result;
+        const auto* samples = w.GetSamplesForSymbol( symAddr );
+        if( !samples ) return result;
+        for( const auto& s : *samples )
+        {
+            py::dict d;
+            d["time"] = s.time.Val();
+            d["thread"] = (uint32_t)s.thread;
+            result.append( d );
+        }
+        return result;
+    }, "sym_addr"_a )
+
+        // --- Hardware performance counter summary per symbol (IPC, cache-miss rate, branch-miss rate) ---
+        .def( "get_hw_sample_summary", []( const Worker& w ) {
+        py::list result;
+        for( const auto& kv : w.GetSymbolStats() )
+        {
+            const uint64_t addr = kv.first;
+            auto* hw = const_cast<Worker&>( w ).GetHwSampleData( addr );
+            if( !hw || ( hw->cycles.empty() && hw->retired.empty() ) ) continue;
+            auto mean = []( const auto& v ) -> double {
+                if( v.empty() ) return 0.0;
+                double sum = 0.0;
+                for( const auto& x : v ) sum += (double)x.Val();
+                return sum / (double)v.size();
+            };
+            const double cyc = mean( hw->cycles );
+            const double ret = mean( hw->retired );
+            const double cmr = mean( hw->cacheRef );
+            const double cmm = mean( hw->cacheMiss );
+            const double brr = mean( hw->branchRetired );
+            const double brm = mean( hw->branchMiss );
+            py::dict d;
+            d["addr"] = addr;
+            d["samples"] = hw->cycles.empty() ? hw->retired.size() : hw->cycles.size();
+            d["cycles_mean"] = cyc;
+            d["retired_mean"] = ret;
+            d["cache_ref_mean"] = cmr;
+            d["cache_miss_mean"] = cmm;
+            d["branch_ret_mean"] = brr;
+            d["branch_miss_mean"] = brm;
+            d["ipc"] = ( cyc > 0.0 && ret > 0.0 ) ? ret / cyc : -1.0;
+            d["cache_miss_rate"] = ( cmr > 0.0 ) ? cmm / cmr : -1.0;
+            d["branch_miss_rate"] = ( brr > 0.0 ) ? brm / brr : -1.0;
+            const auto* sym = w.GetSymbolData( addr );
+            d["name"] = sym ? std::string( w.GetString( sym->name ) ) : std::string( "" );
+            d["file"] = sym ? std::string( w.GetString( sym->file ) ) : std::string( "" );
+            d["line"] = sym ? sym->line : 0u;
+            d["image"] = sym ? std::string( w.GetString( sym->imageName ) ) : std::string( "" );
+            result.append( d );
+        }
+        return result;
+    } )
+
+        // --- Raw memory allocation events (ptr, size, timestamps) for temporal zone correlation ---
+        .def( "get_memory_events", []( const Worker& w, size_t maxCount, const std::string& poolName ) {
+        py::list result;
+        for( const auto& kv : w.GetMemNameMap() )
+        {
+            const std::string name = kv.first == 0
+                                         ? std::string( "(default)" )
+                                         : std::string( w.GetString( kv.first ) );
+            if( !poolName.empty() && name != poolName ) continue;
+            const MemData* md = kv.second;
+            for( const auto& ev : md->data )
+            {
+                if( (size_t)result.size() >= maxCount ) break;
+                py::dict d;
+                d["pool"] = name;
+                d["ptr"] = ev.Ptr();
+                d["size"] = ev.Size();
+                d["time_alloc"] = ev.TimeAlloc();
+                d["time_free"] = ev.TimeFree();
+                d["thread_alloc"] = (uint32_t)ev.ThreadAlloc();
+                d["callstack_idx"] = (uint32_t)ev.CsAlloc();
+                result.append( d );
+            }
+            if( !poolName.empty() ) break;
+        }
+        return result;
+    }, "max_count"_a = 100000, "pool_name"_a = "" )
+
+        // --- Per-lock wait/contention stats (total and average wait time) ---
+        .def( "get_lock_wait_stats", []( const Worker& w ) {
+        py::list result;
+        for( const auto& kv : w.GetLockMap() )
+        {
+            const LockMap* lm = kv.second;
+            if( !lm || !lm->valid || !lm->isContended ) continue;
+            std::string name;
+            if( lm->customName.Active() )
+                name = w.GetString( lm->customName );
+            else
+                name = w.GetZoneName( w.GetSourceLocation( lm->srcloc ) );
+            int64_t totalWaitNs = 0;
+            uint64_t contentionCount = 0;
+            std::unordered_map<uint8_t, int64_t> pendingWait;
+            for( const auto& evPtr : lm->timeline )
+            {
+                const auto* ev = evPtr.ptr.get();
+                if( !ev ) continue;
+                if( ev->type == LockEvent::Type::Wait || ev->type == LockEvent::Type::WaitShared )
+                {
+                    pendingWait[ev->thread] = ev->Time();
+                }
+                else if( ev->type == LockEvent::Type::Obtain || ev->type == LockEvent::Type::ObtainShared )
+                {
+                    auto it = pendingWait.find( ev->thread );
+                    if( it != pendingWait.end() )
+                    {
+                        totalWaitNs += ev->Time() - it->second;
+                        contentionCount++;
+                        pendingWait.erase( it );
+                    }
+                }
+            }
+            if( contentionCount == 0 ) continue;
+            py::dict d;
+            d["name"] = name;
+            d["total_wait_ns"] = totalWaitNs;
+            d["avg_wait_ns"] = (double)totalWaitNs / (double)contentionCount;
+            d["contention_count"] = contentionCount;
+            d["threads"] = lm->threadList;
+            result.append( d );
+        }
+        return result;
+    } )
+
+        // --- GPU zone stats ---
+        .def( "get_all_gpu_zone_stats", []( const Worker& w ) {
+        py::dict result;
+        for( const auto& kv : w.GetGpuSourceLocationZones() )
+        {
+            const auto& sl = w.GetSourceLocation( kv.first );
+            const char* name = w.GetZoneName( sl );
+            const auto& s = kv.second;
+            const size_t cnt = s.zones.size();
+            if( cnt > 0 )
+                result[name] = GpuZoneStats{ s.min, s.max, s.total, s.sumSq, cnt, (double)s.total / cnt };
+        }
+        return result;
+    } ).def( "get_gpu_child_zone_stats", []( const Worker& w, const std::string& name, size_t maxParents ) {
+            // GPU equivalent of get_child_zone_stats — returns per-child-name GPU duration stats
+            // for all occurrences of the named parent GPU zone.
+        struct Acc
+        {
+            int64_t min = INT64_MAX, max = INT64_MIN, total = 0;
+            double sumSq = 0.0;
+            size_t count = 0;
+        };
+        std::unordered_map<int16_t, Acc> acc;
+        size_t parentCount = 0;
+
+        auto accumChild = [&]( const GpuEvent& c ) {
+            if( c.GpuEnd() < 0 ) return;
+            const int64_t dur = c.GpuEnd() - c.GpuStart();
+            if( dur < 0 ) return;
+            auto& s = acc[c.SrcLoc()];
+            s.total += dur;
+            s.count++;
+            s.sumSq += (double)dur * dur;
+            if( dur < s.min ) s.min = dur;
+            if( dur > s.max ) s.max = dur;
+        };
+
+        for( const auto& kv : w.GetGpuSourceLocationZones() )
+        {
+            if( std::string( w.GetZoneName( w.GetSourceLocation( kv.first ) ) ) != name ) continue;
+            for( const auto& ztd : kv.second.zones )
+            {
+                if( parentCount >= maxParents ) goto done_gpu_child;
+                const auto* z = ztd.Zone();
+                if( !z || z->GpuEnd() < 0 || z->Child() < 0 ) continue;
+                parentCount++;
+                for( const auto& cptr : w.GetGpuChildren( z->Child() ) )
+                {
+                    if( const GpuEvent* c = cptr.get() ) accumChild( *c );
+                }
+            }
+        }
+    done_gpu_child:;
+
+        py::dict result;
+        for( const auto& kv : acc )
+        {
+            const auto& s = kv.second;
+            if( s.count == 0 ) continue;
+            const char* cname = w.GetZoneName( w.GetSourceLocation( kv.first ) );
+            result[cname] = GpuZoneStats{ s.min, s.max, s.total, s.sumSq, s.count, (double)s.total / s.count };
+        }
+        return result;
+    }, "name"_a, "max_parents"_a = 100000 )
+
+        // --- Frame sets ---
+        .def( "get_frame_count", []( const Worker& w ) {
+        auto frames = w.GetFramesBase();
+        return frames ? w.GetFrameCount( *frames ) : 0;
+    } ).def( "get_all_frame_stats", []( const Worker& w ) {
+        std::vector<FrameStats> result;
+        for( const auto* fd : w.GetFrames() )
+        {
+            if( !fd ) continue;
+            const size_t cnt = fd->frames.size();
+            const std::string name = w.GetString( fd->name );
+            result.push_back( FrameStats{
+                name, fd->min, fd->max, fd->total, fd->sumSq,
+                cnt, cnt ? (double)fd->total / cnt : 0.0 } );
+        }
+        return result;
+    } ).def( "get_frame_boundaries", []( const Worker& w ) {
+        auto* fd = w.GetFramesBase();
+        if( !fd ) return std::vector<std::pair<int64_t, int64_t>>{};
+        const size_t cnt = w.GetFrameCount( *fd );
+        std::vector<std::pair<int64_t, int64_t>> result;
+        result.reserve( cnt );
+        for( size_t i = 0; i < cnt; ++i )
+            result.emplace_back( w.GetFrameBegin( *fd, i ), w.GetFrameEnd( *fd, i ) );
+        return result;
+    } ).def( "get_frame_times", []( const Worker& w ) {
+        auto* fd = w.GetFramesBase();
+        if( !fd ) return std::vector<int64_t>{};
+        const size_t cnt = w.GetFrameCount( *fd );
+        std::vector<int64_t> times;
+        times.reserve( cnt );
+        for( size_t i = 0; i < cnt; ++i )
+            times.push_back( w.GetFrameTime( *fd, i ) );
+        return times;
+    } ).def( "get_frame_times_named", []( const Worker& w, const std::string& name ) {
+        for( const auto* fd : w.GetFrames() )
+        {
+            if( !fd ) continue;
+            if( w.GetString( fd->name ) == name )
+            {
+                const size_t cnt = w.GetFrameCount( *fd );
+                std::vector<int64_t> times;
+                times.reserve( cnt );
+                for( size_t i = 0; i < cnt; ++i )
+                    times.push_back( w.GetFrameTime( *fd, i ) );
+                return times;
+            }
+        }
+        return std::vector<int64_t>{};
+    } ).def( "get_zones_in_frame", []( const Worker& w, size_t frameIdx ) {
+            // Returns {zone_name: {count, total_ns}} for all CPU zones that STARTED within
+            // the specified frame's time window. Uses sorted thread timelines for early exit.
+        auto* fd = w.GetFramesBase();
+        if( !fd || frameIdx >= (size_t)w.GetFrameCount( *fd ) ) return py::dict{};
+
+        const int64_t frameStart = w.GetFrameBegin( *fd, (int)frameIdx );
+        const int64_t frameEnd = w.GetFrameEnd( *fd, (int)frameIdx );
+
+        struct Acc
+        {
+            int64_t total = 0;
+            size_t count = 0;
+        };
+        std::unordered_map<int16_t, Acc> acc;
+
+            // Returns false when a zone starts at or after frameEnd (prune signal
+            // for sorted sibling lists). Uses a local struct instead of std::function
+            // to avoid per-call heap allocation on the hot recursive path.
+        struct Visitor
+        {
+            const Worker& w;
+            std::unordered_map<int16_t, Acc>& acc;
+            int64_t frameStart, frameEnd;
+
+            bool operator()( const ZoneEvent& z )
+            {
+                if( !z.IsEndValid() ) return true;
+                const int64_t zs = z.Start();
+                if( zs >= frameEnd ) return false;
+                if( zs >= frameStart )
+                {
+                    auto& s = acc[z.SrcLoc()];
+                    s.total += z.End() - zs;
+                    s.count++;
+                }
+                if( z.HasChildren() && z.End() > frameStart )
+                {
+                    const auto& ch = w.GetZoneChildren( z.Child() );
+                    if( ch.is_magic() )
+                    {
+                        for( const auto& c : *(const Vector<ZoneEvent>*)&ch )
+                        {
+                            if( !( *this )( c ) ) break;
+                        }
+                    }
+                    else
+                    {
+                        for( const auto& cptr : ch )
+                        {
+                            if( const ZoneEvent* c = cptr.get() )
+                            {
+                                if( !( *this )( *c ) ) break;
+                            }
+                        }
+                    }
+                }
+                return true;
+            }
+        } visit{ w, acc, frameStart, frameEnd };
+
+        for( const auto* td : w.GetThreadData() )
+        {
+            if( !td ) continue;
+            if( td->timeline.is_magic() )
+            {
+                for( const auto& z : *(const Vector<ZoneEvent>*)&td->timeline )
+                {
+                    if( !visit( z ) ) break;
+                }
+            }
+            else
+            {
+                for( const auto& zptr : td->timeline )
+                {
+                    const ZoneEvent* z = zptr.get();
+                    if( z && !visit( *z ) ) break;
+                }
+            }
+        }
+
+        py::dict result;
+        for( const auto& kv : acc )
+        {
+            py::dict d;
+            d["count"] = kv.second.count;
+            d["total_ns"] = kv.second.total;
+            const char* zname = w.GetZoneName( w.GetSourceLocation( kv.first ) );
+            result[zname] = d;
+        }
+        return result;
+    }, "frame_idx"_a )
+
+        // --- Messages ---
+        .def( "get_messages", []( const Worker& w ) {
+        const auto& msgs = w.GetMessages();
+        std::vector<MessageInfo> result;
+        result.reserve( msgs.size() );
+        for( const auto& m_ptr : msgs )
+        {
+            const auto& msg = *m_ptr;
+            result.push_back( MessageInfo{
+                msg.time,
+                std::string( w.GetString( msg.ref ) ),
+                msg.color,
+                (uint64_t)msg.thread } );
+        }
+        return result;
+    } )
+
+        // --- Plots ---
+        .def( "get_plots", []( const Worker& w ) {
+        static const char* plotTypeStr[] = { "User", "Memory", "SysTime", "Power" };
+        std::vector<PlotSummary> result;
+        for( const auto* pd : w.GetPlots() )
+        {
+            if( !pd ) continue;
+            const size_t cnt = pd->data.size();
+            const std::string name = w.GetString( pd->name );
+            const char* typeStr = (uint8_t)pd->type < 4 ? plotTypeStr[(uint8_t)pd->type] : "Unknown";
+            result.push_back( PlotSummary{
+                name, pd->min, pd->max, pd->sum,
+                cnt, cnt ? pd->sum / cnt : 0.0,
+                std::string( typeStr ) } );
+        }
+        return result;
+    } )
+
+        // --- Memory pools ---
+        .def( "get_memory_pools", []( const Worker& w ) {
+        std::vector<MemPoolSummary> result;
+        for( const auto& kv : w.GetMemNameMap() )
+        {
+            const MemData* md = kv.second;
+            const std::string name = kv.first == 0 ? "(default)" : std::string( w.GetString( kv.first ) );
+            result.push_back( MemPoolSummary{
+                name, md->high, md->low, md->usage, md->data.size() } );
+        }
+        return result;
+    } )
+
+        // --- Locks ---
+        .def( "get_locks", []( const Worker& w ) {
+        std::vector<LockSummary> result;
+        for( const auto& kv : w.GetLockMap() )
+        {
+            const LockMap* lm = kv.second;
+            if( !lm || !lm->valid ) continue;
+            std::string name;
+            if( lm->customName.Active() )
+                name = w.GetString( lm->customName );
+            else
+                name = w.GetZoneName( w.GetSourceLocation( lm->srcloc ) );
+            const char* typeStr = lm->type == LockType::Lockable ? "Lockable" : "SharedLockable";
+            result.push_back( LockSummary{
+                name,
+                lm->isContended,
+                std::string( typeStr ),
+                lm->timeAnnounce,
+                lm->timeTerminate,
+                lm->threadList } );
+        }
+        return result;
+    } )
+
+        // --- GPU contexts ---
+        .def( "get_gpu_contexts", []( const Worker& w ) {
+        static const char* gpuTypeStr[] = {
+            "Invalid", "OpenGL", "Vulkan", "OpenCL", "Direct3D12", "Direct3D11", "Metal", "Custom", "CUDA", "Rocprof" };
+        std::vector<GpuContextSummary> result;
+        for( const auto* ctx : w.GetGpuData() )
+        {
+            if( !ctx ) continue;
+            const std::string name = ctx->name.Active() ? w.GetString( ctx->name ) : "";
+            const uint8_t typeIdx = (uint8_t)ctx->type;
+            const char* typeStr = typeIdx < 10 ? gpuTypeStr[typeIdx] : "Unknown";
+            result.push_back( GpuContextSummary{
+                name, ctx->count, std::string( typeStr ), ctx->thread } );
+        }
+        return result;
+    } )
+
+        // --- Threads ---
+        .def( "get_threads", []( const Worker& w ) {
+            // Returns list of dicts to avoid raw-pointer pybind11 ownership issues
+        py::list result;
+        for( const auto& t : w.GetThreadData() )
+        {
+            if( !t ) continue;
+            py::dict d;
+            d["id"] = t->id;
+            d["count"] = t->count;
+            d["is_fiber"] = (bool)t->isFiber;
+            d["name"] = std::string( w.GetThreadName( t->id ) );
+            result.append( d );
+        }
+        return result;
+    } ).def( "get_thread_name", []( const Worker& w, uint64_t tid ) {
+        return w.GetThreadName( tid );
+    } )
+
+        // --- Connection control ---
+        .def( "is_connected", &Worker::IsConnected )
+        .def( "shutdown", &Worker::Shutdown )
+        .def( "disconnect", &Worker::Disconnect );
+
+    // -------------------------------------------------------------------------
+    // FileRead
+    // -------------------------------------------------------------------------
+    m.def( "open_file", []( const char* path ) -> std::shared_ptr<FileRead> {
+        auto f = FileRead::Open( path );
+        if( !f ) throw std::runtime_error( "Could not open file" );
+        return std::shared_ptr<FileRead>( f );
+    } );
+
+    py::class_<FileRead, std::shared_ptr<FileRead>>( m, "FileRead" );
+
+    m.def( "create_worker_from_file", []( std::shared_ptr<FileRead> f ) {
+        return std::make_unique<Worker>( *f );
+    } );
+}
+
+} // namespace tracy