API exploration for kernel metrics

2026-06-26 00:58:56 +00:00 · 2026-06-24 16:13:03 -07:00
1 changed files with 319 additions and 0 deletions
--- a/public/tracy/TracyCUDA.hpp
+++ b/public/tracy/TracyCUDA.hpp
@@ -10,6 +10,10 @@
 #define TracyCUDAStartProfiling(ctx)
 #define TracyCUDAStopProfiling(ctx)

+#define TracyCUDAEnableKernelMetrics(ctx)
+#define TracyCUDADisableKernelMetrics(ctx)
+#define TracyCUDADumpKernelMetrics(ctx)
+
 #define TracyCUDACollect(ctx)

 namespace tracy{
@@ -57,6 +61,24 @@ using CUDACtx = std::nullptr_t;
 #define TRACY_CUDA_ENABLE_CUDA_CALL_STATS (0)
 #endif//TRACY_CUDA_ENABLE_CUDA_CALL_STATS

+// Opt-in CUPTI Range Profiler support: collects hardware metrics for kernel
+// dispatches / graph launches and surfaces them as Tracy plots. OFF by default
+// because it pulls in an extra dependency (the NVPW host library) and may
+// require elevated GPU performance-counter permissions at runtime.
+#ifndef TRACY_CUDA_ENABLE_KERNEL_METRICS
+#define TRACY_CUDA_ENABLE_KERNEL_METRICS (0)
+#endif//TRACY_CUDA_ENABLE_KERNEL_METRICS
+
+#if TRACY_CUDA_ENABLE_KERNEL_METRICS
+#if CUDA_VERSION < 12060
+#error "kernel metrics require CUDA v12.6 (or later)"
+#endif
+#include <cupti_target.h>
+#include <cupti_profiler_target.h>
+#include <cupti_profiler_host.h>
+#include <cupti_range_profiler.h>
+#endif//TRACY_CUDA_ENABLE_KERNEL_METRICS
+
 namespace {

 // TODO(marcos): wrap these in structs for better type safety
@@ -568,6 +590,22 @@ namespace tracy
            printStats();
        }

+        void EnableKernelMetrics()
+        {
+            ZoneScoped;
+            #if TRACY_CUDA_ENABLE_KERNEL_METRICS
+            CUPTI::BeginKernelMetrics(this);
+            #endif//TRACY_CUDA_ENABLE_KERNEL_METRICS
+        }
+
+        void DisableKernelMetrics()
+        {
+            ZoneScoped;
+            #if TRACY_CUDA_ENABLE_KERNEL_METRICS
+            CUPTI::EndKernelMetrics();
+            #endif//TRACY_CUDA_ENABLE_KERNEL_METRICS
+        }
+
        void Name(const char *name, uint16_t len)
        {
            auto ptr = (char*)tracyMalloc(len);
@@ -1263,6 +1301,262 @@ namespace tracy
            //CUPTI_ACTIVITY_KIND_DRIVER,
        };

+        #if TRACY_CUDA_ENABLE_KERNEL_METRICS
+        // Curated set of PerfWorks metrics collected per range (dispatch / graph
+        // launch). Kept deliberately small so the whole set fits in a SINGLE
+        // collection pass (no kernel replay): all are high-level "% of peak"
+        // rollups, so each maps to only a few raw counters. BeginKernelMetrics
+        // asserts the resulting config needs exactly one pass; trim this list if
+        // a given architecture's counter budget is exceeded. Note that some
+        // metric names can vary across GPU architectures.
+        static constexpr const char* kKernelMetricNames[] = {
+            // Compute efficiency
+            "sm__throughput.avg.pct_of_peak_sustained_elapsed",                 // overall SM/compute throughput
+            "smsp__inst_executed.avg.per_cycle_active",                         // instructions issued per active cycle (IPC)
+            // Memory utilization
+            "gpu__compute_memory_throughput.avg.pct_of_peak_sustained_elapsed", // overall memory-subsystem throughput
+            "dram__throughput.avg.pct_of_peak_sustained_elapsed",               // device-memory (DRAM) bandwidth
+            "lts__t_sector_hit_rate.pct",                                       // L2 cache hit rate
+            // Occupancy / latency hiding
+            "sm__warps_active.avg.pct_of_peak_sustained_active",                // achieved occupancy
+            // Stalls (issue-slot utilization: low value => warps stalled)
+            "smsp__issue_active.avg.pct_of_peak_sustained_active",
+        };
+        static constexpr size_t kKernelMetricCount = sizeof(kKernelMetricNames) / sizeof(kKernelMetricNames[0]);
+
+        // Upper bound on ranges (dispatches / graph launches) buffered in one
+        // counter-data image before it must be decoded and reset. Tunable; sizes
+        // the counter-data allocation and caps ranges-per-pass at SetConfig time.
+        static constexpr size_t kMaxRangesPerPass = 64;
+
+        // Range Profiler bring-up: one-time process-wide profiler init plus
+        // creation of a Range Profiler object bound to the current CUDA context.
+        // Metric configuration (chip name + curated single-pass metric list ->
+        // config image), cuptiRangeProfilerSetConfig() in user-range mode, and
+        // cuptiRangeProfilerStart() are wired up in a later step.
+        static void BeginKernelMetrics(CUDACtx* profilerHost) {
+            ZoneScoped;
+            UNREFERENCED(profilerHost);
+            auto& rangeProfiler = PersistentState::Get().rangeProfiler;
+            if (rangeProfiler != nullptr) {
+                return; // already initialized
+            }
+
+            // Process-wide CUPTI profiler initialization (idempotent).
+            CUpti_Profiler_Initialize_Params initParams = { CUpti_Profiler_Initialize_Params_STRUCT_SIZE };
+            CUPTI_API_CALL(cuptiProfilerInitialize(&initParams));
+
+            // Bind a Range Profiler object to the calling thread's current
+            // context (a null ctx tells CUPTI to use the current context).
+            CUcontext cuCtx = nullptr;
+            DRIVER_API_CALL(cuCtxGetCurrent(&cuCtx));
+            CUpti_RangeProfiler_Enable_Params createParams = { CUpti_RangeProfiler_Enable_Params_STRUCT_SIZE };
+            createParams.ctx = cuCtx;
+            CUPTI_API_CALL(cuptiRangeProfilerEnable(&createParams));
+            rangeProfiler = createParams.pRangeProfilerObject;
+
+            // Query the device chip name: the host profiler is chip-specific and
+            // needs it to resolve metric names into hardware counters.
+            CUdevice cuDevice = 0;
+            DRIVER_API_CALL(cuCtxGetDevice(&cuDevice));
+            CUpti_Device_GetChipName_Params chipNameParams = { CUpti_Device_GetChipName_Params_STRUCT_SIZE };
+            chipNameParams.deviceIndex = (size_t)cuDevice;
+            CUPTI_API_CALL(cuptiDeviceGetChipName(&chipNameParams));
+            auto& chipName = PersistentState::Get().chipName;
+            chipName = chipNameParams.pChipName;
+
+            // Snapshot which counters are actually available on this context so
+            // the host can reject unsupported metrics up front. The first call
+            // reports the image size; the second fills the allocated buffer.
+            auto& counterAvailabilityImage = PersistentState::Get().counterAvailabilityImage;
+            CUpti_Profiler_GetCounterAvailability_Params availabilityParams = { CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE };
+            availabilityParams.ctx = cuCtx;
+            CUPTI_API_CALL(cuptiProfilerGetCounterAvailability(&availabilityParams));
+            counterAvailabilityImage.resize(availabilityParams.counterAvailabilityImageSize);
+            availabilityParams.pCounterAvailabilityImage = counterAvailabilityImage.data();
+            CUPTI_API_CALL(cuptiProfilerGetCounterAvailability(&availabilityParams));
+
+            // Create the host profiler object: it builds metric config images
+            // and later evaluates counter data into metric values.
+            CUpti_Profiler_Host_Initialize_Params hostInitParams = { CUpti_Profiler_Host_Initialize_Params_STRUCT_SIZE };
+            hostInitParams.profilerType = CUPTI_PROFILER_TYPE_RANGE_PROFILER;
+            hostInitParams.pChipName = chipName.c_str();
+            hostInitParams.pCounterAvailabilityImage = counterAvailabilityImage.data();
+            CUPTI_API_CALL(cuptiProfilerHostInitialize(&hostInitParams));
+            auto* hostObject = hostInitParams.pHostObject;
+            PersistentState::Get().profilerHostObject = hostObject;
+
+            // Add the curated metric list to the host config and bake a config
+            // image describing the hardware counters to collect.
+            CUpti_Profiler_Host_ConfigAddMetrics_Params addMetricsParams = { CUpti_Profiler_Host_ConfigAddMetrics_Params_STRUCT_SIZE };
+            addMetricsParams.pHostObject = hostObject;
+            addMetricsParams.ppMetricNames = const_cast<const char**>(kKernelMetricNames);
+            addMetricsParams.numMetrics = kKernelMetricCount;
+            CUPTI_API_CALL(cuptiProfilerHostConfigAddMetrics(&addMetricsParams));
+
+            auto& configImage = PersistentState::Get().configImage;
+            CUpti_Profiler_Host_GetConfigImageSize_Params configSizeParams = { CUpti_Profiler_Host_GetConfigImageSize_Params_STRUCT_SIZE };
+            configSizeParams.pHostObject = hostObject;
+            CUPTI_API_CALL(cuptiProfilerHostGetConfigImageSize(&configSizeParams));
+            configImage.resize(configSizeParams.configImageSize);
+            CUpti_Profiler_Host_GetConfigImage_Params configImageParams = { CUpti_Profiler_Host_GetConfigImage_Params_STRUCT_SIZE };
+            configImageParams.pHostObject = hostObject;
+            configImageParams.pConfigImage = configImage.data();
+            configImageParams.configImageSize = configImage.size();
+            CUPTI_API_CALL(cuptiProfilerHostGetConfigImage(&configImageParams));
+
+            // Enforce the single-pass (no-replay) design constraint up front: if
+            // the selected metrics don't fit one pass, trim kKernelMetricNames.
+            CUpti_Profiler_Host_GetNumOfPasses_Params numPassesParams = { CUpti_Profiler_Host_GetNumOfPasses_Params_STRUCT_SIZE };
+            numPassesParams.pConfigImage = configImage.data();
+            numPassesParams.configImageSize = configImage.size();
+            CUPTI_API_CALL(cuptiProfilerHostGetNumOfPasses(&numPassesParams));
+            if (numPassesParams.numOfPasses != 1) {
+                fprintf(stderr, "ERROR:\tTracyCUDA kernel metrics need a single-pass config, "
+                    "but the selected metrics require %llu passes; trim kKernelMetricNames.\n",
+                    (unsigned long long)numPassesParams.numOfPasses);
+                assert(numPassesParams.numOfPasses == 1);
+            }
+
+            // Allocate and initialize the counter-data image the Range Profiler
+            // fills during collection (sized for the metric set and range batch).
+            auto& counterDataImage = PersistentState::Get().counterDataImage;
+            CUpti_RangeProfiler_GetCounterDataSize_Params counterDataSizeParams = { CUpti_RangeProfiler_GetCounterDataSize_Params_STRUCT_SIZE };
+            counterDataSizeParams.pRangeProfilerObject = rangeProfiler;
+            counterDataSizeParams.pMetricNames = const_cast<const char**>(kKernelMetricNames);
+            counterDataSizeParams.numMetrics = kKernelMetricCount;
+            counterDataSizeParams.maxNumOfRanges = kMaxRangesPerPass;
+            counterDataSizeParams.maxNumRangeTreeNodes = kMaxRangesPerPass;
+            CUPTI_API_CALL(cuptiRangeProfilerGetCounterDataSize(&counterDataSizeParams));
+            counterDataImage.resize(counterDataSizeParams.counterDataSize);
+            CUpti_RangeProfiler_CounterDataImage_Initialize_Params counterDataInitParams = { CUpti_RangeProfiler_CounterDataImage_Initialize_Params_STRUCT_SIZE };
+            counterDataInitParams.pRangeProfilerObject = rangeProfiler;
+            counterDataInitParams.counterDataSize = counterDataImage.size();
+            counterDataInitParams.pCounterData = counterDataImage.data();
+            CUPTI_API_CALL(cuptiRangeProfilerCounterDataImageInitialize(&counterDataInitParams));
+
+            // Bind config + counter-data to the Range Profiler in user-range mode.
+            // A single pass means the user-replay loop runs exactly once, so there
+            // is no kernel re-execution and no forced serialization of overlap.
+            CUpti_RangeProfiler_SetConfig_Params setConfigParams = { CUpti_RangeProfiler_SetConfig_Params_STRUCT_SIZE };
+            setConfigParams.pRangeProfilerObject = rangeProfiler;
+            setConfigParams.pConfig = configImage.data();
+            setConfigParams.configSize = configImage.size();
+            setConfigParams.pCounterDataImage = counterDataImage.data();
+            setConfigParams.counterDataImageSize = counterDataImage.size();
+            setConfigParams.range = CUPTI_UserRange;
+            setConfigParams.replayMode = CUPTI_UserReplay;
+            setConfigParams.maxRangesPerPass = kMaxRangesPerPass;
+            setConfigParams.numNestingLevels = 1;
+            setConfigParams.minNestingLevel = 1;
+            CUPTI_API_CALL(cuptiRangeProfilerSetConfig(&setConfigParams));
+        }
+
+        static void EndKernelMetrics() {
+            ZoneScoped;
+            auto& rangeProfiler = PersistentState::Get().rangeProfiler;
+            if (rangeProfiler == nullptr) {
+                return;
+            }
+            // TODO(metrics): cuptiRangeProfilerStop() once Start() is wired up.
+            CUpti_RangeProfiler_Disable_Params destroyParams = { CUpti_RangeProfiler_Disable_Params_STRUCT_SIZE };
+            destroyParams.pRangeProfilerObject = rangeProfiler;
+            CUPTI_API_CALL(cuptiRangeProfilerDisable(&destroyParams));
+            rangeProfiler = nullptr;
+
+            auto& profilerHostObject = PersistentState::Get().profilerHostObject;
+            if (profilerHostObject != nullptr) {
+                CUpti_Profiler_Host_Deinitialize_Params hostDeinitParams = { CUpti_Profiler_Host_Deinitialize_Params_STRUCT_SIZE };
+                hostDeinitParams.pHostObject = profilerHostObject;
+                CUPTI_API_CALL(cuptiProfilerHostDeinitialize(&hostDeinitParams));
+                profilerHostObject = nullptr;
+            }
+        }
+
+        // Debug utility: print host/chip information and the metrics available
+        // for collection on the current device. Self-contained — spins up a
+        // temporary host object and tears it down, so it is safe to call without
+        // an active profiling session. Output goes to stdout and can be verbose.
+        static void DumpAvailableMetrics() {
+            ZoneScoped;
+            CUcontext cuCtx = nullptr;
+            DRIVER_API_CALL(cuCtxGetCurrent(&cuCtx));
+            if (cuCtx == nullptr) {
+                fprintf(stderr, "TracyCUDA: no current CUDA context; cannot list metrics.\n");
+                return;
+            }
+
+            // Device identity (human-readable), independent of the profiler.
+            CUdevice cuDevice = 0;
+            DRIVER_API_CALL(cuCtxGetDevice(&cuDevice));
+            char deviceName[256] = {};
+            DRIVER_API_CALL(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
+            int ccMajor = 0, ccMinor = 0;
+            DRIVER_API_CALL(cuDeviceGetAttribute(&ccMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+            DRIVER_API_CALL(cuDeviceGetAttribute(&ccMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+
+            // Profiler-side chip name (what the host object keys metrics on).
+            CUpti_Profiler_Initialize_Params initParams = { CUpti_Profiler_Initialize_Params_STRUCT_SIZE };
+            CUPTI_API_CALL(cuptiProfilerInitialize(&initParams));
+            CUpti_Device_GetChipName_Params chipNameParams = { CUpti_Device_GetChipName_Params_STRUCT_SIZE };
+            chipNameParams.deviceIndex = (size_t)cuDevice;
+            CUPTI_API_CALL(cuptiDeviceGetChipName(&chipNameParams));
+
+            fprintf(stdout, "\nTracyCUDA kernel-metrics host info:\n");
+            fprintf(stdout, "  Device : %s (compute capability %d.%d)\n", deviceName, ccMajor, ccMinor);
+            fprintf(stdout, "  Chip   : %s\n", chipNameParams.pChipName);
+
+            // Listing metrics is a pure host-side query keyed on the chip name,
+            // so it must NOT touch the device: we deliberately skip
+            // cuptiProfilerGetCounterAvailability() (which reserves the perf
+            // counters and is admin-gated -> INSUFFICIENT_PRIVILEGES / HARDWARE_BUSY)
+            // and pass a null counter-availability image. Per the CUPTI docs the
+            // image is only required for chips newer than the CUPTI build; for a
+            // chip known at release time the name alone enumerates every metric.
+            // The listing therefore reflects everything the chip defines rather
+            // than what this context can currently collect — which is exactly
+            // what a static "available metrics" dump should report.
+            CUpti_Profiler_Host_Initialize_Params hostInitParams = { CUpti_Profiler_Host_Initialize_Params_STRUCT_SIZE };
+            hostInitParams.profilerType = CUPTI_PROFILER_TYPE_RANGE_PROFILER;
+            hostInitParams.pChipName = chipNameParams.pChipName;
+            hostInitParams.pCounterAvailabilityImage = nullptr;
+            CUPTI_API_CALL(cuptiProfilerHostInitialize(&hostInitParams));
+            auto* hostObject = hostInitParams.pHostObject;
+
+            // Enumerate base metrics per type, and each base metric's submetrics
+            // (the fully-qualified, collectible names like "<base>.avg.pct_...").
+            const CUpti_MetricType metricTypes[] = {
+                CUPTI_METRIC_TYPE_COUNTER,
+                CUPTI_METRIC_TYPE_RATIO,
+                CUPTI_METRIC_TYPE_THROUGHPUT,
+            };
+            const char* const metricTypeNames[] = { "counter", "ratio", "throughput" };
+            for (size_t t = 0; t < sizeof(metricTypes) / sizeof(metricTypes[0]); ++t) {
+                CUpti_Profiler_Host_GetBaseMetrics_Params baseParams = { CUpti_Profiler_Host_GetBaseMetrics_Params_STRUCT_SIZE };
+                baseParams.pHostObject = hostObject;
+                baseParams.metricType = metricTypes[t];
+                CUPTI_API_CALL(cuptiProfilerHostGetBaseMetrics(&baseParams));
+                fprintf(stdout, "\n  %zu %s metrics:\n", (size_t)baseParams.numMetrics, metricTypeNames[t]);
+                for (size_t i = 0; i < baseParams.numMetrics; ++i) {
+                    const char* baseName = baseParams.ppMetricNames[i];
+                    fprintf(stdout, "    %s\n", baseName);
+                    CUpti_Profiler_Host_GetSubMetrics_Params subParams = { CUpti_Profiler_Host_GetSubMetrics_Params_STRUCT_SIZE };
+                    subParams.pHostObject = hostObject;
+                    subParams.metricType = metricTypes[t];
+                    subParams.pMetricName = baseName;
+                    CUPTI_API_CALL(cuptiProfilerHostGetSubMetrics(&subParams));
+                    for (size_t s = 0; s < subParams.numOfSubmetrics; ++s) {
+                        fprintf(stdout, "      .%s\n", subParams.ppSubMetrics[s]);
+                    }
+                }
+            }
+
+            CUpti_Profiler_Host_Deinitialize_Params hostDeinitParams = { CUpti_Profiler_Host_Deinitialize_Params_STRUCT_SIZE };
+            hostDeinitParams.pHostObject = hostObject;
+            CUPTI_API_CALL(cuptiProfilerHostDeinitialize(&hostDeinitParams));
+        }
+        #endif//TRACY_CUDA_ENABLE_KERNEL_METRICS
+
        static void BeginInstrumentation(CUDACtx* profilerHost) {
            auto& currentProfilerHost = PersistentState::Get().profilerHost;
            if (currentProfilerHost != nullptr) {
@@ -1274,6 +1568,8 @@ namespace tracy
            // CUDA API calls and device activities that happens past this point
            cudaDeviceSynchronize();

+            DumpAvailableMetrics();
+
            auto& subscriber = PersistentState::Get().subscriber;
            CUPTI_API_CALL(cuptiSubscribe(&subscriber, CUPTI::OnCallbackAPI, profilerHost));
            CUPTI_API_CALL(cuptiActivityRegisterCallbacks(CUPTI::OnBufferRequested, CUPTI::OnBufferCompleted));
@@ -1399,6 +1695,25 @@ namespace tracy
            CUpti_SubscriberHandle subscriber = {};
            CUDACtx* profilerHost = nullptr;

+            #if TRACY_CUDA_ENABLE_KERNEL_METRICS
+            // CUPTI Range Profiler object, created in BeginKernelMetrics() and
+            // destroyed in EndKernelMetrics(). nullptr when metrics are inactive.
+            CUpti_RangeProfiler_Object* rangeProfiler = nullptr;
+            // Host-side profiler object: builds metric config images from a
+            // metric list and decodes counter data into metric values. It is
+            // chip-specific, hence the cached chip name and counter availability
+            // snapshot it is initialized from.
+            CUpti_Profiler_Host_Object* profilerHostObject = nullptr;
+            std::string chipName;
+            std::vector<uint8_t> counterAvailabilityImage;
+            // Config image: which counters to collect (baked from the metric
+            // list). Counter-data image: where the Range Profiler writes the
+            // collected values. Both must outlive the active profiling session,
+            // as cuptiRangeProfilerSetConfig() retains pointers into them.
+            std::vector<uint8_t> configImage;
+            std::vector<uint8_t> counterDataImage;
+            #endif//TRACY_CUDA_ENABLE_KERNEL_METRICS
+
            Collector collector;

            static PersistentState& Get() {
@@ -1491,6 +1806,10 @@ namespace tracy
 #define TracyCUDAStartProfiling(ctx) ctx->StartProfiling()
 #define TracyCUDAStopProfiling(ctx) ctx->StopProfiling()

+#define TracyCUDAEnableKernelMetrics(ctx) ctx->EnableKernelMetrics()
+#define TracyCUDADisableKernelMetrics(ctx) ctx->DisableKernelMetrics()
+#define TracyCUDADumpKernelMetrics(ctx) ctx->DumpKernelMetrics()
+
 #define TracyCUDACollect(ctx) ctx->Collect()

 #endif