From 64797dc735e4fd8dd57d8b9dd6dd68a51dced299 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Fri, 5 Jun 2026 11:00:37 -0700 Subject: [PATCH] adding basic CUDA graph example --- examples/cuda/README.md | 0 examples/cuda/graph/CMakeLists.txt | 39 +++++++ examples/cuda/graph/build.sh | 11 ++ examples/cuda/graph/cuda-graph-demo.cu | 146 +++++++++++++++++++++++++ 4 files changed, 196 insertions(+) create mode 100644 examples/cuda/README.md create mode 100644 examples/cuda/graph/CMakeLists.txt create mode 100644 examples/cuda/graph/build.sh create mode 100644 examples/cuda/graph/cuda-graph-demo.cu diff --git a/examples/cuda/README.md b/examples/cuda/README.md new file mode 100644 index 00000000..e69de29b diff --git a/examples/cuda/graph/CMakeLists.txt b/examples/cuda/graph/CMakeLists.txt new file mode 100644 index 00000000..7a784aa5 --- /dev/null +++ b/examples/cuda/graph/CMakeLists.txt @@ -0,0 +1,39 @@ +cmake_minimum_required(VERSION 3.18) +project(CUDAGraphDemo LANGUAGES CXX CUDA) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CUDA_STANDARD 17) + +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24") + set(CMAKE_CUDA_ARCHITECTURES native) +endif() + +set(TRACY_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../.." + CACHE PATH "Root of the Tracy repository") +set(TRACY_PUBLIC "${TRACY_PATH}/public") + +find_package(CUDAToolkit REQUIRED) +find_package(Threads REQUIRED) + +# cuda-graph-demo.cu embeds Tracy via #include (unity build), +# so no separate TracyClient library is needed — just expose the public headers. +add_executable(cuda-graph-demo cuda-graph-demo.cu) +target_include_directories(cuda-graph-demo PRIVATE ${TRACY_PUBLIC}) +target_link_libraries(cuda-graph-demo PRIVATE + CUDA::cupti CUDA::cuda_driver Threads::Threads ${CMAKE_DL_LIBS}) + +# ctest-related integration below +# to run the binaries via ctest: +# ctest --test-dir -R -C + +enable_testing() +add_test(NAME cuda-graph-demo COMMAND cuda-graph-demo) + +# On Windows, CUPTI's DLL must be on PATH at runtime. +if(WIN32) + set(_cupti_dir "$") + set_target_properties(cuda-graph-demo PROPERTIES + VS_DEBUGGER_ENVIRONMENT "PATH=${_cupti_dir};$ENV{PATH}") + set_tests_properties(cuda-graph-demo PROPERTIES + ENVIRONMENT "PATH=${_cupti_dir};$ENV{PATH}") +endif() diff --git a/examples/cuda/graph/build.sh b/examples/cuda/graph/build.sh new file mode 100644 index 00000000..b372bd88 --- /dev/null +++ b/examples/cuda/graph/build.sh @@ -0,0 +1,11 @@ +TRACY_PATH= +CUDA_TOOLKIT_PATH=/usr/local/cuda +CUDA_CUPTI_PATH=${CUDA_TOOLKIT_PATH}/extras/CUPTI + +# pass -v to nvcc for verbose build information +nvcc -O2 -std=c++17 cuda-graph-demo.cu \ + -o cuda-graph-demo \ + -I "${TRACY_PATH}/public" \ + -I "${CUDA_CUPTI_PATH}/include" -I "${CUDA_TOOLKIT_PATH}/include" \ + -L "${CUDA_CUPTI_PATH}/lib64" -L "${CUDA_TOOLKIT_PATH}/lib64" \ + -lcupti -lcuda diff --git a/examples/cuda/graph/cuda-graph-demo.cu b/examples/cuda/graph/cuda-graph-demo.cu new file mode 100644 index 00000000..96678a9f --- /dev/null +++ b/examples/cuda/graph/cuda-graph-demo.cu @@ -0,0 +1,146 @@ +#include + +// WARN: for simplicity, we enable and "embed" the Tracy client directly into the code +#define TRACY_ENABLE +#include + +#include +#include + +#include +#include +#include + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err__ = (call); \ + if (err__ != cudaSuccess) { \ + std::fprintf(stderr, "CUDA error %s at %s:%d: %s\n", \ + cudaGetErrorName(err__), __FILE__, __LINE__, \ + cudaGetErrorString(err__)); \ + std::exit(EXIT_FAILURE); \ + } \ + } while (0) + +__global__ void saxpy(float a, const float* x, float* y, int n) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) y[i] = a * x[i] + y[i]; +} + +int main() +{ + // CUPTI-backed Tracy context. Auto-captures all CUDA activity from the + // point StartProfiling() is called until StopProfiling(). The background + // collector thread flushes activity into Tracy; the explicit Collect() + // calls below just force a flush at known phase boundaries. + auto* cudaCtx = TracyCUDAContext(); + { + constexpr char ctxName[] = "CUDA Graph Demo"; + TracyCUDAContextName(cudaCtx, ctxName, sizeof(ctxName) - 1); + } + TracyCUDAStartProfiling(cudaCtx); + + constexpr int N = 1 << 16; // small N => kernel is short => launch overhead dominates + constexpr int KERNELS_PER_GRAPH = 32; // chain length captured into the graph + constexpr int OUTER_ITERS = 2000; // how many times we replay the chain + + // allocate device buffers + float *dX = nullptr, *dY = nullptr; + CUDA_CHECK(cudaMalloc(&dX, N * sizeof(float))); + CUDA_CHECK(cudaMalloc(&dY, N * sizeof(float))); + + std::vector hX(N, 1.0f); + CUDA_CHECK(cudaMemcpy(dX, hX.data(), N * sizeof(float), cudaMemcpyHostToDevice)); + + cudaStream_t stream = nullptr; + CUDA_CHECK(cudaStreamCreate(&stream)); + + const dim3 block(256); + const dim3 grid((N + block.x - 1) / block.x); + + cudaEvent_t evStart, evStop; + CUDA_CHECK(cudaEventCreate(&evStart)); + CUDA_CHECK(cudaEventCreate(&evStop)); + + // warm-up (so first-launch lazy-init and/or JIT doesn't bias the measurement) + saxpy<<>>(0.0f, dX, dY, N); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // baseline: launch each kernel directly on the stream + float msStream = 0.0f; + { + ZoneScopedN("stream-launches"); + CUDA_CHECK(cudaMemsetAsync(dY, 0, N * sizeof(float), stream)); + CUDA_CHECK(cudaEventRecord(evStart, stream)); + for (int outer = 0; outer < OUTER_ITERS; ++outer) { + for (int k = 0; k < KERNELS_PER_GRAPH; ++k) { + saxpy<<>>(1.0e-6f, dX, dY, N); + } + } + CUDA_CHECK(cudaEventRecord(evStop, stream)); + CUDA_CHECK(cudaEventSynchronize(evStop)); + CUDA_CHECK(cudaEventElapsedTime(&msStream, evStart, evStop)); + TracyCUDACollect(cudaCtx); + } + + // capture: record the same kernel chain into a graph + cudaGraph_t graph = nullptr; + cudaGraphExec_t graphExec = nullptr; + { + ZoneScopedN("graph-capture"); + // cudaStreamCaptureModeRelaxed allows the calling thread to perform + // unrelated CUDA work during capture; ThreadLocal is stricter if you need + // isolation. Most short, single-stream captures work fine in either mode. + CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeRelaxed)); + for (int k = 0; k < KERNELS_PER_GRAPH; ++k) { + saxpy<<>>(1.0e-6f, dX, dY, N); + } + CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); + + // Instantiate once -> reusable executable graph. + CUDA_CHECK(cudaGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + + // The template graph isn't needed once instantiated. + CUDA_CHECK(cudaGraphDestroy(graph)); + } + + // replay: launch the instantiated graph OUTER_ITERS times + float msGraph = 0.0f; + { + ZoneScopedN("graph-launches"); + CUDA_CHECK(cudaMemsetAsync(dY, 0, N * sizeof(float), stream)); + CUDA_CHECK(cudaEventRecord(evStart, stream)); + for (int outer = 0; outer < OUTER_ITERS; ++outer) { + CUDA_CHECK(cudaGraphLaunch(graphExec, stream)); + } + CUDA_CHECK(cudaEventRecord(evStop, stream)); + CUDA_CHECK(cudaEventSynchronize(evStop)); + CUDA_CHECK(cudaEventElapsedTime(&msGraph, evStart, evStop)); + TracyCUDACollect(cudaCtx); + } + + // sanity check: y[i] = OUTER_ITERS * KERNELS_PER_GRAPH * 1e-6 * x[i] + std::vector hY(N); + CUDA_CHECK(cudaMemcpy(hY.data(), dY, N * sizeof(float), cudaMemcpyDeviceToHost)); + const float expected = float(OUTER_ITERS) * float(KERNELS_PER_GRAPH) * 1.0e-6f; + + std::printf("Stream launches: %8.3f ms (%d kernels)\n", + msStream, OUTER_ITERS * KERNELS_PER_GRAPH); + std::printf("Graph launches: %8.3f ms (%d graph launches x %d kernels)\n", + msGraph, OUTER_ITERS, KERNELS_PER_GRAPH); + std::printf("Speedup : %8.2fx\n", msStream / msGraph); + std::printf("hY[0] = %.6e (expected %.6e)\n", hY[0], expected); + + // shutdown + CUDA_CHECK(cudaGraphExecDestroy(graphExec)); + CUDA_CHECK(cudaEventDestroy(evStart)); + CUDA_CHECK(cudaEventDestroy(evStop)); + CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaFree(dX)); + CUDA_CHECK(cudaFree(dY)); + + TracyCUDAStopProfiling(cudaCtx); + TracyCUDAContextDestroy(cudaCtx); + return 0; +}