Use atomic flag for retirement check to avoid mutex on hot path

Replace the mutex-guarded empty check in OnBufferCompleted with an
std::atomic<bool> dirty flag. The mutex is now only acquired when
there is actual retirement work to do. Also update stale comment
on cudaGraphCurrentLaunch that said "let them leak".

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Basil Milanich
2026-04-13 10:21:07 -05:00
parent 8f6249c12f
commit 58809f95ff

View File

@@ -686,12 +686,8 @@ namespace tracy
// Check if any graph execs are pending retirement before entering
// the record loop — avoids the graphIdsSeenInBuffer heap allocation
// on the hot path when no execs have been destroyed.
bool trackRetirement;
{
auto& state = PersistentState::Get();
std::lock_guard<std::mutex> lock(state.graphRetireMutex);
trackRetirement = !state.graphExecPendingRetire.empty();
}
// Uses an atomic flag so we skip the mutex entirely in the common case.
bool trackRetirement = PersistentState::Get().graphRetirePending.load(std::memory_order_acquire);
CUptiResult status;
CUpti_Activity* record = nullptr;
std::unordered_set<GraphID> graphIdsSeenInBuffer;
@@ -729,6 +725,9 @@ namespace tracy
++it;
}
}
if (state.graphExecPendingRetire.empty()) {
state.graphRetirePending.store(false, std::memory_order_release);
}
}
PersistentState::Get().profilerHost->OnEventsProcessed();
@@ -944,6 +943,7 @@ namespace tracy
auto& state = PersistentState::Get();
std::lock_guard<std::mutex> lock(state.graphRetireMutex);
state.graphExecPendingRetire.insert(retireGraphId);
state.graphRetirePending.store(true, std::memory_order_release);
}
}
}
@@ -1376,11 +1376,16 @@ namespace tracy
// NOTE(marcos): these objects do not need to persist, but their relative
// footprint is trivial enough that we don't care if we let them leak
ConcurrentHashMap<CorrelationID, APICallInfo> cudaCallSiteInfo;
// Graph launch cache: entries are retired via graphExecPendingRetire
// when the corresponding cudaGraphExec is destroyed.
ConcurrentHashMap<GraphID, APICallInfo> cudaGraphCurrentLaunch;
ConcurrentHashMap<uintptr_t, int> memAllocAddress;
// Pending retirement: graphIds whose exec handles have been destroyed.
// Entries are erased from cudaGraphCurrentLaunch in OnBufferCompleted
// once no further activity records for the exec arrive in a buffer.
// graphRetirePending is an atomic dirty-flag so OnBufferCompleted can
// skip the mutex on the hot path when no execs have been destroyed.
std::atomic<bool> graphRetirePending{false};
std::mutex graphRetireMutex;
std::unordered_set<GraphID> graphExecPendingRetire;
CUpti_SubscriberHandle subscriber = {};