/* * Copyright (C) 2016 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // Note: The overhead of SYSTRACE_TAG_JOBSYSTEM is not negligible especially with parallel_for(). #ifndef SYSTRACE_TAG //#define SYSTRACE_TAG SYSTRACE_TAG_JOBSYSTEM #define SYSTRACE_TAG SYSTRACE_TAG_NEVER #endif // when SYSTRACE_TAG_JOBSYSTEM is used, enables even heavier systraces #define HEAVY_SYSTRACE 0 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(WIN32) # define NOMINMAX # include # include # else # include #endif #ifdef __ANDROID__ // see https://developer.android.com/topic/performance/threads#priority # include # include # ifndef ANDROID_PRIORITY_URGENT_DISPLAY # define ANDROID_PRIORITY_URGENT_DISPLAY (-8) # endif # ifndef ANDROID_PRIORITY_DISPLAY # define ANDROID_PRIORITY_DISPLAY (-4) # endif # ifndef ANDROID_PRIORITY_NORMAL # define ANDROID_PRIORITY_NORMAL (0) # endif # ifndef ANDROID_PRIORITY_BACKGROUND # define ANDROID_PRIORITY_BACKGROUND (10) # endif #elif defined(__linux__) // There is no glibc wrapper for gettid on linux so we need to syscall it. # include # include # define gettid() syscall(SYS_gettid) #endif #if HEAVY_SYSTRACE # define HEAVY_SYSTRACE_CALL() SYSTRACE_CALL() # define HEAVY_SYSTRACE_NAME(name) SYSTRACE_NAME(name) # define HEAVY_SYSTRACE_VALUE32(name, v) SYSTRACE_VALUE32(name, v) #else # define HEAVY_SYSTRACE_CALL() # define HEAVY_SYSTRACE_NAME(name) # define HEAVY_SYSTRACE_VALUE32(name, v) #endif namespace utils { void JobSystem::setThreadName(const char* name) noexcept { #if defined(__linux__) pthread_setname_np(pthread_self(), name); #elif defined(__APPLE__) pthread_setname_np(name); #elif defined(WIN32) std::string_view u8name(name); size_t size = MultiByteToWideChar(CP_UTF8, 0, u8name.data(), u8name.size(), nullptr, 0); std::wstring u16name; u16name.resize(size); MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, u8name.data(), u8name.size(), u16name.data(), u16name.size()); SetThreadDescription(GetCurrentThread(), u16name.data()); #endif } void JobSystem::setThreadPriority(Priority priority) noexcept { #ifdef __ANDROID__ int androidPriority = 0; switch (priority) { case Priority::BACKGROUND: androidPriority = ANDROID_PRIORITY_BACKGROUND; break; case Priority::NORMAL: androidPriority = ANDROID_PRIORITY_NORMAL; break; case Priority::DISPLAY: androidPriority = ANDROID_PRIORITY_DISPLAY; break; case Priority::URGENT_DISPLAY: androidPriority = ANDROID_PRIORITY_URGENT_DISPLAY; break; } errno = 0; UTILS_UNUSED_IN_RELEASE int error; error = setpriority(PRIO_PROCESS, 0, androidPriority); #ifndef NDEBUG if (UTILS_UNLIKELY(error)) { slog.w << "setpriority failed: " << strerror(errno) << io::endl; } #endif #elif defined(__APPLE__) qos_class_t qosClass = QOS_CLASS_DEFAULT; switch (priority) { case Priority::BACKGROUND: qosClass = QOS_CLASS_BACKGROUND; break; case Priority::NORMAL: qosClass = QOS_CLASS_DEFAULT; break; case Priority::DISPLAY: qosClass = QOS_CLASS_USER_INTERACTIVE; break; case Priority::URGENT_DISPLAY: qosClass = QOS_CLASS_USER_INTERACTIVE; break; } errno = 0; UTILS_UNUSED_IN_RELEASE int error; error = pthread_set_qos_class_self_np(qosClass, 0); #ifndef NDEBUG if (UTILS_UNLIKELY(error)) { slog.w << "pthread_set_qos_class_self_np failed: " << strerror(errno) << io::endl; } #endif #endif } void JobSystem::setThreadAffinityById(size_t id) noexcept { #if defined(__linux__) cpu_set_t set; CPU_ZERO(&set); CPU_SET(id, &set); sched_setaffinity(gettid(), sizeof(set), &set); #endif } JobSystem::JobSystem(const size_t userThreadCount, const size_t adoptableThreadsCount) noexcept : mJobPool("JobSystem Job pool", MAX_JOB_COUNT * sizeof(Job)), mJobStorageBase(static_cast(mJobPool.getAllocator().getCurrent())) { SYSTRACE_ENABLE(); unsigned int threadPoolCount = userThreadCount; if (threadPoolCount == 0) { // default value, system dependant unsigned int hwThreads = std::thread::hardware_concurrency(); if (UTILS_HAS_HYPER_THREADING) { // For now we avoid using HT, this simplifies profiling. // TODO: figure-out what to do with Hyper-threading // since we assumed HT, always round-up to an even number of cores (to play it safe) hwThreads = (hwThreads + 1) / 2; } // one of the thread will be the user thread threadPoolCount = hwThreads - 1; } // make sure we have at least one thread in the thread pool threadPoolCount = std::max(1u, threadPoolCount); // and also limit the pool to 32 threads threadPoolCount = std::min(UTILS_HAS_THREADING ? 32u : 0u, threadPoolCount); mThreadStates = aligned_vector(threadPoolCount + adoptableThreadsCount); mThreadCount = uint16_t(threadPoolCount); mParallelSplitCount = (uint8_t)std::ceil((std::log2f(threadPoolCount + adoptableThreadsCount))); static_assert(std::atomic::is_always_lock_free); static_assert(std::atomic::is_always_lock_free); std::random_device rd; const size_t hardwareThreadCount = mThreadCount; auto& states = mThreadStates; #pragma nounroll for (size_t i = 0, n = states.size(); i < n; i++) { auto& state = states[i]; state.rndGen = default_random_engine(rd()); state.js = this; if (i < hardwareThreadCount) { // don't start a thread of adoptable thread slots state.thread = std::thread(&JobSystem::loop, this, &state); } } } JobSystem::~JobSystem() { requestExit(); #pragma nounroll for (auto &state : mThreadStates) { // adopted threads are not joinable if (state.thread.joinable()) { state.thread.join(); } } } inline void JobSystem::incRef(Job const* job) noexcept { // no action is taken when incrementing the reference counter, therefore we can safely use // memory_order_relaxed. job->refCount.fetch_add(1, std::memory_order_relaxed); } UTILS_NOINLINE void JobSystem::decRef(Job const* job) noexcept { // We must ensure that accesses from other threads happen before deleting the Job. // To accomplish this, we need to guarantee that no read/writes are reordered after the // dec-ref, because ANOTHER thread could hold the last reference (after us) and that thread // needs to see all accesses completed before it deletes the object. This is done // with memory_order_release. // Similarly, we need to guarantee that no read/write are reordered before the last decref, // or some other thread could see a destroyed object before the ref-count is 0. This is done // with memory_order_acquire. auto c = job->refCount.fetch_sub(1, std::memory_order_acq_rel); assert(c > 0); if (c == 1) { // This was the last reference, it's safe to destroy the job. mJobPool.destroy(job); } } void JobSystem::requestExit() noexcept { mExitRequested.store(true); std::lock_guard const lock(mWaiterLock); mWaiterCondition.notify_all(); } inline bool JobSystem::exitRequested() const noexcept { // memory_order_relaxed is safe because the only action taken is to exit the thread return mExitRequested.load(std::memory_order_relaxed); } inline bool JobSystem::hasActiveJobs() const noexcept { return mActiveJobs.load(std::memory_order_relaxed) > 0; } inline bool JobSystem::hasJobCompleted(Job const* job) noexcept { return (job->runningJobCount.load(std::memory_order_acquire) & JOB_COUNT_MASK) == 0; } inline void JobSystem::wait(std::unique_lock& lock) noexcept { HEAVY_SYSTRACE_CALL(); mWaiterCondition.wait(lock); } inline uint32_t JobSystem::wait(std::unique_lock& lock, Job* const job) noexcept { HEAVY_SYSTRACE_CALL(); // signal we are waiting if (hasActiveJobs() || exitRequested()) { return job->runningJobCount.load(std::memory_order_acquire); } uint32_t runningJobCount = job->runningJobCount.fetch_add(1 << WAITER_COUNT_SHIFT, std::memory_order_relaxed); if (runningJobCount & JOB_COUNT_MASK) { mWaiterCondition.wait(lock); } runningJobCount = job->runningJobCount.fetch_sub(1 << WAITER_COUNT_SHIFT, std::memory_order_acquire); assert_invariant((runningJobCount >> WAITER_COUNT_SHIFT) >= 1); return runningJobCount; } UTILS_NOINLINE void JobSystem::wakeAll() noexcept { // wakeAll() is called when a job finishes (to wake up any thread that might be waiting on it) SYSTRACE_CALL(); mWaiterLock.lock(); // this empty critical section is needed -- it guarantees that notify_all() happens // either before the condition is checked, or after the condition variable sleeps. mWaiterLock.unlock(); // notify_all() can be pretty slow, and it doesn't need to be inside the lock. mWaiterCondition.notify_all(); } void JobSystem::wakeOne() noexcept { // wakeOne() is called when a new job is added to a queue HEAVY_SYSTRACE_CALL(); mWaiterLock.lock(); // this empty critical section is needed -- it guarantees that notify_one() happens // either before the condition is checked, or after the condition variable sleeps. mWaiterLock.unlock(); // notify_one() can be pretty slow, and it doesn't need to be inside the lock. mWaiterCondition.notify_one(); } inline JobSystem::ThreadState& JobSystem::getState() noexcept { std::lock_guard const lock(mThreadMapLock); auto iter = mThreadMap.find(std::this_thread::get_id()); FILAMENT_CHECK_PRECONDITION(iter != mThreadMap.end()) << "This thread has not been adopted."; return *iter->second; } JobSystem::Job* JobSystem::allocateJob() noexcept { return mJobPool.make(); } void JobSystem::put(WorkQueue& workQueue, Job* job) noexcept { assert(job); size_t const index = job - mJobStorageBase; assert(index >= 0 && index < MAX_JOB_COUNT); // put the job into the queue workQueue.push(uint16_t(index + 1)); // increase our active job count (the order in which we're doing this must not matter // because we're not using std::memory_order_seq_cst (here or in WorkQueue::push()). mActiveJobs.fetch_add(1, std::memory_order_relaxed); // Note: it's absolutely possible for mActiveJobs to be 0 here, because the job could have // been handled by a zealous worker already. In that case we could avoid calling wakeOne(), // but that is not the common case. wakeOne(); } JobSystem::Job* JobSystem::pop(WorkQueue& workQueue) noexcept { size_t const index = workQueue.pop(); assert(index <= MAX_JOB_COUNT); Job* const job = !index ? nullptr : &mJobStorageBase[index - 1]; if (UTILS_LIKELY(job)) { mActiveJobs.fetch_sub(1, std::memory_order_relaxed); } return job; } JobSystem::Job* JobSystem::steal(WorkQueue& workQueue) noexcept { size_t const index = workQueue.steal(); assert_invariant(index <= MAX_JOB_COUNT); Job* const job = !index ? nullptr : &mJobStorageBase[index - 1]; if (UTILS_LIKELY(job)) { mActiveJobs.fetch_sub(1, std::memory_order_relaxed); } return job; } inline JobSystem::ThreadState* JobSystem::getStateToStealFrom(ThreadState& state) noexcept { auto& threadStates = mThreadStates; // memory_order_relaxed is okay because we don't take any action that has data dependency // on this value (in particular mThreadStates, is always initialized properly). uint16_t const adopted = mAdoptedThreads.load(std::memory_order_relaxed); uint16_t const threadCount = mThreadCount + adopted; ThreadState* stateToStealFrom = nullptr; // don't try to steal from someone else if we're the only thread (infinite loop) if (threadCount >= 2) { do { // This is biased, but frankly, we don't care. It's fast. uint16_t const index = uint16_t(state.rndGen() % threadCount); assert(index < threadStates.size()); stateToStealFrom = &threadStates[index]; // don't steal from our own queue } while (stateToStealFrom == &state); } return stateToStealFrom; } JobSystem::Job* JobSystem::steal(ThreadState& state) noexcept { HEAVY_SYSTRACE_CALL(); Job* job = nullptr; do { ThreadState* const stateToStealFrom = getStateToStealFrom(state); if (stateToStealFrom) { job = steal(stateToStealFrom->workQueue); } // nullptr -> nothing to steal in that queue either, if there are active jobs, // continue to try stealing one. } while (!job && hasActiveJobs()); return job; } bool JobSystem::execute(ThreadState& state) noexcept { HEAVY_SYSTRACE_CALL(); Job* job = pop(state.workQueue); // It is beneficial for some benchmarks to poll on steal() for a bit, because going back to // sleep and waking up is pretty expensive. However, it is unclear it helps in practice with // larger jobs or when parallel_for is used. constexpr size_t const STEAL_TRY_COUNT = 1; for (size_t i = 0; UTILS_UNLIKELY(!job && i < STEAL_TRY_COUNT); i++) { // our queue is empty, try to steal a job job = steal(state); } if (UTILS_LIKELY(job)) { assert((job->runningJobCount.load(std::memory_order_relaxed) & JOB_COUNT_MASK) >= 1); if (UTILS_LIKELY(job->function)) { HEAVY_SYSTRACE_NAME("job->function"); job->id = std::distance(mThreadStates.data(), &state); job->function(job->storage, *this, job); job->id = invalidThreadId; } finish(job); } return job != nullptr; } void JobSystem::loop(ThreadState* state) noexcept { setThreadName("JobSystem::loop"); setThreadPriority(Priority::DISPLAY); // record our work queue std::unique_lock lock(mThreadMapLock); bool const inserted = mThreadMap.emplace(std::this_thread::get_id(), state).second; lock.unlock(); FILAMENT_CHECK_PRECONDITION(inserted) << "This thread is already in a loop."; // run our main loop... do { if (!execute(*state)) { std::unique_lock lock(mWaiterLock); while (!exitRequested() && !hasActiveJobs()) { wait(lock); } } } while (!exitRequested()); } UTILS_NOINLINE void JobSystem::finish(Job* job) noexcept { HEAVY_SYSTRACE_CALL(); bool notify = false; // terminate this job and notify its parent Job* const storage = mJobStorageBase; do { // std::memory_order_release here is needed to synchronize with JobSystem::wait() // which needs to "see" all changes that happened before the job terminated. uint32_t const v = job->runningJobCount.fetch_sub(1, std::memory_order_acq_rel); uint32_t const runningJobCount = v & JOB_COUNT_MASK; assert(runningJobCount > 0); if (runningJobCount == 1) { // no more work, destroy this job and notify its parent uint32_t const waiters = v >> WAITER_COUNT_SHIFT; if (waiters) { notify = true; } Job* const parent = job->parent == 0x7FFF ? nullptr : &storage[job->parent]; decRef(job); job = parent; } else { // there is still work (e.g.: children), we're done. break; } } while (job); // wake-up all threads that could potentially be waiting on this job finishing if (UTILS_UNLIKELY(notify)) { // but avoid calling notify_all() at all cost, because it's always expensive wakeAll(); } } // ----------------------------------------------------------------------------------------------- // public API... JobSystem::Job* JobSystem::create(Job* parent, JobFunc func) noexcept { parent = (parent == nullptr) ? mRootJob : parent; Job* const job = allocateJob(); if (UTILS_LIKELY(job)) { size_t index = 0x7FFF; if (parent) { // add a reference to the parent to make sure it can't be terminated. // memory_order_relaxed is safe because no action is taken at this point // (the job is not started yet). UTILS_UNUSED_IN_RELEASE auto const parentJobCount = parent->runningJobCount.fetch_add(1, std::memory_order_relaxed); // can't create a child job of a terminated parent assert((parentJobCount & JOB_COUNT_MASK) > 0); index = parent - mJobStorageBase; assert(index < MAX_JOB_COUNT); } job->function = func; job->parent = uint16_t(index); } return job; } void JobSystem::cancel(Job*& job) noexcept { finish(job); job = nullptr; } JobSystem::Job* JobSystem::retain(Job* job) noexcept { Job* retained = job; incRef(retained); return retained; } void JobSystem::release(Job*& job) noexcept { decRef(job); job = nullptr; } void JobSystem::run(Job*& job) noexcept { HEAVY_SYSTRACE_CALL(); ThreadState& state(getState()); put(state.workQueue, job); // after run() returns, the job is virtually invalid (it'll die on its own) job = nullptr; } void JobSystem::run(Job*& job, uint8_t id) noexcept { HEAVY_SYSTRACE_CALL(); ThreadState& state = mThreadStates[id]; assert_invariant(&state == &getState()); put(state.workQueue, job); // after run() returns, the job is virtually invalid (it'll die on its own) job = nullptr; } JobSystem::Job* JobSystem::runAndRetain(Job* job) noexcept { Job* retained = retain(job); run(job); return retained; } void JobSystem::waitAndRelease(Job*& job) noexcept { SYSTRACE_CALL(); assert(job); assert(job->refCount.load(std::memory_order_relaxed) >= 1); ThreadState& state(getState()); do { if (UTILS_UNLIKELY(!execute(state))) { // test if job has completed first, to possibly avoid taking the lock if (hasJobCompleted(job)) { break; } // the only way we can be here is if the job we're waiting on it being handled // by another thread: // - we returned from execute() which means all queues are empty // - yet our job hasn't completed yet // ergo, it's being run in another thread // // this could take time however, so we will wait with a condition, and // continue to handle more jobs, as they get added. std::unique_lock lock(mWaiterLock); uint32_t const runningJobCount = wait(lock, job); // we could be waking up because either: // - the job we're waiting on has completed // - more jobs where added to the JobSystem // - we're asked to exit if ((runningJobCount & JOB_COUNT_MASK) == 0 || exitRequested()) { break; } // if we get here, it means that // - the job we're waiting on is still running, and // - we're not asked to exit, and // - there were some active jobs // So we try to handle one. continue; } } while (UTILS_LIKELY(!hasJobCompleted(job) && !exitRequested())); if (job == mRootJob) { mRootJob = nullptr; } release(job); } void JobSystem::runAndWait(Job*& job) noexcept { SYSTRACE_CALL(); runAndRetain(job); waitAndRelease(job); } void JobSystem::adopt() { const auto tid = std::this_thread::get_id(); std::unique_lock lock(mThreadMapLock); auto iter = mThreadMap.find(tid); ThreadState* const state = iter == mThreadMap.end() ? nullptr : iter->second; lock.unlock(); if (state) { // we're already part of a JobSystem, do nothing. FILAMENT_CHECK_PRECONDITION(this == state->js) << "Called adopt on a thread owned by another JobSystem (" << state->js << "), this=" << this << "!"; return; } // memory_order_relaxed is safe because we don't take action on this value. uint16_t const adopted = mAdoptedThreads.fetch_add(1, std::memory_order_relaxed); size_t const index = mThreadCount + adopted; FILAMENT_CHECK_POSTCONDITION(index < mThreadStates.size()) << "Too many calls to adopt(). No more adoptable threads!"; // all threads adopted by the JobSystem need to run at the same priority setThreadPriority(Priority::DISPLAY); // This thread's queue will be selectable immediately (i.e.: before we set its TLS) // however, it's not a problem since mThreadState is pre-initialized and valid // (e.g.: the queue is empty). lock.lock(); mThreadMap[tid] = &mThreadStates[index]; } void JobSystem::emancipate() { const auto tid = std::this_thread::get_id(); std::unique_lock const lock(mThreadMapLock); auto iter = mThreadMap.find(tid); ThreadState* const state = iter == mThreadMap.end() ? nullptr : iter->second; FILAMENT_CHECK_PRECONDITION(state) << "this thread is not an adopted thread"; FILAMENT_CHECK_PRECONDITION(state->js == this) << "this thread is not adopted by us"; mThreadMap.erase(iter); } io::ostream& operator<<(io::ostream& out, JobSystem const& js) { for (auto const& item : js.mThreadStates) { size_t const id = std::distance(js.mThreadStates.data(), &item); out << id << ": " << item.workQueue.getCount() << io::endl; } return out; } } // namespace utils