diff --git a/platforms/posix/include/windows_shim/pthread.h b/platforms/posix/include/windows_shim/pthread.h index 5eb1a6e9ac..74071408c3 100644 --- a/platforms/posix/include/windows_shim/pthread.h +++ b/platforms/posix/include/windows_shim/pthread.h @@ -178,6 +178,11 @@ int pthread_cond_signal(pthread_cond_t *cond); int pthread_cond_broadcast(pthread_cond_t *cond); /** @} */ +#if defined(_MSC_VER) && !defined(__clang__) +typedef void (*px4_pthread_cond_notify_callback_t)(pthread_cond_t *cond, int broadcast); +int px4_pthread_cond_set_notify_callback(px4_pthread_cond_notify_callback_t callback); +#endif + /** @name Thread lifecycle functions * * Wrap CreateThread/WaitForSingleObject/CloseHandle with pthread-compatible @@ -191,6 +196,8 @@ int pthread_detach(pthread_t thread); void pthread_exit(void *value_ptr); pthread_t pthread_self(void); int pthread_equal(pthread_t t1, pthread_t t2); +int pthread_getschedparam(pthread_t thread, int *policy, struct sched_param *param); +int pthread_setschedparam(pthread_t thread, int policy, const struct sched_param *param); int pthread_cancel(pthread_t thread); int pthread_kill(pthread_t thread, int sig); /** @} */ @@ -244,3 +251,17 @@ int pthread_getname_np(pthread_t thread, char *name, size_t len); #ifndef PTHREAD_STACK_MIN #define PTHREAD_STACK_MIN 16384 #endif + +#if (defined(__PX4_WINDOWS) || defined(_WIN32)) && !defined(_MSC_VER) && \ + (defined(ENABLE_LOCKSTEP_SCHEDULER) || defined(PX4_WINDOWS_PTHREAD_LOCKSTEP_BRIDGE)) +#ifdef __cplusplus +extern "C" { +#endif +int px4_lockstep_pthread_cond_signal(pthread_cond_t *cond); +int px4_lockstep_pthread_cond_broadcast(pthread_cond_t *cond); +#ifdef __cplusplus +} +#endif +#define pthread_cond_signal(cond_) px4_lockstep_pthread_cond_signal(cond_) +#define pthread_cond_broadcast(cond_) px4_lockstep_pthread_cond_broadcast(cond_) +#endif diff --git a/platforms/posix/include/windows_shim/time.h b/platforms/posix/include/windows_shim/time.h index 5f556c906b..4388b5a201 100644 --- a/platforms/posix/include/windows_shim/time.h +++ b/platforms/posix/include/windows_shim/time.h @@ -67,12 +67,14 @@ typedef int clockid_t; #ifndef CLOCK_MONOTONIC_COARSE #define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC #endif -#ifndef _TIMEVAL_DEFINED -#define _TIMEVAL_DEFINED -struct timeval { - long tv_sec; - long tv_usec; -}; +/* The MSVC SDK declares `struct timeval` only inside / + * , and does so unconditionally — no header guard. Pull it + * from there so any later -via- include doesn't + * trigger a "redefinition" (C2011). NOMINMAX / WIN32_LEAN_AND_MEAN are + * already in effect via the SITL compile flags, so the cost here is + * mostly the winsock typedefs. */ +#ifndef _WINSOCK2API_ +#include #endif #else #include_next diff --git a/platforms/posix/include/windows_shim/unistd.h b/platforms/posix/include/windows_shim/unistd.h index b5cf8c9b9c..60762a8642 100644 --- a/platforms/posix/include/windows_shim/unistd.h +++ b/platforms/posix/include/windows_shim/unistd.h @@ -46,6 +46,15 @@ #if defined(_MSC_VER) && !defined(__clang__) #include +#elif defined(_WIN32) +/* + * MinGW declares its own usleep() in . Pull the rest of that + * header through normally, but hide only that declaration so PX4 can provide + * the same high-resolution Windows implementation for system_usleep. + */ +#define usleep _px4_mingw_runtime_usleep +#include_next +#undef usleep #else #include_next #endif @@ -115,14 +124,258 @@ extern "C" { #endif +#if defined(_WIN32) +/* CREATE_WAITABLE_TIMER_HIGH_RESOLUTION (Windows 10 1803+; build 17134) + * may not be defined in older SDK headers - fall back to the literal + * value documented by Microsoft. Same for the manual-reset flag. */ +#ifndef CREATE_WAITABLE_TIMER_MANUAL_RESET +#define CREATE_WAITABLE_TIMER_MANUAL_RESET 0x00000001 +#endif +#ifndef CREATE_WAITABLE_TIMER_HIGH_RESOLUTION +#define CREATE_WAITABLE_TIMER_HIGH_RESOLUTION 0x00000002 +#endif + #if defined(_MSC_VER) && !defined(__clang__) -/** @brief Sleep for at least @p usec microseconds using Windows Sleep(). */ +#define PX4_WINDOWS_SLEEP_TLS __declspec(thread) +#else +#define PX4_WINDOWS_SLEEP_TLS __thread +#endif + +/** + * Runtime-tuned thresholds that drive the spin-residual hybrid below. + * + * Defined and (optionally) auto-calibrated by + * px4_windows_calibrate_usleep_threshold() in + * platforms/posix/src/px4/windows/runtime/init.cpp. The calibration runs + * immediately after timeBeginPeriod(1), before any module thread starts + * calling usleep(), so the first usleep() the process performs already + * sees the tuned value. + * + * Override at process startup with the PX4_USLEEP_SPIN_US environment + * variable (clamped to [0, 50000] microseconds). Values <= 50000 are + * accepted; 0 effectively forces every wait > 0 us through the timer + + * spin-tail path. + * + * @c g_usleep_spin_tail_us is the *upper bound* of the QPC spin closing + * the residual after the high-resolution waitable timer wakes. The + * adaptive controller in usleep() shrinks the tail per-thread toward + * the observed timer overshoot via an EWMA, so a quiet host pays only + * ~p95 jitter of CPU spin per call instead of the worst-case bound. + */ +extern long g_usleep_pure_spin_us; +extern long g_usleep_spin_tail_us; + +/* Floor for the per-thread adaptive spin tail. Initialised by the + * calibration routine to the host-measured P95 waitable-timer jitter so + * the controller never trims the spin below the value we already know + * is needed to cover this host's observed long-tail wakes. Defaults to + * a conservative 700 us when calibration cannot run. */ +extern long g_usleep_adaptive_min_tail_us; + +/** + * @brief Sleep for at least @p usec microseconds with microsecond accuracy. + * + * Windows Sleep() is quantized to the system timer tick (~15.6 ms by + * default; 1 ms after timeBeginPeriod(1) in init.cpp). A 4 ms sleep + * therefore rounds up to a full HPET tick, throttling SITL sim time. + * + * The naive Sleep() path loses ~10 % of wall time. A pure HPET-backed + * waitable timer wakes within 0.3 - 0.7 ms of the target on a quiet + * system but quantizes to 1 ms under load, so a tight SITL producer + * (250 Hz - 1 kHz lockstep loop) accumulates 5 - 10 % drift. + * + * The current implementation is a spin-residual hybrid: + * + * - Requests <= @c g_usleep_pure_spin_us are held entirely on the QPC + * deadline. This covers SIH's normal lockstep wall-sleep cadence + * (200 Hz - 2 kHz, 500 - 5000 us). Even a single 0.5 - 1 ms + * scheduler-late wake in that band becomes visible as sim/wall + * drift, so the short simulation waits pay CPU for determinism. + * + * - For requests > @c g_usleep_pure_spin_us the bulk of the wait runs + * on a high-resolution waitable timer (CreateWaitableTimerExW + + * CREATE_WAITABLE_TIMER_HIGH_RESOLUTION, Windows 10 1803+). The + * timer is armed to wake @c g_usleep_spin_tail_us microseconds + * early and the residual is closed by a QueryPerformanceCounter + * busy-loop. This trades ~g_usleep_spin_tail_us of CPU per call for + * microsecond-scale accuracy against the absolute QPC target. + * + * The HANDLE is cached per-thread in compiler-native TLS so we pay one + * CreateWaitableTimerExW per thread for the lifetime of the process. + */ static inline int usleep(useconds_t usec) { - Sleep((DWORD)((usec + 999U) / 1000U)); + if (usec == 0) { + return 0; + } + + // Snapshot the tuned thresholds once per call. Reads of an unaligned + // long are atomic on x86_64; the calibration in init.cpp runs before + // any other thread starts, so no further synchronization is needed. + const long pure_spin_us = g_usleep_pure_spin_us; + const long spin_tail_us = g_usleep_spin_tail_us; + const long adaptive_floor_us = g_usleep_adaptive_min_tail_us; + + LARGE_INTEGER qpc_freq; + LARGE_INTEGER qpc_start; + QueryPerformanceFrequency(&qpc_freq); + QueryPerformanceCounter(&qpc_start); + + // Absolute QPC target = start + usec. The conversion uses 64-bit + // integer math throughout: at 10 MHz QPC and a 1-second sleep the + // product is 1e7, well within LONGLONG range. + const LONGLONG qpc_target = qpc_start.QuadPart + + ((LONGLONG)usec * qpc_freq.QuadPart) / 1000000LL; + + if ((long)usec > pure_spin_us) { + // Use compiler-native TLS instead of C++ thread_local because this + // header is also included from .c translation units. + static PX4_WINDOWS_SLEEP_TLS HANDLE timer = NULL; + // Per-thread adaptive spin-tail state. We track the timer wake + // overshoot (how late WaitForSingleObject returned past the + // requested bulk deadline) as an EWMA in microseconds, then size + // the spin tail at (overshoot_ewma + small_margin), bounded by + // [PX4_USLEEP_ADAPTIVE_MIN_TAIL_US, spin_tail_us]. + // The EWMA is initialized with sentinel -1 so the first call + // uses the configured upper-bound tail; subsequent calls + // converge toward the host's actual jitter and trim the spin. + static PX4_WINDOWS_SLEEP_TLS long adaptive_tail_us = -1; + static PX4_WINDOWS_SLEEP_TLS long overshoot_ewma_us = -1; + + if (timer == NULL) { + timer = CreateWaitableTimerExW(NULL, NULL, + CREATE_WAITABLE_TIMER_HIGH_RESOLUTION + | CREATE_WAITABLE_TIMER_MANUAL_RESET, + TIMER_ALL_ACCESS); + + if (timer == NULL) { + // Older Windows: legacy manual-reset timer + // still honors timeBeginPeriod(1). + timer = CreateWaitableTimerW(NULL, TRUE, NULL); + } + } + + // Decide the spin tail for this call. First call (sentinel) - + // fall back to the configured upper bound so we definitely + // cover the deadline while we collect data. Subsequent calls + // use the EWMA-derived value. + long tail_us = (adaptive_tail_us < 0) ? spin_tail_us : adaptive_tail_us; + + // Floor at the host-measured P95 jitter (set by the calibration + // routine in init.cpp). Trimming below this would force the QPC + // spin to absorb wakes past the deadline, which directly bleeds + // into sim/wall ratio. + if (tail_us < adaptive_floor_us) { tail_us = adaptive_floor_us; } + + if (tail_us > spin_tail_us) { tail_us = spin_tail_us; } + + if (timer != NULL) { + LARGE_INTEGER due; + // Wake tail_us early and close the gap by spin. + // Negative due time = relative interval, 100 ns units. + // Clamp the bulk wait to >= 0 in case the caller asked + // for a value just above pure_spin_us with a larger + // spin_tail_us; the QPC spin still enforces the deadline. + const LONGLONG bulk_us = (LONGLONG)usec - (LONGLONG)tail_us; + const LONGLONG bulk_us_clamped = bulk_us > 0 ? bulk_us : 0; + due.QuadPart = -(bulk_us_clamped * 10); + + if (SetWaitableTimer(timer, &due, 0, NULL, NULL, FALSE)) { + // Use a millisecond timeout slightly longer than + // the requested sleep rather than INFINITE: a + // rare WaitForSingleObject misbehavior on Windows + // (observed under heavy SITL lockstep load) can + // otherwise hang the producer thread permanently. + // The QPC spin below still enforces the absolute + // deadline, so a premature wake is harmless. + const DWORD wait_ms_bulk = (DWORD)((bulk_us_clamped + 999LL) / 1000LL); + const DWORD wait_ms = wait_ms_bulk + 5U; // +5 ms safety margin + WaitForSingleObject(timer, wait_ms); + + // Adaptive update: measure how late we woke vs the + // requested bulk deadline (qpc_target - tail_us). + // Negative = woke early (good); positive = woke late + // and the spin tail had to absorb it. We track an + // upper-envelope EWMA: a late wake snaps the value up + // immediately, a stretch of clean wakes decays it down + // at 1/64 per call (~30 ms settle at 250 Hz). The plain + // mean would undersize the tail because the timer jitter + // distribution has a long upper tail and 5 % of waits + // can wake far past the mean - each such miss bleeds + // 100 - 1000 us into wall time and accumulates as + // sim/wall ratio drift. + LARGE_INTEGER wake_now; + QueryPerformanceCounter(&wake_now); + const LONGLONG bulk_target_qpc = qpc_target + - ((LONGLONG)tail_us * qpc_freq.QuadPart) / 1000000LL; + LONGLONG overshoot_qpc = wake_now.QuadPart - bulk_target_qpc; + if (overshoot_qpc < 0) { overshoot_qpc = 0; } + const long overshoot_us = + (long)((overshoot_qpc * 1000000LL) / qpc_freq.QuadPart); + + if (overshoot_ewma_us < 0) { + // First sample: seed at the configured upper + // bound so we don't undershoot before any + // data has been gathered. + overshoot_ewma_us = spin_tail_us; + } + + // Fast attack, slow decay. + if (overshoot_us > overshoot_ewma_us) { + overshoot_ewma_us = overshoot_us; + + } else { + overshoot_ewma_us = + (overshoot_ewma_us * 63 + overshoot_us + 32) / 64; + } + + // Size the next call's tail at envelope + 200 us + // margin. The margin covers the residual gap between + // the slow-decay envelope and the instantaneous + // worst-case wake jitter; the floor and upper-bound + // clamps keep the controller from collapsing or + // running away. + adaptive_tail_us = overshoot_ewma_us + 200; + + if (adaptive_tail_us < adaptive_floor_us) { + adaptive_tail_us = adaptive_floor_us; + } + + if (adaptive_tail_us > spin_tail_us) { + adaptive_tail_us = spin_tail_us; + } + + } else { + // Arming failed (very rare). Fall through to + // the QPC spin below; it will still hit the + // deadline, just with a brief CPU burn. + } + } else { + // No timer available at all: kernel Sleep() rounded + // up to the nearest millisecond. The spin tail below + // still corrects the residual. + const LONGLONG bulk_us = (LONGLONG)usec - (LONGLONG)tail_us; + const LONGLONG bulk_us_clamped = bulk_us > 0 ? bulk_us : 0; + Sleep((DWORD)((bulk_us_clamped + 999LL) / 1000LL)); + } + } + + // Close the residual against the absolute QPC target. For longer waits + // this is at most ~spin_tail_us of spin (often less because the waitable + // timer wakes slightly late). For SIH-sized waits it is the full request. + LARGE_INTEGER now; + + do { + YieldProcessor(); + QueryPerformanceCounter(&now); + } while (now.QuadPart < qpc_target); + return 0; } +#undef PX4_WINDOWS_SLEEP_TLS + +#if defined(_MSC_VER) && !defined(__clang__) /** @brief Sleep for at least @p seconds seconds using Windows Sleep(). */ static inline unsigned int sleep(unsigned int seconds) { @@ -130,6 +383,7 @@ static inline unsigned int sleep(unsigned int seconds) return 0; } #endif +#endif /* POSIX pipe(fd[2]) - default to 64 KiB buffer and binary mode. */ #ifndef _PX4_PIPE_SHIM_DEFINED diff --git a/platforms/posix/src/px4/windows/posix/proc/pthread.cpp b/platforms/posix/src/px4/windows/posix/proc/pthread.cpp index ee31f9d2f4..e8f20edeeb 100644 --- a/platforms/posix/src/px4/windows/posix/proc/pthread.cpp +++ b/platforms/posix/src/px4/windows/posix/proc/pthread.cpp @@ -46,7 +46,11 @@ #if defined(_MSC_VER) && !defined(__clang__) +#include #include +#include +#include +#include namespace { @@ -59,6 +63,72 @@ struct PX4ThreadStart { pthread_t self; }; +/* POSIX requires pthread_key_create() destructors to run on thread exit. The + * Win32 TLS API has no such hook, so we keep our own registry of keys with + * non-null destructors and walk it from the thread trampoline tail. Without + * this, every per-thread allocation registered via pthread_setspecific() leaks + * on the MSVC SITL build (e.g. CmdThreadSpecificData in px4_daemon::Server). */ +std::mutex &tls_destructor_mutex() +{ + static std::mutex m; + return m; +} + +std::unordered_map &tls_destructors() +{ + static std::unordered_map map; + return map; +} + +std::atomic &cond_notify_callback() +{ + static std::atomic callback{nullptr}; + return callback; +} + +void run_tls_destructors_on_exit() +{ + /* POSIX allows up to PTHREAD_DESTRUCTOR_ITERATIONS (typically 4) passes + * because a destructor may install new TLS values. Snapshot under the + * mutex, run unlocked so destructors can call pthread_key_delete()/ + * pthread_setspecific() without deadlocking. */ + for (int pass = 0; pass < 4; ++pass) { + struct Pending { + pthread_key_t key; + void (*destructor)(void *); + void *value; + }; + std::vector pending; + { + std::lock_guard guard(tls_destructor_mutex()); + pending.reserve(tls_destructors().size()); + + for (const auto &entry : tls_destructors()) { + if (entry.second == nullptr) { + continue; + } + + void *value = TlsGetValue(entry.first); + + if (value == nullptr) { + continue; + } + + pending.push_back({entry.first, entry.second, value}); + } + } + + if (pending.empty()) { + return; + } + + for (const auto &p : pending) { + TlsSetValue(p.key, nullptr); + p.destructor(p.value); + } + } +} + BOOL CALLBACK init_mutex_once(PINIT_ONCE, PVOID parameter, PVOID *) { /* Static pthread mutex initializers cannot run a constructor. INIT_ONCE lets @@ -95,6 +165,30 @@ static HANDLE handle_for_pthread(pthread_t thread) return reinterpret_cast(thread); } +static int windows_thread_priority(int priority) +{ + if (priority >= THREAD_PRIORITY_TIME_CRITICAL) { + return THREAD_PRIORITY_TIME_CRITICAL; + + } else if (priority >= THREAD_PRIORITY_HIGHEST) { + return THREAD_PRIORITY_HIGHEST; + + } else if (priority >= THREAD_PRIORITY_ABOVE_NORMAL) { + return THREAD_PRIORITY_ABOVE_NORMAL; + + } else if (priority == THREAD_PRIORITY_NORMAL) { + return THREAD_PRIORITY_NORMAL; + + } else if (priority <= THREAD_PRIORITY_IDLE) { + return THREAD_PRIORITY_IDLE; + + } else if (priority <= THREAD_PRIORITY_LOWEST) { + return THREAD_PRIORITY_LOWEST; + } + + return THREAD_PRIORITY_BELOW_NORMAL; +} + static DWORD abstime_to_timeout_ms(const timespec *abstime) { if (!abstime) { @@ -128,6 +222,7 @@ unsigned __stdcall thread_trampoline(void *arg) delete start; const uintptr_t result = reinterpret_cast(entry(entry_arg)); + run_tls_destructors_on_exit(); _endthreadex(static_cast(result)); return static_cast(result); } @@ -459,6 +554,10 @@ int pthread_cond_signal(pthread_cond_t *cond) return EINVAL; } + if (px4_pthread_cond_notify_callback_t callback = cond_notify_callback().load(std::memory_order_acquire)) { + callback(cond, 0); + } + WakeConditionVariable(cond); return 0; } @@ -469,10 +568,20 @@ int pthread_cond_broadcast(pthread_cond_t *cond) return EINVAL; } + if (px4_pthread_cond_notify_callback_t callback = cond_notify_callback().load(std::memory_order_acquire)) { + callback(cond, 1); + } + WakeAllConditionVariable(cond); return 0; } +int px4_pthread_cond_set_notify_callback(px4_pthread_cond_notify_callback_t callback) +{ + cond_notify_callback().store(callback, std::memory_order_release); + return 0; +} + int pthread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine)(void *), void *arg) { if (!thread || !start_routine) { @@ -499,7 +608,7 @@ int pthread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_ *thread = static_cast(handle); if (attr && attr->sched.sched_priority != 0) { - SetThreadPriority(reinterpret_cast(handle), attr->sched.sched_priority); + SetThreadPriority(reinterpret_cast(handle), windows_thread_priority(attr->sched.sched_priority)); } ResumeThread(reinterpret_cast(handle)); @@ -522,6 +631,10 @@ int pthread_join(pthread_t thread, void **value_ptr) HANDLE handle = reinterpret_cast(thread); if (WaitForSingleObject(handle, INFINITE) == WAIT_FAILED) { + /* Even on failure the caller has handed ownership of the handle + * to pthread_join() per POSIX semantics; close it so we don't + * leak the Win32 thread object. */ + CloseHandle(handle); return ESRCH; } @@ -547,6 +660,7 @@ int pthread_detach(pthread_t thread) void pthread_exit(void *value_ptr) { + run_tls_destructors_on_exit(); _endthreadex(static_cast(reinterpret_cast(value_ptr))); } @@ -564,6 +678,34 @@ int pthread_equal(pthread_t t1, pthread_t t2) return t1 == t2; } +int pthread_getschedparam(pthread_t thread, int *policy, struct sched_param *param) +{ + if (!policy || !param) { + return EINVAL; + } + + const int priority = GetThreadPriority(handle_for_pthread(thread)); + + if (priority == THREAD_PRIORITY_ERROR_RETURN && GetLastError() != ERROR_SUCCESS) { + return ESRCH; + } + + *policy = SCHED_OTHER; + param->sched_priority = priority; + return 0; +} + +int pthread_setschedparam(pthread_t thread, int policy, const struct sched_param *param) +{ + (void)policy; + + if (!param) { + return EINVAL; + } + + return SetThreadPriority(handle_for_pthread(thread), windows_thread_priority(param->sched_priority)) ? 0 : ESRCH; +} + int pthread_cancel(pthread_t thread) { if (thread == 0) { @@ -584,8 +726,6 @@ int pthread_kill(pthread_t thread, int sig) int pthread_key_create(pthread_key_t *key, void (*destructor)(void *)) { - (void)destructor; - if (!key) { return EINVAL; } @@ -596,12 +736,21 @@ int pthread_key_create(pthread_key_t *key, void (*destructor)(void *)) return EAGAIN; } + if (destructor) { + std::lock_guard guard(tls_destructor_mutex()); + tls_destructors()[index] = destructor; + } + *key = index; return 0; } int pthread_key_delete(pthread_key_t key) { + { + std::lock_guard guard(tls_destructor_mutex()); + tls_destructors().erase(key); + } return TlsFree(key) ? 0 : EINVAL; } diff --git a/platforms/posix/src/px4/windows/posix/sys/time.cpp b/platforms/posix/src/px4/windows/posix/sys/time.cpp index feeab625c4..b77c0f4039 100644 --- a/platforms/posix/src/px4/windows/posix/sys/time.cpp +++ b/platforms/posix/src/px4/windows/posix/sys/time.cpp @@ -61,16 +61,27 @@ int clock_gettime(clockid_t clk_id, struct timespec *tp) if (clk_id == CLOCK_MONOTONIC) { /* QPC is monotonic and high resolution, but relative to an arbitrary - * boot-time counter. That is exactly what CLOCK_MONOTONIC promises. */ - LARGE_INTEGER frequency {}; + * boot-time counter. That is exactly what CLOCK_MONOTONIC promises. + * + * Per Microsoft's QueryPerformanceCounter guidance, the QPC frequency + * is fixed at system boot and consistent across processors, so we + * only need to query it once. clock_gettime is on PX4's hot path + * (drv_hrt's hrt_absolute_time, lockstep_scheduler, every uORB + * publish/subscribe), and a syscall here adds up quickly. + */ + static const int64_t frequency = []() { + LARGE_INTEGER f {}; + QueryPerformanceFrequency(&f); + return f.QuadPart; + }(); + LARGE_INTEGER counter {}; - QueryPerformanceFrequency(&frequency); QueryPerformanceCounter(&counter); - const uint64_t seconds = static_cast(counter.QuadPart / frequency.QuadPart); - const uint64_t remainder = static_cast(counter.QuadPart % frequency.QuadPart); + const uint64_t seconds = static_cast(counter.QuadPart / frequency); + const uint64_t remainder = static_cast(counter.QuadPart % frequency); tp->tv_sec = static_cast(seconds); - tp->tv_nsec = static_cast((remainder * 1000000000ULL) / static_cast(frequency.QuadPart)); + tp->tv_nsec = static_cast((remainder * 1000000000ULL) / static_cast(frequency)); return 0; } diff --git a/platforms/posix/src/px4/windows/runtime/init.cpp b/platforms/posix/src/px4/windows/runtime/init.cpp index 12790d39b4..c16b709029 100644 --- a/platforms/posix/src/px4/windows/runtime/init.cpp +++ b/platforms/posix/src/px4/windows/runtime/init.cpp @@ -43,7 +43,35 @@ #include "px4_windows_internal.h" +#include #include +#include +#include +#include + +// timeBeginPeriod / timeEndPeriod live in winmm. Without raising the +// system timer resolution, the Windows scheduler quantizes Sleep() to +// the default ~15.6 ms HPET tick, which throttles SITL sim time to +// ~40 % of wall time. Requesting 1 ms resolution drops the floor to +// the documented minimum. +#include + +// CREATE_WAITABLE_TIMER_HIGH_RESOLUTION (Windows 10 1803+, build 17134) +// may not be defined in older SDK headers. Mirror the literal values +// documented by Microsoft - same fallback as windows_shim/unistd.h. +#ifndef CREATE_WAITABLE_TIMER_MANUAL_RESET +#define CREATE_WAITABLE_TIMER_MANUAL_RESET 0x00000001 +#endif +#ifndef CREATE_WAITABLE_TIMER_HIGH_RESOLUTION +#define CREATE_WAITABLE_TIMER_HIGH_RESOLUTION 0x00000002 +#endif + +#if defined(_MSC_VER) +// MSVC CRT debug heap: dumps unfreed allocations to stderr. +// We invoke it explicitly from px4_windows_exit() because the daemon +// shuts down via ExitProcess(), which bypasses _CRTDBG_LEAK_CHECK_DF. +#include +#endif /* -------------------------------------------------------------------------- * One-time process-wide initialisation. @@ -57,6 +85,181 @@ * the extern declaration in proc/ids.cpp resolves. */ volatile LONG g_px4_session_id = 0; +/* Runtime-tuned thresholds consumed by the inline usleep() shim in + * platforms/posix/include/windows_shim/unistd.h. The defaults are sized + * to give a high-resolution-timer-equipped host (Windows 10 1803+) a + * safe starting point: 5 ms pure-spin ceiling and 1 ms spin-tail. The + * tail is the *upper bound* the thread-local adaptive controller in + * usleep() may expand to; it shrinks toward the observed timer overshoot + * via an EWMA so a quiet host pays only ~p95-jitter of CPU spin per + * call. They live at file scope because every translation unit that + * includes on Windows references them through the inline body + * of usleep(). */ +extern "C" long g_usleep_pure_spin_us = 5000; +extern "C" long g_usleep_spin_tail_us = 1000; +/* Floor for the per-thread adaptive spin tail. Initialised by + * px4_windows_calibrate_usleep_threshold() to the host-measured P95 + * waitable-timer jitter so the controller never collapses below the + * value we already know is needed to cover the observed long-tail wakes. + * Defaults to a conservative 700 us when calibration cannot run (e.g. + * pre-1803 Windows with no high-resolution timer). */ +extern "C" long g_usleep_adaptive_min_tail_us = 700; + +/** + * @brief Auto-tune g_usleep_pure_spin_us against the host's measured + * high-resolution waitable-timer jitter and apply an optional + * environment override. + * + * Must be invoked exactly once and BEFORE any thread starts calling + * usleep(). The constructor of PX4WindowsGlobalInit calls it directly + * after timeBeginPeriod(1) - the earliest hookable point in the PX4 + * Windows startup sequence. + * + * Honors PX4_USLEEP_SPIN_US (microseconds, clamped to [0, 50000]). When + * unset, probes the *exact* primitive the inline usleep() shim uses for + * the bulk wait: a CREATE_WAITABLE_TIMER_HIGH_RESOLUTION waitable timer + * armed for 1 ms via SetWaitableTimer + WaitForSingleObject. The chosen + * threshold is g_usleep_spin_tail_us + p95_jitter + 500 us margin, so + * any wait above the threshold can be served by (timer + spin tail) and + * still hit the absolute QPC deadline. Floored to 500 us and capped at + * 5000 us. + * + * The previous heuristic measured Sleep(1) jitter, but Sleep is not on + * the hot path - usleep() uses the high-resolution waitable timer, which + * is far more accurate than Sleep on Win10 1803+. Probing the wrong + * primitive made the auto-tune saturate at 5000 us on quiet hosts, which + * forced every SIH 4 ms tick into pure-spin and pegged one full core. + */ +static void px4_windows_calibrate_usleep_threshold() +{ + // 1. Honor an explicit env override first; most users / CI runs set + // this from the launcher script, so skip the probe entirely when present. + if (const char *env = std::getenv("PX4_USLEEP_SPIN_US")) { + char *end = nullptr; + long v = std::strtol(env, &end, 10); + + if (end != env && v >= 0 && v <= 50000) { + g_usleep_pure_spin_us = v; + // Env override skips probing the host so we + // have no measured P95 - keep the conservative + // upper-bound default for the adaptive floor. + g_usleep_adaptive_min_tail_us = g_usleep_spin_tail_us; + std::printf("INFO [px4_windows] usleep spin threshold (env): %ld us " + "(adaptive tail floor: %ld us)\n", + v, g_usleep_adaptive_min_tail_us); + std::fflush(stdout); + return; + } + + std::printf("WARN [px4_windows] PX4_USLEEP_SPIN_US=\"%s\" out of range [0, 50000], ignored\n", + env); + std::fflush(stdout); + } + + // 2. Probe the actual primitive usleep() uses for the bulk wait: a + // CREATE_WAITABLE_TIMER_HIGH_RESOLUTION waitable timer armed via + // SetWaitableTimer + WaitForSingleObject. On Win10 1803+ this gives + // sub-millisecond accuracy; the residual is closed by the QPC spin + // tail (g_usleep_spin_tail_us). The threshold we want is the smallest + // value such that (waitable_timer + spin_tail) reliably hits the + // deadline. + HANDLE timer = CreateWaitableTimerExW(NULL, NULL, + CREATE_WAITABLE_TIMER_HIGH_RESOLUTION + | CREATE_WAITABLE_TIMER_MANUAL_RESET, + TIMER_ALL_ACCESS); + + if (timer == NULL) { + // Older Windows (pre-1803) lacks the high-res flag. Keep the + // historical 5000 us default - on those hosts the legacy timer + // quantizes to ~1 ms tick and the wide spin band is the safest + // behavior available. + g_usleep_pure_spin_us = 5000; + // Without a high-res timer the legacy 1 ms tick dominates; + // lock the adaptive floor to spin_tail_us so the controller + // can't shrink the spin below the safe bound on this host. + g_usleep_adaptive_min_tail_us = g_usleep_spin_tail_us; + std::printf("INFO [px4_windows] usleep spin threshold (auto): 5000 us " + "(high-res waitable timer unavailable, using legacy default)\n"); + std::fflush(stdout); + return; + } + + LARGE_INTEGER freq; + QueryPerformanceFrequency(&freq); + + // N=500 keeps ~25 samples in the p95 tail (vs 5 at N=100), which removes + // the intermittent low-p95 outlier that under-provisioned the spin tail + // and tripped the sim/wall ratio below 0.99 once every few cold boots. + // Probe cost is ~500 ms of one-time startup time (each iteration waits 1 + // ms on the high-res timer); negligible vs the robustness gain. + constexpr int N = 500; + long jitter_us[N]; + + for (int i = 0; i < N; ++i) { + LARGE_INTEGER t0; + LARGE_INTEGER t1; + LARGE_INTEGER due; + // Ask for 1 ms (10 000 x 100 ns units, negative = relative). + due.QuadPart = -10000; + QueryPerformanceCounter(&t0); + + if (SetWaitableTimer(timer, &due, 0, NULL, NULL, FALSE)) { + WaitForSingleObject(timer, INFINITE); + } + + QueryPerformanceCounter(&t1); + const long actual_us = (long)(((t1.QuadPart - t0.QuadPart) * 1000000LL) / freq.QuadPart); + long delta = actual_us - 1000; + + if (delta < 0) { delta = 0; } + + jitter_us[i] = delta; + } + + CloseHandle(timer); + std::sort(jitter_us, jitter_us + N); + const long p95 = jitter_us[(int)(0.95 * N)]; + + // 3. Choose threshold = spin_tail + p95_jitter + 500 us margin. Any + // wait above this is served by (waitable timer wakes ~p95 us late + // at most + spin tail closes the residual). Floor at 500 us so the + // short-sleep band never collapses (avoids degenerate 0-cost loops); + // cap at 5000 us so a freakishly noisy host still falls back to the + // legacy behavior. + long chosen = g_usleep_spin_tail_us + p95 + 500; + + if (chosen < 500) { chosen = 500; } + + if (chosen > 5000) { chosen = 5000; } + + g_usleep_pure_spin_us = chosen; + // Right-size the spin-tail upper bound to (P95 + 500 us). This is + // the largest value the per-thread adaptive controller in usleep() + // is allowed to grow to, so we trade a couple hundred microseconds + // of CPU spin per call for a robust deadline guarantee. The +500 + // (was +300) absorbs the residual P95 underestimate even when the + // N=500 probe still under-samples a freakishly quiet host. Floored + // at 700 us (so quiet hosts still cover the typical Win10 1803+ + // jitter floor) and capped at 2000 us (the historical safe value). + long sized_tail = p95 + 500; + if (sized_tail < 700) { sized_tail = 700; } + if (sized_tail > 2000) { sized_tail = 2000; } + g_usleep_spin_tail_us = sized_tail; + // Set the adaptive tail floor to the host-measured P95 jitter + // (clamped to [200, sized_tail]) so the per-thread controller in + // usleep() can never trim the spin below the value we already know + // is needed to cover this host's observed long-tail wakes. + long adaptive_floor = p95; + if (adaptive_floor < 200) { adaptive_floor = 200; } + if (adaptive_floor > sized_tail) { adaptive_floor = sized_tail; } + g_usleep_adaptive_min_tail_us = adaptive_floor; + std::printf("INFO [px4_windows] usleep spin threshold (auto): %ld us " + "(p95 high-res timer jitter: %ld us [N=%d], spin tail: %ld us, " + "adaptive tail floor: %ld us)\n", + chosen, p95, N, sized_tail, adaptive_floor); + std::fflush(stdout); +} + namespace { @@ -134,6 +337,7 @@ struct PX4WindowsGlobalInit { // Inline Linux syscall helpers (x86_64 ABI). static long long linux_syscall1(long long num, long long a) { +#if defined(__GNUC__) || defined(__clang__) long long ret; __asm__ volatile ( "syscall" @@ -142,10 +346,16 @@ struct PX4WindowsGlobalInit { : "rcx", "r11", "memory" ); return ret; +#else + (void)num; + (void)a; + return -1; +#endif } static long long linux_syscall3(long long num, long long a, long long b, long long c) { +#if defined(__GNUC__) || defined(__clang__) long long ret; __asm__ volatile ( "syscall" @@ -154,6 +364,13 @@ struct PX4WindowsGlobalInit { : "rcx", "r11", "memory" ); return ret; +#else + (void)num; + (void)a; + (void)b; + (void)c; + return -1; +#endif } static long long open_host_tty() @@ -260,6 +477,8 @@ struct PX4WindowsGlobalInit { } } + bool timer_resolution_raised = false; + PX4WindowsGlobalInit() { WSADATA wsaData; @@ -268,6 +487,24 @@ struct PX4WindowsGlobalInit { } SetConsoleOutputCP(CP_UTF8); + // Raise the global timer resolution to 1 ms. The default + // (~15.6 ms) makes every usleep() round up to a full HPET + // tick, throttling SITL sim time to ~40 % of wall time. The + // matching timeEndPeriod(1) lives in the destructor; Windows + // also clears the request on process exit, so a hard + // ExitProcess() path is still safe. + if (timeBeginPeriod(1) == TIMERR_NOERROR) { + timer_resolution_raised = true; + } + + // Tune g_usleep_pure_spin_us either from PX4_USLEEP_SPIN_US or + // by probing this host's Sleep(1) jitter. Must run AFTER + // timeBeginPeriod(1) so the probe sees the same scheduler + // behavior usleep() will see, and BEFORE any module thread + // has had a chance to start (we are in a static constructor, + // so PX4 main() has not yet been entered). + px4_windows_calibrate_usleep_threshold(); + // PX4 stores binary data (parameters.bson, dataman) and expects // read/write to preserve bytes exactly. MSVCRT's default text // mode maps CRLF<->LF, which corrupts arbitrary binary content. @@ -326,11 +563,101 @@ struct PX4WindowsGlobalInit { { restore_console_modes(); WSACleanup(); + + if (timer_resolution_raised) { + timeEndPeriod(1); + timer_resolution_raised = false; + } } }; static PX4WindowsGlobalInit _px4_win_init; + +// Filesystem paths the process owns and must remove on any exit path. +// Used by px4_windows_exit() to undo the byte-range lock files that the +// daemon installs in %TEMP% via set_server_running(); the explicit unlink +// in main.cpp only runs when the pxh shell loop returns normally, but the +// `pxh shutdown` command leaves via px4_platform_exit() -> ExitProcess() +// and would otherwise leak the lock and PID-companion files. +std::mutex _px4_exit_unlink_mutex; +std::vector _px4_exit_unlink_paths; + +// File descriptors held open for the lifetime of the process (typically the +// byte-range lock fd installed by set_server_running). Windows refuses to +// unlink a file while any handle to it is open in the same process, so the +// exit path must close these BEFORE running the registered unlinks. +std::vector _px4_exit_close_fds; + +void px4_run_exit_unlinks() +{ + std::lock_guard lock(_px4_exit_unlink_mutex); + + // Close fds first so subsequent unlink() calls don't hit ERROR_SHARING_VIOLATION. + for (int fd : _px4_exit_close_fds) { + if (fd >= 0) { + (void)::_close(fd); + } + } + + _px4_exit_close_fds.clear(); + + for (const std::string &path : _px4_exit_unlink_paths) { + // Best effort: ignore errors — the path may already be gone if a + // different shutdown route ran the explicit cleanup first. + (void)::_unlink(path.c_str()); + } + + _px4_exit_unlink_paths.clear(); +} } // namespace +extern "C" void px4_windows_register_exit_unlink(const char *path) +{ + if (path == nullptr || path[0] == '\0') { + return; + } + + std::lock_guard lock(_px4_exit_unlink_mutex); + + for (const std::string &existing : _px4_exit_unlink_paths) { + if (existing == path) { + return; // already registered + } + } + + // Hard cap on entries so a buggy caller can't grow this unboundedly; + // the daemon only registers two paths (lock + .pid). + constexpr std::size_t kMaxRegistered = 16; + + if (_px4_exit_unlink_paths.size() >= kMaxRegistered) { + return; + } + + _px4_exit_unlink_paths.emplace_back(path); +} + +extern "C" void px4_windows_register_exit_close_fd(int fd) +{ + if (fd < 0) { + return; + } + + std::lock_guard lock(_px4_exit_unlink_mutex); + + for (int existing : _px4_exit_close_fds) { + if (existing == fd) { + return; // already registered + } + } + + constexpr std::size_t kMaxRegistered = 16; + + if (_px4_exit_close_fds.size() >= kMaxRegistered) { + return; + } + + _px4_exit_close_fds.push_back(fd); +} + extern "C" void px4_windows_restore_console_modes() { _px4_win_init.restore_console_modes(); @@ -381,12 +708,33 @@ extern "C" void px4_windows_exit(int status) { fflush(stdout); fflush(stderr); + + // Drop server lock + PID-companion files before tearing down the + // console. Done early so a follow-up launch racing this process can + // re-acquire the byte-range lock without falling through to the + // stale-lock recovery path in get_server_running(). + px4_run_exit_unlinks(); + +#if defined(_MSC_VER) + // ExitProcess()/TerminateProcess() skip the CRT exit chain, so + // _CRTDBG_LEAK_CHECK_DF never runs. Dump the leak report explicitly + // here, BEFORE FreeConsole() invalidates the stderr handle the CRT + // would write to. + _CrtDumpMemoryLeaks(); + fflush(stderr); +#endif + _px4_win_init.restore_console_modes(); if (!_px4_win_init.running_under_wine) { FreeConsole(); } + // Static dtors do not run under ExitProcess()/TerminateProcess(). + // Match WSAStartup() from the constructor by calling WSACleanup() + // explicitly so a soft-exit path does not appear to leak winsock state. + WSACleanup(); + if (_px4_win_init.running_under_wine) { TerminateProcess(GetCurrentProcess(), static_cast(status)); }