feat(posix): tune Windows sleep primitives

Add host-calibrated sleep and spin-tail handling for Windows SITL, plus scheduler and clock helpers needed to keep short lockstep waits precise under MSVC and MinGW.

Signed-off-by: Nuno Marques <n.marques21@hotmail.com>
This commit is contained in:
Nuno Marques
2026-05-05 20:56:58 -07:00
parent 88af828724
commit 83e69f98d4
6 changed files with 802 additions and 17 deletions
@@ -178,6 +178,11 @@ int pthread_cond_signal(pthread_cond_t *cond);
int pthread_cond_broadcast(pthread_cond_t *cond);
/** @} */
#if defined(_MSC_VER) && !defined(__clang__)
typedef void (*px4_pthread_cond_notify_callback_t)(pthread_cond_t *cond, int broadcast);
int px4_pthread_cond_set_notify_callback(px4_pthread_cond_notify_callback_t callback);
#endif
/** @name Thread lifecycle functions
*
* Wrap CreateThread/WaitForSingleObject/CloseHandle with pthread-compatible
@@ -191,6 +196,8 @@ int pthread_detach(pthread_t thread);
void pthread_exit(void *value_ptr);
pthread_t pthread_self(void);
int pthread_equal(pthread_t t1, pthread_t t2);
int pthread_getschedparam(pthread_t thread, int *policy, struct sched_param *param);
int pthread_setschedparam(pthread_t thread, int policy, const struct sched_param *param);
int pthread_cancel(pthread_t thread);
int pthread_kill(pthread_t thread, int sig);
/** @} */
@@ -244,3 +251,17 @@ int pthread_getname_np(pthread_t thread, char *name, size_t len);
#ifndef PTHREAD_STACK_MIN
#define PTHREAD_STACK_MIN 16384
#endif
#if (defined(__PX4_WINDOWS) || defined(_WIN32)) && !defined(_MSC_VER) && \
(defined(ENABLE_LOCKSTEP_SCHEDULER) || defined(PX4_WINDOWS_PTHREAD_LOCKSTEP_BRIDGE))
#ifdef __cplusplus
extern "C" {
#endif
int px4_lockstep_pthread_cond_signal(pthread_cond_t *cond);
int px4_lockstep_pthread_cond_broadcast(pthread_cond_t *cond);
#ifdef __cplusplus
}
#endif
#define pthread_cond_signal(cond_) px4_lockstep_pthread_cond_signal(cond_)
#define pthread_cond_broadcast(cond_) px4_lockstep_pthread_cond_broadcast(cond_)
#endif
+8 -6
View File
@@ -67,12 +67,14 @@ typedef int clockid_t;
#ifndef CLOCK_MONOTONIC_COARSE
#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC
#endif
#ifndef _TIMEVAL_DEFINED
#define _TIMEVAL_DEFINED
struct timeval {
long tv_sec;
long tv_usec;
};
/* The MSVC SDK declares `struct timeval` only inside <winsock.h> /
* <winsock2.h>, and does so unconditionally — no header guard. Pull it
* from there so any later <sys/socket.h>-via-<winsock2.h> include doesn't
* trigger a "redefinition" (C2011). NOMINMAX / WIN32_LEAN_AND_MEAN are
* already in effect via the SITL compile flags, so the cost here is
* mostly the winsock typedefs. */
#ifndef _WINSOCK2API_
#include <winsock2.h>
#endif
#else
#include_next <time.h>
+256 -2
View File
@@ -46,6 +46,15 @@
#if defined(_MSC_VER) && !defined(__clang__)
#include <sys/types.h>
#elif defined(_WIN32)
/*
* MinGW declares its own usleep() in <unistd.h>. Pull the rest of that
* header through normally, but hide only that declaration so PX4 can provide
* the same high-resolution Windows implementation for system_usleep.
*/
#define usleep _px4_mingw_runtime_usleep
#include_next <unistd.h>
#undef usleep
#else
#include_next <unistd.h>
#endif
@@ -115,14 +124,258 @@
extern "C" {
#endif
#if defined(_WIN32)
/* CREATE_WAITABLE_TIMER_HIGH_RESOLUTION (Windows 10 1803+; build 17134)
* may not be defined in older SDK headers - fall back to the literal
* value documented by Microsoft. Same for the manual-reset flag. */
#ifndef CREATE_WAITABLE_TIMER_MANUAL_RESET
#define CREATE_WAITABLE_TIMER_MANUAL_RESET 0x00000001
#endif
#ifndef CREATE_WAITABLE_TIMER_HIGH_RESOLUTION
#define CREATE_WAITABLE_TIMER_HIGH_RESOLUTION 0x00000002
#endif
#if defined(_MSC_VER) && !defined(__clang__)
/** @brief Sleep for at least @p usec microseconds using Windows Sleep(). */
#define PX4_WINDOWS_SLEEP_TLS __declspec(thread)
#else
#define PX4_WINDOWS_SLEEP_TLS __thread
#endif
/**
* Runtime-tuned thresholds that drive the spin-residual hybrid below.
*
* Defined and (optionally) auto-calibrated by
* px4_windows_calibrate_usleep_threshold() in
* platforms/posix/src/px4/windows/runtime/init.cpp. The calibration runs
* immediately after timeBeginPeriod(1), before any module thread starts
* calling usleep(), so the first usleep() the process performs already
* sees the tuned value.
*
* Override at process startup with the PX4_USLEEP_SPIN_US environment
* variable (clamped to [0, 50000] microseconds). Values <= 50000 are
* accepted; 0 effectively forces every wait > 0 us through the timer +
* spin-tail path.
*
* @c g_usleep_spin_tail_us is the *upper bound* of the QPC spin closing
* the residual after the high-resolution waitable timer wakes. The
* adaptive controller in usleep() shrinks the tail per-thread toward
* the observed timer overshoot via an EWMA, so a quiet host pays only
* ~p95 jitter of CPU spin per call instead of the worst-case bound.
*/
extern long g_usleep_pure_spin_us;
extern long g_usleep_spin_tail_us;
/* Floor for the per-thread adaptive spin tail. Initialised by the
* calibration routine to the host-measured P95 waitable-timer jitter so
* the controller never trims the spin below the value we already know
* is needed to cover this host's observed long-tail wakes. Defaults to
* a conservative 700 us when calibration cannot run. */
extern long g_usleep_adaptive_min_tail_us;
/**
* @brief Sleep for at least @p usec microseconds with microsecond accuracy.
*
* Windows Sleep() is quantized to the system timer tick (~15.6 ms by
* default; 1 ms after timeBeginPeriod(1) in init.cpp). A 4 ms sleep
* therefore rounds up to a full HPET tick, throttling SITL sim time.
*
* The naive Sleep() path loses ~10 % of wall time. A pure HPET-backed
* waitable timer wakes within 0.3 - 0.7 ms of the target on a quiet
* system but quantizes to 1 ms under load, so a tight SITL producer
* (250 Hz - 1 kHz lockstep loop) accumulates 5 - 10 % drift.
*
* The current implementation is a spin-residual hybrid:
*
* - Requests <= @c g_usleep_pure_spin_us are held entirely on the QPC
* deadline. This covers SIH's normal lockstep wall-sleep cadence
* (200 Hz - 2 kHz, 500 - 5000 us). Even a single 0.5 - 1 ms
* scheduler-late wake in that band becomes visible as sim/wall
* drift, so the short simulation waits pay CPU for determinism.
*
* - For requests > @c g_usleep_pure_spin_us the bulk of the wait runs
* on a high-resolution waitable timer (CreateWaitableTimerExW +
* CREATE_WAITABLE_TIMER_HIGH_RESOLUTION, Windows 10 1803+). The
* timer is armed to wake @c g_usleep_spin_tail_us microseconds
* early and the residual is closed by a QueryPerformanceCounter
* busy-loop. This trades ~g_usleep_spin_tail_us of CPU per call for
* microsecond-scale accuracy against the absolute QPC target.
*
* The HANDLE is cached per-thread in compiler-native TLS so we pay one
* CreateWaitableTimerExW per thread for the lifetime of the process.
*/
static inline int usleep(useconds_t usec)
{
Sleep((DWORD)((usec + 999U) / 1000U));
if (usec == 0) {
return 0;
}
// Snapshot the tuned thresholds once per call. Reads of an unaligned
// long are atomic on x86_64; the calibration in init.cpp runs before
// any other thread starts, so no further synchronization is needed.
const long pure_spin_us = g_usleep_pure_spin_us;
const long spin_tail_us = g_usleep_spin_tail_us;
const long adaptive_floor_us = g_usleep_adaptive_min_tail_us;
LARGE_INTEGER qpc_freq;
LARGE_INTEGER qpc_start;
QueryPerformanceFrequency(&qpc_freq);
QueryPerformanceCounter(&qpc_start);
// Absolute QPC target = start + usec. The conversion uses 64-bit
// integer math throughout: at 10 MHz QPC and a 1-second sleep the
// product is 1e7, well within LONGLONG range.
const LONGLONG qpc_target = qpc_start.QuadPart
+ ((LONGLONG)usec * qpc_freq.QuadPart) / 1000000LL;
if ((long)usec > pure_spin_us) {
// Use compiler-native TLS instead of C++ thread_local because this
// header is also included from .c translation units.
static PX4_WINDOWS_SLEEP_TLS HANDLE timer = NULL;
// Per-thread adaptive spin-tail state. We track the timer wake
// overshoot (how late WaitForSingleObject returned past the
// requested bulk deadline) as an EWMA in microseconds, then size
// the spin tail at (overshoot_ewma + small_margin), bounded by
// [PX4_USLEEP_ADAPTIVE_MIN_TAIL_US, spin_tail_us].
// The EWMA is initialized with sentinel -1 so the first call
// uses the configured upper-bound tail; subsequent calls
// converge toward the host's actual jitter and trim the spin.
static PX4_WINDOWS_SLEEP_TLS long adaptive_tail_us = -1;
static PX4_WINDOWS_SLEEP_TLS long overshoot_ewma_us = -1;
if (timer == NULL) {
timer = CreateWaitableTimerExW(NULL, NULL,
CREATE_WAITABLE_TIMER_HIGH_RESOLUTION
| CREATE_WAITABLE_TIMER_MANUAL_RESET,
TIMER_ALL_ACCESS);
if (timer == NULL) {
// Older Windows: legacy manual-reset timer
// still honors timeBeginPeriod(1).
timer = CreateWaitableTimerW(NULL, TRUE, NULL);
}
}
// Decide the spin tail for this call. First call (sentinel) -
// fall back to the configured upper bound so we definitely
// cover the deadline while we collect data. Subsequent calls
// use the EWMA-derived value.
long tail_us = (adaptive_tail_us < 0) ? spin_tail_us : adaptive_tail_us;
// Floor at the host-measured P95 jitter (set by the calibration
// routine in init.cpp). Trimming below this would force the QPC
// spin to absorb wakes past the deadline, which directly bleeds
// into sim/wall ratio.
if (tail_us < adaptive_floor_us) { tail_us = adaptive_floor_us; }
if (tail_us > spin_tail_us) { tail_us = spin_tail_us; }
if (timer != NULL) {
LARGE_INTEGER due;
// Wake tail_us early and close the gap by spin.
// Negative due time = relative interval, 100 ns units.
// Clamp the bulk wait to >= 0 in case the caller asked
// for a value just above pure_spin_us with a larger
// spin_tail_us; the QPC spin still enforces the deadline.
const LONGLONG bulk_us = (LONGLONG)usec - (LONGLONG)tail_us;
const LONGLONG bulk_us_clamped = bulk_us > 0 ? bulk_us : 0;
due.QuadPart = -(bulk_us_clamped * 10);
if (SetWaitableTimer(timer, &due, 0, NULL, NULL, FALSE)) {
// Use a millisecond timeout slightly longer than
// the requested sleep rather than INFINITE: a
// rare WaitForSingleObject misbehavior on Windows
// (observed under heavy SITL lockstep load) can
// otherwise hang the producer thread permanently.
// The QPC spin below still enforces the absolute
// deadline, so a premature wake is harmless.
const DWORD wait_ms_bulk = (DWORD)((bulk_us_clamped + 999LL) / 1000LL);
const DWORD wait_ms = wait_ms_bulk + 5U; // +5 ms safety margin
WaitForSingleObject(timer, wait_ms);
// Adaptive update: measure how late we woke vs the
// requested bulk deadline (qpc_target - tail_us).
// Negative = woke early (good); positive = woke late
// and the spin tail had to absorb it. We track an
// upper-envelope EWMA: a late wake snaps the value up
// immediately, a stretch of clean wakes decays it down
// at 1/64 per call (~30 ms settle at 250 Hz). The plain
// mean would undersize the tail because the timer jitter
// distribution has a long upper tail and 5 % of waits
// can wake far past the mean - each such miss bleeds
// 100 - 1000 us into wall time and accumulates as
// sim/wall ratio drift.
LARGE_INTEGER wake_now;
QueryPerformanceCounter(&wake_now);
const LONGLONG bulk_target_qpc = qpc_target
- ((LONGLONG)tail_us * qpc_freq.QuadPart) / 1000000LL;
LONGLONG overshoot_qpc = wake_now.QuadPart - bulk_target_qpc;
if (overshoot_qpc < 0) { overshoot_qpc = 0; }
const long overshoot_us =
(long)((overshoot_qpc * 1000000LL) / qpc_freq.QuadPart);
if (overshoot_ewma_us < 0) {
// First sample: seed at the configured upper
// bound so we don't undershoot before any
// data has been gathered.
overshoot_ewma_us = spin_tail_us;
}
// Fast attack, slow decay.
if (overshoot_us > overshoot_ewma_us) {
overshoot_ewma_us = overshoot_us;
} else {
overshoot_ewma_us =
(overshoot_ewma_us * 63 + overshoot_us + 32) / 64;
}
// Size the next call's tail at envelope + 200 us
// margin. The margin covers the residual gap between
// the slow-decay envelope and the instantaneous
// worst-case wake jitter; the floor and upper-bound
// clamps keep the controller from collapsing or
// running away.
adaptive_tail_us = overshoot_ewma_us + 200;
if (adaptive_tail_us < adaptive_floor_us) {
adaptive_tail_us = adaptive_floor_us;
}
if (adaptive_tail_us > spin_tail_us) {
adaptive_tail_us = spin_tail_us;
}
} else {
// Arming failed (very rare). Fall through to
// the QPC spin below; it will still hit the
// deadline, just with a brief CPU burn.
}
} else {
// No timer available at all: kernel Sleep() rounded
// up to the nearest millisecond. The spin tail below
// still corrects the residual.
const LONGLONG bulk_us = (LONGLONG)usec - (LONGLONG)tail_us;
const LONGLONG bulk_us_clamped = bulk_us > 0 ? bulk_us : 0;
Sleep((DWORD)((bulk_us_clamped + 999LL) / 1000LL));
}
}
// Close the residual against the absolute QPC target. For longer waits
// this is at most ~spin_tail_us of spin (often less because the waitable
// timer wakes slightly late). For SIH-sized waits it is the full request.
LARGE_INTEGER now;
do {
YieldProcessor();
QueryPerformanceCounter(&now);
} while (now.QuadPart < qpc_target);
return 0;
}
#undef PX4_WINDOWS_SLEEP_TLS
#if defined(_MSC_VER) && !defined(__clang__)
/** @brief Sleep for at least @p seconds seconds using Windows Sleep(). */
static inline unsigned int sleep(unsigned int seconds)
{
@@ -130,6 +383,7 @@ static inline unsigned int sleep(unsigned int seconds)
return 0;
}
#endif
#endif
/* POSIX pipe(fd[2]) - default to 64 KiB buffer and binary mode. */
#ifndef _PX4_PIPE_SHIM_DEFINED
@@ -46,7 +46,11 @@
#if defined(_MSC_VER) && !defined(__clang__)
#include <atomic>
#include <process.h>
#include <mutex>
#include <unordered_map>
#include <vector>
namespace
{
@@ -59,6 +63,72 @@ struct PX4ThreadStart {
pthread_t self;
};
/* POSIX requires pthread_key_create() destructors to run on thread exit. The
* Win32 TLS API has no such hook, so we keep our own registry of keys with
* non-null destructors and walk it from the thread trampoline tail. Without
* this, every per-thread allocation registered via pthread_setspecific() leaks
* on the MSVC SITL build (e.g. CmdThreadSpecificData in px4_daemon::Server). */
std::mutex &tls_destructor_mutex()
{
static std::mutex m;
return m;
}
std::unordered_map<pthread_key_t, void (*)(void *)> &tls_destructors()
{
static std::unordered_map<pthread_key_t, void (*)(void *)> map;
return map;
}
std::atomic<px4_pthread_cond_notify_callback_t> &cond_notify_callback()
{
static std::atomic<px4_pthread_cond_notify_callback_t> callback{nullptr};
return callback;
}
void run_tls_destructors_on_exit()
{
/* POSIX allows up to PTHREAD_DESTRUCTOR_ITERATIONS (typically 4) passes
* because a destructor may install new TLS values. Snapshot under the
* mutex, run unlocked so destructors can call pthread_key_delete()/
* pthread_setspecific() without deadlocking. */
for (int pass = 0; pass < 4; ++pass) {
struct Pending {
pthread_key_t key;
void (*destructor)(void *);
void *value;
};
std::vector<Pending> pending;
{
std::lock_guard<std::mutex> guard(tls_destructor_mutex());
pending.reserve(tls_destructors().size());
for (const auto &entry : tls_destructors()) {
if (entry.second == nullptr) {
continue;
}
void *value = TlsGetValue(entry.first);
if (value == nullptr) {
continue;
}
pending.push_back({entry.first, entry.second, value});
}
}
if (pending.empty()) {
return;
}
for (const auto &p : pending) {
TlsSetValue(p.key, nullptr);
p.destructor(p.value);
}
}
}
BOOL CALLBACK init_mutex_once(PINIT_ONCE, PVOID parameter, PVOID *)
{
/* Static pthread mutex initializers cannot run a constructor. INIT_ONCE lets
@@ -95,6 +165,30 @@ static HANDLE handle_for_pthread(pthread_t thread)
return reinterpret_cast<HANDLE>(thread);
}
static int windows_thread_priority(int priority)
{
if (priority >= THREAD_PRIORITY_TIME_CRITICAL) {
return THREAD_PRIORITY_TIME_CRITICAL;
} else if (priority >= THREAD_PRIORITY_HIGHEST) {
return THREAD_PRIORITY_HIGHEST;
} else if (priority >= THREAD_PRIORITY_ABOVE_NORMAL) {
return THREAD_PRIORITY_ABOVE_NORMAL;
} else if (priority == THREAD_PRIORITY_NORMAL) {
return THREAD_PRIORITY_NORMAL;
} else if (priority <= THREAD_PRIORITY_IDLE) {
return THREAD_PRIORITY_IDLE;
} else if (priority <= THREAD_PRIORITY_LOWEST) {
return THREAD_PRIORITY_LOWEST;
}
return THREAD_PRIORITY_BELOW_NORMAL;
}
static DWORD abstime_to_timeout_ms(const timespec *abstime)
{
if (!abstime) {
@@ -128,6 +222,7 @@ unsigned __stdcall thread_trampoline(void *arg)
delete start;
const uintptr_t result = reinterpret_cast<uintptr_t>(entry(entry_arg));
run_tls_destructors_on_exit();
_endthreadex(static_cast<unsigned>(result));
return static_cast<unsigned>(result);
}
@@ -459,6 +554,10 @@ int pthread_cond_signal(pthread_cond_t *cond)
return EINVAL;
}
if (px4_pthread_cond_notify_callback_t callback = cond_notify_callback().load(std::memory_order_acquire)) {
callback(cond, 0);
}
WakeConditionVariable(cond);
return 0;
}
@@ -469,10 +568,20 @@ int pthread_cond_broadcast(pthread_cond_t *cond)
return EINVAL;
}
if (px4_pthread_cond_notify_callback_t callback = cond_notify_callback().load(std::memory_order_acquire)) {
callback(cond, 1);
}
WakeAllConditionVariable(cond);
return 0;
}
int px4_pthread_cond_set_notify_callback(px4_pthread_cond_notify_callback_t callback)
{
cond_notify_callback().store(callback, std::memory_order_release);
return 0;
}
int pthread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine)(void *), void *arg)
{
if (!thread || !start_routine) {
@@ -499,7 +608,7 @@ int pthread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_
*thread = static_cast<pthread_t>(handle);
if (attr && attr->sched.sched_priority != 0) {
SetThreadPriority(reinterpret_cast<HANDLE>(handle), attr->sched.sched_priority);
SetThreadPriority(reinterpret_cast<HANDLE>(handle), windows_thread_priority(attr->sched.sched_priority));
}
ResumeThread(reinterpret_cast<HANDLE>(handle));
@@ -522,6 +631,10 @@ int pthread_join(pthread_t thread, void **value_ptr)
HANDLE handle = reinterpret_cast<HANDLE>(thread);
if (WaitForSingleObject(handle, INFINITE) == WAIT_FAILED) {
/* Even on failure the caller has handed ownership of the handle
* to pthread_join() per POSIX semantics; close it so we don't
* leak the Win32 thread object. */
CloseHandle(handle);
return ESRCH;
}
@@ -547,6 +660,7 @@ int pthread_detach(pthread_t thread)
void pthread_exit(void *value_ptr)
{
run_tls_destructors_on_exit();
_endthreadex(static_cast<unsigned>(reinterpret_cast<uintptr_t>(value_ptr)));
}
@@ -564,6 +678,34 @@ int pthread_equal(pthread_t t1, pthread_t t2)
return t1 == t2;
}
int pthread_getschedparam(pthread_t thread, int *policy, struct sched_param *param)
{
if (!policy || !param) {
return EINVAL;
}
const int priority = GetThreadPriority(handle_for_pthread(thread));
if (priority == THREAD_PRIORITY_ERROR_RETURN && GetLastError() != ERROR_SUCCESS) {
return ESRCH;
}
*policy = SCHED_OTHER;
param->sched_priority = priority;
return 0;
}
int pthread_setschedparam(pthread_t thread, int policy, const struct sched_param *param)
{
(void)policy;
if (!param) {
return EINVAL;
}
return SetThreadPriority(handle_for_pthread(thread), windows_thread_priority(param->sched_priority)) ? 0 : ESRCH;
}
int pthread_cancel(pthread_t thread)
{
if (thread == 0) {
@@ -584,8 +726,6 @@ int pthread_kill(pthread_t thread, int sig)
int pthread_key_create(pthread_key_t *key, void (*destructor)(void *))
{
(void)destructor;
if (!key) {
return EINVAL;
}
@@ -596,12 +736,21 @@ int pthread_key_create(pthread_key_t *key, void (*destructor)(void *))
return EAGAIN;
}
if (destructor) {
std::lock_guard<std::mutex> guard(tls_destructor_mutex());
tls_destructors()[index] = destructor;
}
*key = index;
return 0;
}
int pthread_key_delete(pthread_key_t key)
{
{
std::lock_guard<std::mutex> guard(tls_destructor_mutex());
tls_destructors().erase(key);
}
return TlsFree(key) ? 0 : EINVAL;
}
@@ -61,16 +61,27 @@ int clock_gettime(clockid_t clk_id, struct timespec *tp)
if (clk_id == CLOCK_MONOTONIC) {
/* QPC is monotonic and high resolution, but relative to an arbitrary
* boot-time counter. That is exactly what CLOCK_MONOTONIC promises. */
LARGE_INTEGER frequency {};
* boot-time counter. That is exactly what CLOCK_MONOTONIC promises.
*
* Per Microsoft's QueryPerformanceCounter guidance, the QPC frequency
* is fixed at system boot and consistent across processors, so we
* only need to query it once. clock_gettime is on PX4's hot path
* (drv_hrt's hrt_absolute_time, lockstep_scheduler, every uORB
* publish/subscribe), and a syscall here adds up quickly.
*/
static const int64_t frequency = []() {
LARGE_INTEGER f {};
QueryPerformanceFrequency(&f);
return f.QuadPart;
}();
LARGE_INTEGER counter {};
QueryPerformanceFrequency(&frequency);
QueryPerformanceCounter(&counter);
const uint64_t seconds = static_cast<uint64_t>(counter.QuadPart / frequency.QuadPart);
const uint64_t remainder = static_cast<uint64_t>(counter.QuadPart % frequency.QuadPart);
const uint64_t seconds = static_cast<uint64_t>(counter.QuadPart / frequency);
const uint64_t remainder = static_cast<uint64_t>(counter.QuadPart % frequency);
tp->tv_sec = static_cast<time_t>(seconds);
tp->tv_nsec = static_cast<long>((remainder * 1000000000ULL) / static_cast<uint64_t>(frequency.QuadPart));
tp->tv_nsec = static_cast<long>((remainder * 1000000000ULL) / static_cast<uint64_t>(frequency));
return 0;
}
@@ -43,7 +43,35 @@
#include "px4_windows_internal.h"
#include <algorithm>
#include <array>
#include <mutex>
#include <string>
#include <vector>
// timeBeginPeriod / timeEndPeriod live in winmm. Without raising the
// system timer resolution, the Windows scheduler quantizes Sleep() to
// the default ~15.6 ms HPET tick, which throttles SITL sim time to
// ~40 % of wall time. Requesting 1 ms resolution drops the floor to
// the documented minimum.
#include <timeapi.h>
// CREATE_WAITABLE_TIMER_HIGH_RESOLUTION (Windows 10 1803+, build 17134)
// may not be defined in older SDK headers. Mirror the literal values
// documented by Microsoft - same fallback as windows_shim/unistd.h.
#ifndef CREATE_WAITABLE_TIMER_MANUAL_RESET
#define CREATE_WAITABLE_TIMER_MANUAL_RESET 0x00000001
#endif
#ifndef CREATE_WAITABLE_TIMER_HIGH_RESOLUTION
#define CREATE_WAITABLE_TIMER_HIGH_RESOLUTION 0x00000002
#endif
#if defined(_MSC_VER)
// MSVC CRT debug heap: dumps unfreed allocations to stderr.
// We invoke it explicitly from px4_windows_exit() because the daemon
// shuts down via ExitProcess(), which bypasses _CRTDBG_LEAK_CHECK_DF.
#include <crtdbg.h>
#endif
/* --------------------------------------------------------------------------
* One-time process-wide initialisation.
@@ -57,6 +85,181 @@
* the extern declaration in proc/ids.cpp resolves. */
volatile LONG g_px4_session_id = 0;
/* Runtime-tuned thresholds consumed by the inline usleep() shim in
* platforms/posix/include/windows_shim/unistd.h. The defaults are sized
* to give a high-resolution-timer-equipped host (Windows 10 1803+) a
* safe starting point: 5 ms pure-spin ceiling and 1 ms spin-tail. The
* tail is the *upper bound* the thread-local adaptive controller in
* usleep() may expand to; it shrinks toward the observed timer overshoot
* via an EWMA so a quiet host pays only ~p95-jitter of CPU spin per
* call. They live at file scope because every translation unit that
* includes <unistd.h> on Windows references them through the inline body
* of usleep(). */
extern "C" long g_usleep_pure_spin_us = 5000;
extern "C" long g_usleep_spin_tail_us = 1000;
/* Floor for the per-thread adaptive spin tail. Initialised by
* px4_windows_calibrate_usleep_threshold() to the host-measured P95
* waitable-timer jitter so the controller never collapses below the
* value we already know is needed to cover the observed long-tail wakes.
* Defaults to a conservative 700 us when calibration cannot run (e.g.
* pre-1803 Windows with no high-resolution timer). */
extern "C" long g_usleep_adaptive_min_tail_us = 700;
/**
* @brief Auto-tune g_usleep_pure_spin_us against the host's measured
* high-resolution waitable-timer jitter and apply an optional
* environment override.
*
* Must be invoked exactly once and BEFORE any thread starts calling
* usleep(). The constructor of PX4WindowsGlobalInit calls it directly
* after timeBeginPeriod(1) - the earliest hookable point in the PX4
* Windows startup sequence.
*
* Honors PX4_USLEEP_SPIN_US (microseconds, clamped to [0, 50000]). When
* unset, probes the *exact* primitive the inline usleep() shim uses for
* the bulk wait: a CREATE_WAITABLE_TIMER_HIGH_RESOLUTION waitable timer
* armed for 1 ms via SetWaitableTimer + WaitForSingleObject. The chosen
* threshold is g_usleep_spin_tail_us + p95_jitter + 500 us margin, so
* any wait above the threshold can be served by (timer + spin tail) and
* still hit the absolute QPC deadline. Floored to 500 us and capped at
* 5000 us.
*
* The previous heuristic measured Sleep(1) jitter, but Sleep is not on
* the hot path - usleep() uses the high-resolution waitable timer, which
* is far more accurate than Sleep on Win10 1803+. Probing the wrong
* primitive made the auto-tune saturate at 5000 us on quiet hosts, which
* forced every SIH 4 ms tick into pure-spin and pegged one full core.
*/
static void px4_windows_calibrate_usleep_threshold()
{
// 1. Honor an explicit env override first; most users / CI runs set
// this from the launcher script, so skip the probe entirely when present.
if (const char *env = std::getenv("PX4_USLEEP_SPIN_US")) {
char *end = nullptr;
long v = std::strtol(env, &end, 10);
if (end != env && v >= 0 && v <= 50000) {
g_usleep_pure_spin_us = v;
// Env override skips probing the host so we
// have no measured P95 - keep the conservative
// upper-bound default for the adaptive floor.
g_usleep_adaptive_min_tail_us = g_usleep_spin_tail_us;
std::printf("INFO [px4_windows] usleep spin threshold (env): %ld us "
"(adaptive tail floor: %ld us)\n",
v, g_usleep_adaptive_min_tail_us);
std::fflush(stdout);
return;
}
std::printf("WARN [px4_windows] PX4_USLEEP_SPIN_US=\"%s\" out of range [0, 50000], ignored\n",
env);
std::fflush(stdout);
}
// 2. Probe the actual primitive usleep() uses for the bulk wait: a
// CREATE_WAITABLE_TIMER_HIGH_RESOLUTION waitable timer armed via
// SetWaitableTimer + WaitForSingleObject. On Win10 1803+ this gives
// sub-millisecond accuracy; the residual is closed by the QPC spin
// tail (g_usleep_spin_tail_us). The threshold we want is the smallest
// value such that (waitable_timer + spin_tail) reliably hits the
// deadline.
HANDLE timer = CreateWaitableTimerExW(NULL, NULL,
CREATE_WAITABLE_TIMER_HIGH_RESOLUTION
| CREATE_WAITABLE_TIMER_MANUAL_RESET,
TIMER_ALL_ACCESS);
if (timer == NULL) {
// Older Windows (pre-1803) lacks the high-res flag. Keep the
// historical 5000 us default - on those hosts the legacy timer
// quantizes to ~1 ms tick and the wide spin band is the safest
// behavior available.
g_usleep_pure_spin_us = 5000;
// Without a high-res timer the legacy 1 ms tick dominates;
// lock the adaptive floor to spin_tail_us so the controller
// can't shrink the spin below the safe bound on this host.
g_usleep_adaptive_min_tail_us = g_usleep_spin_tail_us;
std::printf("INFO [px4_windows] usleep spin threshold (auto): 5000 us "
"(high-res waitable timer unavailable, using legacy default)\n");
std::fflush(stdout);
return;
}
LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq);
// N=500 keeps ~25 samples in the p95 tail (vs 5 at N=100), which removes
// the intermittent low-p95 outlier that under-provisioned the spin tail
// and tripped the sim/wall ratio below 0.99 once every few cold boots.
// Probe cost is ~500 ms of one-time startup time (each iteration waits 1
// ms on the high-res timer); negligible vs the robustness gain.
constexpr int N = 500;
long jitter_us[N];
for (int i = 0; i < N; ++i) {
LARGE_INTEGER t0;
LARGE_INTEGER t1;
LARGE_INTEGER due;
// Ask for 1 ms (10 000 x 100 ns units, negative = relative).
due.QuadPart = -10000;
QueryPerformanceCounter(&t0);
if (SetWaitableTimer(timer, &due, 0, NULL, NULL, FALSE)) {
WaitForSingleObject(timer, INFINITE);
}
QueryPerformanceCounter(&t1);
const long actual_us = (long)(((t1.QuadPart - t0.QuadPart) * 1000000LL) / freq.QuadPart);
long delta = actual_us - 1000;
if (delta < 0) { delta = 0; }
jitter_us[i] = delta;
}
CloseHandle(timer);
std::sort(jitter_us, jitter_us + N);
const long p95 = jitter_us[(int)(0.95 * N)];
// 3. Choose threshold = spin_tail + p95_jitter + 500 us margin. Any
// wait above this is served by (waitable timer wakes ~p95 us late
// at most + spin tail closes the residual). Floor at 500 us so the
// short-sleep band never collapses (avoids degenerate 0-cost loops);
// cap at 5000 us so a freakishly noisy host still falls back to the
// legacy behavior.
long chosen = g_usleep_spin_tail_us + p95 + 500;
if (chosen < 500) { chosen = 500; }
if (chosen > 5000) { chosen = 5000; }
g_usleep_pure_spin_us = chosen;
// Right-size the spin-tail upper bound to (P95 + 500 us). This is
// the largest value the per-thread adaptive controller in usleep()
// is allowed to grow to, so we trade a couple hundred microseconds
// of CPU spin per call for a robust deadline guarantee. The +500
// (was +300) absorbs the residual P95 underestimate even when the
// N=500 probe still under-samples a freakishly quiet host. Floored
// at 700 us (so quiet hosts still cover the typical Win10 1803+
// jitter floor) and capped at 2000 us (the historical safe value).
long sized_tail = p95 + 500;
if (sized_tail < 700) { sized_tail = 700; }
if (sized_tail > 2000) { sized_tail = 2000; }
g_usleep_spin_tail_us = sized_tail;
// Set the adaptive tail floor to the host-measured P95 jitter
// (clamped to [200, sized_tail]) so the per-thread controller in
// usleep() can never trim the spin below the value we already know
// is needed to cover this host's observed long-tail wakes.
long adaptive_floor = p95;
if (adaptive_floor < 200) { adaptive_floor = 200; }
if (adaptive_floor > sized_tail) { adaptive_floor = sized_tail; }
g_usleep_adaptive_min_tail_us = adaptive_floor;
std::printf("INFO [px4_windows] usleep spin threshold (auto): %ld us "
"(p95 high-res timer jitter: %ld us [N=%d], spin tail: %ld us, "
"adaptive tail floor: %ld us)\n",
chosen, p95, N, sized_tail, adaptive_floor);
std::fflush(stdout);
}
namespace
{
@@ -134,6 +337,7 @@ struct PX4WindowsGlobalInit {
// Inline Linux syscall helpers (x86_64 ABI).
static long long linux_syscall1(long long num, long long a)
{
#if defined(__GNUC__) || defined(__clang__)
long long ret;
__asm__ volatile (
"syscall"
@@ -142,10 +346,16 @@ struct PX4WindowsGlobalInit {
: "rcx", "r11", "memory"
);
return ret;
#else
(void)num;
(void)a;
return -1;
#endif
}
static long long linux_syscall3(long long num, long long a, long long b, long long c)
{
#if defined(__GNUC__) || defined(__clang__)
long long ret;
__asm__ volatile (
"syscall"
@@ -154,6 +364,13 @@ struct PX4WindowsGlobalInit {
: "rcx", "r11", "memory"
);
return ret;
#else
(void)num;
(void)a;
(void)b;
(void)c;
return -1;
#endif
}
static long long open_host_tty()
@@ -260,6 +477,8 @@ struct PX4WindowsGlobalInit {
}
}
bool timer_resolution_raised = false;
PX4WindowsGlobalInit()
{
WSADATA wsaData;
@@ -268,6 +487,24 @@ struct PX4WindowsGlobalInit {
}
SetConsoleOutputCP(CP_UTF8);
// Raise the global timer resolution to 1 ms. The default
// (~15.6 ms) makes every usleep() round up to a full HPET
// tick, throttling SITL sim time to ~40 % of wall time. The
// matching timeEndPeriod(1) lives in the destructor; Windows
// also clears the request on process exit, so a hard
// ExitProcess() path is still safe.
if (timeBeginPeriod(1) == TIMERR_NOERROR) {
timer_resolution_raised = true;
}
// Tune g_usleep_pure_spin_us either from PX4_USLEEP_SPIN_US or
// by probing this host's Sleep(1) jitter. Must run AFTER
// timeBeginPeriod(1) so the probe sees the same scheduler
// behavior usleep() will see, and BEFORE any module thread
// has had a chance to start (we are in a static constructor,
// so PX4 main() has not yet been entered).
px4_windows_calibrate_usleep_threshold();
// PX4 stores binary data (parameters.bson, dataman) and expects
// read/write to preserve bytes exactly. MSVCRT's default text
// mode maps CRLF<->LF, which corrupts arbitrary binary content.
@@ -326,11 +563,101 @@ struct PX4WindowsGlobalInit {
{
restore_console_modes();
WSACleanup();
if (timer_resolution_raised) {
timeEndPeriod(1);
timer_resolution_raised = false;
}
}
};
static PX4WindowsGlobalInit _px4_win_init;
// Filesystem paths the process owns and must remove on any exit path.
// Used by px4_windows_exit() to undo the byte-range lock files that the
// daemon installs in %TEMP% via set_server_running(); the explicit unlink
// in main.cpp only runs when the pxh shell loop returns normally, but the
// `pxh shutdown` command leaves via px4_platform_exit() -> ExitProcess()
// and would otherwise leak the lock and PID-companion files.
std::mutex _px4_exit_unlink_mutex;
std::vector<std::string> _px4_exit_unlink_paths;
// File descriptors held open for the lifetime of the process (typically the
// byte-range lock fd installed by set_server_running). Windows refuses to
// unlink a file while any handle to it is open in the same process, so the
// exit path must close these BEFORE running the registered unlinks.
std::vector<int> _px4_exit_close_fds;
void px4_run_exit_unlinks()
{
std::lock_guard<std::mutex> lock(_px4_exit_unlink_mutex);
// Close fds first so subsequent unlink() calls don't hit ERROR_SHARING_VIOLATION.
for (int fd : _px4_exit_close_fds) {
if (fd >= 0) {
(void)::_close(fd);
}
}
_px4_exit_close_fds.clear();
for (const std::string &path : _px4_exit_unlink_paths) {
// Best effort: ignore errors — the path may already be gone if a
// different shutdown route ran the explicit cleanup first.
(void)::_unlink(path.c_str());
}
_px4_exit_unlink_paths.clear();
}
} // namespace
extern "C" void px4_windows_register_exit_unlink(const char *path)
{
if (path == nullptr || path[0] == '\0') {
return;
}
std::lock_guard<std::mutex> lock(_px4_exit_unlink_mutex);
for (const std::string &existing : _px4_exit_unlink_paths) {
if (existing == path) {
return; // already registered
}
}
// Hard cap on entries so a buggy caller can't grow this unboundedly;
// the daemon only registers two paths (lock + .pid).
constexpr std::size_t kMaxRegistered = 16;
if (_px4_exit_unlink_paths.size() >= kMaxRegistered) {
return;
}
_px4_exit_unlink_paths.emplace_back(path);
}
extern "C" void px4_windows_register_exit_close_fd(int fd)
{
if (fd < 0) {
return;
}
std::lock_guard<std::mutex> lock(_px4_exit_unlink_mutex);
for (int existing : _px4_exit_close_fds) {
if (existing == fd) {
return; // already registered
}
}
constexpr std::size_t kMaxRegistered = 16;
if (_px4_exit_close_fds.size() >= kMaxRegistered) {
return;
}
_px4_exit_close_fds.push_back(fd);
}
extern "C" void px4_windows_restore_console_modes()
{
_px4_win_init.restore_console_modes();
@@ -381,12 +708,33 @@ extern "C" void px4_windows_exit(int status)
{
fflush(stdout);
fflush(stderr);
// Drop server lock + PID-companion files before tearing down the
// console. Done early so a follow-up launch racing this process can
// re-acquire the byte-range lock without falling through to the
// stale-lock recovery path in get_server_running().
px4_run_exit_unlinks();
#if defined(_MSC_VER)
// ExitProcess()/TerminateProcess() skip the CRT exit chain, so
// _CRTDBG_LEAK_CHECK_DF never runs. Dump the leak report explicitly
// here, BEFORE FreeConsole() invalidates the stderr handle the CRT
// would write to.
_CrtDumpMemoryLeaks();
fflush(stderr);
#endif
_px4_win_init.restore_console_modes();
if (!_px4_win_init.running_under_wine) {
FreeConsole();
}
// Static dtors do not run under ExitProcess()/TerminateProcess().
// Match WSAStartup() from the constructor by calling WSACleanup()
// explicitly so a soft-exit path does not appear to leak winsock state.
WSACleanup();
if (_px4_win_init.running_under_wine) {
TerminateProcess(GetCurrentProcess(), static_cast<UINT>(status));
}