fix(lockstep): bound scheduler waits at high speed

Use wall-time watchdogs for lockstep component barriers and condition waits so one delayed participant cannot stall the simulator forever.

The HRT work queue gains an explicit semaphore wakeup path on Windows, work queues mark lockstep progress while active, and SIH uses wall-time sleeps/yields so speed factors above realtime are not limited by coarse sleeps.

Signed-off-by: Nuno Marques <n.marques21@hotmail.com>
This commit is contained in:
Nuno Marques
2026-04-27 15:26:22 -07:00
parent 1974fe5945
commit f8da039518
7 changed files with 180 additions and 12 deletions
@@ -192,6 +192,10 @@ void WorkQueue::Run()
#if defined(ENABLE_LOCKSTEP_SCHEDULER)
if (_lockstep_component != -1) {
px4_lockstep_progress(_lockstep_component);
}
if (_q.empty()) {
px4_lockstep_unregister_component(_lockstep_component);
_lockstep_component = -1;
+10 -1
View File
@@ -106,6 +106,7 @@
int hrt_work_queue(struct work_s *work, worker_t worker, void *arg, uint32_t delay)
{
struct wqueue_s *wqueue = &g_hrt_work;
const int wake_worker = (px4_getpid() != wqueue->pid);
/* First, initialize the work structure */
@@ -124,9 +125,17 @@ int hrt_work_queue(struct work_s *work, worker_t worker, void *arg, uint32_t del
dq_addlast(&work->dq, &wqueue->q);
if (px4_getpid() != wqueue->pid) { /* only need to wake up if called from a different thread */
if (wake_worker) { /* only need to wake up if called from a different thread */
//wqueue->pid == own task? -> don't signal
#if defined(__PX4_WINDOWS)
int sem_val;
if (px4_sem_getvalue(&_hrt_work_signal, &sem_val) == 0 && sem_val <= 0) {
px4_sem_post(&_hrt_work_signal);
}
#else
px4_task_kill(wqueue->pid, SIGCONT); /* Wake up the worker thread */
#endif
}
hrt_work_unlock();
+14 -3
View File
@@ -73,6 +73,9 @@ struct wqueue_s g_hrt_work;
* Private Variables
****************************************************************************/
px4_sem_t _hrt_work_lock;
#if defined(__PX4_WINDOWS)
px4_sem_t _hrt_work_signal;
#endif
/****************************************************************************
* Private Functions
@@ -223,13 +226,17 @@ static void hrt_work_process(void)
}
/* Wait awhile to check the work list. We will wait here until either
* the time elapses or until we are awakened by a signal.
* the time elapses or until a newly queued item wakes us.
*/
hrt_work_unlock();
/* might sleep less if a signal received and new item was queued */
//PX4_INFO("Sleeping for %u usec", next);
#if defined(__PX4_WINDOWS)
struct timespec ts;
abstime_to_ts(&ts, hrt_absolute_time() + next);
px4_sem_timedwait(&_hrt_work_signal, &ts);
#else
px4_usleep(next);
#endif
}
/****************************************************************************
@@ -286,6 +293,10 @@ static int work_hrtthread(int argc, char *argv[])
void hrt_work_queue_init(void)
{
px4_sem_init(&_hrt_work_lock, 0, 1);
#if defined(__PX4_WINDOWS)
px4_sem_init(&_hrt_work_signal, 0, 0);
px4_sem_setprotocol(&_hrt_work_signal, SEM_PRIO_NONE);
#endif
memset(&g_hrt_work, 0, sizeof(g_hrt_work));
// Create high priority worker thread
+3
View File
@@ -41,6 +41,9 @@
__BEGIN_DECLS
extern px4_sem_t _hrt_work_lock;
#if defined(__PX4_WINDOWS)
extern px4_sem_t _hrt_work_signal;
#endif
extern struct wqueue_s g_hrt_work;
void hrt_work_queue_init(void);
@@ -40,7 +40,36 @@
#include <drivers/drv_hrt.h>
#include <px4_platform_common/log.h>
#include <px4_platform_common/tasks.h>
#include <px4_platform_common/time.h>
#include <limits.h>
#include <stdlib.h>
static uint64_t lockstep_wall_time_us()
{
timespec ts{};
system_clock_gettime(CLOCK_MONOTONIC, &ts);
return (static_cast<uint64_t>(ts.tv_sec) * 1000000ULL) + (static_cast<uint64_t>(ts.tv_nsec) / 1000ULL);
}
static uint64_t lockstep_component_wait_budget_us()
{
static const uint64_t wait_budget_us = []() -> uint64_t {
const char *speed_factor_env = getenv("PX4_SIM_SPEED_FACTOR");
if (speed_factor_env != nullptr) {
const double speed_factor = strtod(speed_factor_env, nullptr);
if (speed_factor > 0.) {
const uint64_t budget = static_cast<uint64_t>(1000. / speed_factor);
return budget < 10ULL ? 10ULL : (budget > 100ULL ? 100ULL : budget);
}
}
return 100ULL;
}();
return wait_budget_us;
}
LockstepComponents::LockstepComponents(bool no_cleanup_on_destroy)
: _no_cleanup_on_destroy(no_cleanup_on_destroy)
@@ -128,5 +157,28 @@ void LockstepComponents::wait_for_components()
return;
}
#if defined(__PX4_WINDOWS) || defined(__PX4_LINUX)
// Do not let one temporarily blocked component stall the simulator forever.
// Keep the watchdog short: high lockstep speed factors need sub-millisecond
// barrier overhead, and active components normally release the latch first.
const uint64_t wait_start_us = lockstep_wall_time_us();
const uint64_t max_wait_us = lockstep_component_wait_budget_us();
unsigned spin_count = 0;
while (px4_sem_trywait(&_components_sem) != 0) {
if (_components_used_bitset == 0) {
return;
}
if ((lockstep_wall_time_us() - wait_start_us) >= max_wait_us) {
return;
}
if (++spin_count % 16 == 0) {
sched_yield();
}
}
#else
while (px4_sem_wait(&_components_sem) != 0) {}
#endif
}
@@ -124,7 +124,49 @@ int LockstepScheduler::cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *loc
}
}
int result = pthread_cond_wait(cond, lock);
// On most pthread implementations pthread_cond_wait here would
// suffice — set_absolute_time broadcasts when our target is reached.
// winpthreads (MinGW), however, occasionally drops the broadcast
// when it races the waiter entering the wait, which leaves the
// thread blocked forever. Use pthread_cond_timedwait with a long
// real-time timeout (500 ms) as a watchdog: under a correct
// implementation the timeout never fires; under winpthreads it
// caps any lost-broadcast hang at 500 ms instead of forever.
int result = 0;
while (!timed_wait.timeout) {
struct timespec poll_deadline;
clock_gettime(CLOCK_REALTIME, &poll_deadline);
long total_ns = poll_deadline.tv_nsec + 500L * 1000L * 1000L;
poll_deadline.tv_sec += total_ns / 1000000000L;
poll_deadline.tv_nsec = total_ns % 1000000000L;
result = pthread_cond_timedwait(cond, lock, &poll_deadline);
if (timed_wait.timeout) {
break;
}
if (result == ETIMEDOUT) {
// Watchdog fired; re-check _time_us in case we lost the
// broadcast. _timed_waits_mutex is taken here only to
// synchronise with set_absolute_time.
_timed_waits_mutex.lock();
if (time_us <= _time_us) {
timed_wait.timeout = true;
}
_timed_waits_mutex.unlock();
continue;
}
if (result != 0) {
break;
}
// Spurious wakeup or real signal not reflected in `timeout`
// (shouldn't happen normally). Loop to re-check.
}
const bool timeout = timed_wait.timeout;
@@ -132,6 +174,13 @@ int LockstepScheduler::cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *loc
result = ETIMEDOUT;
}
if (result == ETIMEDOUT && !timeout) {
// pthread_cond_timedwait returned ETIMEDOUT but broadcast also
// flipped `timeout` true between our check and this moment — we
// re-enter the loop above so this branch is effectively dead.
result = 0;
}
timed_wait.done = true;
if (!timeout && _setting_time) {
+47 -7
View File
@@ -45,6 +45,8 @@
#include <px4_platform_common/getopt.h>
#include <px4_platform_common/log.h>
#include <px4_platform_common/tasks.h>
#include <px4_platform_common/time.h>
#include <drivers/drv_pwm_output.h> // to get PWM flags
#include <lib/drivers/device/Device.hpp>
@@ -92,12 +94,50 @@ void Sih::run()
}
#if defined(ENABLE_LOCKSTEP_SCHEDULER)
// Get current timestamp in microseconds
static uint64_t micros()
static uint64_t wall_time_us()
{
#if defined(__PX4_WINDOWS)
struct timeval t;
gettimeofday(&t, nullptr);
return t.tv_sec * ((uint64_t)1000000) + t.tv_usec;
return t.tv_sec * static_cast<uint64_t>(1000000) + t.tv_usec;
#else
timespec ts{};
system_clock_gettime(CLOCK_MONOTONIC, &ts);
return (static_cast<uint64_t>(ts.tv_sec) * 1000000ULL) + (static_cast<uint64_t>(ts.tv_nsec) / 1000ULL);
#endif
}
static void lockstep_wall_sleep(int sleep_time_us)
{
if (sleep_time_us <= 0) {
return;
}
// Avoid coarse sub-millisecond sleeps limiting high lockstep speed factors.
if (sleep_time_us <= 1000) {
const uint64_t sleep_until_us = wall_time_us() + sleep_time_us;
#if defined(__PX4_WINDOWS)
while (wall_time_us() < sleep_until_us) {
sched_yield();
}
#else
while (true) {
const uint64_t now_us = wall_time_us();
if (now_us >= sleep_until_us) {
break;
}
if (sleep_until_us - now_us > 100) {
sched_yield();
}
}
#endif
} else {
system_usleep(sleep_time_us);
}
}
void Sih::lockstep_loop()
@@ -126,7 +166,7 @@ void Sih::lockstep_loop()
uint64_t pre_compute_wall_time_us;
while (!should_exit()) {
pre_compute_wall_time_us = micros();
pre_compute_wall_time_us = wall_time_us();
perf_count(_loop_interval_perf);
_current_simulation_time_us += sim_interval_us;
@@ -144,18 +184,18 @@ void Sih::lockstep_loop()
if (_last_actuator_output_time <= 0) {
PX4_DEBUG("SIH starting up - no lockstep yet");
current_wall_time_us = micros();
current_wall_time_us = wall_time_us();
sleep_time = math::max(0, sim_interval_us - (int)(current_wall_time_us - pre_compute_wall_time_us));
} else {
px4_lockstep_wait_for_components();
current_wall_time_us = micros();
current_wall_time_us = wall_time_us();
sleep_time = math::max(0, rt_interval_us - (int)(current_wall_time_us - pre_compute_wall_time_us));
}
_achieved_speedup = 0.99f * _achieved_speedup + 0.01f * ((float)sim_interval_us / (float)(
current_wall_time_us - pre_compute_wall_time_us + sleep_time));
usleep(sleep_time);
lockstep_wall_sleep(sleep_time);
}
}
#endif