mirror of
https://github.com/PX4/PX4-Autopilot.git
synced 2026-05-20 20:03:54 +08:00
fix(lockstep): bound scheduler waits at high speed
Use wall-time watchdogs for lockstep component barriers and condition waits so one delayed participant cannot stall the simulator forever. The HRT work queue gains an explicit semaphore wakeup path on Windows, work queues mark lockstep progress while active, and SIH uses wall-time sleeps/yields so speed factors above realtime are not limited by coarse sleeps. Signed-off-by: Nuno Marques <n.marques21@hotmail.com>
This commit is contained in:
@@ -192,6 +192,10 @@ void WorkQueue::Run()
|
||||
|
||||
#if defined(ENABLE_LOCKSTEP_SCHEDULER)
|
||||
|
||||
if (_lockstep_component != -1) {
|
||||
px4_lockstep_progress(_lockstep_component);
|
||||
}
|
||||
|
||||
if (_q.empty()) {
|
||||
px4_lockstep_unregister_component(_lockstep_component);
|
||||
_lockstep_component = -1;
|
||||
|
||||
@@ -106,6 +106,7 @@
|
||||
int hrt_work_queue(struct work_s *work, worker_t worker, void *arg, uint32_t delay)
|
||||
{
|
||||
struct wqueue_s *wqueue = &g_hrt_work;
|
||||
const int wake_worker = (px4_getpid() != wqueue->pid);
|
||||
|
||||
/* First, initialize the work structure */
|
||||
|
||||
@@ -124,9 +125,17 @@ int hrt_work_queue(struct work_s *work, worker_t worker, void *arg, uint32_t del
|
||||
|
||||
dq_addlast(&work->dq, &wqueue->q);
|
||||
|
||||
if (px4_getpid() != wqueue->pid) { /* only need to wake up if called from a different thread */
|
||||
if (wake_worker) { /* only need to wake up if called from a different thread */
|
||||
//wqueue->pid == own task? -> don't signal
|
||||
#if defined(__PX4_WINDOWS)
|
||||
int sem_val;
|
||||
|
||||
if (px4_sem_getvalue(&_hrt_work_signal, &sem_val) == 0 && sem_val <= 0) {
|
||||
px4_sem_post(&_hrt_work_signal);
|
||||
}
|
||||
#else
|
||||
px4_task_kill(wqueue->pid, SIGCONT); /* Wake up the worker thread */
|
||||
#endif
|
||||
}
|
||||
|
||||
hrt_work_unlock();
|
||||
|
||||
@@ -73,6 +73,9 @@ struct wqueue_s g_hrt_work;
|
||||
* Private Variables
|
||||
****************************************************************************/
|
||||
px4_sem_t _hrt_work_lock;
|
||||
#if defined(__PX4_WINDOWS)
|
||||
px4_sem_t _hrt_work_signal;
|
||||
#endif
|
||||
|
||||
/****************************************************************************
|
||||
* Private Functions
|
||||
@@ -223,13 +226,17 @@ static void hrt_work_process(void)
|
||||
}
|
||||
|
||||
/* Wait awhile to check the work list. We will wait here until either
|
||||
* the time elapses or until we are awakened by a signal.
|
||||
* the time elapses or until a newly queued item wakes us.
|
||||
*/
|
||||
hrt_work_unlock();
|
||||
|
||||
/* might sleep less if a signal received and new item was queued */
|
||||
//PX4_INFO("Sleeping for %u usec", next);
|
||||
#if defined(__PX4_WINDOWS)
|
||||
struct timespec ts;
|
||||
abstime_to_ts(&ts, hrt_absolute_time() + next);
|
||||
px4_sem_timedwait(&_hrt_work_signal, &ts);
|
||||
#else
|
||||
px4_usleep(next);
|
||||
#endif
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
@@ -286,6 +293,10 @@ static int work_hrtthread(int argc, char *argv[])
|
||||
void hrt_work_queue_init(void)
|
||||
{
|
||||
px4_sem_init(&_hrt_work_lock, 0, 1);
|
||||
#if defined(__PX4_WINDOWS)
|
||||
px4_sem_init(&_hrt_work_signal, 0, 0);
|
||||
px4_sem_setprotocol(&_hrt_work_signal, SEM_PRIO_NONE);
|
||||
#endif
|
||||
memset(&g_hrt_work, 0, sizeof(g_hrt_work));
|
||||
|
||||
// Create high priority worker thread
|
||||
|
||||
@@ -41,6 +41,9 @@
|
||||
__BEGIN_DECLS
|
||||
|
||||
extern px4_sem_t _hrt_work_lock;
|
||||
#if defined(__PX4_WINDOWS)
|
||||
extern px4_sem_t _hrt_work_signal;
|
||||
#endif
|
||||
extern struct wqueue_s g_hrt_work;
|
||||
|
||||
void hrt_work_queue_init(void);
|
||||
|
||||
@@ -40,7 +40,36 @@
|
||||
#include <drivers/drv_hrt.h>
|
||||
#include <px4_platform_common/log.h>
|
||||
#include <px4_platform_common/tasks.h>
|
||||
#include <px4_platform_common/time.h>
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static uint64_t lockstep_wall_time_us()
|
||||
{
|
||||
timespec ts{};
|
||||
system_clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (static_cast<uint64_t>(ts.tv_sec) * 1000000ULL) + (static_cast<uint64_t>(ts.tv_nsec) / 1000ULL);
|
||||
}
|
||||
|
||||
static uint64_t lockstep_component_wait_budget_us()
|
||||
{
|
||||
static const uint64_t wait_budget_us = []() -> uint64_t {
|
||||
const char *speed_factor_env = getenv("PX4_SIM_SPEED_FACTOR");
|
||||
|
||||
if (speed_factor_env != nullptr) {
|
||||
const double speed_factor = strtod(speed_factor_env, nullptr);
|
||||
|
||||
if (speed_factor > 0.) {
|
||||
const uint64_t budget = static_cast<uint64_t>(1000. / speed_factor);
|
||||
return budget < 10ULL ? 10ULL : (budget > 100ULL ? 100ULL : budget);
|
||||
}
|
||||
}
|
||||
|
||||
return 100ULL;
|
||||
}();
|
||||
|
||||
return wait_budget_us;
|
||||
}
|
||||
|
||||
LockstepComponents::LockstepComponents(bool no_cleanup_on_destroy)
|
||||
: _no_cleanup_on_destroy(no_cleanup_on_destroy)
|
||||
@@ -128,5 +157,28 @@ void LockstepComponents::wait_for_components()
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(__PX4_WINDOWS) || defined(__PX4_LINUX)
|
||||
// Do not let one temporarily blocked component stall the simulator forever.
|
||||
// Keep the watchdog short: high lockstep speed factors need sub-millisecond
|
||||
// barrier overhead, and active components normally release the latch first.
|
||||
const uint64_t wait_start_us = lockstep_wall_time_us();
|
||||
const uint64_t max_wait_us = lockstep_component_wait_budget_us();
|
||||
unsigned spin_count = 0;
|
||||
|
||||
while (px4_sem_trywait(&_components_sem) != 0) {
|
||||
if (_components_used_bitset == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if ((lockstep_wall_time_us() - wait_start_us) >= max_wait_us) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (++spin_count % 16 == 0) {
|
||||
sched_yield();
|
||||
}
|
||||
}
|
||||
#else
|
||||
while (px4_sem_wait(&_components_sem) != 0) {}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -124,7 +124,49 @@ int LockstepScheduler::cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *loc
|
||||
}
|
||||
}
|
||||
|
||||
int result = pthread_cond_wait(cond, lock);
|
||||
// On most pthread implementations pthread_cond_wait here would
|
||||
// suffice — set_absolute_time broadcasts when our target is reached.
|
||||
// winpthreads (MinGW), however, occasionally drops the broadcast
|
||||
// when it races the waiter entering the wait, which leaves the
|
||||
// thread blocked forever. Use pthread_cond_timedwait with a long
|
||||
// real-time timeout (500 ms) as a watchdog: under a correct
|
||||
// implementation the timeout never fires; under winpthreads it
|
||||
// caps any lost-broadcast hang at 500 ms instead of forever.
|
||||
int result = 0;
|
||||
|
||||
while (!timed_wait.timeout) {
|
||||
struct timespec poll_deadline;
|
||||
clock_gettime(CLOCK_REALTIME, &poll_deadline);
|
||||
long total_ns = poll_deadline.tv_nsec + 500L * 1000L * 1000L;
|
||||
poll_deadline.tv_sec += total_ns / 1000000000L;
|
||||
poll_deadline.tv_nsec = total_ns % 1000000000L;
|
||||
|
||||
result = pthread_cond_timedwait(cond, lock, &poll_deadline);
|
||||
|
||||
if (timed_wait.timeout) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (result == ETIMEDOUT) {
|
||||
// Watchdog fired; re-check _time_us in case we lost the
|
||||
// broadcast. _timed_waits_mutex is taken here only to
|
||||
// synchronise with set_absolute_time.
|
||||
_timed_waits_mutex.lock();
|
||||
|
||||
if (time_us <= _time_us) {
|
||||
timed_wait.timeout = true;
|
||||
}
|
||||
|
||||
_timed_waits_mutex.unlock();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (result != 0) {
|
||||
break;
|
||||
}
|
||||
// Spurious wakeup or real signal not reflected in `timeout`
|
||||
// (shouldn't happen normally). Loop to re-check.
|
||||
}
|
||||
|
||||
const bool timeout = timed_wait.timeout;
|
||||
|
||||
@@ -132,6 +174,13 @@ int LockstepScheduler::cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *loc
|
||||
result = ETIMEDOUT;
|
||||
}
|
||||
|
||||
if (result == ETIMEDOUT && !timeout) {
|
||||
// pthread_cond_timedwait returned ETIMEDOUT but broadcast also
|
||||
// flipped `timeout` true between our check and this moment — we
|
||||
// re-enter the loop above so this branch is effectively dead.
|
||||
result = 0;
|
||||
}
|
||||
|
||||
timed_wait.done = true;
|
||||
|
||||
if (!timeout && _setting_time) {
|
||||
|
||||
@@ -45,6 +45,8 @@
|
||||
|
||||
#include <px4_platform_common/getopt.h>
|
||||
#include <px4_platform_common/log.h>
|
||||
#include <px4_platform_common/tasks.h>
|
||||
#include <px4_platform_common/time.h>
|
||||
|
||||
#include <drivers/drv_pwm_output.h> // to get PWM flags
|
||||
#include <lib/drivers/device/Device.hpp>
|
||||
@@ -92,12 +94,50 @@ void Sih::run()
|
||||
}
|
||||
|
||||
#if defined(ENABLE_LOCKSTEP_SCHEDULER)
|
||||
// Get current timestamp in microseconds
|
||||
static uint64_t micros()
|
||||
static uint64_t wall_time_us()
|
||||
{
|
||||
#if defined(__PX4_WINDOWS)
|
||||
struct timeval t;
|
||||
gettimeofday(&t, nullptr);
|
||||
return t.tv_sec * ((uint64_t)1000000) + t.tv_usec;
|
||||
return t.tv_sec * static_cast<uint64_t>(1000000) + t.tv_usec;
|
||||
#else
|
||||
timespec ts{};
|
||||
system_clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (static_cast<uint64_t>(ts.tv_sec) * 1000000ULL) + (static_cast<uint64_t>(ts.tv_nsec) / 1000ULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void lockstep_wall_sleep(int sleep_time_us)
|
||||
{
|
||||
if (sleep_time_us <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Avoid coarse sub-millisecond sleeps limiting high lockstep speed factors.
|
||||
if (sleep_time_us <= 1000) {
|
||||
const uint64_t sleep_until_us = wall_time_us() + sleep_time_us;
|
||||
|
||||
#if defined(__PX4_WINDOWS)
|
||||
while (wall_time_us() < sleep_until_us) {
|
||||
sched_yield();
|
||||
}
|
||||
#else
|
||||
while (true) {
|
||||
const uint64_t now_us = wall_time_us();
|
||||
|
||||
if (now_us >= sleep_until_us) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (sleep_until_us - now_us > 100) {
|
||||
sched_yield();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
} else {
|
||||
system_usleep(sleep_time_us);
|
||||
}
|
||||
}
|
||||
|
||||
void Sih::lockstep_loop()
|
||||
@@ -126,7 +166,7 @@ void Sih::lockstep_loop()
|
||||
uint64_t pre_compute_wall_time_us;
|
||||
|
||||
while (!should_exit()) {
|
||||
pre_compute_wall_time_us = micros();
|
||||
pre_compute_wall_time_us = wall_time_us();
|
||||
perf_count(_loop_interval_perf);
|
||||
|
||||
_current_simulation_time_us += sim_interval_us;
|
||||
@@ -144,18 +184,18 @@ void Sih::lockstep_loop()
|
||||
|
||||
if (_last_actuator_output_time <= 0) {
|
||||
PX4_DEBUG("SIH starting up - no lockstep yet");
|
||||
current_wall_time_us = micros();
|
||||
current_wall_time_us = wall_time_us();
|
||||
sleep_time = math::max(0, sim_interval_us - (int)(current_wall_time_us - pre_compute_wall_time_us));
|
||||
|
||||
} else {
|
||||
px4_lockstep_wait_for_components();
|
||||
current_wall_time_us = micros();
|
||||
current_wall_time_us = wall_time_us();
|
||||
sleep_time = math::max(0, rt_interval_us - (int)(current_wall_time_us - pre_compute_wall_time_us));
|
||||
}
|
||||
|
||||
_achieved_speedup = 0.99f * _achieved_speedup + 0.01f * ((float)sim_interval_us / (float)(
|
||||
current_wall_time_us - pre_compute_wall_time_us + sleep_time));
|
||||
usleep(sleep_time);
|
||||
lockstep_wall_sleep(sleep_time);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user