logger: update watchdog

- reduce boost priority to PX4_WQ_HP_BASE - 6
- add cli command 'trigger_watchdog' to manually trigger watchdog
- add perf counters when triggering watchdog
- reduce top measurement to 300ms
- restore priorities after 1.5s

There are precautions in case the SD card code itself has a busy-loop.
This commit is contained in:
Beat Küng
2023-02-17 12:00:08 +01:00
committed by Daniel Agar
parent 015ba62727
commit e4cef9f303
5 changed files with 168 additions and 88 deletions
@@ -124,6 +124,13 @@ typedef struct {
// wait for the sensor hub if its data is coming from it.
#define SCHED_PRIORITY_ESTIMATOR (PX4_WQ_HP_BASE - 5)
// Logger watchdog priority, triggered when a task busy-loops (and
// restored after a short time).
// The priority is a trade-off between:
// - ability to capture any busy-looping task below this priority
// - not having a negative impact on the system itself
#define SCHED_PRIORITY_LOG_WATCHDOG (PX4_WQ_HP_BASE - 6)
// Position controllers typically are in a blocking wait on estimator data
// so when new sensor data is available they will run last. Keeping them
// on a high priority ensures that they are the first process to be run
+48 -29
View File
@@ -36,7 +36,6 @@
#include "logged_topics.h"
#include "logger.h"
#include "messages.h"
#include "watchdog.h"
#include <dirent.h>
#include <sys/stat.h>
@@ -84,19 +83,12 @@ using namespace px4::logger;
using namespace time_literals;
struct timer_callback_data_s {
px4_sem_t semaphore;
watchdog_data_t watchdog_data;
volatile bool watchdog_triggered = false;
};
/* This is used to schedule work for the logger (periodic scan for updated topics) */
static void timer_callback(void *arg)
{
/* Note: we are in IRQ context here (on NuttX) */
timer_callback_data_s *data = (timer_callback_data_s *)arg;
Logger::timer_callback_data_s *data = (Logger::timer_callback_data_s *)arg;
int semaphore_value = 0;
@@ -112,7 +104,7 @@ static void timer_callback(void *arg)
bool semaphore_value_saturated = semaphore_value > 100;
if (watchdog_update(data->watchdog_data, semaphore_value_saturated)) {
data->watchdog_triggered = true;
data->watchdog_triggered.store(true);
}
if (semaphore_value_saturated) {
@@ -151,6 +143,15 @@ int Logger::custom_command(int argc, char *argv[])
return 1;
}
#ifdef __PX4_NUTTX
if (!strcmp(argv[0], "trigger_watchdog")) {
get_instance()->trigger_watchdog_now();
return 0;
}
#endif
if (!strcmp(argv[0], "on")) {
get_instance()->set_arm_override(true);
return 0;
@@ -650,10 +651,9 @@ void Logger::run()
/* init the update timer */
struct hrt_call timer_call {};
timer_callback_data_s timer_callback_data;
px4_sem_init(&timer_callback_data.semaphore, 0, 0);
px4_sem_init(&_timer_callback_data.semaphore, 0, 0);
/* timer_semaphore use case is a signal */
px4_sem_setprotocol(&timer_callback_data.semaphore, SEM_PRIO_NONE);
px4_sem_setprotocol(&_timer_callback_data.semaphore, SEM_PRIO_NONE);
int polling_topic_sub = -1;
@@ -673,10 +673,10 @@ void Logger::run()
// sched_note_start is already called from pthread_create and task_create,
// which means we can expect to find the tasks in system_load.tasks, as required in watchdog_initialize
watchdog_initialize(pid_self, writer_thread, timer_callback_data.watchdog_data);
watchdog_initialize(pid_self, writer_thread, _timer_callback_data.watchdog_data);
}
hrt_call_every(&timer_call, _log_interval, _log_interval, timer_callback, &timer_callback_data);
hrt_call_every(&timer_call, _log_interval, _log_interval, timer_callback, &_timer_callback_data);
}
// check for new subscription data
@@ -703,8 +703,8 @@ void Logger::run()
/* check for logging command from MAVLink (start/stop streaming) */
handle_vehicle_command_update();
if (timer_callback_data.watchdog_triggered) {
timer_callback_data.watchdog_triggered = false;
if (_timer_callback_data.watchdog_triggered.load()) {
_timer_callback_data.watchdog_triggered.store(false);
initialize_load_output(PrintLoadReason::Watchdog);
}
@@ -916,7 +916,7 @@ void Logger::run()
* And on linux this is quite accurate as well, but under NuttX it is not accurate,
* because usleep() has only a granularity of CONFIG_MSEC_PER_TICK (=1ms).
*/
while (px4_sem_wait(&timer_callback_data.semaphore) != 0) {}
while (px4_sem_wait(&_timer_callback_data.semaphore) != 0) {}
}
}
@@ -926,7 +926,7 @@ void Logger::run()
stop_log_file(LogType::Mission);
hrt_cancel(&timer_call);
px4_sem_destroy(&timer_callback_data.semaphore);
px4_sem_destroy(&_timer_callback_data.semaphore);
// stop the writer thread
_writer.thread_stop();
@@ -1420,7 +1420,7 @@ void Logger::start_log_file(LogType type)
if (type == LogType::Full) {
write_parameters(type);
write_parameter_defaults(type);
write_perf_data(true);
write_perf_data(PrintLoadReason::Preflight);
write_console_output();
write_events_file(LogType::Full);
write_excluded_optional_topics(type);
@@ -1448,7 +1448,7 @@ void Logger::stop_log_file(LogType type)
if (type == LogType::Full) {
_writer.set_need_reliable_transfer(true);
write_perf_data(false);
write_perf_data(PrintLoadReason::Postflight);
_writer.set_need_reliable_transfer(false);
}
@@ -1478,7 +1478,7 @@ void Logger::start_log_mavlink()
write_formats(LogType::Full);
write_parameters(LogType::Full);
write_parameter_defaults(LogType::Full);
write_perf_data(true);
write_perf_data(PrintLoadReason::Preflight);
write_console_output();
write_events_file(LogType::Full);
write_excluded_optional_topics(LogType::Full);
@@ -1498,7 +1498,7 @@ void Logger::stop_log_mavlink()
if (_writer.is_started(LogType::Full, LogWriter::BackendMavlink)) {
_writer.select_write_backend(LogWriter::BackendMavlink);
_writer.set_need_reliable_transfer(true);
write_perf_data(false);
write_perf_data(PrintLoadReason::Postflight);
_writer.set_need_reliable_transfer(false);
_writer.unselect_write_backend();
_writer.notify();
@@ -1509,7 +1509,7 @@ void Logger::stop_log_mavlink()
struct perf_callback_data_t {
Logger *logger;
int counter;
bool preflight;
Logger::PrintLoadReason reason;
char *buffer;
};
@@ -1522,23 +1522,31 @@ void Logger::perf_iterate_callback(perf_counter_t handle, void *user)
perf_print_counter_buffer(buffer, buffer_length, handle);
if (callback_data->preflight) {
switch (callback_data->reason) {
case PrintLoadReason::Preflight:
default:
perf_name = "perf_counter_preflight";
break;
} else {
case PrintLoadReason::Postflight:
perf_name = "perf_counter_postflight";
break;
case PrintLoadReason::Watchdog:
perf_name = "perf_counter_watchdog";
break;
}
callback_data->logger->write_info_multiple(LogType::Full, perf_name, buffer, callback_data->counter != 0);
++callback_data->counter;
}
void Logger::write_perf_data(bool preflight)
void Logger::write_perf_data(PrintLoadReason reason)
{
perf_callback_data_t callback_data = {};
callback_data.logger = this;
callback_data.counter = 0;
callback_data.preflight = preflight;
callback_data.reason = reason;
// write the perf counters
perf_iterate_all(perf_iterate_callback, &callback_data);
@@ -1580,7 +1588,14 @@ void Logger::print_load_callback(void *user)
void Logger::initialize_load_output(PrintLoadReason reason)
{
init_print_load(&_load);
_next_load_print = hrt_absolute_time() + 1_s;
if (reason == PrintLoadReason::Watchdog) {
_next_load_print = hrt_absolute_time() + 300_ms;
} else {
_next_load_print = hrt_absolute_time() + 1_s;
}
_print_load_reason = reason;
}
@@ -1588,6 +1603,7 @@ void Logger::write_load_output()
{
if (_print_load_reason == PrintLoadReason::Watchdog) {
PX4_ERR("Writing watchdog data"); // this is just that we see it easily in the log
write_perf_data(PrintLoadReason::Watchdog);
}
perf_callback_data_t callback_data = {};
@@ -2421,6 +2437,9 @@ $ logger on
PRINT_MODULE_USAGE_PARAM_FLOAT('c', 1.0, 0.2, 2.0, "Log rate factor (higher is faster)", true);
PRINT_MODULE_USAGE_COMMAND_DESCR("on", "start logging now, override arming (logger must be running)");
PRINT_MODULE_USAGE_COMMAND_DESCR("off", "stop logging now, override arming (logger must be running)");
#ifdef __PX4_NUTTX
PRINT_MODULE_USAGE_COMMAND_DESCR("trigger_watchdog", "manually trigger the watchdog now");
#endif
PRINT_MODULE_USAGE_DEFAULT_COMMANDS();
return 0;
+25 -8
View File
@@ -36,6 +36,7 @@
#include "log_writer.h"
#include "logged_topics.h"
#include "messages.h"
#include "watchdog.h"
#include <containers/Array.hpp>
#include "util.h"
#include <px4_platform_common/defines.h>
@@ -91,6 +92,20 @@ public:
arm_until_shutdown,
};
enum class PrintLoadReason {
Preflight,
Postflight,
Watchdog
};
struct timer_callback_data_s {
px4_sem_t semaphore;
watchdog_data_t watchdog_data;
px4::atomic_bool watchdog_triggered{false};
};
Logger(LogWriter::Backend backend, size_t buffer_size, uint32_t log_interval, const char *poll_topic_name,
LogMode log_mode, bool log_name_timestamp, float rate_factor);
@@ -131,13 +146,14 @@ public:
void set_arm_override(bool override) { _manually_logging_override = override; }
private:
void trigger_watchdog_now()
{
#ifdef __PX4_NUTTX
_timer_callback_data.watchdog_data.manual_watchdog_trigger = true;
#endif
}
enum class PrintLoadReason {
Preflight,
Postflight,
Watchdog
};
private:
static constexpr int MAX_MISSION_TOPICS_NUM = 5; /**< Maximum number of mission topics */
static constexpr unsigned MAX_NO_LOGFILE = 999; /**< Maximum number of log files */
@@ -229,9 +245,8 @@ private:
/**
* write performance counters
* @param preflight preflight if true, postflight otherwise
*/
void write_perf_data(bool preflight);
void write_perf_data(PrintLoadReason reason);
/**
* write bootup console output
@@ -371,6 +386,8 @@ private:
uint32_t _message_gaps{0};
timer_callback_data_s _timer_callback_data{};
uORB::Subscription _manual_control_setpoint_sub{ORB_ID(manual_control_setpoint)};
uORB::Subscription _vehicle_command_sub{ORB_ID(vehicle_command)};
uORB::Subscription _vehicle_status_sub{ORB_ID(vehicle_status)};
+84 -51
View File
@@ -34,6 +34,7 @@
#include "watchdog.h"
#include <px4_platform_common/log.h>
#include <px4_platform_common/tasks.h>
#if defined(__PX4_NUTTX) && !defined(CONFIG_SCHED_INSTRUMENTATION)
# error watchdog support requires CONFIG_SCHED_INSTRUMENTATION
@@ -57,72 +58,104 @@ bool watchdog_update(watchdog_data_t &watchdog_data, bool semaphore_value_satura
const system_load_taskinfo_s &log_writer_task = system_load.tasks[watchdog_data.logger_writer_task_index];
if (log_writer_task.valid) {
// Trigger the watchdog if the log writer task has been ready to run for a
// minimum duration and it has not been scheduled during that time.
// When the writer is waiting for an SD transfer, it is not in ready state, thus a long dropout
// will not trigger it. The longest period in ready state I measured was around 70ms,
// after a param change.
// Additionally we need to check the main thread as well, because if the main thread gets stalled as well
// while the writer is idle (no active write), it would not trigger.
// We do that by checking if the scheduling semaphore counter is saturated for a certain duration.
// No need to lock the tcb access, since we are in IRQ context
// update the timestamp if it has been scheduled recently
if (log_writer_task.curr_start_time > watchdog_data.ready_to_run_timestamp) {
watchdog_data.ready_to_run_timestamp = log_writer_task.curr_start_time;
}
// Was it already triggered?
if (watchdog_data.trigger_time != 0) {
// If so, restore the priority after 1.5s (enough time to flush the buffer and write the perf data)
if (now > watchdog_data.trigger_time + 1500_ms) {
// Restore priorities to ensure the logger threads cannot adversely affect the system
sched_param param{};
param.sched_priority = watchdog_data.logger_main_priority;
// update the timestamp if not ready to run or if transitioned into ready to run
uint8_t current_state = log_writer_task.tcb->task_state;
if (system_load.tasks[watchdog_data.logger_main_task_index].valid) {
sched_setparam(system_load.tasks[watchdog_data.logger_main_task_index].tcb->pid, &param);
}
if (current_state != TSTATE_TASK_READYTORUN
|| (watchdog_data.last_state != TSTATE_TASK_READYTORUN && current_state == TSTATE_TASK_READYTORUN)) {
watchdog_data.ready_to_run_timestamp = now;
}
param.sched_priority = watchdog_data.log_writer_priority;
sched_setparam(log_writer_task.tcb->pid, &param);
watchdog_data.last_state = current_state;
// Make sure we won't trigger again
watchdog_data.logger_main_task_index = -1;
}
} else {
// Trigger the watchdog if the log writer task has been ready to run for a
// minimum duration and it has not been scheduled during that time.
// When the writer is waiting for an SD transfer, it is not in ready state, thus a long dropout
// will not trigger it. The longest period in ready state I measured was around 70ms,
// after a param change.
// Additionally we need to check the main thread as well, because if the main thread gets stalled as well
// while the writer is idle (no active write), it would not trigger.
// We do that by checking if the scheduling semaphore counter is saturated for a certain duration.
// No need to lock the tcb access, since we are in IRQ context
// update the timestamp if it has been scheduled recently
if (log_writer_task.curr_start_time > watchdog_data.ready_to_run_timestamp) {
watchdog_data.ready_to_run_timestamp = log_writer_task.curr_start_time;
}
// update the timestamp if not ready to run or if transitioned into ready to run
uint8_t current_state = log_writer_task.tcb->task_state;
if (current_state != TSTATE_TASK_READYTORUN ||
(watchdog_data.last_state != TSTATE_TASK_READYTORUN && current_state == TSTATE_TASK_READYTORUN)) {
watchdog_data.ready_to_run_timestamp = now;
}
watchdog_data.last_state = current_state;
#if 0 // for debugging
// test code that prints the maximum time in ready state.
// Note: we are in IRQ context, and thus are strictly speaking not allowed to use PX4_ERR -
// we do it anyway since it's only used for debugging.
static uint64_t max_time = 0;
// test code that prints the maximum time in ready state.
// Note: we are in IRQ context, and thus are strictly speaking not allowed to use PX4_ERR -
// we do it anyway since it's only used for debugging.
static uint64_t max_time = 0;
if (now - watchdog_data.ready_to_run_timestamp > max_time) {
max_time = now - watchdog_data.ready_to_run_timestamp;
}
if (now - watchdog_data.ready_to_run_timestamp > max_time) {
max_time = now - watchdog_data.ready_to_run_timestamp;
}
static int counter = 0;
static int counter = 0;
if (++counter > 300) {
PX4_ERR("max time in ready: %i ms", (int)max_time / 1000);
counter = 0;
max_time = 0;
}
if (++counter > 300) {
PX4_ERR("max time in ready: %i ms", (int)max_time / 1000);
counter = 0;
max_time = 0;
}
#endif
if (!semaphore_value_saturated) {
watchdog_data.sem_counter_saturated_start = now;
}
if (now - watchdog_data.sem_counter_saturated_start > 3_s || now - watchdog_data.ready_to_run_timestamp > 1_s) {
// boost the priority to make sure the logger continues to write to the log.
// Note that we never restore the priority, to keep the logic simple and because it is
// an event that must not occur under normal circumstances (if it does, there's a bug
// somewhere)
sched_param param{};
param.sched_priority = SCHED_PRIORITY_MAX;
if (system_load.tasks[watchdog_data.logger_main_task_index].valid) {
sched_setparam(system_load.tasks[watchdog_data.logger_main_task_index].tcb->pid, &param);
if (!semaphore_value_saturated) {
watchdog_data.sem_counter_saturated_start = now;
}
sched_setparam(log_writer_task.tcb->pid, &param);
if (watchdog_data.manual_watchdog_trigger
|| now > watchdog_data.sem_counter_saturated_start + 3_s
|| now > watchdog_data.ready_to_run_timestamp + 1_s) {
// make sure we won't trigger again
watchdog_data.logger_main_task_index = -1;
return true;
sched_param param{};
// Get the current priorities
if (system_load.tasks[watchdog_data.logger_main_task_index].valid) {
sched_getparam(system_load.tasks[watchdog_data.logger_main_task_index].tcb->pid, &param);
watchdog_data.logger_main_priority = param.sched_priority;
}
sched_getparam(log_writer_task.tcb->pid, &param);
watchdog_data.log_writer_priority = param.sched_priority;
// Boost the priority to make sure the logger continues to write to the log.
param.sched_priority = SCHED_PRIORITY_LOG_WATCHDOG;
if (system_load.tasks[watchdog_data.logger_main_task_index].valid) {
sched_setparam(system_load.tasks[watchdog_data.logger_main_task_index].tcb->pid, &param);
}
sched_setparam(log_writer_task.tcb->pid, &param);
watchdog_data.trigger_time = now;
return true;
}
}
} else {
+4
View File
@@ -52,6 +52,10 @@ struct watchdog_data_t {
hrt_abstime ready_to_run_timestamp = hrt_absolute_time();
hrt_abstime sem_counter_saturated_start = hrt_absolute_time();
uint8_t last_state = TSTATE_TASK_INVALID;
int log_writer_priority = 0;
int logger_main_priority = 0;
hrt_abstime trigger_time = 0; ///< timestamp when it was triggered
bool manual_watchdog_trigger = false;
#endif /* __PX4_NUTTX */
};