mirror of
https://github.com/PX4/PX4-Autopilot.git
synced 2026-05-26 09:26:25 +08:00
logger: update watchdog
- reduce boost priority to PX4_WQ_HP_BASE - 6 - add cli command 'trigger_watchdog' to manually trigger watchdog - add perf counters when triggering watchdog - reduce top measurement to 300ms - restore priorities after 1.5s There are precautions in case the SD card code itself has a busy-loop.
This commit is contained in:
@@ -124,6 +124,13 @@ typedef struct {
|
||||
// wait for the sensor hub if its data is coming from it.
|
||||
#define SCHED_PRIORITY_ESTIMATOR (PX4_WQ_HP_BASE - 5)
|
||||
|
||||
// Logger watchdog priority, triggered when a task busy-loops (and
|
||||
// restored after a short time).
|
||||
// The priority is a trade-off between:
|
||||
// - ability to capture any busy-looping task below this priority
|
||||
// - not having a negative impact on the system itself
|
||||
#define SCHED_PRIORITY_LOG_WATCHDOG (PX4_WQ_HP_BASE - 6)
|
||||
|
||||
// Position controllers typically are in a blocking wait on estimator data
|
||||
// so when new sensor data is available they will run last. Keeping them
|
||||
// on a high priority ensures that they are the first process to be run
|
||||
|
||||
@@ -36,7 +36,6 @@
|
||||
#include "logged_topics.h"
|
||||
#include "logger.h"
|
||||
#include "messages.h"
|
||||
#include "watchdog.h"
|
||||
|
||||
#include <dirent.h>
|
||||
#include <sys/stat.h>
|
||||
@@ -84,19 +83,12 @@ using namespace px4::logger;
|
||||
using namespace time_literals;
|
||||
|
||||
|
||||
struct timer_callback_data_s {
|
||||
px4_sem_t semaphore;
|
||||
|
||||
watchdog_data_t watchdog_data;
|
||||
volatile bool watchdog_triggered = false;
|
||||
};
|
||||
|
||||
/* This is used to schedule work for the logger (periodic scan for updated topics) */
|
||||
static void timer_callback(void *arg)
|
||||
{
|
||||
/* Note: we are in IRQ context here (on NuttX) */
|
||||
|
||||
timer_callback_data_s *data = (timer_callback_data_s *)arg;
|
||||
Logger::timer_callback_data_s *data = (Logger::timer_callback_data_s *)arg;
|
||||
|
||||
int semaphore_value = 0;
|
||||
|
||||
@@ -112,7 +104,7 @@ static void timer_callback(void *arg)
|
||||
bool semaphore_value_saturated = semaphore_value > 100;
|
||||
|
||||
if (watchdog_update(data->watchdog_data, semaphore_value_saturated)) {
|
||||
data->watchdog_triggered = true;
|
||||
data->watchdog_triggered.store(true);
|
||||
}
|
||||
|
||||
if (semaphore_value_saturated) {
|
||||
@@ -151,6 +143,15 @@ int Logger::custom_command(int argc, char *argv[])
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef __PX4_NUTTX
|
||||
|
||||
if (!strcmp(argv[0], "trigger_watchdog")) {
|
||||
get_instance()->trigger_watchdog_now();
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
if (!strcmp(argv[0], "on")) {
|
||||
get_instance()->set_arm_override(true);
|
||||
return 0;
|
||||
@@ -650,10 +651,9 @@ void Logger::run()
|
||||
|
||||
/* init the update timer */
|
||||
struct hrt_call timer_call {};
|
||||
timer_callback_data_s timer_callback_data;
|
||||
px4_sem_init(&timer_callback_data.semaphore, 0, 0);
|
||||
px4_sem_init(&_timer_callback_data.semaphore, 0, 0);
|
||||
/* timer_semaphore use case is a signal */
|
||||
px4_sem_setprotocol(&timer_callback_data.semaphore, SEM_PRIO_NONE);
|
||||
px4_sem_setprotocol(&_timer_callback_data.semaphore, SEM_PRIO_NONE);
|
||||
|
||||
int polling_topic_sub = -1;
|
||||
|
||||
@@ -673,10 +673,10 @@ void Logger::run()
|
||||
|
||||
// sched_note_start is already called from pthread_create and task_create,
|
||||
// which means we can expect to find the tasks in system_load.tasks, as required in watchdog_initialize
|
||||
watchdog_initialize(pid_self, writer_thread, timer_callback_data.watchdog_data);
|
||||
watchdog_initialize(pid_self, writer_thread, _timer_callback_data.watchdog_data);
|
||||
}
|
||||
|
||||
hrt_call_every(&timer_call, _log_interval, _log_interval, timer_callback, &timer_callback_data);
|
||||
hrt_call_every(&timer_call, _log_interval, _log_interval, timer_callback, &_timer_callback_data);
|
||||
}
|
||||
|
||||
// check for new subscription data
|
||||
@@ -703,8 +703,8 @@ void Logger::run()
|
||||
/* check for logging command from MAVLink (start/stop streaming) */
|
||||
handle_vehicle_command_update();
|
||||
|
||||
if (timer_callback_data.watchdog_triggered) {
|
||||
timer_callback_data.watchdog_triggered = false;
|
||||
if (_timer_callback_data.watchdog_triggered.load()) {
|
||||
_timer_callback_data.watchdog_triggered.store(false);
|
||||
initialize_load_output(PrintLoadReason::Watchdog);
|
||||
}
|
||||
|
||||
@@ -916,7 +916,7 @@ void Logger::run()
|
||||
* And on linux this is quite accurate as well, but under NuttX it is not accurate,
|
||||
* because usleep() has only a granularity of CONFIG_MSEC_PER_TICK (=1ms).
|
||||
*/
|
||||
while (px4_sem_wait(&timer_callback_data.semaphore) != 0) {}
|
||||
while (px4_sem_wait(&_timer_callback_data.semaphore) != 0) {}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -926,7 +926,7 @@ void Logger::run()
|
||||
stop_log_file(LogType::Mission);
|
||||
|
||||
hrt_cancel(&timer_call);
|
||||
px4_sem_destroy(&timer_callback_data.semaphore);
|
||||
px4_sem_destroy(&_timer_callback_data.semaphore);
|
||||
|
||||
// stop the writer thread
|
||||
_writer.thread_stop();
|
||||
@@ -1420,7 +1420,7 @@ void Logger::start_log_file(LogType type)
|
||||
if (type == LogType::Full) {
|
||||
write_parameters(type);
|
||||
write_parameter_defaults(type);
|
||||
write_perf_data(true);
|
||||
write_perf_data(PrintLoadReason::Preflight);
|
||||
write_console_output();
|
||||
write_events_file(LogType::Full);
|
||||
write_excluded_optional_topics(type);
|
||||
@@ -1448,7 +1448,7 @@ void Logger::stop_log_file(LogType type)
|
||||
|
||||
if (type == LogType::Full) {
|
||||
_writer.set_need_reliable_transfer(true);
|
||||
write_perf_data(false);
|
||||
write_perf_data(PrintLoadReason::Postflight);
|
||||
_writer.set_need_reliable_transfer(false);
|
||||
}
|
||||
|
||||
@@ -1478,7 +1478,7 @@ void Logger::start_log_mavlink()
|
||||
write_formats(LogType::Full);
|
||||
write_parameters(LogType::Full);
|
||||
write_parameter_defaults(LogType::Full);
|
||||
write_perf_data(true);
|
||||
write_perf_data(PrintLoadReason::Preflight);
|
||||
write_console_output();
|
||||
write_events_file(LogType::Full);
|
||||
write_excluded_optional_topics(LogType::Full);
|
||||
@@ -1498,7 +1498,7 @@ void Logger::stop_log_mavlink()
|
||||
if (_writer.is_started(LogType::Full, LogWriter::BackendMavlink)) {
|
||||
_writer.select_write_backend(LogWriter::BackendMavlink);
|
||||
_writer.set_need_reliable_transfer(true);
|
||||
write_perf_data(false);
|
||||
write_perf_data(PrintLoadReason::Postflight);
|
||||
_writer.set_need_reliable_transfer(false);
|
||||
_writer.unselect_write_backend();
|
||||
_writer.notify();
|
||||
@@ -1509,7 +1509,7 @@ void Logger::stop_log_mavlink()
|
||||
struct perf_callback_data_t {
|
||||
Logger *logger;
|
||||
int counter;
|
||||
bool preflight;
|
||||
Logger::PrintLoadReason reason;
|
||||
char *buffer;
|
||||
};
|
||||
|
||||
@@ -1522,23 +1522,31 @@ void Logger::perf_iterate_callback(perf_counter_t handle, void *user)
|
||||
|
||||
perf_print_counter_buffer(buffer, buffer_length, handle);
|
||||
|
||||
if (callback_data->preflight) {
|
||||
switch (callback_data->reason) {
|
||||
case PrintLoadReason::Preflight:
|
||||
default:
|
||||
perf_name = "perf_counter_preflight";
|
||||
break;
|
||||
|
||||
} else {
|
||||
case PrintLoadReason::Postflight:
|
||||
perf_name = "perf_counter_postflight";
|
||||
break;
|
||||
|
||||
case PrintLoadReason::Watchdog:
|
||||
perf_name = "perf_counter_watchdog";
|
||||
break;
|
||||
}
|
||||
|
||||
callback_data->logger->write_info_multiple(LogType::Full, perf_name, buffer, callback_data->counter != 0);
|
||||
++callback_data->counter;
|
||||
}
|
||||
|
||||
void Logger::write_perf_data(bool preflight)
|
||||
void Logger::write_perf_data(PrintLoadReason reason)
|
||||
{
|
||||
perf_callback_data_t callback_data = {};
|
||||
callback_data.logger = this;
|
||||
callback_data.counter = 0;
|
||||
callback_data.preflight = preflight;
|
||||
callback_data.reason = reason;
|
||||
|
||||
// write the perf counters
|
||||
perf_iterate_all(perf_iterate_callback, &callback_data);
|
||||
@@ -1580,7 +1588,14 @@ void Logger::print_load_callback(void *user)
|
||||
void Logger::initialize_load_output(PrintLoadReason reason)
|
||||
{
|
||||
init_print_load(&_load);
|
||||
_next_load_print = hrt_absolute_time() + 1_s;
|
||||
|
||||
if (reason == PrintLoadReason::Watchdog) {
|
||||
_next_load_print = hrt_absolute_time() + 300_ms;
|
||||
|
||||
} else {
|
||||
_next_load_print = hrt_absolute_time() + 1_s;
|
||||
}
|
||||
|
||||
_print_load_reason = reason;
|
||||
}
|
||||
|
||||
@@ -1588,6 +1603,7 @@ void Logger::write_load_output()
|
||||
{
|
||||
if (_print_load_reason == PrintLoadReason::Watchdog) {
|
||||
PX4_ERR("Writing watchdog data"); // this is just that we see it easily in the log
|
||||
write_perf_data(PrintLoadReason::Watchdog);
|
||||
}
|
||||
|
||||
perf_callback_data_t callback_data = {};
|
||||
@@ -2421,6 +2437,9 @@ $ logger on
|
||||
PRINT_MODULE_USAGE_PARAM_FLOAT('c', 1.0, 0.2, 2.0, "Log rate factor (higher is faster)", true);
|
||||
PRINT_MODULE_USAGE_COMMAND_DESCR("on", "start logging now, override arming (logger must be running)");
|
||||
PRINT_MODULE_USAGE_COMMAND_DESCR("off", "stop logging now, override arming (logger must be running)");
|
||||
#ifdef __PX4_NUTTX
|
||||
PRINT_MODULE_USAGE_COMMAND_DESCR("trigger_watchdog", "manually trigger the watchdog now");
|
||||
#endif
|
||||
PRINT_MODULE_USAGE_DEFAULT_COMMANDS();
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -36,6 +36,7 @@
|
||||
#include "log_writer.h"
|
||||
#include "logged_topics.h"
|
||||
#include "messages.h"
|
||||
#include "watchdog.h"
|
||||
#include <containers/Array.hpp>
|
||||
#include "util.h"
|
||||
#include <px4_platform_common/defines.h>
|
||||
@@ -91,6 +92,20 @@ public:
|
||||
arm_until_shutdown,
|
||||
};
|
||||
|
||||
enum class PrintLoadReason {
|
||||
Preflight,
|
||||
Postflight,
|
||||
Watchdog
|
||||
};
|
||||
|
||||
struct timer_callback_data_s {
|
||||
px4_sem_t semaphore;
|
||||
|
||||
watchdog_data_t watchdog_data;
|
||||
px4::atomic_bool watchdog_triggered{false};
|
||||
};
|
||||
|
||||
|
||||
Logger(LogWriter::Backend backend, size_t buffer_size, uint32_t log_interval, const char *poll_topic_name,
|
||||
LogMode log_mode, bool log_name_timestamp, float rate_factor);
|
||||
|
||||
@@ -131,13 +146,14 @@ public:
|
||||
|
||||
void set_arm_override(bool override) { _manually_logging_override = override; }
|
||||
|
||||
private:
|
||||
void trigger_watchdog_now()
|
||||
{
|
||||
#ifdef __PX4_NUTTX
|
||||
_timer_callback_data.watchdog_data.manual_watchdog_trigger = true;
|
||||
#endif
|
||||
}
|
||||
|
||||
enum class PrintLoadReason {
|
||||
Preflight,
|
||||
Postflight,
|
||||
Watchdog
|
||||
};
|
||||
private:
|
||||
|
||||
static constexpr int MAX_MISSION_TOPICS_NUM = 5; /**< Maximum number of mission topics */
|
||||
static constexpr unsigned MAX_NO_LOGFILE = 999; /**< Maximum number of log files */
|
||||
@@ -229,9 +245,8 @@ private:
|
||||
|
||||
/**
|
||||
* write performance counters
|
||||
* @param preflight preflight if true, postflight otherwise
|
||||
*/
|
||||
void write_perf_data(bool preflight);
|
||||
void write_perf_data(PrintLoadReason reason);
|
||||
|
||||
/**
|
||||
* write bootup console output
|
||||
@@ -371,6 +386,8 @@ private:
|
||||
|
||||
uint32_t _message_gaps{0};
|
||||
|
||||
timer_callback_data_s _timer_callback_data{};
|
||||
|
||||
uORB::Subscription _manual_control_setpoint_sub{ORB_ID(manual_control_setpoint)};
|
||||
uORB::Subscription _vehicle_command_sub{ORB_ID(vehicle_command)};
|
||||
uORB::Subscription _vehicle_status_sub{ORB_ID(vehicle_status)};
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "watchdog.h"
|
||||
|
||||
#include <px4_platform_common/log.h>
|
||||
#include <px4_platform_common/tasks.h>
|
||||
|
||||
#if defined(__PX4_NUTTX) && !defined(CONFIG_SCHED_INSTRUMENTATION)
|
||||
# error watchdog support requires CONFIG_SCHED_INSTRUMENTATION
|
||||
@@ -57,72 +58,104 @@ bool watchdog_update(watchdog_data_t &watchdog_data, bool semaphore_value_satura
|
||||
const system_load_taskinfo_s &log_writer_task = system_load.tasks[watchdog_data.logger_writer_task_index];
|
||||
|
||||
if (log_writer_task.valid) {
|
||||
// Trigger the watchdog if the log writer task has been ready to run for a
|
||||
// minimum duration and it has not been scheduled during that time.
|
||||
// When the writer is waiting for an SD transfer, it is not in ready state, thus a long dropout
|
||||
// will not trigger it. The longest period in ready state I measured was around 70ms,
|
||||
// after a param change.
|
||||
// Additionally we need to check the main thread as well, because if the main thread gets stalled as well
|
||||
// while the writer is idle (no active write), it would not trigger.
|
||||
// We do that by checking if the scheduling semaphore counter is saturated for a certain duration.
|
||||
// No need to lock the tcb access, since we are in IRQ context
|
||||
|
||||
// update the timestamp if it has been scheduled recently
|
||||
if (log_writer_task.curr_start_time > watchdog_data.ready_to_run_timestamp) {
|
||||
watchdog_data.ready_to_run_timestamp = log_writer_task.curr_start_time;
|
||||
}
|
||||
// Was it already triggered?
|
||||
if (watchdog_data.trigger_time != 0) {
|
||||
// If so, restore the priority after 1.5s (enough time to flush the buffer and write the perf data)
|
||||
if (now > watchdog_data.trigger_time + 1500_ms) {
|
||||
// Restore priorities to ensure the logger threads cannot adversely affect the system
|
||||
sched_param param{};
|
||||
param.sched_priority = watchdog_data.logger_main_priority;
|
||||
|
||||
// update the timestamp if not ready to run or if transitioned into ready to run
|
||||
uint8_t current_state = log_writer_task.tcb->task_state;
|
||||
if (system_load.tasks[watchdog_data.logger_main_task_index].valid) {
|
||||
sched_setparam(system_load.tasks[watchdog_data.logger_main_task_index].tcb->pid, ¶m);
|
||||
}
|
||||
|
||||
if (current_state != TSTATE_TASK_READYTORUN
|
||||
|| (watchdog_data.last_state != TSTATE_TASK_READYTORUN && current_state == TSTATE_TASK_READYTORUN)) {
|
||||
watchdog_data.ready_to_run_timestamp = now;
|
||||
}
|
||||
param.sched_priority = watchdog_data.log_writer_priority;
|
||||
sched_setparam(log_writer_task.tcb->pid, ¶m);
|
||||
|
||||
watchdog_data.last_state = current_state;
|
||||
// Make sure we won't trigger again
|
||||
watchdog_data.logger_main_task_index = -1;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
// Trigger the watchdog if the log writer task has been ready to run for a
|
||||
// minimum duration and it has not been scheduled during that time.
|
||||
// When the writer is waiting for an SD transfer, it is not in ready state, thus a long dropout
|
||||
// will not trigger it. The longest period in ready state I measured was around 70ms,
|
||||
// after a param change.
|
||||
// Additionally we need to check the main thread as well, because if the main thread gets stalled as well
|
||||
// while the writer is idle (no active write), it would not trigger.
|
||||
// We do that by checking if the scheduling semaphore counter is saturated for a certain duration.
|
||||
// No need to lock the tcb access, since we are in IRQ context
|
||||
|
||||
// update the timestamp if it has been scheduled recently
|
||||
if (log_writer_task.curr_start_time > watchdog_data.ready_to_run_timestamp) {
|
||||
watchdog_data.ready_to_run_timestamp = log_writer_task.curr_start_time;
|
||||
}
|
||||
|
||||
// update the timestamp if not ready to run or if transitioned into ready to run
|
||||
uint8_t current_state = log_writer_task.tcb->task_state;
|
||||
|
||||
if (current_state != TSTATE_TASK_READYTORUN ||
|
||||
(watchdog_data.last_state != TSTATE_TASK_READYTORUN && current_state == TSTATE_TASK_READYTORUN)) {
|
||||
watchdog_data.ready_to_run_timestamp = now;
|
||||
}
|
||||
|
||||
watchdog_data.last_state = current_state;
|
||||
|
||||
#if 0 // for debugging
|
||||
// test code that prints the maximum time in ready state.
|
||||
// Note: we are in IRQ context, and thus are strictly speaking not allowed to use PX4_ERR -
|
||||
// we do it anyway since it's only used for debugging.
|
||||
static uint64_t max_time = 0;
|
||||
// test code that prints the maximum time in ready state.
|
||||
// Note: we are in IRQ context, and thus are strictly speaking not allowed to use PX4_ERR -
|
||||
// we do it anyway since it's only used for debugging.
|
||||
static uint64_t max_time = 0;
|
||||
|
||||
if (now - watchdog_data.ready_to_run_timestamp > max_time) {
|
||||
max_time = now - watchdog_data.ready_to_run_timestamp;
|
||||
}
|
||||
if (now - watchdog_data.ready_to_run_timestamp > max_time) {
|
||||
max_time = now - watchdog_data.ready_to_run_timestamp;
|
||||
}
|
||||
|
||||
static int counter = 0;
|
||||
static int counter = 0;
|
||||
|
||||
if (++counter > 300) {
|
||||
PX4_ERR("max time in ready: %i ms", (int)max_time / 1000);
|
||||
counter = 0;
|
||||
max_time = 0;
|
||||
}
|
||||
if (++counter > 300) {
|
||||
PX4_ERR("max time in ready: %i ms", (int)max_time / 1000);
|
||||
counter = 0;
|
||||
max_time = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
if (!semaphore_value_saturated) {
|
||||
watchdog_data.sem_counter_saturated_start = now;
|
||||
}
|
||||
|
||||
if (now - watchdog_data.sem_counter_saturated_start > 3_s || now - watchdog_data.ready_to_run_timestamp > 1_s) {
|
||||
// boost the priority to make sure the logger continues to write to the log.
|
||||
// Note that we never restore the priority, to keep the logic simple and because it is
|
||||
// an event that must not occur under normal circumstances (if it does, there's a bug
|
||||
// somewhere)
|
||||
sched_param param{};
|
||||
param.sched_priority = SCHED_PRIORITY_MAX;
|
||||
|
||||
if (system_load.tasks[watchdog_data.logger_main_task_index].valid) {
|
||||
sched_setparam(system_load.tasks[watchdog_data.logger_main_task_index].tcb->pid, ¶m);
|
||||
if (!semaphore_value_saturated) {
|
||||
watchdog_data.sem_counter_saturated_start = now;
|
||||
}
|
||||
|
||||
sched_setparam(log_writer_task.tcb->pid, ¶m);
|
||||
if (watchdog_data.manual_watchdog_trigger
|
||||
|| now > watchdog_data.sem_counter_saturated_start + 3_s
|
||||
|| now > watchdog_data.ready_to_run_timestamp + 1_s) {
|
||||
|
||||
// make sure we won't trigger again
|
||||
watchdog_data.logger_main_task_index = -1;
|
||||
return true;
|
||||
sched_param param{};
|
||||
|
||||
// Get the current priorities
|
||||
if (system_load.tasks[watchdog_data.logger_main_task_index].valid) {
|
||||
sched_getparam(system_load.tasks[watchdog_data.logger_main_task_index].tcb->pid, ¶m);
|
||||
watchdog_data.logger_main_priority = param.sched_priority;
|
||||
}
|
||||
|
||||
sched_getparam(log_writer_task.tcb->pid, ¶m);
|
||||
watchdog_data.log_writer_priority = param.sched_priority;
|
||||
|
||||
// Boost the priority to make sure the logger continues to write to the log.
|
||||
param.sched_priority = SCHED_PRIORITY_LOG_WATCHDOG;
|
||||
|
||||
if (system_load.tasks[watchdog_data.logger_main_task_index].valid) {
|
||||
sched_setparam(system_load.tasks[watchdog_data.logger_main_task_index].tcb->pid, ¶m);
|
||||
}
|
||||
|
||||
sched_setparam(log_writer_task.tcb->pid, ¶m);
|
||||
|
||||
watchdog_data.trigger_time = now;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
@@ -52,6 +52,10 @@ struct watchdog_data_t {
|
||||
hrt_abstime ready_to_run_timestamp = hrt_absolute_time();
|
||||
hrt_abstime sem_counter_saturated_start = hrt_absolute_time();
|
||||
uint8_t last_state = TSTATE_TASK_INVALID;
|
||||
int log_writer_priority = 0;
|
||||
int logger_main_priority = 0;
|
||||
hrt_abstime trigger_time = 0; ///< timestamp when it was triggered
|
||||
bool manual_watchdog_trigger = false;
|
||||
#endif /* __PX4_NUTTX */
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user