mirror of
https://github.com/PX4/PX4-Autopilot.git
synced 2026-05-28 19:32:36 +08:00
logger: add watchdog that boosts its priority when the writer is blocked for >1s
This is to guard against busy-looping tasks, so that we at least have a log. In case the watchdog is triggered, it also logs CPU load of all tasks.
This commit is contained in:
@@ -43,6 +43,7 @@ px4_add_module(
|
||||
log_writer.cpp
|
||||
log_writer_file.cpp
|
||||
log_writer_mavlink.cpp
|
||||
watchdog.cpp
|
||||
DEPENDS
|
||||
version
|
||||
)
|
||||
|
||||
@@ -137,6 +137,13 @@ public:
|
||||
return 0;
|
||||
}
|
||||
|
||||
pthread_t thread_id_file() const
|
||||
{
|
||||
if (_log_writer_file) { return _log_writer_file->thread_id(); }
|
||||
|
||||
return (pthread_t)0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Indicate to the underlying backend whether future write_message() calls need a reliable
|
||||
|
||||
@@ -113,6 +113,8 @@ public:
|
||||
return _need_reliable_transfer;
|
||||
}
|
||||
|
||||
pthread_t thread_id() const { return _thread; }
|
||||
|
||||
private:
|
||||
static void *run_helper(void *);
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#include <px4_config.h>
|
||||
#include "logger.h"
|
||||
#include "messages.h"
|
||||
#include "watchdog.h"
|
||||
|
||||
#include <dirent.h>
|
||||
#include <sys/stat.h>
|
||||
@@ -92,6 +93,8 @@ using namespace px4::logger;
|
||||
struct timer_callback_data_s {
|
||||
px4_sem_t semaphore;
|
||||
|
||||
watchdog_data_t watchdog_data;
|
||||
volatile bool watchdog_triggered = false;
|
||||
};
|
||||
|
||||
/* This is used to schedule work for the logger (periodic scan for updated topics) */
|
||||
@@ -101,6 +104,9 @@ static void timer_callback(void *arg)
|
||||
|
||||
timer_callback_data_s *data = (timer_callback_data_s *)arg;
|
||||
|
||||
if (watchdog_update(data->watchdog_data)) {
|
||||
data->watchdog_triggered = true;
|
||||
}
|
||||
|
||||
/* check the value of the semaphore: if the logger cannot keep up with running it's main loop as fast
|
||||
* as the timer_callback here increases the semaphore count, the counter would increase unbounded,
|
||||
@@ -951,6 +957,17 @@ void Logger::run()
|
||||
|
||||
} else {
|
||||
|
||||
if (_writer.backend() & LogWriter::BackendFile) {
|
||||
|
||||
const pid_t pid_self = getpid();
|
||||
// The pthread_t ID is equal to the PID on NuttX
|
||||
const pid_t pid_writer = _writer.thread_id_file();
|
||||
|
||||
// sched_note_start is already called from pthread_create and task_create,
|
||||
// which means we can expect to find the tasks in system_load.tasks, as required in watchdog_initialize
|
||||
watchdog_initialize(pid_self, pid_writer, timer_callback_data.watchdog_data);
|
||||
}
|
||||
|
||||
hrt_call_every(&timer_call, _log_interval, _log_interval, timer_callback, &timer_callback_data);
|
||||
}
|
||||
|
||||
@@ -1027,6 +1044,11 @@ void Logger::run()
|
||||
}
|
||||
|
||||
|
||||
if (timer_callback_data.watchdog_triggered) {
|
||||
timer_callback_data.watchdog_triggered = false;
|
||||
initialize_load_output(PrintLoadReason::Watchdog);
|
||||
}
|
||||
|
||||
|
||||
const hrt_abstime loop_time = hrt_absolute_time();
|
||||
|
||||
@@ -1624,6 +1646,9 @@ void Logger::initialize_load_output(PrintLoadReason reason)
|
||||
|
||||
void Logger::write_load_output()
|
||||
{
|
||||
if (_print_load_reason == PrintLoadReason::Watchdog) {
|
||||
PX4_ERR("Writing watchdog data"); // this is just that we see it easily in the log
|
||||
}
|
||||
perf_callback_data_t callback_data = {};
|
||||
char buffer[140];
|
||||
callback_data.logger = this;
|
||||
|
||||
@@ -0,0 +1,168 @@
|
||||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2016 PX4 Development Team. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* 3. Neither the name PX4 nor the names of its contributors may be
|
||||
* used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
||||
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
#include "watchdog.h"
|
||||
|
||||
#include <px4_log.h>
|
||||
|
||||
using namespace time_literals;
|
||||
|
||||
namespace px4
|
||||
{
|
||||
namespace logger
|
||||
{
|
||||
|
||||
bool watchdog_update(watchdog_data_t &watchdog_data)
|
||||
{
|
||||
|
||||
#ifdef __PX4_NUTTX
|
||||
|
||||
if (system_load.initialized && watchdog_data.logger_main_task_index >= 0
|
||||
&& watchdog_data.logger_writer_task_index >= 0) {
|
||||
const hrt_abstime now = hrt_absolute_time();
|
||||
const system_load_taskinfo_s &log_writer_task = system_load.tasks[watchdog_data.logger_writer_task_index];
|
||||
|
||||
if (log_writer_task.valid) {
|
||||
// Trigger the watchdog if the log writer task has been ready to run for a
|
||||
// minimum duration and it has not been scheduled during that time.
|
||||
// When the writer is waiting for an SD transfer, it is not in ready state, thus a long dropout
|
||||
// will not trigger it. The longest period in ready state I measured was around 70ms,
|
||||
// after a param change.
|
||||
// We only check the log writer because it runs at lower priority than the main thread.
|
||||
// No need to lock the tcb access, since we are in IRQ context
|
||||
|
||||
// update the timestamp if it has been scheduled recently
|
||||
if (log_writer_task.curr_start_time > watchdog_data.ready_to_run_timestamp) {
|
||||
watchdog_data.ready_to_run_timestamp = log_writer_task.curr_start_time;
|
||||
}
|
||||
|
||||
// update the timestamp if not ready to run or if transitioned into ready to run
|
||||
uint8_t current_state = log_writer_task.tcb->task_state;
|
||||
|
||||
if (current_state != TSTATE_TASK_READYTORUN
|
||||
|| (watchdog_data.last_state != TSTATE_TASK_READYTORUN && current_state == TSTATE_TASK_READYTORUN)) {
|
||||
watchdog_data.ready_to_run_timestamp = now;
|
||||
}
|
||||
|
||||
watchdog_data.last_state = current_state;
|
||||
|
||||
#if 0 // for debugging
|
||||
// test code that prints the maximum time in ready state
|
||||
static uint64_t max_time = 0;
|
||||
|
||||
if (now - watchdog_data.ready_to_run_timestamp > max_time) {
|
||||
max_time = now - watchdog_data.ready_to_run_timestamp;
|
||||
}
|
||||
|
||||
static int counter = 0;
|
||||
|
||||
if (++counter > 300) {
|
||||
PX4_ERR("max time in ready: %i ms", (int)max_time / 1000);
|
||||
counter = 0;
|
||||
max_time = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
if (now - watchdog_data.ready_to_run_timestamp > 1_s) {
|
||||
PX4_ERR("watchdog triggered!"); // this will most likely not be logged due to dropouts
|
||||
|
||||
// boost the priority to make sure the logger continues to write to the log
|
||||
sched_param param{};
|
||||
param.sched_priority = SCHED_PRIORITY_MAX;
|
||||
int ret;
|
||||
|
||||
if (system_load.tasks[watchdog_data.logger_main_task_index].valid) {
|
||||
ret = sched_setparam(system_load.tasks[watchdog_data.logger_main_task_index].tcb->pid, ¶m);
|
||||
|
||||
if (ret < 0) {
|
||||
PX4_ERR("sched_reprioritize failed (%i)", ret);
|
||||
}
|
||||
}
|
||||
|
||||
ret = sched_setparam(log_writer_task.tcb->pid, ¶m);
|
||||
|
||||
if (ret < 0) {
|
||||
PX4_ERR("sched_reprioritize failed (%i)", ret);
|
||||
}
|
||||
|
||||
// make sure we won't trigger again
|
||||
watchdog_data.logger_main_task_index = -1;
|
||||
return true;
|
||||
}
|
||||
|
||||
} else {
|
||||
// should never happen
|
||||
watchdog_data.logger_main_task_index = -1;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* __PX4_NUTTX */
|
||||
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
void watchdog_initialize(const pid_t pid_logger_main, const pid_t pid_logger_writer, watchdog_data_t &watchdog_data)
|
||||
{
|
||||
#ifdef __PX4_NUTTX
|
||||
|
||||
sched_lock(); // need to lock the tcb access
|
||||
|
||||
for (int i = 0; i < CONFIG_MAX_TASKS; i++) {
|
||||
if (system_load.tasks[i].valid) {
|
||||
if (system_load.tasks[i].tcb->pid == pid_logger_writer) {
|
||||
watchdog_data.logger_writer_task_index = i;
|
||||
}
|
||||
|
||||
if (system_load.tasks[i].tcb->pid == pid_logger_main) {
|
||||
watchdog_data.logger_main_task_index = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sched_unlock();
|
||||
|
||||
if (watchdog_data.logger_writer_task_index == -1 ||
|
||||
watchdog_data.logger_main_task_index == -1) {
|
||||
// If we land here it means the NuttX implementation changed
|
||||
// and one of our assumptions is not valid anymore
|
||||
PX4_ERR("watchdog init failed");
|
||||
}
|
||||
|
||||
#endif /* __PX4_NUTTX */
|
||||
}
|
||||
|
||||
|
||||
} // namespace logger
|
||||
} // namespace px4
|
||||
@@ -0,0 +1,78 @@
|
||||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2018 PX4 Development Team. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* 3. Neither the name PX4 nor the names of its contributors may be
|
||||
* used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
||||
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <drivers/drv_hrt.h>
|
||||
|
||||
#ifdef __PX4_NUTTX
|
||||
#include <sched.h>
|
||||
#include <systemlib/cpuload.h>
|
||||
#endif /* __PX4_NUTTX */
|
||||
|
||||
namespace px4
|
||||
{
|
||||
namespace logger
|
||||
{
|
||||
|
||||
struct watchdog_data_t {
|
||||
#ifdef __PX4_NUTTX
|
||||
int logger_main_task_index = -1;
|
||||
int logger_writer_task_index = -1;
|
||||
hrt_abstime ready_to_run_timestamp = hrt_absolute_time();
|
||||
uint8_t last_state = TSTATE_TASK_INVALID;
|
||||
#endif /* __PX4_NUTTX */
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Initialize the watchdog, fill in watchdog_data.
|
||||
*/
|
||||
void watchdog_initialize(const pid_t pid_logger_main, const pid_t pid_logger_writer, watchdog_data_t &watchdog_data);
|
||||
|
||||
/**
|
||||
* Update the watchdog and trigger it if necessary. It is triggered when the log writer task is in
|
||||
* ready state for a certain period of time, but did not get scheduled. It means that most likely
|
||||
* some other higher-prio task runs busy.
|
||||
* When the watchdog triggers, it boosts the priority of the logger's main & writer tasks to maximum, so
|
||||
* that they get scheduled again.
|
||||
*
|
||||
* Expected to be called from IRQ context.
|
||||
*
|
||||
* @param watchdog_data
|
||||
* @return true if watchdog is triggered, false otherwise
|
||||
*/
|
||||
bool watchdog_update(watchdog_data_t &watchdog_data);
|
||||
|
||||
} //namespace logger
|
||||
} //namespace px4
|
||||
@@ -167,7 +167,11 @@ void sched_note_resume(FAR struct tcb_s *tcb)
|
||||
|
||||
for (int i = 0; i < CONFIG_MAX_TASKS; i++) {
|
||||
if (system_load.tasks[i].valid && system_load.tasks[i].tcb->pid == tcb->pid) {
|
||||
// curr_start_time is accessed from an IRQ handler (in logger), so we need
|
||||
// to make the update atomic
|
||||
irqstate_t irq_state = px4_enter_critical_section();
|
||||
system_load.tasks[i].curr_start_time = new_time;
|
||||
px4_leave_critical_section(irq_state);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user