mirror of
https://github.com/esphome/esphome.git
synced 2026-05-28 21:59:59 +08:00
[esp32] Capture both cores' backtraces in crash handler (#15559)
Co-authored-by: J. Nick Koston <nick@home-assistant.io> Co-authored-by: J. Nick Koston <nick@koston.org>
This commit is contained in:
@@ -59,6 +59,59 @@ static inline bool is_return_addr(uint32_t addr) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// --- Architecture-specific backtrace helpers ---
|
||||||
|
// These run from IRAM during panic (no flash access).
|
||||||
|
|
||||||
|
#if CONFIG_IDF_TARGET_ARCH_XTENSA
|
||||||
|
// Walk Xtensa backtrace from an exception frame, writing PCs to out[].
|
||||||
|
// Returns number of entries written.
|
||||||
|
static uint8_t IRAM_ATTR walk_xtensa_backtrace(XtExcFrame *frame, uint32_t *out, uint8_t max) {
|
||||||
|
esp_backtrace_frame_t bt_frame = {
|
||||||
|
.pc = (uint32_t) frame->pc,
|
||||||
|
.sp = (uint32_t) frame->a1,
|
||||||
|
.next_pc = (uint32_t) frame->a0,
|
||||||
|
.exc_frame = frame,
|
||||||
|
};
|
||||||
|
uint8_t count = 0;
|
||||||
|
uint32_t first_pc = esp_cpu_process_stack_pc(bt_frame.pc);
|
||||||
|
if (is_code_addr(first_pc)) {
|
||||||
|
out[count++] = first_pc;
|
||||||
|
}
|
||||||
|
while (count < max && bt_frame.next_pc != 0) {
|
||||||
|
if (!esp_backtrace_get_next_frame(&bt_frame))
|
||||||
|
break;
|
||||||
|
uint32_t pc = esp_cpu_process_stack_pc(bt_frame.pc);
|
||||||
|
if (is_code_addr(pc)) {
|
||||||
|
out[count++] = pc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if CONFIG_IDF_TARGET_ARCH_RISCV
|
||||||
|
// Capture RISC-V backtrace: MEPC + RA from registers, then stack scan.
|
||||||
|
// Returns total count; *reg_count receives number of register-sourced entries.
|
||||||
|
static uint8_t IRAM_ATTR capture_riscv_backtrace(RvExcFrame *frame, uint32_t *out, uint8_t max, uint8_t *reg_count) {
|
||||||
|
uint8_t count = 0;
|
||||||
|
if (is_code_addr(frame->mepc)) {
|
||||||
|
out[count++] = frame->mepc;
|
||||||
|
}
|
||||||
|
if (is_code_addr(frame->ra) && frame->ra != frame->mepc) {
|
||||||
|
out[count++] = frame->ra;
|
||||||
|
}
|
||||||
|
*reg_count = count;
|
||||||
|
auto *scan_start = (uint32_t *) frame->sp;
|
||||||
|
for (uint32_t i = 0; i < 64 && count < max; i++) {
|
||||||
|
uint32_t val = scan_start[i];
|
||||||
|
if (is_code_addr(val) && val != frame->mepc && val != frame->ra) {
|
||||||
|
out[count++] = val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Raw crash data written by the panic handler wrapper.
|
// Raw crash data written by the panic handler wrapper.
|
||||||
// Lives in .noinit so it survives software reset but contains garbage after power cycle.
|
// Lives in .noinit so it survives software reset but contains garbage after power cycle.
|
||||||
// Validated by magic marker. Static linkage since it's only used within this file.
|
// Validated by magic marker. Static linkage since it's only used within this file.
|
||||||
@@ -66,7 +119,7 @@ static inline bool is_return_addr(uint32_t addr) {
|
|||||||
// Magic is second to validate the data. Remaining fields can change between versions.
|
// Magic is second to validate the data. Remaining fields can change between versions.
|
||||||
// Version is uint32_t because it would be padded to 4 bytes anyway before the next
|
// Version is uint32_t because it would be padded to 4 bytes anyway before the next
|
||||||
// uint32_t field, so we use the full width rather than wasting 3 bytes of padding.
|
// uint32_t field, so we use the full width rather than wasting 3 bytes of padding.
|
||||||
static constexpr uint32_t CRASH_DATA_VERSION = 1;
|
static constexpr uint32_t CRASH_DATA_VERSION = 2;
|
||||||
struct RawCrashData {
|
struct RawCrashData {
|
||||||
uint32_t version;
|
uint32_t version;
|
||||||
uint32_t magic;
|
uint32_t magic;
|
||||||
@@ -77,6 +130,13 @@ struct RawCrashData {
|
|||||||
uint8_t pseudo_excause; // Whether cause is a pseudo exception (Xtensa SoC-level panic)
|
uint8_t pseudo_excause; // Whether cause is a pseudo exception (Xtensa SoC-level panic)
|
||||||
uint32_t backtrace[MAX_BACKTRACE];
|
uint32_t backtrace[MAX_BACKTRACE];
|
||||||
uint32_t cause; // Architecture-specific: exccause (Xtensa) or mcause (RISC-V)
|
uint32_t cause; // Architecture-specific: exccause (Xtensa) or mcause (RISC-V)
|
||||||
|
uint8_t crashed_core;
|
||||||
|
#if SOC_CPU_CORES_NUM > 1
|
||||||
|
static_assert(SOC_CPU_CORES_NUM == 2, "Dual-core logic assumes exactly 2 cores");
|
||||||
|
uint8_t other_backtrace_count;
|
||||||
|
uint8_t other_reg_frame_count;
|
||||||
|
uint32_t other_backtrace[MAX_BACKTRACE];
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
static RawCrashData __attribute__((section(".noinit")))
|
static RawCrashData __attribute__((section(".noinit")))
|
||||||
s_raw_crash_data; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
|
s_raw_crash_data; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
|
||||||
@@ -100,6 +160,14 @@ void crash_handler_read_and_clear() {
|
|||||||
s_raw_crash_data.exception = 4; // Default to PANIC_EXCEPTION_FAULT
|
s_raw_crash_data.exception = 4; // Default to PANIC_EXCEPTION_FAULT
|
||||||
if (s_raw_crash_data.pseudo_excause > 1)
|
if (s_raw_crash_data.pseudo_excause > 1)
|
||||||
s_raw_crash_data.pseudo_excause = 0;
|
s_raw_crash_data.pseudo_excause = 0;
|
||||||
|
if (s_raw_crash_data.crashed_core >= SOC_CPU_CORES_NUM)
|
||||||
|
s_raw_crash_data.crashed_core = 0;
|
||||||
|
#if SOC_CPU_CORES_NUM > 1
|
||||||
|
if (s_raw_crash_data.other_backtrace_count > MAX_BACKTRACE)
|
||||||
|
s_raw_crash_data.other_backtrace_count = MAX_BACKTRACE;
|
||||||
|
if (s_raw_crash_data.other_reg_frame_count > s_raw_crash_data.other_backtrace_count)
|
||||||
|
s_raw_crash_data.other_reg_frame_count = s_raw_crash_data.other_backtrace_count;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
// Don't clear magic here — crash data must survive OTA rollback reboots.
|
// Don't clear magic here — crash data must survive OTA rollback reboots.
|
||||||
// Magic is cleared by crash_handler_clear() after an API client receives the data.
|
// Magic is cleared by crash_handler_clear() after an API client receives the data.
|
||||||
@@ -219,6 +287,36 @@ static const char *get_exception_type() {
|
|||||||
return "Unknown";
|
return "Unknown";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Log backtrace entries, filtering stack-scanned addresses on RISC-V.
|
||||||
|
static void log_backtrace(const uint32_t *addrs, uint8_t count, uint8_t reg_frame_count) {
|
||||||
|
uint8_t bt_num = 0;
|
||||||
|
for (uint8_t i = 0; i < count; i++) {
|
||||||
|
uint32_t addr = addrs[i];
|
||||||
|
#if CONFIG_IDF_TARGET_ARCH_RISCV
|
||||||
|
if (i >= reg_frame_count && !is_return_addr(addr))
|
||||||
|
continue;
|
||||||
|
const char *source = (i < reg_frame_count) ? "backtrace" : "stack scan";
|
||||||
|
#else
|
||||||
|
const char *source = "backtrace";
|
||||||
|
#endif
|
||||||
|
ESP_LOGE(TAG, " BT%d: 0x%08" PRIX32 " (%s)", bt_num++, addr, source);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Append backtrace addresses to the addr2line hint buffer.
|
||||||
|
static int append_addrs_to_hint(char *buf, int size, int pos, const uint32_t *addrs, uint8_t count,
|
||||||
|
uint8_t reg_frame_count) {
|
||||||
|
for (uint8_t i = 0; i < count && pos < size - 12; i++) {
|
||||||
|
uint32_t addr = addrs[i];
|
||||||
|
#if CONFIG_IDF_TARGET_ARCH_RISCV
|
||||||
|
if (i >= reg_frame_count && !is_return_addr(addr))
|
||||||
|
continue;
|
||||||
|
#endif
|
||||||
|
pos += snprintf(buf + pos, size - pos, " 0x%08" PRIX32, addr);
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
// Intentionally uses separate ESP_LOGE calls per line instead of combining into
|
// Intentionally uses separate ESP_LOGE calls per line instead of combining into
|
||||||
// one multi-line log message. This ensures each address appears as its own line
|
// one multi-line log message. This ensures each address appears as its own line
|
||||||
// on the serial console, making it possible to see partial output if the device
|
// on the serial console, making it possible to see partial output if the device
|
||||||
@@ -235,33 +333,28 @@ void crash_handler_log() {
|
|||||||
} else {
|
} else {
|
||||||
ESP_LOGE(TAG, " Reason: %s", get_exception_type());
|
ESP_LOGE(TAG, " Reason: %s", get_exception_type());
|
||||||
}
|
}
|
||||||
|
ESP_LOGE(TAG, " Crashed core: %d", s_raw_crash_data.crashed_core);
|
||||||
ESP_LOGE(TAG, " PC: 0x%08" PRIX32 " (fault location)", s_raw_crash_data.pc);
|
ESP_LOGE(TAG, " PC: 0x%08" PRIX32 " (fault location)", s_raw_crash_data.pc);
|
||||||
uint8_t bt_num = 0;
|
log_backtrace(s_raw_crash_data.backtrace, s_raw_crash_data.backtrace_count, s_raw_crash_data.reg_frame_count);
|
||||||
for (uint8_t i = 0; i < s_raw_crash_data.backtrace_count; i++) {
|
|
||||||
uint32_t addr = s_raw_crash_data.backtrace[i];
|
#if SOC_CPU_CORES_NUM > 1
|
||||||
#if CONFIG_IDF_TARGET_ARCH_RISCV
|
if (s_raw_crash_data.other_backtrace_count > 0) {
|
||||||
// Register-sourced entries (MEPC/RA) are trusted; only filter stack-scanned ones.
|
int other_core = 1 - s_raw_crash_data.crashed_core;
|
||||||
if (i >= s_raw_crash_data.reg_frame_count && !is_return_addr(addr))
|
ESP_LOGE(TAG, " Other core (%d) backtrace:", other_core);
|
||||||
continue;
|
log_backtrace(s_raw_crash_data.other_backtrace, s_raw_crash_data.other_backtrace_count,
|
||||||
#endif
|
s_raw_crash_data.other_reg_frame_count);
|
||||||
#if CONFIG_IDF_TARGET_ARCH_RISCV
|
|
||||||
const char *source = (i < s_raw_crash_data.reg_frame_count) ? "backtrace" : "stack scan";
|
|
||||||
#else
|
|
||||||
const char *source = "backtrace";
|
|
||||||
#endif
|
|
||||||
ESP_LOGE(TAG, " BT%d: 0x%08" PRIX32 " (%s)", bt_num++, addr, source);
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Build addr2line hint with all captured addresses for easy copy-paste
|
// Build addr2line hint with all captured addresses for easy copy-paste
|
||||||
char hint[256];
|
char hint[256];
|
||||||
int pos = snprintf(hint, sizeof(hint), "Use: addr2line -pfiaC -e firmware.elf 0x%08" PRIX32, s_raw_crash_data.pc);
|
int pos = snprintf(hint, sizeof(hint), "Use: addr2line -pfiaC -e firmware.elf 0x%08" PRIX32, s_raw_crash_data.pc);
|
||||||
for (uint8_t i = 0; i < s_raw_crash_data.backtrace_count && pos < (int) sizeof(hint) - 12; i++) {
|
pos = append_addrs_to_hint(hint, sizeof(hint), pos, s_raw_crash_data.backtrace, s_raw_crash_data.backtrace_count,
|
||||||
uint32_t addr = s_raw_crash_data.backtrace[i];
|
s_raw_crash_data.reg_frame_count);
|
||||||
#if CONFIG_IDF_TARGET_ARCH_RISCV
|
#if SOC_CPU_CORES_NUM > 1
|
||||||
if (i >= s_raw_crash_data.reg_frame_count && !is_return_addr(addr))
|
append_addrs_to_hint(hint, sizeof(hint), pos, s_raw_crash_data.other_backtrace,
|
||||||
continue;
|
s_raw_crash_data.other_backtrace_count, s_raw_crash_data.other_reg_frame_count);
|
||||||
#endif
|
#endif
|
||||||
pos += snprintf(hint + pos, sizeof(hint) - pos, " 0x%08" PRIX32, addr);
|
|
||||||
}
|
|
||||||
ESP_LOGE(TAG, "%s", hint);
|
ESP_LOGE(TAG, "%s", hint);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -283,68 +376,54 @@ void IRAM_ATTR __wrap_esp_panic_handler(panic_info_t *info) {
|
|||||||
s_raw_crash_data.reg_frame_count = 0;
|
s_raw_crash_data.reg_frame_count = 0;
|
||||||
s_raw_crash_data.exception = (uint8_t) info->exception;
|
s_raw_crash_data.exception = (uint8_t) info->exception;
|
||||||
s_raw_crash_data.pseudo_excause = info->pseudo_excause ? 1 : 0;
|
s_raw_crash_data.pseudo_excause = info->pseudo_excause ? 1 : 0;
|
||||||
|
s_raw_crash_data.crashed_core = (uint8_t) info->core;
|
||||||
|
#if SOC_CPU_CORES_NUM > 1
|
||||||
|
s_raw_crash_data.other_backtrace_count = 0;
|
||||||
|
s_raw_crash_data.other_reg_frame_count = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
#if CONFIG_IDF_TARGET_ARCH_XTENSA
|
#if CONFIG_IDF_TARGET_ARCH_XTENSA
|
||||||
// Xtensa: walk the backtrace using the public API
|
// Xtensa: walk the backtrace using the public API
|
||||||
if (info->frame != nullptr) {
|
if (info->frame != nullptr) {
|
||||||
auto *xt_frame = (XtExcFrame *) info->frame;
|
auto *xt_frame = (XtExcFrame *) info->frame;
|
||||||
s_raw_crash_data.cause = xt_frame->exccause;
|
s_raw_crash_data.cause = xt_frame->exccause;
|
||||||
esp_backtrace_frame_t bt_frame = {
|
s_raw_crash_data.backtrace_count = walk_xtensa_backtrace(xt_frame, s_raw_crash_data.backtrace, MAX_BACKTRACE);
|
||||||
.pc = (uint32_t) xt_frame->pc,
|
|
||||||
.sp = (uint32_t) xt_frame->a1,
|
|
||||||
.next_pc = (uint32_t) xt_frame->a0,
|
|
||||||
.exc_frame = xt_frame,
|
|
||||||
};
|
|
||||||
|
|
||||||
uint8_t count = 0;
|
|
||||||
// First frame PC
|
|
||||||
uint32_t first_pc = esp_cpu_process_stack_pc(bt_frame.pc);
|
|
||||||
if (is_code_addr(first_pc)) {
|
|
||||||
s_raw_crash_data.backtrace[count++] = first_pc;
|
|
||||||
}
|
|
||||||
// Walk remaining frames
|
|
||||||
while (count < MAX_BACKTRACE && bt_frame.next_pc != 0) {
|
|
||||||
if (!esp_backtrace_get_next_frame(&bt_frame)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
uint32_t pc = esp_cpu_process_stack_pc(bt_frame.pc);
|
|
||||||
if (is_code_addr(pc)) {
|
|
||||||
s_raw_crash_data.backtrace[count++] = pc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
s_raw_crash_data.backtrace_count = count;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if SOC_CPU_CORES_NUM > 1
|
||||||
|
// Capture the other core's backtrace from the global frame array.
|
||||||
|
// Both cores save their frames to g_exc_frames[] before esp_panic_handler
|
||||||
|
// is called, so the other core's frame is available here.
|
||||||
|
if (info->core >= 0 && info->core < SOC_CPU_CORES_NUM) {
|
||||||
|
int other_core = 1 - info->core;
|
||||||
|
auto *other_frame = (XtExcFrame *) g_exc_frames[other_core];
|
||||||
|
if (other_frame != nullptr) {
|
||||||
|
s_raw_crash_data.other_backtrace_count =
|
||||||
|
walk_xtensa_backtrace(other_frame, s_raw_crash_data.other_backtrace, MAX_BACKTRACE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#elif CONFIG_IDF_TARGET_ARCH_RISCV
|
#elif CONFIG_IDF_TARGET_ARCH_RISCV
|
||||||
// RISC-V: capture MEPC + RA, then scan stack for code addresses
|
// RISC-V: capture MEPC + RA, then scan stack for code addresses
|
||||||
if (info->frame != nullptr) {
|
if (info->frame != nullptr) {
|
||||||
auto *rv_frame = (RvExcFrame *) info->frame;
|
auto *rv_frame = (RvExcFrame *) info->frame;
|
||||||
s_raw_crash_data.cause = rv_frame->mcause;
|
s_raw_crash_data.cause = rv_frame->mcause;
|
||||||
uint8_t count = 0;
|
s_raw_crash_data.backtrace_count =
|
||||||
|
capture_riscv_backtrace(rv_frame, s_raw_crash_data.backtrace, MAX_BACKTRACE, &s_raw_crash_data.reg_frame_count);
|
||||||
// Save MEPC (fault PC) and RA (return address)
|
|
||||||
if (is_code_addr(rv_frame->mepc)) {
|
|
||||||
s_raw_crash_data.backtrace[count++] = rv_frame->mepc;
|
|
||||||
}
|
|
||||||
if (is_code_addr(rv_frame->ra) && rv_frame->ra != rv_frame->mepc) {
|
|
||||||
s_raw_crash_data.backtrace[count++] = rv_frame->ra;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Track how many entries came from registers (MEPC/RA) so we can
|
|
||||||
// skip return-address validation for them at log time.
|
|
||||||
s_raw_crash_data.reg_frame_count = count;
|
|
||||||
|
|
||||||
// Scan stack for code addresses — captures broadly during panic,
|
|
||||||
// filtered by is_return_addr() at log time when flash is accessible.
|
|
||||||
auto *scan_start = (uint32_t *) rv_frame->sp;
|
|
||||||
for (uint32_t i = 0; i < 64 && count < MAX_BACKTRACE; i++) {
|
|
||||||
uint32_t val = scan_start[i];
|
|
||||||
if (is_code_addr(val) && val != rv_frame->mepc && val != rv_frame->ra) {
|
|
||||||
s_raw_crash_data.backtrace[count++] = val;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
s_raw_crash_data.backtrace_count = count;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if SOC_CPU_CORES_NUM > 1
|
||||||
|
// Capture the other core's backtrace from the global frame array.
|
||||||
|
if (info->core >= 0 && info->core < SOC_CPU_CORES_NUM) {
|
||||||
|
int other_core = 1 - info->core;
|
||||||
|
auto *other_frame = (RvExcFrame *) g_exc_frames[other_core];
|
||||||
|
if (other_frame != nullptr) {
|
||||||
|
s_raw_crash_data.other_backtrace_count = capture_riscv_backtrace(
|
||||||
|
other_frame, s_raw_crash_data.other_backtrace, MAX_BACKTRACE, &s_raw_crash_data.other_reg_frame_count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Write version and magic last — ensures all data is written before we mark it valid
|
// Write version and magic last — ensures all data is written before we mark it valid
|
||||||
|
|||||||
Reference in New Issue
Block a user