[esp32] Capture both cores' backtraces in crash handler (#15559)

Co-authored-by: J. Nick Koston <nick@home-assistant.io>
Co-authored-by: J. Nick Koston <nick@koston.org>
This commit is contained in:
Jonathan Swoboda
2026-04-08 16:14:18 -04:00
committed by GitHub
parent 94f1e48d95
commit 2cd92a311b
+149 -70
View File
@@ -59,6 +59,59 @@ static inline bool is_return_addr(uint32_t addr) {
}
#endif
// --- Architecture-specific backtrace helpers ---
// These run from IRAM during panic (no flash access).
#if CONFIG_IDF_TARGET_ARCH_XTENSA
// Walk Xtensa backtrace from an exception frame, writing PCs to out[].
// Returns number of entries written.
static uint8_t IRAM_ATTR walk_xtensa_backtrace(XtExcFrame *frame, uint32_t *out, uint8_t max) {
esp_backtrace_frame_t bt_frame = {
.pc = (uint32_t) frame->pc,
.sp = (uint32_t) frame->a1,
.next_pc = (uint32_t) frame->a0,
.exc_frame = frame,
};
uint8_t count = 0;
uint32_t first_pc = esp_cpu_process_stack_pc(bt_frame.pc);
if (is_code_addr(first_pc)) {
out[count++] = first_pc;
}
while (count < max && bt_frame.next_pc != 0) {
if (!esp_backtrace_get_next_frame(&bt_frame))
break;
uint32_t pc = esp_cpu_process_stack_pc(bt_frame.pc);
if (is_code_addr(pc)) {
out[count++] = pc;
}
}
return count;
}
#endif
#if CONFIG_IDF_TARGET_ARCH_RISCV
// Capture RISC-V backtrace: MEPC + RA from registers, then stack scan.
// Returns total count; *reg_count receives number of register-sourced entries.
static uint8_t IRAM_ATTR capture_riscv_backtrace(RvExcFrame *frame, uint32_t *out, uint8_t max, uint8_t *reg_count) {
uint8_t count = 0;
if (is_code_addr(frame->mepc)) {
out[count++] = frame->mepc;
}
if (is_code_addr(frame->ra) && frame->ra != frame->mepc) {
out[count++] = frame->ra;
}
*reg_count = count;
auto *scan_start = (uint32_t *) frame->sp;
for (uint32_t i = 0; i < 64 && count < max; i++) {
uint32_t val = scan_start[i];
if (is_code_addr(val) && val != frame->mepc && val != frame->ra) {
out[count++] = val;
}
}
return count;
}
#endif
// Raw crash data written by the panic handler wrapper.
// Lives in .noinit so it survives software reset but contains garbage after power cycle.
// Validated by magic marker. Static linkage since it's only used within this file.
@@ -66,7 +119,7 @@ static inline bool is_return_addr(uint32_t addr) {
// Magic is second to validate the data. Remaining fields can change between versions.
// Version is uint32_t because it would be padded to 4 bytes anyway before the next
// uint32_t field, so we use the full width rather than wasting 3 bytes of padding.
static constexpr uint32_t CRASH_DATA_VERSION = 1;
static constexpr uint32_t CRASH_DATA_VERSION = 2;
struct RawCrashData {
uint32_t version;
uint32_t magic;
@@ -77,6 +130,13 @@ struct RawCrashData {
uint8_t pseudo_excause; // Whether cause is a pseudo exception (Xtensa SoC-level panic)
uint32_t backtrace[MAX_BACKTRACE];
uint32_t cause; // Architecture-specific: exccause (Xtensa) or mcause (RISC-V)
uint8_t crashed_core;
#if SOC_CPU_CORES_NUM > 1
static_assert(SOC_CPU_CORES_NUM == 2, "Dual-core logic assumes exactly 2 cores");
uint8_t other_backtrace_count;
uint8_t other_reg_frame_count;
uint32_t other_backtrace[MAX_BACKTRACE];
#endif
};
static RawCrashData __attribute__((section(".noinit")))
s_raw_crash_data; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
@@ -100,6 +160,14 @@ void crash_handler_read_and_clear() {
s_raw_crash_data.exception = 4; // Default to PANIC_EXCEPTION_FAULT
if (s_raw_crash_data.pseudo_excause > 1)
s_raw_crash_data.pseudo_excause = 0;
if (s_raw_crash_data.crashed_core >= SOC_CPU_CORES_NUM)
s_raw_crash_data.crashed_core = 0;
#if SOC_CPU_CORES_NUM > 1
if (s_raw_crash_data.other_backtrace_count > MAX_BACKTRACE)
s_raw_crash_data.other_backtrace_count = MAX_BACKTRACE;
if (s_raw_crash_data.other_reg_frame_count > s_raw_crash_data.other_backtrace_count)
s_raw_crash_data.other_reg_frame_count = s_raw_crash_data.other_backtrace_count;
#endif
}
// Don't clear magic here — crash data must survive OTA rollback reboots.
// Magic is cleared by crash_handler_clear() after an API client receives the data.
@@ -219,6 +287,36 @@ static const char *get_exception_type() {
return "Unknown";
}
// Log backtrace entries, filtering stack-scanned addresses on RISC-V.
static void log_backtrace(const uint32_t *addrs, uint8_t count, uint8_t reg_frame_count) {
uint8_t bt_num = 0;
for (uint8_t i = 0; i < count; i++) {
uint32_t addr = addrs[i];
#if CONFIG_IDF_TARGET_ARCH_RISCV
if (i >= reg_frame_count && !is_return_addr(addr))
continue;
const char *source = (i < reg_frame_count) ? "backtrace" : "stack scan";
#else
const char *source = "backtrace";
#endif
ESP_LOGE(TAG, " BT%d: 0x%08" PRIX32 " (%s)", bt_num++, addr, source);
}
}
// Append backtrace addresses to the addr2line hint buffer.
static int append_addrs_to_hint(char *buf, int size, int pos, const uint32_t *addrs, uint8_t count,
uint8_t reg_frame_count) {
for (uint8_t i = 0; i < count && pos < size - 12; i++) {
uint32_t addr = addrs[i];
#if CONFIG_IDF_TARGET_ARCH_RISCV
if (i >= reg_frame_count && !is_return_addr(addr))
continue;
#endif
pos += snprintf(buf + pos, size - pos, " 0x%08" PRIX32, addr);
}
return pos;
}
// Intentionally uses separate ESP_LOGE calls per line instead of combining into
// one multi-line log message. This ensures each address appears as its own line
// on the serial console, making it possible to see partial output if the device
@@ -235,33 +333,28 @@ void crash_handler_log() {
} else {
ESP_LOGE(TAG, " Reason: %s", get_exception_type());
}
ESP_LOGE(TAG, " Crashed core: %d", s_raw_crash_data.crashed_core);
ESP_LOGE(TAG, " PC: 0x%08" PRIX32 " (fault location)", s_raw_crash_data.pc);
uint8_t bt_num = 0;
for (uint8_t i = 0; i < s_raw_crash_data.backtrace_count; i++) {
uint32_t addr = s_raw_crash_data.backtrace[i];
#if CONFIG_IDF_TARGET_ARCH_RISCV
// Register-sourced entries (MEPC/RA) are trusted; only filter stack-scanned ones.
if (i >= s_raw_crash_data.reg_frame_count && !is_return_addr(addr))
continue;
#endif
#if CONFIG_IDF_TARGET_ARCH_RISCV
const char *source = (i < s_raw_crash_data.reg_frame_count) ? "backtrace" : "stack scan";
#else
const char *source = "backtrace";
#endif
ESP_LOGE(TAG, " BT%d: 0x%08" PRIX32 " (%s)", bt_num++, addr, source);
log_backtrace(s_raw_crash_data.backtrace, s_raw_crash_data.backtrace_count, s_raw_crash_data.reg_frame_count);
#if SOC_CPU_CORES_NUM > 1
if (s_raw_crash_data.other_backtrace_count > 0) {
int other_core = 1 - s_raw_crash_data.crashed_core;
ESP_LOGE(TAG, " Other core (%d) backtrace:", other_core);
log_backtrace(s_raw_crash_data.other_backtrace, s_raw_crash_data.other_backtrace_count,
s_raw_crash_data.other_reg_frame_count);
}
#endif
// Build addr2line hint with all captured addresses for easy copy-paste
char hint[256];
int pos = snprintf(hint, sizeof(hint), "Use: addr2line -pfiaC -e firmware.elf 0x%08" PRIX32, s_raw_crash_data.pc);
for (uint8_t i = 0; i < s_raw_crash_data.backtrace_count && pos < (int) sizeof(hint) - 12; i++) {
uint32_t addr = s_raw_crash_data.backtrace[i];
#if CONFIG_IDF_TARGET_ARCH_RISCV
if (i >= s_raw_crash_data.reg_frame_count && !is_return_addr(addr))
continue;
pos = append_addrs_to_hint(hint, sizeof(hint), pos, s_raw_crash_data.backtrace, s_raw_crash_data.backtrace_count,
s_raw_crash_data.reg_frame_count);
#if SOC_CPU_CORES_NUM > 1
append_addrs_to_hint(hint, sizeof(hint), pos, s_raw_crash_data.other_backtrace,
s_raw_crash_data.other_backtrace_count, s_raw_crash_data.other_reg_frame_count);
#endif
pos += snprintf(hint + pos, sizeof(hint) - pos, " 0x%08" PRIX32, addr);
}
ESP_LOGE(TAG, "%s", hint);
}
@@ -283,68 +376,54 @@ void IRAM_ATTR __wrap_esp_panic_handler(panic_info_t *info) {
s_raw_crash_data.reg_frame_count = 0;
s_raw_crash_data.exception = (uint8_t) info->exception;
s_raw_crash_data.pseudo_excause = info->pseudo_excause ? 1 : 0;
s_raw_crash_data.crashed_core = (uint8_t) info->core;
#if SOC_CPU_CORES_NUM > 1
s_raw_crash_data.other_backtrace_count = 0;
s_raw_crash_data.other_reg_frame_count = 0;
#endif
#if CONFIG_IDF_TARGET_ARCH_XTENSA
// Xtensa: walk the backtrace using the public API
if (info->frame != nullptr) {
auto *xt_frame = (XtExcFrame *) info->frame;
s_raw_crash_data.cause = xt_frame->exccause;
esp_backtrace_frame_t bt_frame = {
.pc = (uint32_t) xt_frame->pc,
.sp = (uint32_t) xt_frame->a1,
.next_pc = (uint32_t) xt_frame->a0,
.exc_frame = xt_frame,
};
uint8_t count = 0;
// First frame PC
uint32_t first_pc = esp_cpu_process_stack_pc(bt_frame.pc);
if (is_code_addr(first_pc)) {
s_raw_crash_data.backtrace[count++] = first_pc;
}
// Walk remaining frames
while (count < MAX_BACKTRACE && bt_frame.next_pc != 0) {
if (!esp_backtrace_get_next_frame(&bt_frame)) {
break;
}
uint32_t pc = esp_cpu_process_stack_pc(bt_frame.pc);
if (is_code_addr(pc)) {
s_raw_crash_data.backtrace[count++] = pc;
}
}
s_raw_crash_data.backtrace_count = count;
s_raw_crash_data.backtrace_count = walk_xtensa_backtrace(xt_frame, s_raw_crash_data.backtrace, MAX_BACKTRACE);
}
#if SOC_CPU_CORES_NUM > 1
// Capture the other core's backtrace from the global frame array.
// Both cores save their frames to g_exc_frames[] before esp_panic_handler
// is called, so the other core's frame is available here.
if (info->core >= 0 && info->core < SOC_CPU_CORES_NUM) {
int other_core = 1 - info->core;
auto *other_frame = (XtExcFrame *) g_exc_frames[other_core];
if (other_frame != nullptr) {
s_raw_crash_data.other_backtrace_count =
walk_xtensa_backtrace(other_frame, s_raw_crash_data.other_backtrace, MAX_BACKTRACE);
}
}
#endif
#elif CONFIG_IDF_TARGET_ARCH_RISCV
// RISC-V: capture MEPC + RA, then scan stack for code addresses
if (info->frame != nullptr) {
auto *rv_frame = (RvExcFrame *) info->frame;
s_raw_crash_data.cause = rv_frame->mcause;
uint8_t count = 0;
// Save MEPC (fault PC) and RA (return address)
if (is_code_addr(rv_frame->mepc)) {
s_raw_crash_data.backtrace[count++] = rv_frame->mepc;
}
if (is_code_addr(rv_frame->ra) && rv_frame->ra != rv_frame->mepc) {
s_raw_crash_data.backtrace[count++] = rv_frame->ra;
}
// Track how many entries came from registers (MEPC/RA) so we can
// skip return-address validation for them at log time.
s_raw_crash_data.reg_frame_count = count;
// Scan stack for code addresses — captures broadly during panic,
// filtered by is_return_addr() at log time when flash is accessible.
auto *scan_start = (uint32_t *) rv_frame->sp;
for (uint32_t i = 0; i < 64 && count < MAX_BACKTRACE; i++) {
uint32_t val = scan_start[i];
if (is_code_addr(val) && val != rv_frame->mepc && val != rv_frame->ra) {
s_raw_crash_data.backtrace[count++] = val;
}
}
s_raw_crash_data.backtrace_count = count;
s_raw_crash_data.backtrace_count =
capture_riscv_backtrace(rv_frame, s_raw_crash_data.backtrace, MAX_BACKTRACE, &s_raw_crash_data.reg_frame_count);
}
#if SOC_CPU_CORES_NUM > 1
// Capture the other core's backtrace from the global frame array.
if (info->core >= 0 && info->core < SOC_CPU_CORES_NUM) {
int other_core = 1 - info->core;
auto *other_frame = (RvExcFrame *) g_exc_frames[other_core];
if (other_frame != nullptr) {
s_raw_crash_data.other_backtrace_count = capture_riscv_backtrace(
other_frame, s_raw_crash_data.other_backtrace, MAX_BACKTRACE, &s_raw_crash_data.other_reg_frame_count);
}
}
#endif
#endif
// Write version and magic last — ensures all data is written before we mark it valid