ESPHome 2026.6.3
Loading...
Searching...
No Matches
crash_handler.cpp
Go to the documentation of this file.
1#ifdef USE_ESP32
2
4#ifdef USE_ESP32_CRASH_HANDLER
5
6#include "crash_handler.h"
7#include "esphome/core/log.h"
8
9#include <cinttypes>
10#include <cstring>
11#include <esp_attr.h>
12#include <esp_private/panic_internal.h>
13#include <soc/soc.h>
14
15#if CONFIG_IDF_TARGET_ARCH_XTENSA
16#include <esp_cpu_utils.h>
17#include <esp_debug_helpers.h>
18#include <xtensa_context.h>
19#elif CONFIG_IDF_TARGET_ARCH_RISCV
20#include <riscv/rvruntime-frames.h>
21#endif
22
23static constexpr uint32_t CRASH_MAGIC = 0xDEADBEEF;
24static constexpr size_t MAX_BACKTRACE = 16;
25
26// Check if an address looks like code (flash-mapped or IRAM).
27// Must be safe to call from panic context (no flash access needed).
28static inline bool IRAM_ATTR is_code_addr(uint32_t addr) {
29 return (addr >= SOC_IROM_LOW && addr < SOC_IROM_HIGH) || (addr >= SOC_IRAM_LOW && addr < SOC_IRAM_HIGH);
30}
31
32#if CONFIG_IDF_TARGET_ARCH_RISCV
33// Check if a code address is a real return address by verifying the preceding
34// instruction is a JAL or JALR with rd=ra (x1). Called at log time (not during
35// panic) so flash cache is available and both IRAM and IROM are safely readable.
36static inline bool is_return_addr(uint32_t addr) {
37 if (!is_code_addr(addr) || addr < 4)
38 return false;
39 // A return address on the stack points to the instruction after a call.
40 // Check for 4-byte JAL/JALR call instruction before this address.
41 // Use memcpy for alignment safety — RISC-V C extension means code addresses
42 // are only 2-byte aligned, so addr-4 may not be 4-byte aligned.
43 uint32_t inst;
44 // NOLINTNEXTLINE(performance-no-int-to-ptr) - reading code memory at a raw address is the point
45 memcpy(&inst, (const void *) (addr - 4), sizeof(inst));
46 // RISC-V instruction encoding: bits [6:0] = opcode, bits [11:7] = rd
47 uint32_t opcode = inst & 0x7f; // Extract 7-bit opcode
48 uint32_t rd = inst & 0xf80; // Extract rd field (bits 11:7)
49 // Match JAL (0x6f) or JALR (0x67) with rd=ra (x1, encoded as 0x80 = 1<<7)
50 if ((opcode == 0x6f || opcode == 0x67) && rd == 0x80)
51 return true;
52 // Check for 2-byte compressed c.jalr before this address (C extension).
53 // c.jalr saves to ra implicitly: funct4=1001, rs1!=0, rs2=0, op=10
54 if (addr >= 2) {
55 // NOLINTNEXTLINE(performance-no-int-to-ptr) - reading code memory at a raw address is the point
56 uint16_t c_inst = *(uint16_t *) (addr - 2);
57 if ((c_inst & 0xf07f) == 0x9002 && (c_inst & 0x0f80) != 0)
58 return true;
59 }
60 return false;
61}
62#endif
63
64// --- Architecture-specific backtrace helpers ---
65// These run from IRAM during panic (no flash access).
66
67#if CONFIG_IDF_TARGET_ARCH_XTENSA
68// Walk Xtensa backtrace from an exception frame, writing PCs to out[].
69// Returns number of entries written.
70static uint8_t IRAM_ATTR walk_xtensa_backtrace(XtExcFrame *frame, uint32_t *out, uint8_t max) {
71 esp_backtrace_frame_t bt_frame = {
72 .pc = (uint32_t) frame->pc,
73 .sp = (uint32_t) frame->a1,
74 .next_pc = (uint32_t) frame->a0,
75 .exc_frame = frame,
76 };
77 uint8_t count = 0;
78 uint32_t first_pc = esp_cpu_process_stack_pc(bt_frame.pc);
79 if (is_code_addr(first_pc)) {
80 out[count++] = first_pc;
81 }
82 while (count < max && bt_frame.next_pc != 0) {
83 if (!esp_backtrace_get_next_frame(&bt_frame))
84 break;
85 uint32_t pc = esp_cpu_process_stack_pc(bt_frame.pc);
86 if (is_code_addr(pc)) {
87 out[count++] = pc;
88 }
89 }
90 return count;
91}
92#endif
93
94#if CONFIG_IDF_TARGET_ARCH_RISCV
95// Capture RISC-V backtrace: MEPC + RA from registers, then stack scan.
96// Returns total count; *reg_count receives number of register-sourced entries.
97static uint8_t IRAM_ATTR capture_riscv_backtrace(RvExcFrame *frame, uint32_t *out, uint8_t max, uint8_t *reg_count) {
98 uint8_t count = 0;
99 if (is_code_addr(frame->mepc)) {
100 out[count++] = frame->mepc;
101 }
102 if (is_code_addr(frame->ra) && frame->ra != frame->mepc) {
103 out[count++] = frame->ra;
104 }
105 *reg_count = count;
106 // NOLINTNEXTLINE(performance-no-int-to-ptr) - walking the raw stack by address is the point
107 auto *scan_start = (uint32_t *) frame->sp;
108 for (uint32_t i = 0; i < 64 && count < max; i++) {
110 if (is_code_addr(val) && val != frame->mepc && val != frame->ra) {
111 out[count++] = val;
112 }
113 }
114 return count;
115}
116#endif
117
118// Raw crash data written by the panic handler wrapper.
119// Lives in .noinit so it survives software reset but contains garbage after power cycle.
120// Validated by magic marker. Static linkage since it's only used within this file.
121// Version field is first so future firmware can always identify the struct layout.
122// Magic is second to validate the data. Remaining fields can change between versions.
123// Version is uint32_t because it would be padded to 4 bytes anyway before the next
124// uint32_t field, so we use the full width rather than wasting 3 bytes of padding.
125static constexpr uint32_t CRASH_DATA_VERSION = 2;
126struct RawCrashData {
127 uint32_t version;
128 uint32_t magic;
129 uint32_t pc;
130 uint8_t backtrace_count;
131 uint8_t reg_frame_count; // Number of entries from registers (not stack-scanned)
132 uint8_t exception; // panic_exception_t enum (FAULT/ABORT/IWDT/TWDT/DEBUG)
133 uint8_t pseudo_excause; // Whether cause is a pseudo exception (Xtensa SoC-level panic)
134 uint32_t backtrace[MAX_BACKTRACE];
135 uint32_t cause; // Architecture-specific: exccause (Xtensa) or mcause (RISC-V)
136 uint8_t crashed_core;
137#if SOC_CPU_CORES_NUM > 1
138 static_assert(SOC_CPU_CORES_NUM == 2, "Dual-core logic assumes exactly 2 cores");
139 uint8_t other_backtrace_count;
140 uint8_t other_reg_frame_count;
141 uint32_t other_backtrace[MAX_BACKTRACE];
142#endif
143};
144static RawCrashData __attribute__((section(".noinit")))
145s_raw_crash_data; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
146
147// Whether crash data was found and validated this boot.
148static bool s_crash_data_valid = false; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
149
150namespace esphome::esp32 {
151
152static const char *const TAG = "esp32.crash";
153
155 if (s_raw_crash_data.magic == CRASH_MAGIC && s_raw_crash_data.version == CRASH_DATA_VERSION) {
156 s_crash_data_valid = true;
157 // Clamp counts to prevent out-of-bounds reads from corrupt .noinit data
158 if (s_raw_crash_data.backtrace_count > MAX_BACKTRACE)
159 s_raw_crash_data.backtrace_count = MAX_BACKTRACE;
160 if (s_raw_crash_data.reg_frame_count > s_raw_crash_data.backtrace_count)
161 s_raw_crash_data.reg_frame_count = s_raw_crash_data.backtrace_count;
162 if (s_raw_crash_data.exception > 4) // panic_exception_t max value
163 s_raw_crash_data.exception = 4; // Default to PANIC_EXCEPTION_FAULT
164 if (s_raw_crash_data.pseudo_excause > 1)
165 s_raw_crash_data.pseudo_excause = 0;
166 if (s_raw_crash_data.crashed_core >= SOC_CPU_CORES_NUM)
167 s_raw_crash_data.crashed_core = 0;
168#if SOC_CPU_CORES_NUM > 1
169 if (s_raw_crash_data.other_backtrace_count > MAX_BACKTRACE)
170 s_raw_crash_data.other_backtrace_count = MAX_BACKTRACE;
171 if (s_raw_crash_data.other_reg_frame_count > s_raw_crash_data.other_backtrace_count)
172 s_raw_crash_data.other_reg_frame_count = s_raw_crash_data.other_backtrace_count;
173#endif
174 }
175 // Don't clear magic here — crash data must survive OTA rollback reboots.
176 // Magic is cleared by crash_handler_clear() after an API client receives the data.
177}
178
179bool crash_handler_has_data() { return s_crash_data_valid; }
180
182 // Only clear the magic so data doesn't survive the next reboot.
183 // Keep s_crash_data_valid so crash_handler_log() still works for
184 // additional API clients connecting during this boot session.
185 s_raw_crash_data.magic = 0;
186}
187
188// Look up the exception cause as a human-readable string.
189// Tables mirror ESP-IDF's panic_arch_fill_info() which uses local static arrays
190// not exposed via any public API.
191static const char *get_exception_reason() {
192#if CONFIG_IDF_TARGET_ARCH_XTENSA
193 if (s_raw_crash_data.pseudo_excause) {
194 // SoC-level panic: watchdog, cache error, etc.
195 // Keep in sync with ESP-IDF's PANIC_RSN_* defines
196 static const char *const PSEUDO_REASON[] = {
197 "Unknown reason", // 0
198 "Unhandled debug exception", // 1
199 "Double exception", // 2
200 "Unhandled kernel exception", // 3
201 "Coprocessor exception", // 4
202 "Interrupt wdt timeout on CPU0", // 5
203 "Interrupt wdt timeout on CPU1", // 6
204 "Cache error", // 7
205 };
206 uint32_t cause = s_raw_crash_data.cause;
207 if (cause < sizeof(PSEUDO_REASON) / sizeof(PSEUDO_REASON[0]))
208 return PSEUDO_REASON[cause];
209 return PSEUDO_REASON[0];
210 }
211 // Real Xtensa exception
212 static const char *const REASON[] = {
213 "IllegalInstruction",
214 "Syscall",
215 "InstructionFetchError",
216 "LoadStoreError",
217 "Level1Interrupt",
218 "Alloca",
219 "IntegerDivideByZero",
220 "PCValue",
221 "Privileged",
222 "LoadStoreAlignment",
223 nullptr,
224 nullptr,
225 "InstrPDAddrError",
226 "LoadStorePIFDataError",
227 "InstrPIFAddrError",
228 "LoadStorePIFAddrError",
229 "InstTLBMiss",
230 "InstTLBMultiHit",
231 "InstFetchPrivilege",
232 nullptr,
233 "InstrFetchProhibited",
234 nullptr,
235 nullptr,
236 nullptr,
237 "LoadStoreTLBMiss",
238 "LoadStoreTLBMultihit",
239 "LoadStorePrivilege",
240 nullptr,
241 "LoadProhibited",
242 "StoreProhibited",
243 };
244 uint32_t cause = s_raw_crash_data.cause;
245 if (cause < sizeof(REASON) / sizeof(REASON[0]) && REASON[cause] != nullptr)
246 return REASON[cause];
247#elif CONFIG_IDF_TARGET_ARCH_RISCV
248 // For SoC-level panics (watchdog, cache error), mcause holds IDF-internal
249 // interrupt numbers, not standard RISC-V cause codes. The exception type
250 // field already identifies these, so just return null to use the type name.
251 if (s_raw_crash_data.pseudo_excause)
252 return nullptr;
253 static const char *const REASON[] = {
254 "Instruction address misaligned",
255 "Instruction access fault",
256 "Illegal instruction",
257 "Breakpoint",
258 "Load address misaligned",
259 "Load access fault",
260 "Store address misaligned",
261 "Store access fault",
262 "Environment call from U-mode",
263 "Environment call from S-mode",
264 nullptr,
265 "Environment call from M-mode",
266 "Instruction page fault",
267 "Load page fault",
268 nullptr,
269 "Store page fault",
270 };
271 uint32_t cause = s_raw_crash_data.cause;
272 if (cause < sizeof(REASON) / sizeof(REASON[0]) && REASON[cause] != nullptr)
273 return REASON[cause];
274#endif
275 return "Unknown";
276}
277
278// Exception type names matching panic_exception_t enum
279static const char *get_exception_type() {
280 static const char *const TYPES[] = {
281 "Debug exception", // PANIC_EXCEPTION_DEBUG
282 "Interrupt wdt", // PANIC_EXCEPTION_IWDT
283 "Task wdt", // PANIC_EXCEPTION_TWDT
284 "Abort", // PANIC_EXCEPTION_ABORT
285 "Fault", // PANIC_EXCEPTION_FAULT
286 };
287 uint8_t exc = s_raw_crash_data.exception;
288 if (exc < sizeof(TYPES) / sizeof(TYPES[0]))
289 return TYPES[exc];
290 return "Unknown";
291}
292
293// Log backtrace entries, filtering stack-scanned addresses on RISC-V.
294static void log_backtrace(const uint32_t *addrs, uint8_t count, uint8_t reg_frame_count) {
295 uint8_t bt_num = 0;
296 for (uint8_t i = 0; i < count; i++) {
297 uint32_t addr = addrs[i];
298#if CONFIG_IDF_TARGET_ARCH_RISCV
299 if (i >= reg_frame_count && !is_return_addr(addr))
300 continue;
301 const char *source = (i < reg_frame_count) ? "backtrace" : "stack scan";
302#else
303 const char *source = "backtrace";
304#endif
305 ESP_LOGE(TAG, " BT%d: 0x%08" PRIX32 " (%s)", bt_num++, addr, source);
306 }
307}
308
309// Append backtrace addresses to the addr2line hint buffer.
310static int append_addrs_to_hint(char *buf, int size, int pos, const uint32_t *addrs, uint8_t count,
311 uint8_t reg_frame_count) {
312 for (uint8_t i = 0; i < count && pos < size - 12; i++) {
313 uint32_t addr = addrs[i];
314#if CONFIG_IDF_TARGET_ARCH_RISCV
315 if (i >= reg_frame_count && !is_return_addr(addr))
316 continue;
317#endif
318 pos += snprintf(buf + pos, size - pos, " 0x%08" PRIX32, addr);
319 }
320 return pos;
321}
322
323// Intentionally uses separate ESP_LOGE calls per line instead of combining into
324// one multi-line log message. This ensures each address appears as its own line
325// on the serial console, making it possible to see partial output if the device
326// crashes again during boot, and allowing the CLI's process_stacktrace to match
327// and decode each address individually.
329 if (!s_crash_data_valid)
330 return;
331
332 ESP_LOGE(TAG, "*** CRASH DETECTED ON PREVIOUS BOOT ***");
333 const char *reason = get_exception_reason();
334 if (reason != nullptr) {
335 ESP_LOGE(TAG, " Reason: %s - %s", get_exception_type(), reason);
336 } else {
337 ESP_LOGE(TAG, " Reason: %s", get_exception_type());
338 }
339 ESP_LOGE(TAG, " Crashed core: %d", s_raw_crash_data.crashed_core);
340 ESP_LOGE(TAG, " PC: 0x%08" PRIX32 " (fault location)", s_raw_crash_data.pc);
341 log_backtrace(s_raw_crash_data.backtrace, s_raw_crash_data.backtrace_count, s_raw_crash_data.reg_frame_count);
342
343#if SOC_CPU_CORES_NUM > 1
344 if (s_raw_crash_data.other_backtrace_count > 0) {
345 int other_core = 1 - s_raw_crash_data.crashed_core;
346 ESP_LOGE(TAG, " Other core (%d) backtrace:", other_core);
347 log_backtrace(s_raw_crash_data.other_backtrace, s_raw_crash_data.other_backtrace_count,
348 s_raw_crash_data.other_reg_frame_count);
349 }
350#endif
351
352 // Build addr2line hint with all captured addresses for easy copy-paste
353 char hint[256];
354 int pos = snprintf(hint, sizeof(hint), "Use: addr2line -pfiaC -e firmware.elf 0x%08" PRIX32, s_raw_crash_data.pc);
355 pos = append_addrs_to_hint(hint, sizeof(hint), pos, s_raw_crash_data.backtrace, s_raw_crash_data.backtrace_count,
356 s_raw_crash_data.reg_frame_count);
357#if SOC_CPU_CORES_NUM > 1
358 append_addrs_to_hint(hint, sizeof(hint), pos, s_raw_crash_data.other_backtrace,
359 s_raw_crash_data.other_backtrace_count, s_raw_crash_data.other_reg_frame_count);
360#else
361 (void) pos; // There is no second-core append on single-core targets, so pos would otherwise be unread.
362#endif
363 ESP_LOGE(TAG, "%s", hint);
364}
365
366} // namespace esphome::esp32
367
368// --- Panic handler wrapper ---
369// Intercepts esp_panic_handler() via --wrap linker flag to capture crash data
370// into NOINIT memory before the normal panic handler runs.
371//
372extern "C" {
373// NOLINTBEGIN(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp,readability-identifier-naming)
374// Names are mandated by the --wrap linker mechanism
375extern void __real_esp_panic_handler(panic_info_t *info);
376
377void IRAM_ATTR __wrap_esp_panic_handler(panic_info_t *info) {
378 // Save the faulting PC and exception info
379 s_raw_crash_data.pc = (uint32_t) info->addr;
380 s_raw_crash_data.backtrace_count = 0;
381 s_raw_crash_data.reg_frame_count = 0;
382 s_raw_crash_data.exception = (uint8_t) info->exception;
383 s_raw_crash_data.pseudo_excause = info->pseudo_excause ? 1 : 0;
384 s_raw_crash_data.crashed_core = (uint8_t) info->core;
385#if SOC_CPU_CORES_NUM > 1
386 s_raw_crash_data.other_backtrace_count = 0;
387 s_raw_crash_data.other_reg_frame_count = 0;
388#endif
389
390#if CONFIG_IDF_TARGET_ARCH_XTENSA
391 // Xtensa: walk the backtrace using the public API
392 if (info->frame != nullptr) {
393 auto *xt_frame = (XtExcFrame *) info->frame;
394 s_raw_crash_data.cause = xt_frame->exccause;
395 s_raw_crash_data.backtrace_count = walk_xtensa_backtrace(xt_frame, s_raw_crash_data.backtrace, MAX_BACKTRACE);
396 }
397
398#if SOC_CPU_CORES_NUM > 1
399 // Capture the other core's backtrace from the global frame array.
400 // Both cores save their frames to g_exc_frames[] before esp_panic_handler
401 // is called, so the other core's frame is available here.
402 if (info->core >= 0 && info->core < SOC_CPU_CORES_NUM) {
403 int other_core = 1 - info->core;
404 auto *other_frame = (XtExcFrame *) g_exc_frames[other_core];
405 if (other_frame != nullptr) {
406 s_raw_crash_data.other_backtrace_count =
407 walk_xtensa_backtrace(other_frame, s_raw_crash_data.other_backtrace, MAX_BACKTRACE);
408 }
409 }
410#endif
411
412#elif CONFIG_IDF_TARGET_ARCH_RISCV
413 // RISC-V: capture MEPC + RA, then scan stack for code addresses
414 if (info->frame != nullptr) {
415 auto *rv_frame = (RvExcFrame *) info->frame;
416 s_raw_crash_data.cause = rv_frame->mcause;
417 s_raw_crash_data.backtrace_count =
418 capture_riscv_backtrace(rv_frame, s_raw_crash_data.backtrace, MAX_BACKTRACE, &s_raw_crash_data.reg_frame_count);
419 }
420
421#if SOC_CPU_CORES_NUM > 1
422 // Capture the other core's backtrace from the global frame array.
423 if (info->core >= 0 && info->core < SOC_CPU_CORES_NUM) {
424 int other_core = 1 - info->core;
425 auto *other_frame = (RvExcFrame *) g_exc_frames[other_core];
426 if (other_frame != nullptr) {
427 s_raw_crash_data.other_backtrace_count = capture_riscv_backtrace(
428 other_frame, s_raw_crash_data.other_backtrace, MAX_BACKTRACE, &s_raw_crash_data.other_reg_frame_count);
429 }
430 }
431#endif
432#endif
433
434 // Write version and magic last — ensures all data is written before we mark it valid
435 s_raw_crash_data.version = CRASH_DATA_VERSION;
436 s_raw_crash_data.magic = CRASH_MAGIC;
437
438 // Call the real panic handler (prints to UART, does core dump, reboots, etc.)
440}
441
442// NOLINTEND(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp,readability-identifier-naming)
443} // extern "C"
444
445#endif // USE_ESP32_CRASH_HANDLER
446#endif // USE_ESP32
struct @65::@66 __attribute__
Wake the main loop task from an ISR. ISR-safe.
Definition main_task.h:32
void __real_esp_panic_handler(panic_info_t *info)
void IRAM_ATTR __wrap_esp_panic_handler(panic_info_t *info)
mopeka_std_values val[3]
bool crash_handler_has_data()
Returns true if crash data was found this boot.
void crash_handler_log()
Log crash data if a crash was detected on previous boot.
void crash_handler_read_and_clear()
Read and validate crash data from NOINIT memory.
void crash_handler_clear()
Clear the magic marker and mark crash data as consumed.
uint16_t size
Definition helpers.cpp:25
size_t size_t pos
Definition helpers.h:1038
uint32_t * scan_start
static void uint32_t
uint32_t pc