From f4dcf4df44e20776b310f4b27a3233676d49951b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=B3r=C3=A1nt=20Pint=C3=A9r?= Date: Fri, 13 Dec 2024 17:53:52 +0100 Subject: [PATCH 1/9] Always put core dump on flash --- sdkconfig.debug.defaults | 5 ----- sdkconfig.defaults | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/sdkconfig.debug.defaults b/sdkconfig.debug.defaults index 3509855e..bcdec2ad 100644 --- a/sdkconfig.debug.defaults +++ b/sdkconfig.debug.defaults @@ -1,6 +1 @@ CONFIG_LOG_MAXIMUM_LEVEL_VERBOSE=y - -CONFIG_ESP_COREDUMP_ENABLE_TO_FLASH=n -CONFIG_ESP_COREDUMP_ENABLE_TO_UART=y -CONFIG_ESP_COREDUMP_DECODE=y -CONFIG_ESP_COREDUMP_LOGS=y diff --git a/sdkconfig.defaults b/sdkconfig.defaults index e259a7b4..a574dc11 100644 --- a/sdkconfig.defaults +++ b/sdkconfig.defaults @@ -88,4 +88,4 @@ CONFIG_PM_LIGHT_SLEEP_CALLBACKS=y CONFIG_ESP_COREDUMP_ENABLE_TO_FLASH=y CONFIG_ESP_COREDUMP_DATA_FORMAT_ELF=y CONFIG_ESP_COREDUMP_CHECKSUM_SHA256=y -CONFIG_ESP_COREDUMP_LOGS=n +CONFIG_ESP_COREDUMP_DECODE_INFO=y From cd73022552deffb2a1f24ba899b891ea86a1defd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=B3r=C3=A1nt=20Pint=C3=A9r?= Date: Fri, 13 Dec 2024 17:54:16 +0100 Subject: [PATCH 2/9] Some debug optimizations --- sdkconfig.debug.defaults | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sdkconfig.debug.defaults b/sdkconfig.debug.defaults index bcdec2ad..8343ecf4 100644 --- a/sdkconfig.debug.defaults +++ b/sdkconfig.debug.defaults @@ -1 +1,7 @@ CONFIG_LOG_MAXIMUM_LEVEL_VERBOSE=y + +CONFIG_ESP_SYSTEM_PANIC_GDBSTUB=y +CONFIG_ESP_DEBUG_OCDAWARE=y + +CONFIG_COMPILER_OPTIMIZATION_SIZE=n +CONFIG_COMPILER_OPTIMIZATION_DEBUG=y From aeb0d83833c62edd2dfecffeb8ef70184a48d3f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=B3r=C3=A1nt=20Pint=C3=A9r?= Date: Fri, 13 Dec 2024 19:23:13 +0100 Subject: [PATCH 3/9] No core dump logs in production --- sdkconfig.debug.defaults | 2 ++ sdkconfig.defaults | 1 + 2 files changed, 3 insertions(+) diff --git a/sdkconfig.debug.defaults b/sdkconfig.debug.defaults index 8343ecf4..98899899 100644 --- a/sdkconfig.debug.defaults +++ b/sdkconfig.debug.defaults @@ -1,5 +1,7 @@ CONFIG_LOG_MAXIMUM_LEVEL_VERBOSE=y +CONFIG_ESP_COREDUMP_LOGS=y + CONFIG_ESP_SYSTEM_PANIC_GDBSTUB=y CONFIG_ESP_DEBUG_OCDAWARE=y diff --git a/sdkconfig.defaults b/sdkconfig.defaults index a574dc11..81329f4e 100644 --- a/sdkconfig.defaults +++ b/sdkconfig.defaults @@ -89,3 +89,4 @@ CONFIG_ESP_COREDUMP_ENABLE_TO_FLASH=y CONFIG_ESP_COREDUMP_DATA_FORMAT_ELF=y CONFIG_ESP_COREDUMP_CHECKSUM_SHA256=y CONFIG_ESP_COREDUMP_DECODE_INFO=y +CONFIG_ESP_COREDUMP_LOGS=n From be54cdcd78605af09add9802cdb592fa2f0df84c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=B3r=C3=A1nt=20Pint=C3=A9r?= Date: Fri, 13 Dec 2024 21:49:46 +0100 Subject: [PATCH 4/9] Give a bit more space to core dumps Now we have 96 kB for data and 96 kB for core dumps. --- partitions.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/partitions.csv b/partitions.csv index 03a2a5c6..adce8e2c 100644 --- a/partitions.csv +++ b/partitions.csv @@ -3,5 +3,5 @@ nvs, data, nvs, 0x9000, 0x5000, otadata, data, ota, 0xe000, 0x2000, app0, app, ota_0, 0x10000, 0x1E0000, app1, app, ota_1, 0x1F0000,0x1E0000, -data, data, spiffs, 0x3D0000,0x20000, -coredump, data, coredump,0x3F0000,0x10000, +data, data, spiffs, 0x3D0000,0x18000, +coredump, data, coredump,0x3E8000,0x18000, From 1b7f676b66d8afe8f7d3b04f851d0aab1ee6f6c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=B3r=C3=A1nt=20Pint=C3=A9r?= Date: Fri, 13 Dec 2024 21:49:58 +0100 Subject: [PATCH 5/9] Do not report core dump logs even in debug mode --- sdkconfig.debug.defaults | 2 -- 1 file changed, 2 deletions(-) diff --git a/sdkconfig.debug.defaults b/sdkconfig.debug.defaults index 98899899..8343ecf4 100644 --- a/sdkconfig.debug.defaults +++ b/sdkconfig.debug.defaults @@ -1,7 +1,5 @@ CONFIG_LOG_MAXIMUM_LEVEL_VERBOSE=y -CONFIG_ESP_COREDUMP_LOGS=y - CONFIG_ESP_SYSTEM_PANIC_GDBSTUB=y CONFIG_ESP_DEBUG_OCDAWARE=y From 0d84c0ea7e86480cd93ff1fd6df8156597126fd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=B3r=C3=A1nt=20Pint=C3=A9r?= Date: Fri, 13 Dec 2024 21:50:29 +0100 Subject: [PATCH 6/9] Report panic location and some more metadata at next init --- main/devices/Device.hpp | 67 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/main/devices/Device.hpp b/main/devices/Device.hpp index 957552e8..58da8d88 100644 --- a/main/devices/Device.hpp +++ b/main/devices/Device.hpp @@ -5,10 +5,11 @@ #include -#include "esp_netif.h" -#include "esp_wifi.h" +#include +#include #include #include +#include #include @@ -495,6 +496,7 @@ class Device { json["state"] = static_cast(initState); json["peripherals"].to().set(peripheralsInitJson); json["sleepWhenIdle"] = kernel.powerManager.sleepWhenIdle; + addCoreDumpInfo(json); }, Retention::NoRetain, QoS::AtLeastOnce, 5s); @@ -542,6 +544,67 @@ class Device { } } + void addCoreDumpInfo(JsonObject& json) { + esp_err_t errCheck = esp_core_dump_image_check(); + if (errCheck == ESP_ERR_NOT_FOUND) { + LOGV("No core dump found"); + return; + } + + auto coreDumpJson = json["coreDump"].to(); + + if (errCheck != ESP_OK) { + LOGE("Failed to check for core dump: %s", esp_err_to_name(errCheck)); + coreDumpJson["error"] = esp_err_to_name(errCheck); + return; + } + + esp_core_dump_summary_t summary; + esp_err_t err = esp_core_dump_get_summary(&summary); + if (err != ESP_OK) { + LOGE("Failed to get core dump summary: %s", esp_err_to_name(err)); + coreDumpJson["error"] = esp_err_to_name(err); + return; + } + + auto excCause = +#if __XTENSA__ + summary.ex_info.exc_cause; +#else + summary.ex_info.mcause; +#endif + + LOGW("Core dump found: task: %s, cause: %ld", + summary.exc_task, excCause); + + coreDumpJson["version"] = summary.core_dump_version; + coreDumpJson["sha256"] = String((const char*) summary.app_elf_sha256, CONFIG_APP_RETRIEVE_LEN_ELF_SHA); + coreDumpJson["task"] = summary.exc_task; + coreDumpJson["cause"] = excCause; + + static constexpr size_t PANIC_REASON_SIZE = 256; + char panicReason[PANIC_REASON_SIZE]; + err = esp_core_dump_get_panic_reason(panicReason, PANIC_REASON_SIZE); + if (err == ESP_OK) { + LOGW("Panic reason: %s", panicReason); + coreDumpJson["panicReason"] = panicReason; + } + + auto backtraceJson = coreDumpJson["backtrace"].to(); + if (summary.exc_bt_info.corrupted) { + LOGE("Backtrace corrupted, depth %lu", summary.exc_bt_info.depth); + backtraceJson["corrupted"] = true; + } else { + auto framesJson = backtraceJson["frames"].to(); + for (int i = 0; i < summary.exc_bt_info.depth; i++) { + auto& frame = summary.exc_bt_info.bt[i]; + framesJson.add("0x" + String(frame, HEX)); + } + } + + ESP_ERROR_CHECK(esp_core_dump_image_erase()); + } + Queue logRecords { "logs", 32 }; ConfiguredKernel configuredKernel { logRecords }; Kernel& kernel = configuredKernel.kernel; From 18dde0ce60efdde3b78a84bf147146682f7cebf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=B3r=C3=A1nt=20Pint=C3=A9r?= Date: Fri, 13 Dec 2024 22:07:50 +0100 Subject: [PATCH 7/9] Report to separate 'crash' MQTT endpoint --- main/devices/Device.hpp | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/main/devices/Device.hpp b/main/devices/Device.hpp index 58da8d88..1b03eb38 100644 --- a/main/devices/Device.hpp +++ b/main/devices/Device.hpp @@ -496,10 +496,11 @@ class Device { json["state"] = static_cast(initState); json["peripherals"].to().set(peripheralsInitJson); json["sleepWhenIdle"] = kernel.powerManager.sleepWhenIdle; - addCoreDumpInfo(json); }, Retention::NoRetain, QoS::AtLeastOnce, 5s); + reportPreviousCrashIfAny(); + Task::loop("telemetry", 8192, [this](Task& task) { publishTelemetry(); // TODO Configure these telemetry intervals @@ -544,18 +545,14 @@ class Device { } } - void addCoreDumpInfo(JsonObject& json) { + void reportPreviousCrashIfAny() { esp_err_t errCheck = esp_core_dump_image_check(); if (errCheck == ESP_ERR_NOT_FOUND) { LOGV("No core dump found"); return; } - - auto coreDumpJson = json["coreDump"].to(); - if (errCheck != ESP_OK) { LOGE("Failed to check for core dump: %s", esp_err_to_name(errCheck)); - coreDumpJson["error"] = esp_err_to_name(errCheck); return; } @@ -563,10 +560,19 @@ class Device { esp_err_t err = esp_core_dump_get_summary(&summary); if (err != ESP_OK) { LOGE("Failed to get core dump summary: %s", esp_err_to_name(err)); - coreDumpJson["error"] = esp_err_to_name(err); - return; + } else { + mqttDeviceRoot->publish( + "crash", + [&](JsonObject& json) { + reportPreviousCrash(json, summary); + }, + Retention::NoRetain, QoS::AtLeastOnce, 5s); } + ESP_ERROR_CHECK_WITHOUT_ABORT(esp_core_dump_image_erase()); + } + + void reportPreviousCrash(JsonObject& json, const esp_core_dump_summary_t& summary) { auto excCause = #if __XTENSA__ summary.ex_info.exc_cause; @@ -577,20 +583,19 @@ class Device { LOGW("Core dump found: task: %s, cause: %ld", summary.exc_task, excCause); - coreDumpJson["version"] = summary.core_dump_version; - coreDumpJson["sha256"] = String((const char*) summary.app_elf_sha256, CONFIG_APP_RETRIEVE_LEN_ELF_SHA); - coreDumpJson["task"] = summary.exc_task; - coreDumpJson["cause"] = excCause; + json["version"] = summary.core_dump_version; + json["sha256"] = String((const char*) summary.app_elf_sha256, CONFIG_APP_RETRIEVE_LEN_ELF_SHA); + json["task"] = summary.exc_task; + json["cause"] = excCause; static constexpr size_t PANIC_REASON_SIZE = 256; char panicReason[PANIC_REASON_SIZE]; - err = esp_core_dump_get_panic_reason(panicReason, PANIC_REASON_SIZE); - if (err == ESP_OK) { + if (esp_core_dump_get_panic_reason(panicReason, PANIC_REASON_SIZE) == ESP_OK) { LOGW("Panic reason: %s", panicReason); - coreDumpJson["panicReason"] = panicReason; + json["panicReason"] = panicReason; } - auto backtraceJson = coreDumpJson["backtrace"].to(); + auto backtraceJson = json["backtrace"].to(); if (summary.exc_bt_info.corrupted) { LOGE("Backtrace corrupted, depth %lu", summary.exc_bt_info.depth); backtraceJson["corrupted"] = true; @@ -601,8 +606,6 @@ class Device { framesJson.add("0x" + String(frame, HEX)); } } - - ESP_ERROR_CHECK(esp_core_dump_image_erase()); } Queue logRecords { "logs", 32 }; From f301ad886038cd2befa517d47fc9dfcc0cf1092b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=B3r=C3=A1nt=20Pint=C3=A9r?= Date: Fri, 13 Dec 2024 22:10:23 +0100 Subject: [PATCH 8/9] Retain the crash report --- main/devices/Device.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/devices/Device.hpp b/main/devices/Device.hpp index 1b03eb38..ace90e8a 100644 --- a/main/devices/Device.hpp +++ b/main/devices/Device.hpp @@ -566,7 +566,7 @@ class Device { [&](JsonObject& json) { reportPreviousCrash(json, summary); }, - Retention::NoRetain, QoS::AtLeastOnce, 5s); + Retention::Retain, QoS::AtLeastOnce, 5s); } ESP_ERROR_CHECK_WITHOUT_ABORT(esp_core_dump_image_erase()); From 3c5a5719aa2fd2344cfe1c5bfc49ba9445f24dc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=B3r=C3=A1nt=20Pint=C3=A9r?= Date: Fri, 13 Dec 2024 22:33:43 +0100 Subject: [PATCH 9/9] Let's report crashes as part of the `init` message after all --- main/devices/Device.hpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/main/devices/Device.hpp b/main/devices/Device.hpp index ace90e8a..f6fe6f03 100644 --- a/main/devices/Device.hpp +++ b/main/devices/Device.hpp @@ -496,11 +496,11 @@ class Device { json["state"] = static_cast(initState); json["peripherals"].to().set(peripheralsInitJson); json["sleepWhenIdle"] = kernel.powerManager.sleepWhenIdle; + + reportPreviousCrashIfAny(json); }, Retention::NoRetain, QoS::AtLeastOnce, 5s); - reportPreviousCrashIfAny(); - Task::loop("telemetry", 8192, [this](Task& task) { publishTelemetry(); // TODO Configure these telemetry intervals @@ -545,7 +545,7 @@ class Device { } } - void reportPreviousCrashIfAny() { + void reportPreviousCrashIfAny(JsonObject& json) { esp_err_t errCheck = esp_core_dump_image_check(); if (errCheck == ESP_ERR_NOT_FOUND) { LOGV("No core dump found"); @@ -561,12 +561,8 @@ class Device { if (err != ESP_OK) { LOGE("Failed to get core dump summary: %s", esp_err_to_name(err)); } else { - mqttDeviceRoot->publish( - "crash", - [&](JsonObject& json) { - reportPreviousCrash(json, summary); - }, - Retention::Retain, QoS::AtLeastOnce, 5s); + auto crashJson = json["crash"].to(); + reportPreviousCrash(crashJson, summary); } ESP_ERROR_CHECK_WITHOUT_ABORT(esp_core_dump_image_erase());