From 990649d8101952bebb9c3bc11eed62d05e534027 Mon Sep 17 00:00:00 2001 From: James L Date: Wed, 25 Feb 2026 13:48:10 -0500 Subject: [PATCH] Fix heap exhaustion: enable PSRAM allocator + bound all tables + auto-reboot watchdog Root cause: heltec_V4_boundary build was missing -DRNS_USE_TLSF=1 and -DRNS_USE_ALLOCATOR=1 flags, causing ALL C++ new/delete to use internal SRAM (239KB) instead of the PSRAM-backed TLSF pool (~1.6MB). Transport data structures consumed internal heap until WiFi driver could not allocate RX buffers (ESP_ERR_NO_MEM). Changes: - platformio.ini: Add TLSF/allocator flags to heltec_V4_boundary env, re-enable NDEBUG - Transport.cpp: Add periodic culling of _path_requests (was unbounded, grew one entry per unique destination forever). Cull entries older than DESTINATION_TIMEOUT. Also cull _pending_local_path_requests for removed interfaces, and fix missing .erase() (Python .pop() equivalent). - RNode_Firmware.ino: Replace WiFi watchdog halt-serial with auto-reboot. Add heap pressure check (reboot if free heap < 20KB). Increase WiFi grace period from 5s to 15s. Remove orphaned boundary_done label. --- RNode_Firmware.ino | 63 +++++++++++++--------------- lib/microReticulum/src/Transport.cpp | 27 ++++++++++++ platformio.ini | 4 +- 3 files changed, 58 insertions(+), 36 deletions(-) diff --git a/RNode_Firmware.ino b/RNode_Firmware.ino index 8bcec68..ae070db 100755 --- a/RNode_Firmware.ino +++ b/RNode_Firmware.ino @@ -2229,19 +2229,26 @@ void loop() { } #ifdef BOUNDARY_MODE - // ── WiFi disconnect watchdog ────────────────────────────────────────────── - // When WiFi drops, dump diagnostic info and halt serial output so the - // operator can read the last log lines over USB. The device keeps running - // (LoRa repeater still works) but serial is frozen until reboot. + // ── Heap + WiFi watchdog ─────────────────────────────────────────────────── + // Monitor heap and WiFi health. Auto-reboot on critical conditions: + // 1) Internal heap drops below 20KB (WiFi needs ~16KB for RX buffers) + // 2) WiFi down for >15s after having been connected (unrecoverable) { static bool _wifi_watchdog_armed = false; // armed once WiFi first connects - static bool _wifi_watchdog_halted = false; // true = serial frozen static uint32_t _wifi_lost_at = 0; // millis() when WiFi first lost - static const uint32_t WIFI_GRACE_MS = 5000; // 5 s grace before halting + static const uint32_t WIFI_GRACE_MS = 15000; // 15s grace before reboot + static const uint32_t HEAP_CRITICAL = 20000; // 20KB minimum internal heap - if (_wifi_watchdog_halted) { - // Frozen — skip all boundary work, just keep LoRa running - goto boundary_done; + // ── Heap pressure check (runs always) ───────────────────────────────── + uint32_t free_heap = ESP.getFreeHeap(); + if (free_heap < HEAP_CRITICAL) { + Serial.printf("\r\n[WATCHDOG] CRITICAL: Free heap %u < %u — REBOOTING\r\n", + free_heap, HEAP_CRITICAL); + Serial.printf("[WATCHDOG] Min free: %u Max alloc: %u\r\n", + ESP.getMinFreeHeap(), ESP.getMaxAllocHeap()); + Serial.flush(); + delay(100); + ESP.restart(); } bool wifi_now = wifi_is_connected(); @@ -2255,41 +2262,28 @@ void loop() { if (_wifi_watchdog_armed && !wifi_now) { if (_wifi_lost_at == 0) { _wifi_lost_at = millis(); - Serial.printf("\r\n[WATCHDOG] WiFi connection LOST at %lu ms — grace period %lu ms\r\n", + Serial.printf("\r\n[WATCHDOG] WiFi lost at %lu ms (grace %lu ms)\r\n", _wifi_lost_at, WIFI_GRACE_MS); - Serial.printf("[WATCHDOG] WiFi.status() = %d RSSI = %d\r\n", - (int)WiFi.status(), (int)WiFi.RSSI()); - Serial.printf("[WATCHDOG] Free heap: %u Min free heap: %u\r\n", - ESP.getFreeHeap(), ESP.getMinFreeHeap()); - Serial.printf("[WATCHDOG] TCP backbone connected: %s clients: %d\r\n", - (tcp_interface_ptr && tcp_interface_ptr->isConnected()) ? "yes" : "no", - tcp_interface_ptr ? tcp_interface_ptr->clientCount() : 0); - if (local_tcp_interface_ptr) { - Serial.printf("[WATCHDOG] Local TCP connected: %s clients: %d\r\n", - local_tcp_interface_ptr->isConnected() ? "yes" : "no", - local_tcp_interface_ptr->clientCount()); - } + Serial.printf("[WATCHDOG] WiFi.status()=%d heap=%u min_heap=%u\r\n", + (int)WiFi.status(), free_heap, ESP.getMinFreeHeap()); Serial.flush(); } - // Check if grace period expired + // Check if grace period expired — unrecoverable, reboot if ((millis() - _wifi_lost_at) >= WIFI_GRACE_MS) { - Serial.printf("\r\n[WATCHDOG] *** WiFi still down after %lu ms — HALTING SERIAL OUTPUT ***\r\n", + Serial.printf("\r\n[WATCHDOG] WiFi down %lu ms — REBOOTING\r\n", millis() - _wifi_lost_at); - Serial.printf("[WATCHDOG] WiFi.status() = %d RSSI = %d\r\n", - (int)WiFi.status(), (int)WiFi.RSSI()); - Serial.printf("[WATCHDOG] Last boundary activity: %lu ms ago\r\n", - millis() - boundary_state.last_bridge_activity); - Serial.printf("[WATCHDOG] Packets bridged: LoRa→TCP=%lu TCP→LoRa=%lu\r\n", + Serial.printf("[WATCHDOG] WiFi.status()=%d heap=%u\r\n", + (int)WiFi.status(), ESP.getFreeHeap()); + Serial.printf("[WATCHDOG] Bridged: L→T=%lu T→L=%lu\r\n", boundary_state.packets_bridged_lora_to_tcp, boundary_state.packets_bridged_tcp_to_lora); - Serial.println("[WATCHDOG] Device still running (LoRa repeater active). Reboot to resume."); Serial.flush(); - _wifi_watchdog_halted = true; - goto boundary_done; + delay(100); + ESP.restart(); } } else if (_wifi_watchdog_armed && wifi_now && _wifi_lost_at != 0) { - // WiFi came back within grace period - Serial.printf("[WATCHDOG] WiFi reconnected after %lu ms\r\n", millis() - _wifi_lost_at); + // WiFi recovered within grace period + Serial.printf("[WATCHDOG] WiFi back after %lu ms\r\n", millis() - _wifi_lost_at); _wifi_lost_at = 0; } } @@ -2318,7 +2312,6 @@ void loop() { boundary_state.wifi_connected = wifi_is_connected(); } -boundary_done: #endif #endif diff --git a/lib/microReticulum/src/Transport.cpp b/lib/microReticulum/src/Transport.cpp index 668309c..f9e0db6 100755 --- a/lib/microReticulum/src/Transport.cpp +++ b/lib/microReticulum/src/Transport.cpp @@ -571,6 +571,32 @@ static bool is_backbone_interface(const Interface& iface) { } } + // Cull the path requests table (entries older than destination timeout) + { + std::vector stale_path_requests; + for (const auto& [destination_hash, timestamp] : _path_requests) { + if (OS::time() > (timestamp + DESTINATION_TIMEOUT)) { + stale_path_requests.push_back(destination_hash); + } + } + for (const Bytes& destination_hash : stale_path_requests) { + _path_requests.erase(destination_hash); + } + } + + // Cull pending local path requests for interfaces that no longer exist + { + std::vector stale_plpr; + for (const auto& [destination_hash, iface] : _pending_local_path_requests) { + if (_interfaces.count(iface.get_hash()) == 0) { + stale_plpr.push_back(destination_hash); + } + } + for (const Bytes& destination_hash : stale_plpr) { + _pending_local_path_requests.erase(destination_hash); + } + } + // Cull the tunnel table count = 0; std::vector stale_tunnels; @@ -2144,6 +2170,7 @@ static bool is_backbone_interface(const Interface& iface) { if (iter != _pending_local_path_requests.end()) { //p desiring_interface = Transport.pending_local_path_requests.pop(packet.destination_hash) //const Interface& desiring_interface = (*iter).second; + _pending_local_path_requests.erase(iter); // CBA FIX: pop() equivalent retransmit_timeout = now; retries = PATHFINDER_R; diff --git a/platformio.ini b/platformio.ini index 0f2d1e1..fc6b28e 100755 --- a/platformio.ini +++ b/platformio.ini @@ -342,7 +342,9 @@ build_flags = -DARDUINO_USB_CDC_ON_BOOT=1 -DBOARD_HAS_PSRAM=1 -DBOUNDARY_MODE - ;-DNDEBUG ; re-enable debug output for WiFi disconnect investigation + -DNDEBUG + -DRNS_USE_TLSF=1 + -DRNS_USE_ALLOCATOR=1 ; --- Boundary mode defaults (override via EEPROM at runtime) --- ; TCP server mode (0=server, 1=client) -DBOUNDARY_TCP_MODE=0