Fix heap exhaustion: enable PSRAM allocator + bound all tables + auto-reboot watchdog
Root cause: heltec_V4_boundary build was missing -DRNS_USE_TLSF=1 and -DRNS_USE_ALLOCATOR=1 flags, causing ALL C++ new/delete to use internal SRAM (239KB) instead of the PSRAM-backed TLSF pool (~1.6MB). Transport data structures consumed internal heap until WiFi driver could not allocate RX buffers (ESP_ERR_NO_MEM). Changes: - platformio.ini: Add TLSF/allocator flags to heltec_V4_boundary env, re-enable NDEBUG - Transport.cpp: Add periodic culling of _path_requests (was unbounded, grew one entry per unique destination forever). Cull entries older than DESTINATION_TIMEOUT. Also cull _pending_local_path_requests for removed interfaces, and fix missing .erase() (Python .pop() equivalent). - RNode_Firmware.ino: Replace WiFi watchdog halt-serial with auto-reboot. Add heap pressure check (reboot if free heap < 20KB). Increase WiFi grace period from 5s to 15s. Remove orphaned boundary_done label.
This commit is contained in:
@@ -2229,19 +2229,26 @@ void loop() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef BOUNDARY_MODE
|
#ifdef BOUNDARY_MODE
|
||||||
// ── WiFi disconnect watchdog ──────────────────────────────────────────────
|
// ── Heap + WiFi watchdog ───────────────────────────────────────────────────
|
||||||
// When WiFi drops, dump diagnostic info and halt serial output so the
|
// Monitor heap and WiFi health. Auto-reboot on critical conditions:
|
||||||
// operator can read the last log lines over USB. The device keeps running
|
// 1) Internal heap drops below 20KB (WiFi needs ~16KB for RX buffers)
|
||||||
// (LoRa repeater still works) but serial is frozen until reboot.
|
// 2) WiFi down for >15s after having been connected (unrecoverable)
|
||||||
{
|
{
|
||||||
static bool _wifi_watchdog_armed = false; // armed once WiFi first connects
|
static bool _wifi_watchdog_armed = false; // armed once WiFi first connects
|
||||||
static bool _wifi_watchdog_halted = false; // true = serial frozen
|
|
||||||
static uint32_t _wifi_lost_at = 0; // millis() when WiFi first lost
|
static uint32_t _wifi_lost_at = 0; // millis() when WiFi first lost
|
||||||
static const uint32_t WIFI_GRACE_MS = 5000; // 5 s grace before halting
|
static const uint32_t WIFI_GRACE_MS = 15000; // 15s grace before reboot
|
||||||
|
static const uint32_t HEAP_CRITICAL = 20000; // 20KB minimum internal heap
|
||||||
|
|
||||||
if (_wifi_watchdog_halted) {
|
// ── Heap pressure check (runs always) ─────────────────────────────────
|
||||||
// Frozen — skip all boundary work, just keep LoRa running
|
uint32_t free_heap = ESP.getFreeHeap();
|
||||||
goto boundary_done;
|
if (free_heap < HEAP_CRITICAL) {
|
||||||
|
Serial.printf("\r\n[WATCHDOG] CRITICAL: Free heap %u < %u — REBOOTING\r\n",
|
||||||
|
free_heap, HEAP_CRITICAL);
|
||||||
|
Serial.printf("[WATCHDOG] Min free: %u Max alloc: %u\r\n",
|
||||||
|
ESP.getMinFreeHeap(), ESP.getMaxAllocHeap());
|
||||||
|
Serial.flush();
|
||||||
|
delay(100);
|
||||||
|
ESP.restart();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool wifi_now = wifi_is_connected();
|
bool wifi_now = wifi_is_connected();
|
||||||
@@ -2255,41 +2262,28 @@ void loop() {
|
|||||||
if (_wifi_watchdog_armed && !wifi_now) {
|
if (_wifi_watchdog_armed && !wifi_now) {
|
||||||
if (_wifi_lost_at == 0) {
|
if (_wifi_lost_at == 0) {
|
||||||
_wifi_lost_at = millis();
|
_wifi_lost_at = millis();
|
||||||
Serial.printf("\r\n[WATCHDOG] WiFi connection LOST at %lu ms — grace period %lu ms\r\n",
|
Serial.printf("\r\n[WATCHDOG] WiFi lost at %lu ms (grace %lu ms)\r\n",
|
||||||
_wifi_lost_at, WIFI_GRACE_MS);
|
_wifi_lost_at, WIFI_GRACE_MS);
|
||||||
Serial.printf("[WATCHDOG] WiFi.status() = %d RSSI = %d\r\n",
|
Serial.printf("[WATCHDOG] WiFi.status()=%d heap=%u min_heap=%u\r\n",
|
||||||
(int)WiFi.status(), (int)WiFi.RSSI());
|
(int)WiFi.status(), free_heap, ESP.getMinFreeHeap());
|
||||||
Serial.printf("[WATCHDOG] Free heap: %u Min free heap: %u\r\n",
|
|
||||||
ESP.getFreeHeap(), ESP.getMinFreeHeap());
|
|
||||||
Serial.printf("[WATCHDOG] TCP backbone connected: %s clients: %d\r\n",
|
|
||||||
(tcp_interface_ptr && tcp_interface_ptr->isConnected()) ? "yes" : "no",
|
|
||||||
tcp_interface_ptr ? tcp_interface_ptr->clientCount() : 0);
|
|
||||||
if (local_tcp_interface_ptr) {
|
|
||||||
Serial.printf("[WATCHDOG] Local TCP connected: %s clients: %d\r\n",
|
|
||||||
local_tcp_interface_ptr->isConnected() ? "yes" : "no",
|
|
||||||
local_tcp_interface_ptr->clientCount());
|
|
||||||
}
|
|
||||||
Serial.flush();
|
Serial.flush();
|
||||||
}
|
}
|
||||||
// Check if grace period expired
|
// Check if grace period expired — unrecoverable, reboot
|
||||||
if ((millis() - _wifi_lost_at) >= WIFI_GRACE_MS) {
|
if ((millis() - _wifi_lost_at) >= WIFI_GRACE_MS) {
|
||||||
Serial.printf("\r\n[WATCHDOG] *** WiFi still down after %lu ms — HALTING SERIAL OUTPUT ***\r\n",
|
Serial.printf("\r\n[WATCHDOG] WiFi down %lu ms — REBOOTING\r\n",
|
||||||
millis() - _wifi_lost_at);
|
millis() - _wifi_lost_at);
|
||||||
Serial.printf("[WATCHDOG] WiFi.status() = %d RSSI = %d\r\n",
|
Serial.printf("[WATCHDOG] WiFi.status()=%d heap=%u\r\n",
|
||||||
(int)WiFi.status(), (int)WiFi.RSSI());
|
(int)WiFi.status(), ESP.getFreeHeap());
|
||||||
Serial.printf("[WATCHDOG] Last boundary activity: %lu ms ago\r\n",
|
Serial.printf("[WATCHDOG] Bridged: L→T=%lu T→L=%lu\r\n",
|
||||||
millis() - boundary_state.last_bridge_activity);
|
|
||||||
Serial.printf("[WATCHDOG] Packets bridged: LoRa→TCP=%lu TCP→LoRa=%lu\r\n",
|
|
||||||
boundary_state.packets_bridged_lora_to_tcp,
|
boundary_state.packets_bridged_lora_to_tcp,
|
||||||
boundary_state.packets_bridged_tcp_to_lora);
|
boundary_state.packets_bridged_tcp_to_lora);
|
||||||
Serial.println("[WATCHDOG] Device still running (LoRa repeater active). Reboot to resume.");
|
|
||||||
Serial.flush();
|
Serial.flush();
|
||||||
_wifi_watchdog_halted = true;
|
delay(100);
|
||||||
goto boundary_done;
|
ESP.restart();
|
||||||
}
|
}
|
||||||
} else if (_wifi_watchdog_armed && wifi_now && _wifi_lost_at != 0) {
|
} else if (_wifi_watchdog_armed && wifi_now && _wifi_lost_at != 0) {
|
||||||
// WiFi came back within grace period
|
// WiFi recovered within grace period
|
||||||
Serial.printf("[WATCHDOG] WiFi reconnected after %lu ms\r\n", millis() - _wifi_lost_at);
|
Serial.printf("[WATCHDOG] WiFi back after %lu ms\r\n", millis() - _wifi_lost_at);
|
||||||
_wifi_lost_at = 0;
|
_wifi_lost_at = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2318,7 +2312,6 @@ void loop() {
|
|||||||
boundary_state.wifi_connected = wifi_is_connected();
|
boundary_state.wifi_connected = wifi_is_connected();
|
||||||
}
|
}
|
||||||
|
|
||||||
boundary_done:
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -571,6 +571,32 @@ static bool is_backbone_interface(const Interface& iface) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Cull the path requests table (entries older than destination timeout)
|
||||||
|
{
|
||||||
|
std::vector<Bytes> stale_path_requests;
|
||||||
|
for (const auto& [destination_hash, timestamp] : _path_requests) {
|
||||||
|
if (OS::time() > (timestamp + DESTINATION_TIMEOUT)) {
|
||||||
|
stale_path_requests.push_back(destination_hash);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (const Bytes& destination_hash : stale_path_requests) {
|
||||||
|
_path_requests.erase(destination_hash);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cull pending local path requests for interfaces that no longer exist
|
||||||
|
{
|
||||||
|
std::vector<Bytes> stale_plpr;
|
||||||
|
for (const auto& [destination_hash, iface] : _pending_local_path_requests) {
|
||||||
|
if (_interfaces.count(iface.get_hash()) == 0) {
|
||||||
|
stale_plpr.push_back(destination_hash);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (const Bytes& destination_hash : stale_plpr) {
|
||||||
|
_pending_local_path_requests.erase(destination_hash);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Cull the tunnel table
|
// Cull the tunnel table
|
||||||
count = 0;
|
count = 0;
|
||||||
std::vector<Bytes> stale_tunnels;
|
std::vector<Bytes> stale_tunnels;
|
||||||
@@ -2144,6 +2170,7 @@ static bool is_backbone_interface(const Interface& iface) {
|
|||||||
if (iter != _pending_local_path_requests.end()) {
|
if (iter != _pending_local_path_requests.end()) {
|
||||||
//p desiring_interface = Transport.pending_local_path_requests.pop(packet.destination_hash)
|
//p desiring_interface = Transport.pending_local_path_requests.pop(packet.destination_hash)
|
||||||
//const Interface& desiring_interface = (*iter).second;
|
//const Interface& desiring_interface = (*iter).second;
|
||||||
|
_pending_local_path_requests.erase(iter); // CBA FIX: pop() equivalent
|
||||||
retransmit_timeout = now;
|
retransmit_timeout = now;
|
||||||
retries = PATHFINDER_R;
|
retries = PATHFINDER_R;
|
||||||
|
|
||||||
|
|||||||
@@ -342,7 +342,9 @@ build_flags =
|
|||||||
-DARDUINO_USB_CDC_ON_BOOT=1
|
-DARDUINO_USB_CDC_ON_BOOT=1
|
||||||
-DBOARD_HAS_PSRAM=1
|
-DBOARD_HAS_PSRAM=1
|
||||||
-DBOUNDARY_MODE
|
-DBOUNDARY_MODE
|
||||||
;-DNDEBUG ; re-enable debug output for WiFi disconnect investigation
|
-DNDEBUG
|
||||||
|
-DRNS_USE_TLSF=1
|
||||||
|
-DRNS_USE_ALLOCATOR=1
|
||||||
; --- Boundary mode defaults (override via EEPROM at runtime) ---
|
; --- Boundary mode defaults (override via EEPROM at runtime) ---
|
||||||
; TCP server mode (0=server, 1=client)
|
; TCP server mode (0=server, 1=client)
|
||||||
-DBOUNDARY_TCP_MODE=0
|
-DBOUNDARY_TCP_MODE=0
|
||||||
|
|||||||
Reference in New Issue
Block a user