Fix heap exhaustion: enable PSRAM allocator + bound all tables + auto-reboot watchdog

Root cause: heltec_V4_boundary build was missing -DRNS_USE_TLSF=1 and
-DRNS_USE_ALLOCATOR=1 flags, causing ALL C++ new/delete to use internal
SRAM (239KB) instead of the PSRAM-backed TLSF pool (~1.6MB). Transport
data structures consumed internal heap until WiFi driver could not
allocate RX buffers (ESP_ERR_NO_MEM).

Changes:
- platformio.ini: Add TLSF/allocator flags to heltec_V4_boundary env,
  re-enable NDEBUG
- Transport.cpp: Add periodic culling of _path_requests (was unbounded,
  grew one entry per unique destination forever). Cull entries older than
  DESTINATION_TIMEOUT. Also cull _pending_local_path_requests for removed
  interfaces, and fix missing .erase() (Python .pop() equivalent).
- RNode_Firmware.ino: Replace WiFi watchdog halt-serial with auto-reboot.
  Add heap pressure check (reboot if free heap < 20KB). Increase WiFi
  grace period from 5s to 15s. Remove orphaned boundary_done label.
This commit is contained in:
James L
2026-02-25 13:48:10 -05:00
parent 8ee8e86563
commit 990649d810
3 changed files with 58 additions and 36 deletions

View File

@@ -2229,19 +2229,26 @@ void loop() {
} }
#ifdef BOUNDARY_MODE #ifdef BOUNDARY_MODE
// ── WiFi disconnect watchdog ────────────────────────────────────────────── // ── Heap + WiFi watchdog ───────────────────────────────────────────────────
// When WiFi drops, dump diagnostic info and halt serial output so the // Monitor heap and WiFi health. Auto-reboot on critical conditions:
// operator can read the last log lines over USB. The device keeps running // 1) Internal heap drops below 20KB (WiFi needs ~16KB for RX buffers)
// (LoRa repeater still works) but serial is frozen until reboot. // 2) WiFi down for >15s after having been connected (unrecoverable)
{ {
static bool _wifi_watchdog_armed = false; // armed once WiFi first connects static bool _wifi_watchdog_armed = false; // armed once WiFi first connects
static bool _wifi_watchdog_halted = false; // true = serial frozen
static uint32_t _wifi_lost_at = 0; // millis() when WiFi first lost static uint32_t _wifi_lost_at = 0; // millis() when WiFi first lost
static const uint32_t WIFI_GRACE_MS = 5000; // 5 s grace before halting static const uint32_t WIFI_GRACE_MS = 15000; // 15s grace before reboot
static const uint32_t HEAP_CRITICAL = 20000; // 20KB minimum internal heap
if (_wifi_watchdog_halted) { // ── Heap pressure check (runs always) ─────────────────────────────────
// Frozen — skip all boundary work, just keep LoRa running uint32_t free_heap = ESP.getFreeHeap();
goto boundary_done; if (free_heap < HEAP_CRITICAL) {
Serial.printf("\r\n[WATCHDOG] CRITICAL: Free heap %u < %u — REBOOTING\r\n",
free_heap, HEAP_CRITICAL);
Serial.printf("[WATCHDOG] Min free: %u Max alloc: %u\r\n",
ESP.getMinFreeHeap(), ESP.getMaxAllocHeap());
Serial.flush();
delay(100);
ESP.restart();
} }
bool wifi_now = wifi_is_connected(); bool wifi_now = wifi_is_connected();
@@ -2255,41 +2262,28 @@ void loop() {
if (_wifi_watchdog_armed && !wifi_now) { if (_wifi_watchdog_armed && !wifi_now) {
if (_wifi_lost_at == 0) { if (_wifi_lost_at == 0) {
_wifi_lost_at = millis(); _wifi_lost_at = millis();
Serial.printf("\r\n[WATCHDOG] WiFi connection LOST at %lu ms grace period %lu ms\r\n", Serial.printf("\r\n[WATCHDOG] WiFi lost at %lu ms (grace %lu ms)\r\n",
_wifi_lost_at, WIFI_GRACE_MS); _wifi_lost_at, WIFI_GRACE_MS);
Serial.printf("[WATCHDOG] WiFi.status() = %d RSSI = %d\r\n", Serial.printf("[WATCHDOG] WiFi.status()=%d heap=%u min_heap=%u\r\n",
(int)WiFi.status(), (int)WiFi.RSSI()); (int)WiFi.status(), free_heap, ESP.getMinFreeHeap());
Serial.printf("[WATCHDOG] Free heap: %u Min free heap: %u\r\n",
ESP.getFreeHeap(), ESP.getMinFreeHeap());
Serial.printf("[WATCHDOG] TCP backbone connected: %s clients: %d\r\n",
(tcp_interface_ptr && tcp_interface_ptr->isConnected()) ? "yes" : "no",
tcp_interface_ptr ? tcp_interface_ptr->clientCount() : 0);
if (local_tcp_interface_ptr) {
Serial.printf("[WATCHDOG] Local TCP connected: %s clients: %d\r\n",
local_tcp_interface_ptr->isConnected() ? "yes" : "no",
local_tcp_interface_ptr->clientCount());
}
Serial.flush(); Serial.flush();
} }
// Check if grace period expired // Check if grace period expired — unrecoverable, reboot
if ((millis() - _wifi_lost_at) >= WIFI_GRACE_MS) { if ((millis() - _wifi_lost_at) >= WIFI_GRACE_MS) {
Serial.printf("\r\n[WATCHDOG] *** WiFi still down after %lu ms — HALTING SERIAL OUTPUT ***\r\n", Serial.printf("\r\n[WATCHDOG] WiFi down %lu ms — REBOOTING\r\n",
millis() - _wifi_lost_at); millis() - _wifi_lost_at);
Serial.printf("[WATCHDOG] WiFi.status() = %d RSSI = %d\r\n", Serial.printf("[WATCHDOG] WiFi.status()=%d heap=%u\r\n",
(int)WiFi.status(), (int)WiFi.RSSI()); (int)WiFi.status(), ESP.getFreeHeap());
Serial.printf("[WATCHDOG] Last boundary activity: %lu ms ago\r\n", Serial.printf("[WATCHDOG] Bridged: L→T=%lu T→L=%lu\r\n",
millis() - boundary_state.last_bridge_activity);
Serial.printf("[WATCHDOG] Packets bridged: LoRa→TCP=%lu TCP→LoRa=%lu\r\n",
boundary_state.packets_bridged_lora_to_tcp, boundary_state.packets_bridged_lora_to_tcp,
boundary_state.packets_bridged_tcp_to_lora); boundary_state.packets_bridged_tcp_to_lora);
Serial.println("[WATCHDOG] Device still running (LoRa repeater active). Reboot to resume.");
Serial.flush(); Serial.flush();
_wifi_watchdog_halted = true; delay(100);
goto boundary_done; ESP.restart();
} }
} else if (_wifi_watchdog_armed && wifi_now && _wifi_lost_at != 0) { } else if (_wifi_watchdog_armed && wifi_now && _wifi_lost_at != 0) {
// WiFi came back within grace period // WiFi recovered within grace period
Serial.printf("[WATCHDOG] WiFi reconnected after %lu ms\r\n", millis() - _wifi_lost_at); Serial.printf("[WATCHDOG] WiFi back after %lu ms\r\n", millis() - _wifi_lost_at);
_wifi_lost_at = 0; _wifi_lost_at = 0;
} }
} }
@@ -2318,7 +2312,6 @@ void loop() {
boundary_state.wifi_connected = wifi_is_connected(); boundary_state.wifi_connected = wifi_is_connected();
} }
boundary_done:
#endif #endif
#endif #endif

View File

@@ -571,6 +571,32 @@ static bool is_backbone_interface(const Interface& iface) {
} }
} }
// Cull the path requests table (entries older than destination timeout)
{
std::vector<Bytes> stale_path_requests;
for (const auto& [destination_hash, timestamp] : _path_requests) {
if (OS::time() > (timestamp + DESTINATION_TIMEOUT)) {
stale_path_requests.push_back(destination_hash);
}
}
for (const Bytes& destination_hash : stale_path_requests) {
_path_requests.erase(destination_hash);
}
}
// Cull pending local path requests for interfaces that no longer exist
{
std::vector<Bytes> stale_plpr;
for (const auto& [destination_hash, iface] : _pending_local_path_requests) {
if (_interfaces.count(iface.get_hash()) == 0) {
stale_plpr.push_back(destination_hash);
}
}
for (const Bytes& destination_hash : stale_plpr) {
_pending_local_path_requests.erase(destination_hash);
}
}
// Cull the tunnel table // Cull the tunnel table
count = 0; count = 0;
std::vector<Bytes> stale_tunnels; std::vector<Bytes> stale_tunnels;
@@ -2144,6 +2170,7 @@ static bool is_backbone_interface(const Interface& iface) {
if (iter != _pending_local_path_requests.end()) { if (iter != _pending_local_path_requests.end()) {
//p desiring_interface = Transport.pending_local_path_requests.pop(packet.destination_hash) //p desiring_interface = Transport.pending_local_path_requests.pop(packet.destination_hash)
//const Interface& desiring_interface = (*iter).second; //const Interface& desiring_interface = (*iter).second;
_pending_local_path_requests.erase(iter); // CBA FIX: pop() equivalent
retransmit_timeout = now; retransmit_timeout = now;
retries = PATHFINDER_R; retries = PATHFINDER_R;

View File

@@ -342,7 +342,9 @@ build_flags =
-DARDUINO_USB_CDC_ON_BOOT=1 -DARDUINO_USB_CDC_ON_BOOT=1
-DBOARD_HAS_PSRAM=1 -DBOARD_HAS_PSRAM=1
-DBOUNDARY_MODE -DBOUNDARY_MODE
;-DNDEBUG ; re-enable debug output for WiFi disconnect investigation -DNDEBUG
-DRNS_USE_TLSF=1
-DRNS_USE_ALLOCATOR=1
; --- Boundary mode defaults (override via EEPROM at runtime) --- ; --- Boundary mode defaults (override via EEPROM at runtime) ---
; TCP server mode (0=server, 1=client) ; TCP server mode (0=server, 1=client)
-DBOUNDARY_TCP_MODE=0 -DBOUNDARY_TCP_MODE=0