Fix heap exhaustion: enable PSRAM allocator + bound all tables + auto-reboot watchdog

Root cause: heltec_V4_boundary build was missing -DRNS_USE_TLSF=1 and
-DRNS_USE_ALLOCATOR=1 flags, causing ALL C++ new/delete to use internal
SRAM (239KB) instead of the PSRAM-backed TLSF pool (~1.6MB). Transport
data structures consumed internal heap until WiFi driver could not
allocate RX buffers (ESP_ERR_NO_MEM).

Changes:
- platformio.ini: Add TLSF/allocator flags to heltec_V4_boundary env,
  re-enable NDEBUG
- Transport.cpp: Add periodic culling of _path_requests (was unbounded,
  grew one entry per unique destination forever). Cull entries older than
  DESTINATION_TIMEOUT. Also cull _pending_local_path_requests for removed
  interfaces, and fix missing .erase() (Python .pop() equivalent).
- RNode_Firmware.ino: Replace WiFi watchdog halt-serial with auto-reboot.
  Add heap pressure check (reboot if free heap < 20KB). Increase WiFi
  grace period from 5s to 15s. Remove orphaned boundary_done label.
This commit is contained in:
James L
2026-02-25 13:48:10 -05:00
parent 8ee8e86563
commit 990649d810
3 changed files with 58 additions and 36 deletions

View File

@@ -2229,19 +2229,26 @@ void loop() {
}
#ifdef BOUNDARY_MODE
// ── WiFi disconnect watchdog ──────────────────────────────────────────────
// When WiFi drops, dump diagnostic info and halt serial output so the
// operator can read the last log lines over USB. The device keeps running
// (LoRa repeater still works) but serial is frozen until reboot.
// ── Heap + WiFi watchdog ───────────────────────────────────────────────────
// Monitor heap and WiFi health. Auto-reboot on critical conditions:
// 1) Internal heap drops below 20KB (WiFi needs ~16KB for RX buffers)
// 2) WiFi down for >15s after having been connected (unrecoverable)
{
static bool _wifi_watchdog_armed = false; // armed once WiFi first connects
static bool _wifi_watchdog_halted = false; // true = serial frozen
static uint32_t _wifi_lost_at = 0; // millis() when WiFi first lost
static const uint32_t WIFI_GRACE_MS = 5000; // 5 s grace before halting
static const uint32_t WIFI_GRACE_MS = 15000; // 15s grace before reboot
static const uint32_t HEAP_CRITICAL = 20000; // 20KB minimum internal heap
if (_wifi_watchdog_halted) {
// Frozen — skip all boundary work, just keep LoRa running
goto boundary_done;
// ── Heap pressure check (runs always) ─────────────────────────────────
uint32_t free_heap = ESP.getFreeHeap();
if (free_heap < HEAP_CRITICAL) {
Serial.printf("\r\n[WATCHDOG] CRITICAL: Free heap %u < %u — REBOOTING\r\n",
free_heap, HEAP_CRITICAL);
Serial.printf("[WATCHDOG] Min free: %u Max alloc: %u\r\n",
ESP.getMinFreeHeap(), ESP.getMaxAllocHeap());
Serial.flush();
delay(100);
ESP.restart();
}
bool wifi_now = wifi_is_connected();
@@ -2255,41 +2262,28 @@ void loop() {
if (_wifi_watchdog_armed && !wifi_now) {
if (_wifi_lost_at == 0) {
_wifi_lost_at = millis();
Serial.printf("\r\n[WATCHDOG] WiFi connection LOST at %lu ms grace period %lu ms\r\n",
Serial.printf("\r\n[WATCHDOG] WiFi lost at %lu ms (grace %lu ms)\r\n",
_wifi_lost_at, WIFI_GRACE_MS);
Serial.printf("[WATCHDOG] WiFi.status() = %d RSSI = %d\r\n",
(int)WiFi.status(), (int)WiFi.RSSI());
Serial.printf("[WATCHDOG] Free heap: %u Min free heap: %u\r\n",
ESP.getFreeHeap(), ESP.getMinFreeHeap());
Serial.printf("[WATCHDOG] TCP backbone connected: %s clients: %d\r\n",
(tcp_interface_ptr && tcp_interface_ptr->isConnected()) ? "yes" : "no",
tcp_interface_ptr ? tcp_interface_ptr->clientCount() : 0);
if (local_tcp_interface_ptr) {
Serial.printf("[WATCHDOG] Local TCP connected: %s clients: %d\r\n",
local_tcp_interface_ptr->isConnected() ? "yes" : "no",
local_tcp_interface_ptr->clientCount());
}
Serial.printf("[WATCHDOG] WiFi.status()=%d heap=%u min_heap=%u\r\n",
(int)WiFi.status(), free_heap, ESP.getMinFreeHeap());
Serial.flush();
}
// Check if grace period expired
// Check if grace period expired — unrecoverable, reboot
if ((millis() - _wifi_lost_at) >= WIFI_GRACE_MS) {
Serial.printf("\r\n[WATCHDOG] *** WiFi still down after %lu ms — HALTING SERIAL OUTPUT ***\r\n",
Serial.printf("\r\n[WATCHDOG] WiFi down %lu ms — REBOOTING\r\n",
millis() - _wifi_lost_at);
Serial.printf("[WATCHDOG] WiFi.status() = %d RSSI = %d\r\n",
(int)WiFi.status(), (int)WiFi.RSSI());
Serial.printf("[WATCHDOG] Last boundary activity: %lu ms ago\r\n",
millis() - boundary_state.last_bridge_activity);
Serial.printf("[WATCHDOG] Packets bridged: LoRa→TCP=%lu TCP→LoRa=%lu\r\n",
Serial.printf("[WATCHDOG] WiFi.status()=%d heap=%u\r\n",
(int)WiFi.status(), ESP.getFreeHeap());
Serial.printf("[WATCHDOG] Bridged: L→T=%lu T→L=%lu\r\n",
boundary_state.packets_bridged_lora_to_tcp,
boundary_state.packets_bridged_tcp_to_lora);
Serial.println("[WATCHDOG] Device still running (LoRa repeater active). Reboot to resume.");
Serial.flush();
_wifi_watchdog_halted = true;
goto boundary_done;
delay(100);
ESP.restart();
}
} else if (_wifi_watchdog_armed && wifi_now && _wifi_lost_at != 0) {
// WiFi came back within grace period
Serial.printf("[WATCHDOG] WiFi reconnected after %lu ms\r\n", millis() - _wifi_lost_at);
// WiFi recovered within grace period
Serial.printf("[WATCHDOG] WiFi back after %lu ms\r\n", millis() - _wifi_lost_at);
_wifi_lost_at = 0;
}
}
@@ -2318,7 +2312,6 @@ void loop() {
boundary_state.wifi_connected = wifi_is_connected();
}
boundary_done:
#endif
#endif

View File

@@ -571,6 +571,32 @@ static bool is_backbone_interface(const Interface& iface) {
}
}
// Cull the path requests table (entries older than destination timeout)
{
std::vector<Bytes> stale_path_requests;
for (const auto& [destination_hash, timestamp] : _path_requests) {
if (OS::time() > (timestamp + DESTINATION_TIMEOUT)) {
stale_path_requests.push_back(destination_hash);
}
}
for (const Bytes& destination_hash : stale_path_requests) {
_path_requests.erase(destination_hash);
}
}
// Cull pending local path requests for interfaces that no longer exist
{
std::vector<Bytes> stale_plpr;
for (const auto& [destination_hash, iface] : _pending_local_path_requests) {
if (_interfaces.count(iface.get_hash()) == 0) {
stale_plpr.push_back(destination_hash);
}
}
for (const Bytes& destination_hash : stale_plpr) {
_pending_local_path_requests.erase(destination_hash);
}
}
// Cull the tunnel table
count = 0;
std::vector<Bytes> stale_tunnels;
@@ -2144,6 +2170,7 @@ static bool is_backbone_interface(const Interface& iface) {
if (iter != _pending_local_path_requests.end()) {
//p desiring_interface = Transport.pending_local_path_requests.pop(packet.destination_hash)
//const Interface& desiring_interface = (*iter).second;
_pending_local_path_requests.erase(iter); // CBA FIX: pop() equivalent
retransmit_timeout = now;
retries = PATHFINDER_R;

View File

@@ -342,7 +342,9 @@ build_flags =
-DARDUINO_USB_CDC_ON_BOOT=1
-DBOARD_HAS_PSRAM=1
-DBOUNDARY_MODE
;-DNDEBUG ; re-enable debug output for WiFi disconnect investigation
-DNDEBUG
-DRNS_USE_TLSF=1
-DRNS_USE_ALLOCATOR=1
; --- Boundary mode defaults (override via EEPROM at runtime) ---
; TCP server mode (0=server, 1=client)
-DBOUNDARY_TCP_MODE=0