Fix heap exhaustion: enable PSRAM allocator + bound all tables + auto-reboot watchdog
Root cause: heltec_V4_boundary build was missing -DRNS_USE_TLSF=1 and -DRNS_USE_ALLOCATOR=1 flags, causing ALL C++ new/delete to use internal SRAM (239KB) instead of the PSRAM-backed TLSF pool (~1.6MB). Transport data structures consumed internal heap until WiFi driver could not allocate RX buffers (ESP_ERR_NO_MEM). Changes: - platformio.ini: Add TLSF/allocator flags to heltec_V4_boundary env, re-enable NDEBUG - Transport.cpp: Add periodic culling of _path_requests (was unbounded, grew one entry per unique destination forever). Cull entries older than DESTINATION_TIMEOUT. Also cull _pending_local_path_requests for removed interfaces, and fix missing .erase() (Python .pop() equivalent). - RNode_Firmware.ino: Replace WiFi watchdog halt-serial with auto-reboot. Add heap pressure check (reboot if free heap < 20KB). Increase WiFi grace period from 5s to 15s. Remove orphaned boundary_done label.
This commit is contained in:
@@ -2229,19 +2229,26 @@ void loop() {
|
||||
}
|
||||
|
||||
#ifdef BOUNDARY_MODE
|
||||
// ── WiFi disconnect watchdog ──────────────────────────────────────────────
|
||||
// When WiFi drops, dump diagnostic info and halt serial output so the
|
||||
// operator can read the last log lines over USB. The device keeps running
|
||||
// (LoRa repeater still works) but serial is frozen until reboot.
|
||||
// ── Heap + WiFi watchdog ───────────────────────────────────────────────────
|
||||
// Monitor heap and WiFi health. Auto-reboot on critical conditions:
|
||||
// 1) Internal heap drops below 20KB (WiFi needs ~16KB for RX buffers)
|
||||
// 2) WiFi down for >15s after having been connected (unrecoverable)
|
||||
{
|
||||
static bool _wifi_watchdog_armed = false; // armed once WiFi first connects
|
||||
static bool _wifi_watchdog_halted = false; // true = serial frozen
|
||||
static uint32_t _wifi_lost_at = 0; // millis() when WiFi first lost
|
||||
static const uint32_t WIFI_GRACE_MS = 5000; // 5 s grace before halting
|
||||
static const uint32_t WIFI_GRACE_MS = 15000; // 15s grace before reboot
|
||||
static const uint32_t HEAP_CRITICAL = 20000; // 20KB minimum internal heap
|
||||
|
||||
if (_wifi_watchdog_halted) {
|
||||
// Frozen — skip all boundary work, just keep LoRa running
|
||||
goto boundary_done;
|
||||
// ── Heap pressure check (runs always) ─────────────────────────────────
|
||||
uint32_t free_heap = ESP.getFreeHeap();
|
||||
if (free_heap < HEAP_CRITICAL) {
|
||||
Serial.printf("\r\n[WATCHDOG] CRITICAL: Free heap %u < %u — REBOOTING\r\n",
|
||||
free_heap, HEAP_CRITICAL);
|
||||
Serial.printf("[WATCHDOG] Min free: %u Max alloc: %u\r\n",
|
||||
ESP.getMinFreeHeap(), ESP.getMaxAllocHeap());
|
||||
Serial.flush();
|
||||
delay(100);
|
||||
ESP.restart();
|
||||
}
|
||||
|
||||
bool wifi_now = wifi_is_connected();
|
||||
@@ -2255,41 +2262,28 @@ void loop() {
|
||||
if (_wifi_watchdog_armed && !wifi_now) {
|
||||
if (_wifi_lost_at == 0) {
|
||||
_wifi_lost_at = millis();
|
||||
Serial.printf("\r\n[WATCHDOG] WiFi connection LOST at %lu ms — grace period %lu ms\r\n",
|
||||
Serial.printf("\r\n[WATCHDOG] WiFi lost at %lu ms (grace %lu ms)\r\n",
|
||||
_wifi_lost_at, WIFI_GRACE_MS);
|
||||
Serial.printf("[WATCHDOG] WiFi.status() = %d RSSI = %d\r\n",
|
||||
(int)WiFi.status(), (int)WiFi.RSSI());
|
||||
Serial.printf("[WATCHDOG] Free heap: %u Min free heap: %u\r\n",
|
||||
ESP.getFreeHeap(), ESP.getMinFreeHeap());
|
||||
Serial.printf("[WATCHDOG] TCP backbone connected: %s clients: %d\r\n",
|
||||
(tcp_interface_ptr && tcp_interface_ptr->isConnected()) ? "yes" : "no",
|
||||
tcp_interface_ptr ? tcp_interface_ptr->clientCount() : 0);
|
||||
if (local_tcp_interface_ptr) {
|
||||
Serial.printf("[WATCHDOG] Local TCP connected: %s clients: %d\r\n",
|
||||
local_tcp_interface_ptr->isConnected() ? "yes" : "no",
|
||||
local_tcp_interface_ptr->clientCount());
|
||||
}
|
||||
Serial.printf("[WATCHDOG] WiFi.status()=%d heap=%u min_heap=%u\r\n",
|
||||
(int)WiFi.status(), free_heap, ESP.getMinFreeHeap());
|
||||
Serial.flush();
|
||||
}
|
||||
// Check if grace period expired
|
||||
// Check if grace period expired — unrecoverable, reboot
|
||||
if ((millis() - _wifi_lost_at) >= WIFI_GRACE_MS) {
|
||||
Serial.printf("\r\n[WATCHDOG] *** WiFi still down after %lu ms — HALTING SERIAL OUTPUT ***\r\n",
|
||||
Serial.printf("\r\n[WATCHDOG] WiFi down %lu ms — REBOOTING\r\n",
|
||||
millis() - _wifi_lost_at);
|
||||
Serial.printf("[WATCHDOG] WiFi.status() = %d RSSI = %d\r\n",
|
||||
(int)WiFi.status(), (int)WiFi.RSSI());
|
||||
Serial.printf("[WATCHDOG] Last boundary activity: %lu ms ago\r\n",
|
||||
millis() - boundary_state.last_bridge_activity);
|
||||
Serial.printf("[WATCHDOG] Packets bridged: LoRa→TCP=%lu TCP→LoRa=%lu\r\n",
|
||||
Serial.printf("[WATCHDOG] WiFi.status()=%d heap=%u\r\n",
|
||||
(int)WiFi.status(), ESP.getFreeHeap());
|
||||
Serial.printf("[WATCHDOG] Bridged: L→T=%lu T→L=%lu\r\n",
|
||||
boundary_state.packets_bridged_lora_to_tcp,
|
||||
boundary_state.packets_bridged_tcp_to_lora);
|
||||
Serial.println("[WATCHDOG] Device still running (LoRa repeater active). Reboot to resume.");
|
||||
Serial.flush();
|
||||
_wifi_watchdog_halted = true;
|
||||
goto boundary_done;
|
||||
delay(100);
|
||||
ESP.restart();
|
||||
}
|
||||
} else if (_wifi_watchdog_armed && wifi_now && _wifi_lost_at != 0) {
|
||||
// WiFi came back within grace period
|
||||
Serial.printf("[WATCHDOG] WiFi reconnected after %lu ms\r\n", millis() - _wifi_lost_at);
|
||||
// WiFi recovered within grace period
|
||||
Serial.printf("[WATCHDOG] WiFi back after %lu ms\r\n", millis() - _wifi_lost_at);
|
||||
_wifi_lost_at = 0;
|
||||
}
|
||||
}
|
||||
@@ -2318,7 +2312,6 @@ void loop() {
|
||||
boundary_state.wifi_connected = wifi_is_connected();
|
||||
}
|
||||
|
||||
boundary_done:
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -571,6 +571,32 @@ static bool is_backbone_interface(const Interface& iface) {
|
||||
}
|
||||
}
|
||||
|
||||
// Cull the path requests table (entries older than destination timeout)
|
||||
{
|
||||
std::vector<Bytes> stale_path_requests;
|
||||
for (const auto& [destination_hash, timestamp] : _path_requests) {
|
||||
if (OS::time() > (timestamp + DESTINATION_TIMEOUT)) {
|
||||
stale_path_requests.push_back(destination_hash);
|
||||
}
|
||||
}
|
||||
for (const Bytes& destination_hash : stale_path_requests) {
|
||||
_path_requests.erase(destination_hash);
|
||||
}
|
||||
}
|
||||
|
||||
// Cull pending local path requests for interfaces that no longer exist
|
||||
{
|
||||
std::vector<Bytes> stale_plpr;
|
||||
for (const auto& [destination_hash, iface] : _pending_local_path_requests) {
|
||||
if (_interfaces.count(iface.get_hash()) == 0) {
|
||||
stale_plpr.push_back(destination_hash);
|
||||
}
|
||||
}
|
||||
for (const Bytes& destination_hash : stale_plpr) {
|
||||
_pending_local_path_requests.erase(destination_hash);
|
||||
}
|
||||
}
|
||||
|
||||
// Cull the tunnel table
|
||||
count = 0;
|
||||
std::vector<Bytes> stale_tunnels;
|
||||
@@ -2144,6 +2170,7 @@ static bool is_backbone_interface(const Interface& iface) {
|
||||
if (iter != _pending_local_path_requests.end()) {
|
||||
//p desiring_interface = Transport.pending_local_path_requests.pop(packet.destination_hash)
|
||||
//const Interface& desiring_interface = (*iter).second;
|
||||
_pending_local_path_requests.erase(iter); // CBA FIX: pop() equivalent
|
||||
retransmit_timeout = now;
|
||||
retries = PATHFINDER_R;
|
||||
|
||||
|
||||
@@ -342,7 +342,9 @@ build_flags =
|
||||
-DARDUINO_USB_CDC_ON_BOOT=1
|
||||
-DBOARD_HAS_PSRAM=1
|
||||
-DBOUNDARY_MODE
|
||||
;-DNDEBUG ; re-enable debug output for WiFi disconnect investigation
|
||||
-DNDEBUG
|
||||
-DRNS_USE_TLSF=1
|
||||
-DRNS_USE_ALLOCATOR=1
|
||||
; --- Boundary mode defaults (override via EEPROM at runtime) ---
|
||||
; TCP server mode (0=server, 1=client)
|
||||
-DBOUNDARY_TCP_MODE=0
|
||||
|
||||
Reference in New Issue
Block a user