[ota][socket] Fix ESP8266/RP2040 OTA timeout by using SO_RCVTIMEO instead of polling (#14675)

This commit is contained in:
J. Nick Koston
2026-03-12 07:15:48 -10:00
committed by GitHub
parent fd1d016795
commit 4a21afe7ce
4 changed files with 123 additions and 12 deletions
+43 -3
View File
@@ -18,6 +18,7 @@
#include <cerrno>
#include <cstdio>
#include <sys/time.h>
namespace esphome {
@@ -238,6 +239,31 @@ void ESPHomeOTAComponent::handle_data_() {
/// and reboots on success.
///
/// Authentication has already been handled in the non-blocking states AUTH_SEND/AUTH_READ.
///
/// Socket I/O strategy:
///
/// Before this function, the handshake states use non-blocking I/O:
/// read()/write() return immediately with EWOULDBLOCK if no data
/// loop() retries on next iteration (~16ms), no delay needed
///
/// This function switches to blocking mode with SO_RCVTIMEO/SO_SNDTIMEO:
///
/// Path | Wait mechanism | WDT strategy
/// --------------|------------------------|---------------------------
/// Main read | SO_RCVTIMEO (2s block) | feed_wdt() only, no delay
/// readall_() | SO_RCVTIMEO (2s block) | feed_wdt() + delay(0)
/// writeall_() | SO_SNDTIMEO (2s block) | feed_wdt() + delay(1)
///
/// readall_() uses delay(0) because SO_RCVTIMEO already waited — just yield.
/// writeall_() uses delay(1) because on raw TCP (ESP8266, RP2040) writes
/// never block (tcp_write returns immediately), so delay(1) prevents spinning.
///
/// Platform details:
/// BSD sockets (ESP32): setblocking(true) makes read/write block
/// lwip sockets (LT): setblocking(true) makes read/write block
/// Raw TCP (8266, RP2040): setblocking is no-op; SO_RCVTIMEO uses
/// socket_delay()/socket_wake() in read();
/// write() always returns immediately
ota::OTAResponseTypes error_code = ota::OTA_RESPONSE_ERROR_UNKNOWN;
bool update_started = false;
size_t total = 0;
@@ -249,6 +275,14 @@ void ESPHomeOTAComponent::handle_data_() {
size_t size_acknowledged = 0;
#endif
// Set socket timeouts and blocking mode (see strategy table above)
struct timeval tv;
tv.tv_sec = 2;
tv.tv_usec = 0;
this->client_->setsockopt(SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
this->client_->setsockopt(SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv));
this->client_->setblocking(true);
// Acknowledge auth OK - 1 byte
this->write_byte_(ota::OTA_RESPONSE_AUTH_OK);
@@ -299,7 +333,8 @@ void ESPHomeOTAComponent::handle_data_() {
ssize_t read = this->client_->read(buf, requested);
if (read == -1) {
if (this->would_block_(errno)) {
this->yield_and_feed_watchdog_();
// read() already waited up to SO_RCVTIMEO for data, just feed WDT
App.feed_wdt();
continue;
}
ESP_LOGW(TAG, "Read err %d", errno);
@@ -401,7 +436,9 @@ bool ESPHomeOTAComponent::readall_(uint8_t *buf, size_t len) {
} else {
at += read;
}
this->yield_and_feed_watchdog_();
// read() already waited via SO_RCVTIMEO, just yield without 1ms stall
App.feed_wdt();
delay(0);
}
return true;
@@ -422,10 +459,13 @@ bool ESPHomeOTAComponent::writeall_(const uint8_t *buf, size_t len) {
ESP_LOGW(TAG, "Write err %zu bytes, errno %d", len, errno);
return false;
}
// EWOULDBLOCK: on raw TCP writes never block, delay(1) prevents spinning
this->yield_and_feed_watchdog_();
} else {
at += written;
// write() may block up to SO_SNDTIMEO on BSD/lwip sockets, feed WDT
App.feed_wdt();
}
this->yield_and_feed_watchdog_();
}
return true;
}
+2
View File
@@ -51,6 +51,8 @@
#define SO_REUSEADDR 0x0004 /* Allow local address reuse */
#define SO_KEEPALIVE 0x0008 /* keep connections alive */
#define SO_BROADCAST 0x0020 /* permit to send and to receive broadcast messages (see IP_SOF_BROADCAST option) */
#define SO_RCVTIMEO 0x1006 /* receive timeout */
#define SO_SNDTIMEO 0x1005 /* send timeout */
#define SOL_SOCKET 0xfff /* options for socket level */
@@ -5,6 +5,7 @@
#include <cerrno>
#include <cstring>
#include <sys/time.h>
#include "esphome/core/helpers.h"
#include "esphome/core/log.h"
@@ -81,7 +82,9 @@ void socket_delay(uint32_t ms) {
s_socket_woke = false;
return;
}
s_socket_woke = false;
// Don't clear s_socket_woke here — if an IRQ fires between the check above
// and the while loop below, the while condition sees it immediately. Clearing
// here would lose that wake and sleep until the timer fires.
s_delay_expired = false;
// Set a one-shot timer to wake us after the timeout.
// add_alarm_in_ms returns >0 on success, 0 if time already passed, <0 on error.
@@ -99,6 +102,7 @@ void socket_delay(uint32_t ms) {
// Cancel timer if we woke early (socket data arrived before timeout)
if (!s_delay_expired)
cancel_alarm(alarm);
s_socket_woke = false; // consume the wake for next call
}
// No IRAM_ATTR equivalent needed: on RP2040, CYW43 async_context runs LWIP
@@ -359,6 +363,18 @@ int LWIPRawCommon::getsockopt(int level, int optname, void *optval, socklen_t *o
*optlen = 4;
return 0;
}
if (level == SOL_SOCKET && optname == SO_RCVTIMEO) {
if (*optlen < sizeof(struct timeval)) {
errno = EINVAL;
return -1;
}
uint32_t ms = this->recv_timeout_cs_ * 10;
auto *tv = reinterpret_cast<struct timeval *>(optval);
tv->tv_sec = ms / 1000;
tv->tv_usec = (ms % 1000) * 1000;
*optlen = sizeof(struct timeval);
return 0;
}
if (level == IPPROTO_TCP && optname == TCP_NODELAY) {
if (*optlen < 4) {
errno = EINVAL;
@@ -388,6 +404,21 @@ int LWIPRawCommon::setsockopt(int level, int optname, const void *optval, sockle
// to prevent warnings
return 0;
}
if (level == SOL_SOCKET && optname == SO_RCVTIMEO) {
if (optlen < sizeof(struct timeval)) {
errno = EINVAL;
return -1;
}
const auto *tv = reinterpret_cast<const struct timeval *>(optval);
uint32_t ms = tv->tv_sec * 1000 + tv->tv_usec / 1000;
uint32_t cs = (ms + 9) / 10; // round up to nearest centisecond
this->recv_timeout_cs_ = cs > 255 ? 255 : static_cast<uint8_t>(cs);
return 0;
}
if (level == SOL_SOCKET && optname == SO_SNDTIMEO) {
// Raw TCP writes are non-blocking (tcp_write), so send timeout is a no-op.
return 0;
}
if (level == IPPROTO_TCP && optname == TCP_NODELAY) {
if (optlen != 4) {
errno = EINVAL;
@@ -518,8 +549,25 @@ err_t LWIPRawImpl::recv_fn(struct pbuf *pb, err_t err) {
return ERR_OK;
}
ssize_t LWIPRawImpl::read(void *buf, size_t len) {
LWIP_LOCK();
void LWIPRawImpl::wait_for_data_() {
// Wait for data without holding LWIP_LOCK so recv_fn() can run on RP2040
// (needs async_context lock).
//
// Loop until data arrives, connection closes, or the full timeout elapses.
// socket_delay() may return early due to other sockets waking the global
// socket_wake() flag, so we re-enter for the remaining time.
uint32_t timeout_ms = this->recv_timeout_cs_ * 10;
uint32_t start = millis();
while (this->waiting_for_data_()) {
uint32_t elapsed = millis() - start;
if (elapsed >= timeout_ms)
break;
socket_delay(timeout_ms - elapsed);
}
}
ssize_t LWIPRawImpl::read_locked_(void *buf, size_t len) {
// Caller must hold LWIP_LOCK. Copies available data from rx_buf_ into buf.
if (this->pcb_ == nullptr) {
errno = ECONNRESET;
return -1;
@@ -578,11 +626,26 @@ ssize_t LWIPRawImpl::read(void *buf, size_t len) {
return read;
}
ssize_t LWIPRawImpl::read(void *buf, size_t len) {
// See waiting_for_data_() for safety of unlocked reads.
if (this->recv_timeout_cs_ > 0 && this->waiting_for_data_()) {
this->wait_for_data_();
}
LWIP_LOCK();
return this->read_locked_(buf, len);
}
ssize_t LWIPRawImpl::readv(const struct iovec *iov, int iovcnt) {
// See waiting_for_data_() for safety of unlocked reads.
if (this->recv_timeout_cs_ > 0 && this->waiting_for_data_()) {
this->wait_for_data_();
}
LWIP_LOCK(); // Hold for entire scatter-gather operation
ssize_t ret = 0;
for (int i = 0; i < iovcnt; i++) {
ssize_t err = this->read(reinterpret_cast<uint8_t *>(iov[i].iov_base), iov[i].iov_len);
ssize_t err = this->read_locked_(reinterpret_cast<uint8_t *>(iov[i].iov_base), iov[i].iov_len);
if (err == -1) {
if (ret != 0) {
// if we already read some don't return an error
+11 -5
View File
@@ -57,6 +57,7 @@ class LWIPRawCommon {
// instead use it for determining whether to call lwip_output
bool nodelay_ = false;
sa_family_t family_ = 0;
uint8_t recv_timeout_cs_ = 0; // SO_RCVTIMEO in centiseconds (0 = no timeout, max 2.55s)
};
/// Connected socket implementation for LWIP raw TCP.
@@ -107,11 +108,8 @@ class LWIPRawImpl : public LWIPRawCommon {
errno = ECONNRESET;
return -1;
}
if (blocking) {
// blocking operation not supported
errno = EINVAL;
return -1;
}
// Raw TCP doesn't use a blocking flag directly. Blocking behavior
// is provided by SO_RCVTIMEO which makes read() wait via socket_delay().
return 0;
}
int loop() { return 0; }
@@ -122,6 +120,14 @@ class LWIPRawImpl : public LWIPRawCommon {
static err_t s_recv_fn(void *arg, struct tcp_pcb *pcb, struct pbuf *pb, err_t err);
protected:
// True when the socket could receive data but none has arrived yet.
// Safe to call without LWIP_LOCK — only null-checks pointers and reads a bool,
// all atomic on ARM/Xtensa. A stale value is harmless: the caller either does
// an unnecessary wait (stale true) or skips it (stale false), and the
// authoritative recheck happens under LWIP_LOCK afterward.
bool waiting_for_data_() const { return this->rx_buf_ == nullptr && !this->rx_closed_ && this->pcb_ != nullptr; }
void wait_for_data_();
ssize_t read_locked_(void *buf, size_t len);
ssize_t internal_write_(const void *buf, size_t len);
int internal_output_();