mirror of
https://github.com/esphome/esphome.git
synced 2026-05-22 01:42:49 +08:00
[ota][socket] Fix ESP8266/RP2040 OTA timeout by using SO_RCVTIMEO instead of polling (#14675)
This commit is contained in:
@@ -18,6 +18,7 @@
|
||||
|
||||
#include <cerrno>
|
||||
#include <cstdio>
|
||||
#include <sys/time.h>
|
||||
|
||||
namespace esphome {
|
||||
|
||||
@@ -238,6 +239,31 @@ void ESPHomeOTAComponent::handle_data_() {
|
||||
/// and reboots on success.
|
||||
///
|
||||
/// Authentication has already been handled in the non-blocking states AUTH_SEND/AUTH_READ.
|
||||
///
|
||||
/// Socket I/O strategy:
|
||||
///
|
||||
/// Before this function, the handshake states use non-blocking I/O:
|
||||
/// read()/write() return immediately with EWOULDBLOCK if no data
|
||||
/// loop() retries on next iteration (~16ms), no delay needed
|
||||
///
|
||||
/// This function switches to blocking mode with SO_RCVTIMEO/SO_SNDTIMEO:
|
||||
///
|
||||
/// Path | Wait mechanism | WDT strategy
|
||||
/// --------------|------------------------|---------------------------
|
||||
/// Main read | SO_RCVTIMEO (2s block) | feed_wdt() only, no delay
|
||||
/// readall_() | SO_RCVTIMEO (2s block) | feed_wdt() + delay(0)
|
||||
/// writeall_() | SO_SNDTIMEO (2s block) | feed_wdt() + delay(1)
|
||||
///
|
||||
/// readall_() uses delay(0) because SO_RCVTIMEO already waited — just yield.
|
||||
/// writeall_() uses delay(1) because on raw TCP (ESP8266, RP2040) writes
|
||||
/// never block (tcp_write returns immediately), so delay(1) prevents spinning.
|
||||
///
|
||||
/// Platform details:
|
||||
/// BSD sockets (ESP32): setblocking(true) makes read/write block
|
||||
/// lwip sockets (LT): setblocking(true) makes read/write block
|
||||
/// Raw TCP (8266, RP2040): setblocking is no-op; SO_RCVTIMEO uses
|
||||
/// socket_delay()/socket_wake() in read();
|
||||
/// write() always returns immediately
|
||||
ota::OTAResponseTypes error_code = ota::OTA_RESPONSE_ERROR_UNKNOWN;
|
||||
bool update_started = false;
|
||||
size_t total = 0;
|
||||
@@ -249,6 +275,14 @@ void ESPHomeOTAComponent::handle_data_() {
|
||||
size_t size_acknowledged = 0;
|
||||
#endif
|
||||
|
||||
// Set socket timeouts and blocking mode (see strategy table above)
|
||||
struct timeval tv;
|
||||
tv.tv_sec = 2;
|
||||
tv.tv_usec = 0;
|
||||
this->client_->setsockopt(SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
|
||||
this->client_->setsockopt(SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv));
|
||||
this->client_->setblocking(true);
|
||||
|
||||
// Acknowledge auth OK - 1 byte
|
||||
this->write_byte_(ota::OTA_RESPONSE_AUTH_OK);
|
||||
|
||||
@@ -299,7 +333,8 @@ void ESPHomeOTAComponent::handle_data_() {
|
||||
ssize_t read = this->client_->read(buf, requested);
|
||||
if (read == -1) {
|
||||
if (this->would_block_(errno)) {
|
||||
this->yield_and_feed_watchdog_();
|
||||
// read() already waited up to SO_RCVTIMEO for data, just feed WDT
|
||||
App.feed_wdt();
|
||||
continue;
|
||||
}
|
||||
ESP_LOGW(TAG, "Read err %d", errno);
|
||||
@@ -401,7 +436,9 @@ bool ESPHomeOTAComponent::readall_(uint8_t *buf, size_t len) {
|
||||
} else {
|
||||
at += read;
|
||||
}
|
||||
this->yield_and_feed_watchdog_();
|
||||
// read() already waited via SO_RCVTIMEO, just yield without 1ms stall
|
||||
App.feed_wdt();
|
||||
delay(0);
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -422,10 +459,13 @@ bool ESPHomeOTAComponent::writeall_(const uint8_t *buf, size_t len) {
|
||||
ESP_LOGW(TAG, "Write err %zu bytes, errno %d", len, errno);
|
||||
return false;
|
||||
}
|
||||
// EWOULDBLOCK: on raw TCP writes never block, delay(1) prevents spinning
|
||||
this->yield_and_feed_watchdog_();
|
||||
} else {
|
||||
at += written;
|
||||
// write() may block up to SO_SNDTIMEO on BSD/lwip sockets, feed WDT
|
||||
App.feed_wdt();
|
||||
}
|
||||
this->yield_and_feed_watchdog_();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -51,6 +51,8 @@
|
||||
#define SO_REUSEADDR 0x0004 /* Allow local address reuse */
|
||||
#define SO_KEEPALIVE 0x0008 /* keep connections alive */
|
||||
#define SO_BROADCAST 0x0020 /* permit to send and to receive broadcast messages (see IP_SOF_BROADCAST option) */
|
||||
#define SO_RCVTIMEO 0x1006 /* receive timeout */
|
||||
#define SO_SNDTIMEO 0x1005 /* send timeout */
|
||||
|
||||
#define SOL_SOCKET 0xfff /* options for socket level */
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
|
||||
#include <cerrno>
|
||||
#include <cstring>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "esphome/core/helpers.h"
|
||||
#include "esphome/core/log.h"
|
||||
@@ -81,7 +82,9 @@ void socket_delay(uint32_t ms) {
|
||||
s_socket_woke = false;
|
||||
return;
|
||||
}
|
||||
s_socket_woke = false;
|
||||
// Don't clear s_socket_woke here — if an IRQ fires between the check above
|
||||
// and the while loop below, the while condition sees it immediately. Clearing
|
||||
// here would lose that wake and sleep until the timer fires.
|
||||
s_delay_expired = false;
|
||||
// Set a one-shot timer to wake us after the timeout.
|
||||
// add_alarm_in_ms returns >0 on success, 0 if time already passed, <0 on error.
|
||||
@@ -99,6 +102,7 @@ void socket_delay(uint32_t ms) {
|
||||
// Cancel timer if we woke early (socket data arrived before timeout)
|
||||
if (!s_delay_expired)
|
||||
cancel_alarm(alarm);
|
||||
s_socket_woke = false; // consume the wake for next call
|
||||
}
|
||||
|
||||
// No IRAM_ATTR equivalent needed: on RP2040, CYW43 async_context runs LWIP
|
||||
@@ -359,6 +363,18 @@ int LWIPRawCommon::getsockopt(int level, int optname, void *optval, socklen_t *o
|
||||
*optlen = 4;
|
||||
return 0;
|
||||
}
|
||||
if (level == SOL_SOCKET && optname == SO_RCVTIMEO) {
|
||||
if (*optlen < sizeof(struct timeval)) {
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
uint32_t ms = this->recv_timeout_cs_ * 10;
|
||||
auto *tv = reinterpret_cast<struct timeval *>(optval);
|
||||
tv->tv_sec = ms / 1000;
|
||||
tv->tv_usec = (ms % 1000) * 1000;
|
||||
*optlen = sizeof(struct timeval);
|
||||
return 0;
|
||||
}
|
||||
if (level == IPPROTO_TCP && optname == TCP_NODELAY) {
|
||||
if (*optlen < 4) {
|
||||
errno = EINVAL;
|
||||
@@ -388,6 +404,21 @@ int LWIPRawCommon::setsockopt(int level, int optname, const void *optval, sockle
|
||||
// to prevent warnings
|
||||
return 0;
|
||||
}
|
||||
if (level == SOL_SOCKET && optname == SO_RCVTIMEO) {
|
||||
if (optlen < sizeof(struct timeval)) {
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
const auto *tv = reinterpret_cast<const struct timeval *>(optval);
|
||||
uint32_t ms = tv->tv_sec * 1000 + tv->tv_usec / 1000;
|
||||
uint32_t cs = (ms + 9) / 10; // round up to nearest centisecond
|
||||
this->recv_timeout_cs_ = cs > 255 ? 255 : static_cast<uint8_t>(cs);
|
||||
return 0;
|
||||
}
|
||||
if (level == SOL_SOCKET && optname == SO_SNDTIMEO) {
|
||||
// Raw TCP writes are non-blocking (tcp_write), so send timeout is a no-op.
|
||||
return 0;
|
||||
}
|
||||
if (level == IPPROTO_TCP && optname == TCP_NODELAY) {
|
||||
if (optlen != 4) {
|
||||
errno = EINVAL;
|
||||
@@ -518,8 +549,25 @@ err_t LWIPRawImpl::recv_fn(struct pbuf *pb, err_t err) {
|
||||
return ERR_OK;
|
||||
}
|
||||
|
||||
ssize_t LWIPRawImpl::read(void *buf, size_t len) {
|
||||
LWIP_LOCK();
|
||||
void LWIPRawImpl::wait_for_data_() {
|
||||
// Wait for data without holding LWIP_LOCK so recv_fn() can run on RP2040
|
||||
// (needs async_context lock).
|
||||
//
|
||||
// Loop until data arrives, connection closes, or the full timeout elapses.
|
||||
// socket_delay() may return early due to other sockets waking the global
|
||||
// socket_wake() flag, so we re-enter for the remaining time.
|
||||
uint32_t timeout_ms = this->recv_timeout_cs_ * 10;
|
||||
uint32_t start = millis();
|
||||
while (this->waiting_for_data_()) {
|
||||
uint32_t elapsed = millis() - start;
|
||||
if (elapsed >= timeout_ms)
|
||||
break;
|
||||
socket_delay(timeout_ms - elapsed);
|
||||
}
|
||||
}
|
||||
|
||||
ssize_t LWIPRawImpl::read_locked_(void *buf, size_t len) {
|
||||
// Caller must hold LWIP_LOCK. Copies available data from rx_buf_ into buf.
|
||||
if (this->pcb_ == nullptr) {
|
||||
errno = ECONNRESET;
|
||||
return -1;
|
||||
@@ -578,11 +626,26 @@ ssize_t LWIPRawImpl::read(void *buf, size_t len) {
|
||||
return read;
|
||||
}
|
||||
|
||||
ssize_t LWIPRawImpl::read(void *buf, size_t len) {
|
||||
// See waiting_for_data_() for safety of unlocked reads.
|
||||
if (this->recv_timeout_cs_ > 0 && this->waiting_for_data_()) {
|
||||
this->wait_for_data_();
|
||||
}
|
||||
|
||||
LWIP_LOCK();
|
||||
return this->read_locked_(buf, len);
|
||||
}
|
||||
|
||||
ssize_t LWIPRawImpl::readv(const struct iovec *iov, int iovcnt) {
|
||||
// See waiting_for_data_() for safety of unlocked reads.
|
||||
if (this->recv_timeout_cs_ > 0 && this->waiting_for_data_()) {
|
||||
this->wait_for_data_();
|
||||
}
|
||||
|
||||
LWIP_LOCK(); // Hold for entire scatter-gather operation
|
||||
ssize_t ret = 0;
|
||||
for (int i = 0; i < iovcnt; i++) {
|
||||
ssize_t err = this->read(reinterpret_cast<uint8_t *>(iov[i].iov_base), iov[i].iov_len);
|
||||
ssize_t err = this->read_locked_(reinterpret_cast<uint8_t *>(iov[i].iov_base), iov[i].iov_len);
|
||||
if (err == -1) {
|
||||
if (ret != 0) {
|
||||
// if we already read some don't return an error
|
||||
|
||||
@@ -57,6 +57,7 @@ class LWIPRawCommon {
|
||||
// instead use it for determining whether to call lwip_output
|
||||
bool nodelay_ = false;
|
||||
sa_family_t family_ = 0;
|
||||
uint8_t recv_timeout_cs_ = 0; // SO_RCVTIMEO in centiseconds (0 = no timeout, max 2.55s)
|
||||
};
|
||||
|
||||
/// Connected socket implementation for LWIP raw TCP.
|
||||
@@ -107,11 +108,8 @@ class LWIPRawImpl : public LWIPRawCommon {
|
||||
errno = ECONNRESET;
|
||||
return -1;
|
||||
}
|
||||
if (blocking) {
|
||||
// blocking operation not supported
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
// Raw TCP doesn't use a blocking flag directly. Blocking behavior
|
||||
// is provided by SO_RCVTIMEO which makes read() wait via socket_delay().
|
||||
return 0;
|
||||
}
|
||||
int loop() { return 0; }
|
||||
@@ -122,6 +120,14 @@ class LWIPRawImpl : public LWIPRawCommon {
|
||||
static err_t s_recv_fn(void *arg, struct tcp_pcb *pcb, struct pbuf *pb, err_t err);
|
||||
|
||||
protected:
|
||||
// True when the socket could receive data but none has arrived yet.
|
||||
// Safe to call without LWIP_LOCK — only null-checks pointers and reads a bool,
|
||||
// all atomic on ARM/Xtensa. A stale value is harmless: the caller either does
|
||||
// an unnecessary wait (stale true) or skips it (stale false), and the
|
||||
// authoritative recheck happens under LWIP_LOCK afterward.
|
||||
bool waiting_for_data_() const { return this->rx_buf_ == nullptr && !this->rx_closed_ && this->pcb_ != nullptr; }
|
||||
void wait_for_data_();
|
||||
ssize_t read_locked_(void *buf, size_t len);
|
||||
ssize_t internal_write_(const void *buf, size_t len);
|
||||
int internal_output_();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user