diff --git a/esphome/components/esphome/ota/ota_esphome.cpp b/esphome/components/esphome/ota/ota_esphome.cpp index a1cdf59d2b7..d8dbe2dee2d 100644 --- a/esphome/components/esphome/ota/ota_esphome.cpp +++ b/esphome/components/esphome/ota/ota_esphome.cpp @@ -18,6 +18,7 @@ #include #include +#include namespace esphome { @@ -238,6 +239,31 @@ void ESPHomeOTAComponent::handle_data_() { /// and reboots on success. /// /// Authentication has already been handled in the non-blocking states AUTH_SEND/AUTH_READ. + /// + /// Socket I/O strategy: + /// + /// Before this function, the handshake states use non-blocking I/O: + /// read()/write() return immediately with EWOULDBLOCK if no data + /// loop() retries on next iteration (~16ms), no delay needed + /// + /// This function switches to blocking mode with SO_RCVTIMEO/SO_SNDTIMEO: + /// + /// Path | Wait mechanism | WDT strategy + /// --------------|------------------------|--------------------------- + /// Main read | SO_RCVTIMEO (2s block) | feed_wdt() only, no delay + /// readall_() | SO_RCVTIMEO (2s block) | feed_wdt() + delay(0) + /// writeall_() | SO_SNDTIMEO (2s block) | feed_wdt() + delay(1) + /// + /// readall_() uses delay(0) because SO_RCVTIMEO already waited — just yield. + /// writeall_() uses delay(1) because on raw TCP (ESP8266, RP2040) writes + /// never block (tcp_write returns immediately), so delay(1) prevents spinning. + /// + /// Platform details: + /// BSD sockets (ESP32): setblocking(true) makes read/write block + /// lwip sockets (LT): setblocking(true) makes read/write block + /// Raw TCP (8266, RP2040): setblocking is no-op; SO_RCVTIMEO uses + /// socket_delay()/socket_wake() in read(); + /// write() always returns immediately ota::OTAResponseTypes error_code = ota::OTA_RESPONSE_ERROR_UNKNOWN; bool update_started = false; size_t total = 0; @@ -249,6 +275,14 @@ void ESPHomeOTAComponent::handle_data_() { size_t size_acknowledged = 0; #endif + // Set socket timeouts and blocking mode (see strategy table above) + struct timeval tv; + tv.tv_sec = 2; + tv.tv_usec = 0; + this->client_->setsockopt(SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + this->client_->setsockopt(SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)); + this->client_->setblocking(true); + // Acknowledge auth OK - 1 byte this->write_byte_(ota::OTA_RESPONSE_AUTH_OK); @@ -299,7 +333,8 @@ void ESPHomeOTAComponent::handle_data_() { ssize_t read = this->client_->read(buf, requested); if (read == -1) { if (this->would_block_(errno)) { - this->yield_and_feed_watchdog_(); + // read() already waited up to SO_RCVTIMEO for data, just feed WDT + App.feed_wdt(); continue; } ESP_LOGW(TAG, "Read err %d", errno); @@ -401,7 +436,9 @@ bool ESPHomeOTAComponent::readall_(uint8_t *buf, size_t len) { } else { at += read; } - this->yield_and_feed_watchdog_(); + // read() already waited via SO_RCVTIMEO, just yield without 1ms stall + App.feed_wdt(); + delay(0); } return true; @@ -422,10 +459,13 @@ bool ESPHomeOTAComponent::writeall_(const uint8_t *buf, size_t len) { ESP_LOGW(TAG, "Write err %zu bytes, errno %d", len, errno); return false; } + // EWOULDBLOCK: on raw TCP writes never block, delay(1) prevents spinning + this->yield_and_feed_watchdog_(); } else { at += written; + // write() may block up to SO_SNDTIMEO on BSD/lwip sockets, feed WDT + App.feed_wdt(); } - this->yield_and_feed_watchdog_(); } return true; } diff --git a/esphome/components/socket/headers.h b/esphome/components/socket/headers.h index 16e4d23d3ba..0eece6480f6 100644 --- a/esphome/components/socket/headers.h +++ b/esphome/components/socket/headers.h @@ -51,6 +51,8 @@ #define SO_REUSEADDR 0x0004 /* Allow local address reuse */ #define SO_KEEPALIVE 0x0008 /* keep connections alive */ #define SO_BROADCAST 0x0020 /* permit to send and to receive broadcast messages (see IP_SOF_BROADCAST option) */ +#define SO_RCVTIMEO 0x1006 /* receive timeout */ +#define SO_SNDTIMEO 0x1005 /* send timeout */ #define SOL_SOCKET 0xfff /* options for socket level */ diff --git a/esphome/components/socket/lwip_raw_tcp_impl.cpp b/esphome/components/socket/lwip_raw_tcp_impl.cpp index 1e03a4935c2..96328e68c73 100644 --- a/esphome/components/socket/lwip_raw_tcp_impl.cpp +++ b/esphome/components/socket/lwip_raw_tcp_impl.cpp @@ -5,6 +5,7 @@ #include #include +#include #include "esphome/core/helpers.h" #include "esphome/core/log.h" @@ -81,7 +82,9 @@ void socket_delay(uint32_t ms) { s_socket_woke = false; return; } - s_socket_woke = false; + // Don't clear s_socket_woke here — if an IRQ fires between the check above + // and the while loop below, the while condition sees it immediately. Clearing + // here would lose that wake and sleep until the timer fires. s_delay_expired = false; // Set a one-shot timer to wake us after the timeout. // add_alarm_in_ms returns >0 on success, 0 if time already passed, <0 on error. @@ -99,6 +102,7 @@ void socket_delay(uint32_t ms) { // Cancel timer if we woke early (socket data arrived before timeout) if (!s_delay_expired) cancel_alarm(alarm); + s_socket_woke = false; // consume the wake for next call } // No IRAM_ATTR equivalent needed: on RP2040, CYW43 async_context runs LWIP @@ -359,6 +363,18 @@ int LWIPRawCommon::getsockopt(int level, int optname, void *optval, socklen_t *o *optlen = 4; return 0; } + if (level == SOL_SOCKET && optname == SO_RCVTIMEO) { + if (*optlen < sizeof(struct timeval)) { + errno = EINVAL; + return -1; + } + uint32_t ms = this->recv_timeout_cs_ * 10; + auto *tv = reinterpret_cast(optval); + tv->tv_sec = ms / 1000; + tv->tv_usec = (ms % 1000) * 1000; + *optlen = sizeof(struct timeval); + return 0; + } if (level == IPPROTO_TCP && optname == TCP_NODELAY) { if (*optlen < 4) { errno = EINVAL; @@ -388,6 +404,21 @@ int LWIPRawCommon::setsockopt(int level, int optname, const void *optval, sockle // to prevent warnings return 0; } + if (level == SOL_SOCKET && optname == SO_RCVTIMEO) { + if (optlen < sizeof(struct timeval)) { + errno = EINVAL; + return -1; + } + const auto *tv = reinterpret_cast(optval); + uint32_t ms = tv->tv_sec * 1000 + tv->tv_usec / 1000; + uint32_t cs = (ms + 9) / 10; // round up to nearest centisecond + this->recv_timeout_cs_ = cs > 255 ? 255 : static_cast(cs); + return 0; + } + if (level == SOL_SOCKET && optname == SO_SNDTIMEO) { + // Raw TCP writes are non-blocking (tcp_write), so send timeout is a no-op. + return 0; + } if (level == IPPROTO_TCP && optname == TCP_NODELAY) { if (optlen != 4) { errno = EINVAL; @@ -518,8 +549,25 @@ err_t LWIPRawImpl::recv_fn(struct pbuf *pb, err_t err) { return ERR_OK; } -ssize_t LWIPRawImpl::read(void *buf, size_t len) { - LWIP_LOCK(); +void LWIPRawImpl::wait_for_data_() { + // Wait for data without holding LWIP_LOCK so recv_fn() can run on RP2040 + // (needs async_context lock). + // + // Loop until data arrives, connection closes, or the full timeout elapses. + // socket_delay() may return early due to other sockets waking the global + // socket_wake() flag, so we re-enter for the remaining time. + uint32_t timeout_ms = this->recv_timeout_cs_ * 10; + uint32_t start = millis(); + while (this->waiting_for_data_()) { + uint32_t elapsed = millis() - start; + if (elapsed >= timeout_ms) + break; + socket_delay(timeout_ms - elapsed); + } +} + +ssize_t LWIPRawImpl::read_locked_(void *buf, size_t len) { + // Caller must hold LWIP_LOCK. Copies available data from rx_buf_ into buf. if (this->pcb_ == nullptr) { errno = ECONNRESET; return -1; @@ -578,11 +626,26 @@ ssize_t LWIPRawImpl::read(void *buf, size_t len) { return read; } +ssize_t LWIPRawImpl::read(void *buf, size_t len) { + // See waiting_for_data_() for safety of unlocked reads. + if (this->recv_timeout_cs_ > 0 && this->waiting_for_data_()) { + this->wait_for_data_(); + } + + LWIP_LOCK(); + return this->read_locked_(buf, len); +} + ssize_t LWIPRawImpl::readv(const struct iovec *iov, int iovcnt) { + // See waiting_for_data_() for safety of unlocked reads. + if (this->recv_timeout_cs_ > 0 && this->waiting_for_data_()) { + this->wait_for_data_(); + } + LWIP_LOCK(); // Hold for entire scatter-gather operation ssize_t ret = 0; for (int i = 0; i < iovcnt; i++) { - ssize_t err = this->read(reinterpret_cast(iov[i].iov_base), iov[i].iov_len); + ssize_t err = this->read_locked_(reinterpret_cast(iov[i].iov_base), iov[i].iov_len); if (err == -1) { if (ret != 0) { // if we already read some don't return an error diff --git a/esphome/components/socket/lwip_raw_tcp_impl.h b/esphome/components/socket/lwip_raw_tcp_impl.h index 95931afcf3f..3c27d71062f 100644 --- a/esphome/components/socket/lwip_raw_tcp_impl.h +++ b/esphome/components/socket/lwip_raw_tcp_impl.h @@ -57,6 +57,7 @@ class LWIPRawCommon { // instead use it for determining whether to call lwip_output bool nodelay_ = false; sa_family_t family_ = 0; + uint8_t recv_timeout_cs_ = 0; // SO_RCVTIMEO in centiseconds (0 = no timeout, max 2.55s) }; /// Connected socket implementation for LWIP raw TCP. @@ -107,11 +108,8 @@ class LWIPRawImpl : public LWIPRawCommon { errno = ECONNRESET; return -1; } - if (blocking) { - // blocking operation not supported - errno = EINVAL; - return -1; - } + // Raw TCP doesn't use a blocking flag directly. Blocking behavior + // is provided by SO_RCVTIMEO which makes read() wait via socket_delay(). return 0; } int loop() { return 0; } @@ -122,6 +120,14 @@ class LWIPRawImpl : public LWIPRawCommon { static err_t s_recv_fn(void *arg, struct tcp_pcb *pcb, struct pbuf *pb, err_t err); protected: + // True when the socket could receive data but none has arrived yet. + // Safe to call without LWIP_LOCK — only null-checks pointers and reads a bool, + // all atomic on ARM/Xtensa. A stale value is harmless: the caller either does + // an unnecessary wait (stale true) or skips it (stale false), and the + // authoritative recheck happens under LWIP_LOCK afterward. + bool waiting_for_data_() const { return this->rx_buf_ == nullptr && !this->rx_closed_ && this->pcb_ != nullptr; } + void wait_for_data_(); + ssize_t read_locked_(void *buf, size_t len); ssize_t internal_write_(const void *buf, size_t len); int internal_output_();