[core] decouple main loop cadence from scheduler wake timing (#15792)

2026-05-25 18:47:56 +08:00 · 2026-04-21 14:48:21 +02:00
parent 1504ac3d19
commit e4f413adad
15 changed files with 644 additions and 62 deletions
@@ -0,0 +1,19 @@
+import esphome.codegen as cg
+import esphome.config_validation as cv
+from esphome.const import CONF_ID
+
+CODEOWNERS = ["@esphome/tests"]
+
+wake_test_component_ns = cg.esphome_ns.namespace("wake_test_component")
+WakeTestComponent = wake_test_component_ns.class_("WakeTestComponent", cg.Component)
+
+CONFIG_SCHEMA = cv.Schema(
+    {
+        cv.GenerateID(): cv.declare_id(WakeTestComponent),
+    }
+).extend(cv.COMPONENT_SCHEMA)
+
+
+async def to_code(config):
+    var = cg.new_Pvariable(config[CONF_ID])
+    await cg.register_component(var, config)
@@ -0,0 +1,19 @@
+#include "wake_test_component.h"
+#include "esphome/core/application.h"
+#include "esphome/core/log.h"
+#include <chrono>
+#include <thread>
+
+namespace esphome::wake_test_component {
+
+static const char *const TAG = "wake_test_component";
+
+void WakeTestComponent::start_async_wake() {
+  ESP_LOGI(TAG, "Spawning async wake thread (50ms delay)");
+  std::thread([] {
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    App.wake_loop_threadsafe();
+  }).detach();
+}
+
+}  // namespace esphome::wake_test_component
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "esphome/core/component.h"
+#include <atomic>
+
+namespace esphome::wake_test_component {
+
+class WakeTestComponent : public Component {
+ public:
+  void setup() override {}
+  void loop() override { this->loop_count_.fetch_add(1, std::memory_order_relaxed); }
+
+  int get_loop_count() const { return this->loop_count_.load(std::memory_order_relaxed); }
+
+  // Spawn a detached thread that sleeps briefly then calls
+  // App.wake_loop_threadsafe(). Used by the integration test to verify a
+  // cross-thread wake forces a component-phase iteration even when
+  // loop_interval_ has been raised high enough to gate it off otherwise.
+  void start_async_wake();
+
+  float get_setup_priority() const override { return setup_priority::DATA; }
+
+ protected:
+  std::atomic<int> loop_count_{0};
+};
+
+}  // namespace esphome::wake_test_component
@@ -0,0 +1,60 @@
+esphome:
+  name: loop-interval-decouple
+  on_boot:
+    priority: -100
+    then:
+      - lambda: |-
+          // Raise loop_interval_ to 500ms. With the decoupling fix the
+          // component phase should run ~twice per second while the 50ms
+          // scheduler interval below still fires at its requested cadence.
+          App.set_loop_interval(500);
+      # Start measurement after 1s so boot transients settle.
+      - delay: 1000ms
+      - lambda: |-
+          id(loop_at_start) = id(loop_counter)->get_loop_count();
+          id(sched_at_start) = id(sched_count);
+          ESP_LOGI("test", "MEASUREMENT_STARTED loop=%d sched=%d",
+                   id(loop_at_start), id(sched_at_start));
+      # Observe for 2s.
+      - delay: 2000ms
+      - lambda: |-
+          int loop_delta = id(loop_counter)->get_loop_count() - id(loop_at_start);
+          int sched_delta = id(sched_count) - id(sched_at_start);
+          ESP_LOGI("test", "MEASUREMENT_DONE loop_delta=%d sched_delta=%d",
+                   loop_delta, sched_delta);
+
+host:
+api:
+logger:
+  level: INFO
+  logs:
+    loop_test_component: WARN   # Silence per-loop log spam
+
+external_components:
+  - source:
+      type: local
+      path: EXTERNAL_COMPONENT_PATH
+
+globals:
+  - id: sched_count
+    type: int
+    initial_value: "0"
+  - id: loop_at_start
+    type: int
+    initial_value: "0"
+  - id: sched_at_start
+    type: int
+    initial_value: "0"
+
+loop_test_component:
+  components:
+    - id: loop_counter
+      name: loop_counter
+
+interval:
+  # Fast scheduler interval — with the decoupling fix this should fire at
+  # its requested 50ms cadence regardless of loop_interval_.
+  - interval: 50ms
+    then:
+      - lambda: |-
+          id(sched_count) += 1;
@@ -0,0 +1,51 @@
+esphome:
+  name: loop-default-not-pulled
+  on_boot:
+    priority: -100
+    then:
+      # Leave loop_interval_ at its default (16 ms → ~62 Hz). Do NOT call
+      # set_loop_interval here. The fast scheduler interval below used to
+      # pull the component phase forward to ~128 Hz via the old
+      # std::max(next_schedule, delay_time / 2) floor.
+      # Start measurement after 1s so boot transients settle.
+      - delay: 1000ms
+      - lambda: |-
+          id(loop_at_start) = id(loop_counter)->get_loop_count();
+          ESP_LOGI("test", "MEASUREMENT_STARTED loop=%d", id(loop_at_start));
+      # Observe for 2s.
+      - delay: 2000ms
+      - lambda: |-
+          int loop_delta = id(loop_counter)->get_loop_count() - id(loop_at_start);
+          ESP_LOGI("test", "MEASUREMENT_DONE loop_delta=%d", loop_delta);
+
+host:
+api:
+logger:
+  level: INFO
+  logs:
+    loop_test_component: WARN   # Silence per-loop log spam
+
+external_components:
+  - source:
+      type: local
+      path: EXTERNAL_COMPONENT_PATH
+
+globals:
+  - id: loop_at_start
+    type: int
+    initial_value: "0"
+
+loop_test_component:
+  components:
+    - id: loop_counter
+      name: loop_counter
+
+interval:
+  # Fast scheduler interval (well under loop_interval_/2 = 8ms). In the
+  # pre-decoupling code this would have pulled the component phase forward
+  # to ~128 Hz. After the decoupling fix the component phase stays at
+  # ~62 Hz regardless.
+  - interval: 5ms
+    then:
+      - lambda: |-
+          // No-op; the presence of a due scheduler item is what matters.
@@ -0,0 +1,52 @@
+esphome:
+  name: wake-loop-phase-b
+  on_boot:
+    priority: -100
+    then:
+      - lambda: |-
+          // Raise loop_interval_ to 2000ms. Without the wake-request flag,
+          // a wake_loop_threadsafe() call would only run Phase A (scheduler)
+          // and leave the component phase gated for ~2s.
+          App.set_loop_interval(2000);
+      # Let boot transients settle.
+      - delay: 1000ms
+      - lambda: |-
+          // Snapshot the loop counter, then ask the component to spawn a
+          // background thread that calls App.wake_loop_threadsafe() after
+          // ~50ms. With the fix, that wake forces Phase B on the next tick
+          // and the counter increments well within the 500ms observation
+          // window below.
+          id(count_at_start) = id(wake_counter)->get_loop_count();
+          id(start_time) = millis();
+          id(wake_counter)->start_async_wake();
+          ESP_LOGI("test", "WAKE_STARTED count=%d", id(count_at_start));
+      # Observation window must be much shorter than loop_interval_ (2000ms)
+      # so a "false pass" isn't possible by simply waiting out the gate.
+      - delay: 500ms
+      - lambda: |-
+          int count_now = id(wake_counter)->get_loop_count();
+          int delta = count_now - id(count_at_start);
+          uint32_t elapsed = millis() - id(start_time);
+          ESP_LOGI("test", "WAKE_RESULT delta=%d elapsed=%u", delta, elapsed);
+
+host:
+api:
+logger:
+  level: INFO
+
+external_components:
+  - source:
+      type: local
+      path: EXTERNAL_COMPONENT_PATH
+    components: [wake_test_component]
+
+globals:
+  - id: count_at_start
+    type: int
+    initial_value: "0"
+  - id: start_time
+    type: uint32_t
+    initial_value: "0"
+
+wake_test_component:
+  id: wake_counter
@@ -0,0 +1,75 @@
+"""Test that loop_interval_ no longer clamps scheduler cadence.
+
+Regression test for the decoupling of Application::loop() component-phase
+cadence from scheduler wake timing.
+
+Setup:
+- App.set_loop_interval(500) — raised for power-savings style cadence
+- Scheduler interval at 50ms — should fire at 50ms regardless of loop_interval_
+- Component loop (LoopTestComponent) — should run at 500ms cadence
+
+Before the decoupling fix the old `std::max(next_schedule, delay_time / 2)`
+floor clamped the sleep to ~250ms, so the 50ms scheduler only fired ~8 times
+per 2s (vs the ~40 expected). After the fix the scheduler fires close to its
+requested cadence while the component phase stays gated at loop_interval_.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import re
+
+import pytest
+
+from .types import APIClientConnectedFactory, RunCompiledFunction
+
+
+@pytest.mark.asyncio
+async def test_loop_interval_decoupling(
+    yaml_config: str,
+    run_compiled: RunCompiledFunction,
+    api_client_connected: APIClientConnectedFactory,
+) -> None:
+    """Raised loop_interval_ must not clamp scheduler item cadence."""
+    loop = asyncio.get_running_loop()
+    measurement_done: asyncio.Future[tuple[int, int]] = loop.create_future()
+
+    def on_log_line(line: str) -> None:
+        match = re.search(r"MEASUREMENT_DONE loop_delta=(\d+) sched_delta=(\d+)", line)
+        if match and not measurement_done.done():
+            measurement_done.set_result((int(match.group(1)), int(match.group(2))))
+
+    async with (
+        run_compiled(yaml_config, line_callback=on_log_line),
+        api_client_connected() as client,
+    ):
+        device_info = await client.device_info()
+        assert device_info is not None
+        assert device_info.name == "loop-interval-decouple"
+
+        try:
+            loop_delta, sched_delta = await asyncio.wait_for(
+                measurement_done, timeout=10.0
+            )
+        except TimeoutError:
+            pytest.fail("MEASUREMENT_DONE marker never appeared")
+
+        # Observation window = 2s, loop_interval_ = 500ms.
+        # Component phase should fire ~4 times in 2s. The upper bound must be
+        # less than 8: the pre-decoupling behavior clamped to ~250ms cadence
+        # giving ~8 loops/2s, so allowing 8 would let the old behavior pass.
+        # Lower bound 3 (not 2) keeps the test honest: a >30% slowdown from
+        # the ~4 nominal is not normal CI jitter and should fail.
+        assert 3 <= loop_delta <= 6, (
+            f"Component loop should fire ~4 times in 2s at loop_interval=500ms, "
+            f"got {loop_delta}"
+        )
+
+        # Scheduler interval = 50ms → ~40 fires in 2s. Before the decoupling
+        # fix this clamped to ~8 fires. Assert >= 20 to catch the old clamped
+        # behavior with comfortable jitter headroom for slow CI hosts.
+        assert sched_delta >= 20, (
+            f"50ms scheduler interval should fire ~40 times in 2s but only "
+            f"fired {sched_delta}. This indicates loop_interval_ is still "
+            f"clamping scheduler cadence."
+        )
@@ -0,0 +1,67 @@
+"""Test that a fast scheduler item does not pull the component phase forward.
+
+Regression test for the original ~128 Hz → ~62 Hz bug fixed by decoupling
+Application::loop() component-phase cadence from scheduler wake timing.
+
+Setup:
+- loop_interval_ left at its default (16 ms → ~62 Hz component phase).
+- Scheduler interval at 5 ms (well under the old loop_interval_/2 = 8 ms floor).
+
+Before the decoupling fix the ``std::max(next_schedule, delay_time / 2)`` floor
+clamped the sleep to ~8 ms whenever any scheduler item was due sooner than
+loop_interval_/2. That pulled the component phase forward to ~128 Hz — twice
+what the documented ~62 Hz default promised. After the fix the component
+phase stays at ~62 Hz regardless of scheduler activity.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import re
+
+import pytest
+
+from .types import APIClientConnectedFactory, RunCompiledFunction
+
+
+@pytest.mark.asyncio
+async def test_loop_interval_default_not_pulled_forward(
+    yaml_config: str,
+    run_compiled: RunCompiledFunction,
+    api_client_connected: APIClientConnectedFactory,
+) -> None:
+    """Fast scheduler item must not pull component phase past default ~62 Hz."""
+    loop = asyncio.get_running_loop()
+    measurement_done: asyncio.Future[int] = loop.create_future()
+
+    def on_log_line(line: str) -> None:
+        match = re.search(r"MEASUREMENT_DONE loop_delta=(\d+)", line)
+        if match and not measurement_done.done():
+            measurement_done.set_result(int(match.group(1)))
+
+    async with (
+        run_compiled(yaml_config, line_callback=on_log_line),
+        api_client_connected() as client,
+    ):
+        device_info = await client.device_info()
+        assert device_info is not None
+        assert device_info.name == "loop-default-not-pulled"
+
+        try:
+            loop_delta = await asyncio.wait_for(measurement_done, timeout=10.0)
+        except TimeoutError:
+            pytest.fail("MEASUREMENT_DONE marker never appeared")
+
+        # Observation window = 2s, loop_interval_ default = 16ms → ~62 Hz →
+        # ~125 component-phase iterations expected.
+        # Pre-fix behavior: the 5 ms scheduler interval tripped the old
+        # delay_time/2 = 8 ms floor, pulling the phase to ~128 Hz → ~256.
+        # Upper bound 180 is comfortably below the ~256 pre-fix rate but
+        # above the ~125 nominal with CI jitter.
+        # Lower bound 80 covers very slow CI hosts without permitting a
+        # complete regression.
+        assert 80 <= loop_delta <= 180, (
+            f"Component loop at default loop_interval_ should fire ~125 times "
+            f"in 2s (≈62 Hz × 2s); got {loop_delta}. Values >200 indicate the "
+            f"scheduler is again pulling the component phase forward."
+        )
@@ -0,0 +1,76 @@
+"""Test that wake_loop_threadsafe() forces a component-phase iteration.
+
+Regression test for the wake-request flag added to Application::loop()'s
+Phase A / Phase B gate. Background producers (MQTT RX, USB RX, BLE event,
+etc.) call App.wake_loop_threadsafe() expecting their component's loop()
+to drain queued work; if the component phase stays gated by loop_interval_,
+the work waits up to loop_interval_ ms instead of running on the next tick.
+
+Setup:
+- App.set_loop_interval(2000) — a wide gate that would clearly mask the bug.
+- A test component spawns a detached std::thread that sleeps 50 ms and then
+  calls App.wake_loop_threadsafe() from a non-main thread.
+- The on_boot block snapshots the component's loop counter before/after a
+  500 ms observation window.
+
+Without the fix, delta=0 (the gate holds Phase B for ~2 s).
+With the fix, delta>=1 (the wake forces Phase B within one tick of the wake).
+"""
+
+from __future__ import annotations
+
+import asyncio
+from pathlib import Path
+import re
+
+import pytest
+
+from .types import APIClientConnectedFactory, RunCompiledFunction
+
+
+@pytest.mark.asyncio
+async def test_wake_loop_forces_phase_b(
+    yaml_config: str,
+    run_compiled: RunCompiledFunction,
+    api_client_connected: APIClientConnectedFactory,
+) -> None:
+    """A wake_loop_threadsafe() call from a background thread must trigger the
+    component phase within the next tick, even when loop_interval_ is raised
+    well above the observation window."""
+    external_components_path = str(
+        Path(__file__).parent / "fixtures" / "external_components"
+    )
+    yaml_config = yaml_config.replace(
+        "EXTERNAL_COMPONENT_PATH", external_components_path
+    )
+
+    loop = asyncio.get_running_loop()
+    result: asyncio.Future[tuple[int, int]] = loop.create_future()
+
+    def on_log_line(line: str) -> None:
+        match = re.search(r"WAKE_RESULT delta=(\d+) elapsed=(\d+)", line)
+        if match and not result.done():
+            result.set_result((int(match.group(1)), int(match.group(2))))
+
+    async with (
+        run_compiled(yaml_config, line_callback=on_log_line),
+        api_client_connected() as client,
+    ):
+        device_info = await client.device_info()
+        assert device_info is not None
+        assert device_info.name == "wake-loop-phase-b"
+
+        try:
+            delta, elapsed = await asyncio.wait_for(result, timeout=15.0)
+        except TimeoutError:
+            pytest.fail("WAKE_RESULT marker never appeared")
+
+        # Without the fix, delta would be 0 — loop_interval_=2000ms held
+        # Phase B off for the full 500ms observation window. With the fix
+        # the wake from the background thread (~50ms after start) forces
+        # Phase B on the next tick, so the counter increments at least once.
+        assert delta >= 1, (
+            f"wake_loop_threadsafe() from a background thread should force "
+            f"Phase B within the next tick; observed delta={delta} after "
+            f"{elapsed}ms with loop_interval_=2000ms"
+        )