[core] Replace scheduler pool vector with unbounded intrusive freelist

The fixed MAX_POOL_SIZE=5 cap was the source of the heap churn the pool was meant to prevent: any device with more than 5 concurrent timers (e.g. a board with 30+ LD2450 sensors) hit a steady-state oscillation of recycle->delete and acquire->new on every loop iteration. Replace std::vector<SchedulerItem*> with a singly-linked freelist threaded through SchedulerItem::next_free, which shares storage with `component` via an anonymous union (zero per-item overhead -- the component pointer is dead while pooled). Drop the cap entirely: the freelist quiesces at the application's natural concurrent-timer high-water mark, which is the working set the device already needs while those timers are active. No std::vector means no growth-doubling slack and no realloc copies during warm-up. Caller of get_item_from_pool_locked_() must overwrite item->component before unlocking (already true at the sole call site); nullptr remains a valid live `component` value for SELF_POINTER items, so we cannot pre-clear it.
2026-05-19 03:01:49 +08:00 · 2026-04-30 10:42:46 -05:00
parent 2758aa5517
commit f9b87d0ede
5 changed files with 43 additions and 73 deletions
@@ -14,18 +14,8 @@ namespace esphome {

 static const char *const TAG = "scheduler";

-// Memory pool configuration constants
-// Pool size of 5 matches typical usage patterns (2-4 active timers)
-// - Minimal memory overhead (~250 bytes on ESP32)
-// - Sufficient for most configs with a couple sensors/components
-// - Still prevents heap fragmentation and allocation stalls
-// - Complex setups with many timers will just allocate beyond the pool
-// See https://github.com/esphome/backlog/issues/52
-static constexpr size_t MAX_POOL_SIZE = 5;
-
 // Maximum number of logically deleted (cancelled) items before forcing cleanup.
-// Set to 5 to match the pool size - when we have as many cancelled items as our
-// pool can hold, it's time to clean up and recycle them.
+// Empirically chosen to balance cleanup overhead against tombstone accumulation in items_.
 static constexpr uint32_t MAX_LOGICALLY_DELETED_ITEMS = 5;
 // max delay to start an interval sequence
 static constexpr uint32_t MAX_INTERVAL_DELAY = 5000;
@@ -165,7 +155,7 @@ void HOT Scheduler::set_timer_common_(Component *component, SchedulerItem::Type
    delay = 1;
  }

-  // Take lock early to protect scheduler_item_pool_ access and retry-cancelled check
+  // Take lock early to protect scheduler_item_pool_head_ access and retry-cancelled check
  LockGuard guard{this->lock_};

  // For retries, check if there's a cancelled timeout first - before allocating an item.
@@ -599,7 +589,7 @@ uint32_t HOT Scheduler::call(uint32_t now) {
  if (now_64 - last_print > 2000) {
    last_print = now_64;
    std::vector<SchedulerItem *> old_items;
-    ESP_LOGD(TAG, "Items: count=%zu, pool=%zu, now=%" PRIu64, this->items_.size(), this->scheduler_item_pool_.size(),
+    ESP_LOGD(TAG, "Items: count=%zu, pool=%zu, now=%" PRIu64, this->items_.size(), this->scheduler_item_pool_size_,
             now_64);
    // Cleanup before debug output
    this->cleanup_();
@@ -894,30 +884,19 @@ bool HOT Scheduler::SchedulerItem::cmp(SchedulerItem *a, SchedulerItem *b) {
                                                              : (a->next_execution_high_ > b->next_execution_high_);
 }

-// Recycle a SchedulerItem back to the pool for reuse.
-// IMPORTANT: Caller must hold the scheduler lock before calling this function.
-// This protects scheduler_item_pool_ from concurrent access by other threads
-// that may be acquiring items from the pool in set_timer_common_().
+// Recycle a SchedulerItem back to the freelist for reuse.
+// IMPORTANT: Caller must hold the scheduler lock.
 void Scheduler::recycle_item_main_loop_(SchedulerItem *item) {
  if (item == nullptr)
    return;

-  if (this->scheduler_item_pool_.size() < MAX_POOL_SIZE) {
-    // Clear callback to release captured resources
-    item->callback = nullptr;
-    this->scheduler_item_pool_.push_back(item);
+  item->callback = nullptr;  // release captured resources
+  item->next_free = this->scheduler_item_pool_head_;
+  this->scheduler_item_pool_head_ = item;
+  this->scheduler_item_pool_size_++;
 #ifdef ESPHOME_DEBUG_SCHEDULER
-    ESP_LOGD(TAG, "Recycled item to pool (pool size now: %zu)", this->scheduler_item_pool_.size());
+  ESP_LOGD(TAG, "Recycled item to pool (pool size now: %zu)", this->scheduler_item_pool_size_);
 #endif
-  } else {
-#ifdef ESPHOME_DEBUG_SCHEDULER
-    ESP_LOGD(TAG, "Pool full (size: %zu), deleting item", this->scheduler_item_pool_.size());
-#endif
-    delete item;
-#ifdef ESPHOME_DEBUG_SCHEDULER
-    this->debug_live_items_--;
-#endif
-  }
 }

 #ifdef ESPHOME_DEBUG_SCHEDULER
@@ -942,14 +921,15 @@ void Scheduler::debug_log_timer_(const SchedulerItem *item, NameType name_type,
 }
 #endif /* ESPHOME_DEBUG_SCHEDULER */

-// Helper to get or create a scheduler item from the pool
-// IMPORTANT: Caller must hold the scheduler lock before calling this function.
+// Pop from freelist or allocate. IMPORTANT: caller must hold the lock and must overwrite
+// `item->component` before releasing it -- the popped slot still holds the freelist link.
 Scheduler::SchedulerItem *Scheduler::get_item_from_pool_locked_() {
-  if (!this->scheduler_item_pool_.empty()) {
-    SchedulerItem *item = this->scheduler_item_pool_.back();
-    this->scheduler_item_pool_.pop_back();
+  if (this->scheduler_item_pool_head_ != nullptr) {
+    SchedulerItem *item = this->scheduler_item_pool_head_;
+    this->scheduler_item_pool_head_ = item->next_free;
+    this->scheduler_item_pool_size_--;
 #ifdef ESPHOME_DEBUG_SCHEDULER
-    ESP_LOGD(TAG, "Reused item from pool (pool size now: %zu)", this->scheduler_item_pool_.size());
+    ESP_LOGD(TAG, "Reused item from pool (pool size now: %zu)", this->scheduler_item_pool_size_);
 #endif
    return item;
  }
@@ -967,7 +947,7 @@ Scheduler::SchedulerItem *Scheduler::get_item_from_pool_locked_() {
 bool Scheduler::debug_verify_no_leak_() const {
  // Invariant: every live SchedulerItem must be in exactly one container.
  // debug_live_items_ tracks allocations minus deletions.
-  size_t accounted = this->items_.size() + this->to_add_.size() + this->scheduler_item_pool_.size();
+  size_t accounted = this->items_.size() + this->to_add_.size() + this->scheduler_item_pool_size_;
 #ifndef ESPHOME_THREAD_SINGLE
  accounted += this->defer_queue_.size();
 #endif
@@ -981,7 +961,7 @@ bool Scheduler::debug_verify_no_leak_() const {
             ")",
             static_cast<uint32_t>(this->debug_live_items_), static_cast<uint32_t>(accounted),
             static_cast<uint32_t>(this->items_.size()), static_cast<uint32_t>(this->to_add_.size()),
-             static_cast<uint32_t>(this->scheduler_item_pool_.size())
+             static_cast<uint32_t>(this->scheduler_item_pool_size_)
 #ifndef ESPHOME_THREAD_SINGLE
                 ,
             static_cast<uint32_t>(this->defer_queue_.size())
@@ -177,8 +177,12 @@ class Scheduler {

 protected:
  struct SchedulerItem {
-    // Ordered by size to minimize padding
-    Component *component;
+    // Ordered by size to minimize padding.
+    // `component` while live; `next_free` while in scheduler_item_pool_head_ (mutually exclusive).
+    union {
+      Component *component;
+      SchedulerItem *next_free;
+    };
    // Optimized name storage using tagged union - zero heap allocation
    union {
      const char *static_name;  // For STATIC_STRING (string literals) and SELF_POINTER (caller's `this`)
@@ -713,19 +717,15 @@ class Scheduler {
 #endif
  }

-  // Memory pool for recycling SchedulerItem objects to reduce heap churn.
-  // Design decisions:
-  // - std::vector is used instead of a fixed array because many systems only need 1-2 scheduler items
-  // - The vector grows dynamically up to MAX_POOL_SIZE (5) only when needed, saving memory on simple setups
-  // - Pool size of 5 matches typical usage (2-4 timers) while keeping memory overhead low (~250 bytes on ESP32)
-  // - The pool significantly reduces heap fragmentation which is critical because heap allocation/deallocation
-  //   can stall the entire system, causing timing issues and dropped events for any components that need
-  //   to synchronize between tasks (see https://github.com/esphome/backlog/issues/52)
-  std::vector<SchedulerItem *> scheduler_item_pool_;
+  // Intrusive freelist threaded through SchedulerItem::next_free. Unbounded so it quiesces at the
+  // app's concurrent-timer high-water mark; the previous fixed cap caused steady-state new/delete
+  // churn on devices with many timers (see https://github.com/esphome/backlog/issues/52).
+  SchedulerItem *scheduler_item_pool_head_{nullptr};
+  size_t scheduler_item_pool_size_{0};

 #ifdef ESPHOME_DEBUG_SCHEDULER
  // Leak detection: tracks total live SchedulerItem allocations.
-  // Invariant: debug_live_items_ == items_.size() + to_add_.size() + defer_queue_.size() + scheduler_item_pool_.size()
+  // Invariant: debug_live_items_ == items_.size() + to_add_.size() + defer_queue_.size() + scheduler_item_pool_size_
  // Verified periodically in call() to catch leaks early.
  size_t debug_live_items_{0};

@@ -101,8 +101,8 @@ static void Scheduler_SetTimeout(benchmark::State &state) {
  Component dummy_component;

  // Register 3 timeouts then call() — realistic worst case where multiple
-  // components schedule in the same loop iteration. Keeps item count within
-  // the recycling pool (MAX_POOL_SIZE=5) to avoid spurious malloc/free.
+  // components schedule in the same loop iteration. warm_pool fills the
+  // freelist so acquire/recycle never falls back to malloc.
  static constexpr int kBatchSize = 3;
  static_assert(kInnerIterations % kBatchSize == 0, "kInnerIterations must be divisible by kBatchSize");
  warm_pool(scheduler, &dummy_component, kBatchSize, 1000);
@@ -209,9 +209,9 @@ static void Scheduler_SetTimeout_ExceedPool(benchmark::State &state) {
  Scheduler scheduler;
  Component dummy_component;

-  // Register 10 timeouts then call() — exceeds MAX_POOL_SIZE=5 to measure
-  // the performance cliff when the recycling pool is exhausted and items
-  // must be malloc'd/freed.
+  // Register 10 timeouts then call() — larger working set than the 3-item
+  // batches above. With the unbounded freelist, warm_pool preallocates 10
+  // items so this measures steady-state, not malloc cliff.
  static constexpr int kBatchSize = 10;
  static_assert(kInnerIterations % kBatchSize == 0, "kInnerIterations must be divisible by kBatchSize");
  warm_pool(scheduler, &dummy_component, kBatchSize, 1000);
@@ -221,14 +221,10 @@ script:
  - id: test_full_pool_reuse
    then:
      - lambda: |-
-          ESP_LOGI("test", "Phase 6: Testing pool size limits after Phase 5 items complete");
+          ESP_LOGI("test", "Phase 6: Testing pool reuse after Phase 5 items complete");

-          // At this point, all Phase 5 timeouts should have completed and been recycled.
-          // The pool should be at its maximum size (5).
-          // Creating 10 new items tests that:
-          // - First 5 items reuse from the pool
-          // - Remaining 5 items allocate new (pool empty)
-          // - Pool doesn't grow beyond MAX_POOL_SIZE of 5
+          // Phase 5 timeouts have completed and been recycled. The freelist is unbounded;
+          // creating 10 new items reuses from it and only allocates fresh when empty.

          auto *component = id(test_sensor);
          int full_reuse_count = 10;
@@ -180,16 +180,10 @@ async def test_scheduler_pool(
    # Verify pool behavior
    assert pool_recycle_count > 0, "Should have recycled items to pool"

-    # Check pool metrics
-    if pool_recycle_count > 0:
-        max_pool_size = 0
-        for line in log_lines:
-            if match := recycle_pattern.search(line):
-                size = int(match.group(1))
-                max_pool_size = max(max_pool_size, size)
-
-        # Pool can grow up to its maximum of 5
-        assert max_pool_size <= 5, f"Pool grew beyond maximum ({max_pool_size})"
+    # Pool is unbounded; the cap was the source of the churn it was meant to prevent.
+    assert pool_full_count == 0, (
+        f"Pool should never report full (got {pool_full_count})"
+    )

    # Log summary for debugging
    print("\nScheduler Pool Test Summary (Python Orchestrated):")