[core] Replace scheduler pool vector with unbounded intrusive freelist

The fixed MAX_POOL_SIZE=5 cap was the source of the heap churn the pool was
meant to prevent: any device with more than 5 concurrent timers (e.g. a board
with 30+ LD2450 sensors) hit a steady-state oscillation of recycle->delete and
acquire->new on every loop iteration.

Replace std::vector<SchedulerItem*> with a singly-linked freelist threaded
through SchedulerItem::next_free, which shares storage with `component` via an
anonymous union (zero per-item overhead -- the component pointer is dead while
pooled). Drop the cap entirely: the freelist quiesces at the application's
natural concurrent-timer high-water mark, which is the working set the device
already needs while those timers are active.

No std::vector means no growth-doubling slack and no realloc copies during
warm-up. Caller of get_item_from_pool_locked_() must overwrite item->component
before unlocking (already true at the sole call site); nullptr remains a valid
live `component` value for SELF_POINTER items, so we cannot pre-clear it.
This commit is contained in:
J. Nick Koston
2026-04-30 10:42:46 -05:00
parent 2758aa5517
commit f9b87d0ede
5 changed files with 43 additions and 73 deletions
+19 -39
View File
@@ -14,18 +14,8 @@ namespace esphome {
static const char *const TAG = "scheduler";
// Memory pool configuration constants
// Pool size of 5 matches typical usage patterns (2-4 active timers)
// - Minimal memory overhead (~250 bytes on ESP32)
// - Sufficient for most configs with a couple sensors/components
// - Still prevents heap fragmentation and allocation stalls
// - Complex setups with many timers will just allocate beyond the pool
// See https://github.com/esphome/backlog/issues/52
static constexpr size_t MAX_POOL_SIZE = 5;
// Maximum number of logically deleted (cancelled) items before forcing cleanup.
// Set to 5 to match the pool size - when we have as many cancelled items as our
// pool can hold, it's time to clean up and recycle them.
// Empirically chosen to balance cleanup overhead against tombstone accumulation in items_.
static constexpr uint32_t MAX_LOGICALLY_DELETED_ITEMS = 5;
// max delay to start an interval sequence
static constexpr uint32_t MAX_INTERVAL_DELAY = 5000;
@@ -165,7 +155,7 @@ void HOT Scheduler::set_timer_common_(Component *component, SchedulerItem::Type
delay = 1;
}
// Take lock early to protect scheduler_item_pool_ access and retry-cancelled check
// Take lock early to protect scheduler_item_pool_head_ access and retry-cancelled check
LockGuard guard{this->lock_};
// For retries, check if there's a cancelled timeout first - before allocating an item.
@@ -599,7 +589,7 @@ uint32_t HOT Scheduler::call(uint32_t now) {
if (now_64 - last_print > 2000) {
last_print = now_64;
std::vector<SchedulerItem *> old_items;
ESP_LOGD(TAG, "Items: count=%zu, pool=%zu, now=%" PRIu64, this->items_.size(), this->scheduler_item_pool_.size(),
ESP_LOGD(TAG, "Items: count=%zu, pool=%zu, now=%" PRIu64, this->items_.size(), this->scheduler_item_pool_size_,
now_64);
// Cleanup before debug output
this->cleanup_();
@@ -894,30 +884,19 @@ bool HOT Scheduler::SchedulerItem::cmp(SchedulerItem *a, SchedulerItem *b) {
: (a->next_execution_high_ > b->next_execution_high_);
}
// Recycle a SchedulerItem back to the pool for reuse.
// IMPORTANT: Caller must hold the scheduler lock before calling this function.
// This protects scheduler_item_pool_ from concurrent access by other threads
// that may be acquiring items from the pool in set_timer_common_().
// Recycle a SchedulerItem back to the freelist for reuse.
// IMPORTANT: Caller must hold the scheduler lock.
void Scheduler::recycle_item_main_loop_(SchedulerItem *item) {
if (item == nullptr)
return;
if (this->scheduler_item_pool_.size() < MAX_POOL_SIZE) {
// Clear callback to release captured resources
item->callback = nullptr;
this->scheduler_item_pool_.push_back(item);
item->callback = nullptr; // release captured resources
item->next_free = this->scheduler_item_pool_head_;
this->scheduler_item_pool_head_ = item;
this->scheduler_item_pool_size_++;
#ifdef ESPHOME_DEBUG_SCHEDULER
ESP_LOGD(TAG, "Recycled item to pool (pool size now: %zu)", this->scheduler_item_pool_.size());
ESP_LOGD(TAG, "Recycled item to pool (pool size now: %zu)", this->scheduler_item_pool_size_);
#endif
} else {
#ifdef ESPHOME_DEBUG_SCHEDULER
ESP_LOGD(TAG, "Pool full (size: %zu), deleting item", this->scheduler_item_pool_.size());
#endif
delete item;
#ifdef ESPHOME_DEBUG_SCHEDULER
this->debug_live_items_--;
#endif
}
}
#ifdef ESPHOME_DEBUG_SCHEDULER
@@ -942,14 +921,15 @@ void Scheduler::debug_log_timer_(const SchedulerItem *item, NameType name_type,
}
#endif /* ESPHOME_DEBUG_SCHEDULER */
// Helper to get or create a scheduler item from the pool
// IMPORTANT: Caller must hold the scheduler lock before calling this function.
// Pop from freelist or allocate. IMPORTANT: caller must hold the lock and must overwrite
// `item->component` before releasing it -- the popped slot still holds the freelist link.
Scheduler::SchedulerItem *Scheduler::get_item_from_pool_locked_() {
if (!this->scheduler_item_pool_.empty()) {
SchedulerItem *item = this->scheduler_item_pool_.back();
this->scheduler_item_pool_.pop_back();
if (this->scheduler_item_pool_head_ != nullptr) {
SchedulerItem *item = this->scheduler_item_pool_head_;
this->scheduler_item_pool_head_ = item->next_free;
this->scheduler_item_pool_size_--;
#ifdef ESPHOME_DEBUG_SCHEDULER
ESP_LOGD(TAG, "Reused item from pool (pool size now: %zu)", this->scheduler_item_pool_.size());
ESP_LOGD(TAG, "Reused item from pool (pool size now: %zu)", this->scheduler_item_pool_size_);
#endif
return item;
}
@@ -967,7 +947,7 @@ Scheduler::SchedulerItem *Scheduler::get_item_from_pool_locked_() {
bool Scheduler::debug_verify_no_leak_() const {
// Invariant: every live SchedulerItem must be in exactly one container.
// debug_live_items_ tracks allocations minus deletions.
size_t accounted = this->items_.size() + this->to_add_.size() + this->scheduler_item_pool_.size();
size_t accounted = this->items_.size() + this->to_add_.size() + this->scheduler_item_pool_size_;
#ifndef ESPHOME_THREAD_SINGLE
accounted += this->defer_queue_.size();
#endif
@@ -981,7 +961,7 @@ bool Scheduler::debug_verify_no_leak_() const {
")",
static_cast<uint32_t>(this->debug_live_items_), static_cast<uint32_t>(accounted),
static_cast<uint32_t>(this->items_.size()), static_cast<uint32_t>(this->to_add_.size()),
static_cast<uint32_t>(this->scheduler_item_pool_.size())
static_cast<uint32_t>(this->scheduler_item_pool_size_)
#ifndef ESPHOME_THREAD_SINGLE
,
static_cast<uint32_t>(this->defer_queue_.size())
+12 -12
View File
@@ -177,8 +177,12 @@ class Scheduler {
protected:
struct SchedulerItem {
// Ordered by size to minimize padding
Component *component;
// Ordered by size to minimize padding.
// `component` while live; `next_free` while in scheduler_item_pool_head_ (mutually exclusive).
union {
Component *component;
SchedulerItem *next_free;
};
// Optimized name storage using tagged union - zero heap allocation
union {
const char *static_name; // For STATIC_STRING (string literals) and SELF_POINTER (caller's `this`)
@@ -713,19 +717,15 @@ class Scheduler {
#endif
}
// Memory pool for recycling SchedulerItem objects to reduce heap churn.
// Design decisions:
// - std::vector is used instead of a fixed array because many systems only need 1-2 scheduler items
// - The vector grows dynamically up to MAX_POOL_SIZE (5) only when needed, saving memory on simple setups
// - Pool size of 5 matches typical usage (2-4 timers) while keeping memory overhead low (~250 bytes on ESP32)
// - The pool significantly reduces heap fragmentation which is critical because heap allocation/deallocation
// can stall the entire system, causing timing issues and dropped events for any components that need
// to synchronize between tasks (see https://github.com/esphome/backlog/issues/52)
std::vector<SchedulerItem *> scheduler_item_pool_;
// Intrusive freelist threaded through SchedulerItem::next_free. Unbounded so it quiesces at the
// app's concurrent-timer high-water mark; the previous fixed cap caused steady-state new/delete
// churn on devices with many timers (see https://github.com/esphome/backlog/issues/52).
SchedulerItem *scheduler_item_pool_head_{nullptr};
size_t scheduler_item_pool_size_{0};
#ifdef ESPHOME_DEBUG_SCHEDULER
// Leak detection: tracks total live SchedulerItem allocations.
// Invariant: debug_live_items_ == items_.size() + to_add_.size() + defer_queue_.size() + scheduler_item_pool_.size()
// Invariant: debug_live_items_ == items_.size() + to_add_.size() + defer_queue_.size() + scheduler_item_pool_size_
// Verified periodically in call() to catch leaks early.
size_t debug_live_items_{0};
+5 -5
View File
@@ -101,8 +101,8 @@ static void Scheduler_SetTimeout(benchmark::State &state) {
Component dummy_component;
// Register 3 timeouts then call() — realistic worst case where multiple
// components schedule in the same loop iteration. Keeps item count within
// the recycling pool (MAX_POOL_SIZE=5) to avoid spurious malloc/free.
// components schedule in the same loop iteration. warm_pool fills the
// freelist so acquire/recycle never falls back to malloc.
static constexpr int kBatchSize = 3;
static_assert(kInnerIterations % kBatchSize == 0, "kInnerIterations must be divisible by kBatchSize");
warm_pool(scheduler, &dummy_component, kBatchSize, 1000);
@@ -209,9 +209,9 @@ static void Scheduler_SetTimeout_ExceedPool(benchmark::State &state) {
Scheduler scheduler;
Component dummy_component;
// Register 10 timeouts then call() — exceeds MAX_POOL_SIZE=5 to measure
// the performance cliff when the recycling pool is exhausted and items
// must be malloc'd/freed.
// Register 10 timeouts then call() — larger working set than the 3-item
// batches above. With the unbounded freelist, warm_pool preallocates 10
// items so this measures steady-state, not malloc cliff.
static constexpr int kBatchSize = 10;
static_assert(kInnerIterations % kBatchSize == 0, "kInnerIterations must be divisible by kBatchSize");
warm_pool(scheduler, &dummy_component, kBatchSize, 1000);
@@ -221,14 +221,10 @@ script:
- id: test_full_pool_reuse
then:
- lambda: |-
ESP_LOGI("test", "Phase 6: Testing pool size limits after Phase 5 items complete");
ESP_LOGI("test", "Phase 6: Testing pool reuse after Phase 5 items complete");
// At this point, all Phase 5 timeouts should have completed and been recycled.
// The pool should be at its maximum size (5).
// Creating 10 new items tests that:
// - First 5 items reuse from the pool
// - Remaining 5 items allocate new (pool empty)
// - Pool doesn't grow beyond MAX_POOL_SIZE of 5
// Phase 5 timeouts have completed and been recycled. The freelist is unbounded;
// creating 10 new items reuses from it and only allocates fresh when empty.
auto *component = id(test_sensor);
int full_reuse_count = 10;
+4 -10
View File
@@ -180,16 +180,10 @@ async def test_scheduler_pool(
# Verify pool behavior
assert pool_recycle_count > 0, "Should have recycled items to pool"
# Check pool metrics
if pool_recycle_count > 0:
max_pool_size = 0
for line in log_lines:
if match := recycle_pattern.search(line):
size = int(match.group(1))
max_pool_size = max(max_pool_size, size)
# Pool can grow up to its maximum of 5
assert max_pool_size <= 5, f"Pool grew beyond maximum ({max_pool_size})"
# Pool is unbounded; the cap was the source of the churn it was meant to prevent.
assert pool_full_count == 0, (
f"Pool should never report full (got {pool_full_count})"
)
# Log summary for debugging
print("\nScheduler Pool Test Summary (Python Orchestrated):")