From 8ca6ee4349acb28878d4c15ef32d67999b96f139 Mon Sep 17 00:00:00 2001
From: Kevin Ahrendt <kevin.ahrendt@openhomefoundation.org>
Date: Tue, 10 Mar 2026 15:25:26 -0500
Subject: [PATCH] [speaker_source] Add new media player (#14649)

Co-authored-by: J. Nick Koston <nick@home-assistant.io>
---
 CODEOWNERS                                    |   1 +
 esphome/components/speaker_source/__init__.py |   0
 .../components/speaker_source/media_player.py | 212 +++++++
 .../speaker_source_media_player.cpp           | 546 ++++++++++++++++++
 .../speaker_source_media_player.h             | 217 +++++++
 tests/components/speaker_source/common.yaml   |  43 ++
 .../speaker_source/test.esp32-idf.yaml        |   9 +
 tests/components/speaker_source/test.wav      | Bin 0 -> 46 bytes
 8 files changed, 1028 insertions(+)
 create mode 100644 esphome/components/speaker_source/__init__.py
 create mode 100644 esphome/components/speaker_source/media_player.py
 create mode 100644 esphome/components/speaker_source/speaker_source_media_player.cpp
 create mode 100644 esphome/components/speaker_source/speaker_source_media_player.h
 create mode 100644 tests/components/speaker_source/common.yaml
 create mode 100644 tests/components/speaker_source/test.esp32-idf.yaml
 create mode 100644 tests/components/speaker_source/test.wav

diff --git a/CODEOWNERS b/CODEOWNERS
index a95e100cbf..12aff01e73 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -459,6 +459,7 @@ esphome/components/sonoff_d1/* @anatoly-savchenkov
 esphome/components/sound_level/* @kahrendt
 esphome/components/speaker/* @jesserockz @kahrendt
 esphome/components/speaker/media_player/* @kahrendt @synesthesiam
+esphome/components/speaker_source/* @kahrendt
 esphome/components/spi/* @clydebarrow @esphome/core
 esphome/components/spi_device/* @clydebarrow
 esphome/components/spi_led_strip/* @clydebarrow
diff --git a/esphome/components/speaker_source/__init__.py b/esphome/components/speaker_source/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/esphome/components/speaker_source/media_player.py b/esphome/components/speaker_source/media_player.py
new file mode 100644
index 0000000000..a44cdcbf01
--- /dev/null
+++ b/esphome/components/speaker_source/media_player.py
@@ -0,0 +1,212 @@
+from esphome import automation
+import esphome.codegen as cg
+from esphome.components import audio, media_player, media_source, speaker
+import esphome.config_validation as cv
+from esphome.const import (
+    CONF_FORMAT,
+    CONF_ID,
+    CONF_NUM_CHANNELS,
+    CONF_SAMPLE_RATE,
+    CONF_SPEAKER,
+)
+from esphome.core.entity_helpers import inherit_property_from
+from esphome.types import ConfigType
+
+AUTO_LOAD = ["audio"]
+DEPENDENCIES = ["media_source", "speaker"]
+
+CODEOWNERS = ["@kahrendt"]
+
+CONF_MEDIA_PIPELINE = "media_pipeline"
+CONF_ON_MUTE = "on_mute"
+CONF_ON_UNMUTE = "on_unmute"
+CONF_ON_VOLUME = "on_volume"
+CONF_SOURCES = "sources"
+CONF_VOLUME_INCREMENT = "volume_increment"
+CONF_VOLUME_INITIAL = "volume_initial"
+CONF_VOLUME_MAX = "volume_max"
+CONF_VOLUME_MIN = "volume_min"
+
+speaker_source_ns = cg.esphome_ns.namespace("speaker_source")
+
+SpeakerSourceMediaPlayer = speaker_source_ns.class_(
+    "SpeakerSourceMediaPlayer", cg.Component, media_player.MediaPlayer
+)
+
+PipelineContext = speaker_source_ns.struct("PipelineContext")
+
+Pipeline = speaker_source_ns.enum("Pipeline")
+
+
+FORMAT_MAPPING = {
+    "FLAC": "flac",
+    "MP3": "mp3",
+    "OPUS": "opus",
+    "WAV": "wav",
+}
+
+
+# Returns a media_player.MediaPlayerSupportedFormat struct with the configured
+# format, sample rate, number of channels, purpose, and bytes per sample
+def _get_supported_format_struct(pipeline: ConfigType):
+    args = [
+        media_player.MediaPlayerSupportedFormat,
+    ]
+
+    args.append(("format", FORMAT_MAPPING[pipeline[CONF_FORMAT]]))
+
+    args.append(("sample_rate", pipeline[CONF_SAMPLE_RATE]))
+    args.append(("num_channels", pipeline[CONF_NUM_CHANNELS]))
+    args.append(("purpose", media_player.MEDIA_PLAYER_FORMAT_PURPOSE_ENUM["default"]))
+
+    # Omit sample_bytes for MP3: ffmpeg transcoding in Home Assistant fails
+    # if the number of bytes per sample is specified for MP3.
+    if pipeline[CONF_FORMAT] != "MP3":
+        args.append(("sample_bytes", 2))
+
+    return cg.StructInitializer(*args)
+
+
+def _validate_pipeline(config: ConfigType) -> ConfigType:
+    # Inherit settings from speaker if not manually set
+    inherit_property_from(CONF_NUM_CHANNELS, CONF_SPEAKER)(config)
+    inherit_property_from(CONF_SAMPLE_RATE, CONF_SPEAKER)(config)
+
+    # Opus only supports 48 kHz
+    if config.get(CONF_FORMAT) == "OPUS" and config.get(CONF_SAMPLE_RATE) != 48000:
+        raise cv.Invalid("Opus only supports a sample rate of 48000 Hz")
+
+    audio.final_validate_audio_schema(
+        "speaker_source media_player",
+        audio_device=CONF_SPEAKER,
+        bits_per_sample=16,
+        channels=config.get(CONF_NUM_CHANNELS),
+        sample_rate=config.get(CONF_SAMPLE_RATE),
+    )(config)
+
+    return config
+
+
+PIPELINE_SCHEMA = cv.Schema(
+    {
+        cv.GenerateID(): cv.declare_id(
+            PipelineContext
+        ),  # Needed to inherit audio settings from the speaker
+        cv.Required(CONF_SPEAKER): cv.use_id(speaker.Speaker),
+        cv.Required(CONF_SOURCES): cv.All(
+            cv.ensure_list(cv.use_id(media_source.MediaSource)),
+            cv.Length(min=1),
+        ),
+        cv.Optional(CONF_FORMAT, default="FLAC"): cv.enum(audio.AUDIO_FILE_TYPE_ENUM),
+        cv.Optional(CONF_SAMPLE_RATE): cv.int_range(min=1),
+        cv.Optional(CONF_NUM_CHANNELS): cv.int_range(1, 2),
+    }
+)
+
+
+def _validate_volume_settings(config: ConfigType) -> ConfigType:
+    # CONF_VOLUME_INITIAL is in the scaled volume domain (0.0-1.0) and doesn't need to be validated
+    if config[CONF_VOLUME_MIN] > config[CONF_VOLUME_MAX]:
+        raise cv.Invalid(
+            f"{CONF_VOLUME_MIN} ({config[CONF_VOLUME_MIN]}) must be less than or equal to {CONF_VOLUME_MAX} ({config[CONF_VOLUME_MAX]})"
+        )
+    return config
+
+
+CONFIG_SCHEMA = cv.All(
+    cv.Schema(
+        {
+            cv.Optional(CONF_VOLUME_INCREMENT, default=0.05): cv.percentage,
+            cv.Optional(CONF_VOLUME_INITIAL, default=0.5): cv.percentage,
+            cv.Optional(CONF_VOLUME_MAX, default=1.0): cv.percentage,
+            cv.Optional(CONF_VOLUME_MIN, default=0.0): cv.percentage,
+            cv.Required(CONF_MEDIA_PIPELINE): PIPELINE_SCHEMA,
+            cv.Optional(CONF_ON_MUTE): automation.validate_automation(single=True),
+            cv.Optional(CONF_ON_UNMUTE): automation.validate_automation(single=True),
+            cv.Optional(CONF_ON_VOLUME): automation.validate_automation(single=True),
+        }
+    )
+    .extend(cv.COMPONENT_SCHEMA)
+    .extend(media_player.media_player_schema(SpeakerSourceMediaPlayer)),
+    cv.only_on_esp32,
+    _validate_volume_settings,
+)
+
+
+def _final_validate_codecs(config: ConfigType) -> ConfigType:
+    pipeline = config[CONF_MEDIA_PIPELINE]
+    fmt = pipeline[CONF_FORMAT]
+    if fmt == "NONE":
+        audio.request_flac_support()
+        audio.request_mp3_support()
+        audio.request_opus_support()
+    elif fmt == "FLAC":
+        audio.request_flac_support()
+    elif fmt == "MP3":
+        audio.request_mp3_support()
+    elif fmt == "OPUS":
+        audio.request_opus_support()
+
+    return config
+
+
+FINAL_VALIDATE_SCHEMA = cv.All(
+    cv.Schema(
+        {
+            cv.Required(CONF_MEDIA_PIPELINE): _validate_pipeline,
+        },
+        extra=cv.ALLOW_EXTRA,
+    ),
+    _final_validate_codecs,
+)
+
+
+async def to_code(config: ConfigType) -> None:
+    var = cg.new_Pvariable(config[CONF_ID])
+    await cg.register_component(var, config)
+    await media_player.register_media_player(var, config)
+
+    cg.add(var.set_volume_increment(config[CONF_VOLUME_INCREMENT]))
+    cg.add(var.set_volume_initial(config[CONF_VOLUME_INITIAL]))
+    cg.add(var.set_volume_max(config[CONF_VOLUME_MAX]))
+    cg.add(var.set_volume_min(config[CONF_VOLUME_MIN]))
+
+    pipeline_config = config[CONF_MEDIA_PIPELINE]
+    pipeline_enum = Pipeline.MEDIA_PIPELINE
+
+    for source in pipeline_config[CONF_SOURCES]:
+        src = await cg.get_variable(source)
+        cg.add(var.add_media_source(pipeline_enum, src))
+
+    cg.add(
+        var.set_speaker(
+            pipeline_enum,
+            await cg.get_variable(pipeline_config[CONF_SPEAKER]),
+        )
+    )
+    if pipeline_config[CONF_FORMAT] != "NONE":
+        cg.add(
+            var.set_format(
+                pipeline_enum,
+                _get_supported_format_struct(pipeline_config),
+            )
+        )
+
+    if on_mute := config.get(CONF_ON_MUTE):
+        await automation.build_automation(
+            var.get_mute_trigger(),
+            [],
+            on_mute,
+        )
+    if on_unmute := config.get(CONF_ON_UNMUTE):
+        await automation.build_automation(
+            var.get_unmute_trigger(),
+            [],
+            on_unmute,
+        )
+    if on_volume := config.get(CONF_ON_VOLUME):
+        await automation.build_automation(
+            var.get_volume_trigger(),
+            [(cg.float_, "x")],
+            on_volume,
+        )
diff --git a/esphome/components/speaker_source/speaker_source_media_player.cpp b/esphome/components/speaker_source/speaker_source_media_player.cpp
new file mode 100644
index 0000000000..a3679891d2
--- /dev/null
+++ b/esphome/components/speaker_source/speaker_source_media_player.cpp
@@ -0,0 +1,546 @@
+#include "speaker_source_media_player.h"
+
+#ifdef USE_ESP32
+
+#include "esphome/core/helpers.h"
+#include "esphome/core/log.h"
+
+namespace esphome::speaker_source {
+
+static constexpr uint32_t MEDIA_CONTROLS_QUEUE_LENGTH = 20;
+
+static const char *const TAG = "speaker_source_media_player";
+
+// SourceBinding method implementations (defined here because SpeakerSourceMediaPlayer is forward-declared in the
+// header)
+
+// THREAD CONTEXT: Called from media source decode task thread
+size_t SourceBinding::write_audio(const uint8_t *data, size_t length, uint32_t timeout_ms,
+                                  const audio::AudioStreamInfo &stream_info) {
+  return this->player->handle_media_output_(this->pipeline, this->source, data, length, timeout_ms, stream_info);
+}
+
+// THREAD CONTEXT: Called from main loop (media source's loop() calls set_state_ which calls report_state)
+void SourceBinding::report_state(media_source::MediaSourceState state) {
+  this->player->handle_media_state_changed_(this->pipeline, this->source, state);
+}
+
+// THREAD CONTEXT: Called from media source task thread; uses defer() to marshal to main loop
+void SourceBinding::request_volume(float volume) {
+  this->player->defer([this, volume]() { this->player->handle_volume_request_(volume); });
+}
+
+// THREAD CONTEXT: Called from media source task thread; uses defer() to marshal to main loop
+void SourceBinding::request_mute(bool is_muted) {
+  this->player->defer([this, is_muted]() { this->player->handle_mute_request_(is_muted); });
+}
+
+// THREAD CONTEXT: Called from media source task thread; uses defer() to marshal to main loop
+void SourceBinding::request_play_uri(const std::string &uri) {
+  this->player->defer([this, uri]() { this->player->handle_play_uri_request_(this->pipeline, uri); });
+}
+
+// THREAD CONTEXT: Called during code generation setup (main loop)
+void SpeakerSourceMediaPlayer::add_media_source(uint8_t pipeline, media_source::MediaSource *media_source) {
+  auto &binding =
+      this->pipelines_[pipeline].sources.emplace_back(std::make_unique<SourceBinding>(this, media_source, pipeline));
+  media_source->set_listener(binding.get());
+}
+
+void SpeakerSourceMediaPlayer::dump_config() {
+  ESP_LOGCONFIG(TAG,
+                "Speaker Source Media Player:\n"
+                "  Volume Increment: %.2f\n"
+                "  Volume Min: %.2f\n"
+                "  Volume Max: %.2f",
+                this->volume_increment_, this->volume_min_, this->volume_max_);
+}
+
+void SpeakerSourceMediaPlayer::setup() {
+  this->state = media_player::MEDIA_PLAYER_STATE_IDLE;
+
+  this->media_control_command_queue_ = xQueueCreate(MEDIA_CONTROLS_QUEUE_LENGTH, sizeof(MediaPlayerControlCommand));
+
+  this->pref_ = this->make_entity_preference<VolumeRestoreState>();
+
+  VolumeRestoreState volume_restore_state;
+  if (this->pref_.load(&volume_restore_state)) {
+    this->set_volume_(volume_restore_state.volume);
+    this->set_mute_state_(volume_restore_state.is_muted);
+  } else {
+    this->set_volume_(this->volume_initial_);
+    this->set_mute_state_(false);
+  }
+
+  // Register callbacks to receive playback notifications from speakers
+  for (size_t i = 0; i < this->pipelines_.size(); i++) {
+    if (this->pipelines_[i].is_configured()) {
+      this->pipelines_[i].speaker->add_audio_output_callback([this, i](uint32_t frames, int64_t timestamp) {
+        this->handle_speaker_playback_callback_(frames, timestamp, i);
+      });
+    }
+  }
+}
+
+// THREAD CONTEXT: Called from the speaker's playback callback task (not main loop)
+void SpeakerSourceMediaPlayer::handle_speaker_playback_callback_(uint32_t frames, int64_t timestamp, uint8_t pipeline) {
+  PipelineContext &ps = this->pipelines_[pipeline];
+
+  // Load once so the null check and use below are consistent
+  media_source::MediaSource *active_source = ps.active_source.load(std::memory_order_relaxed);
+  if (active_source == nullptr) {
+    return;
+  }
+
+  // CAS loop to safely subtract frames without underflow. If pending_frames is reset to 0 (new source
+  // starting) between the load and the subtract, compare_exchange_weak will fail and reload the current value.
+  uint32_t current = ps.pending_frames.load(std::memory_order_relaxed);
+  uint32_t source_frames;
+  do {
+    source_frames = std::min(frames, current);
+  } while (source_frames > 0 &&
+           !ps.pending_frames.compare_exchange_weak(current, current - source_frames, std::memory_order_relaxed));
+
+  if (source_frames > 0) {
+    // Notify the source about the played audio
+    active_source->notify_audio_played(source_frames, timestamp);
+  }
+}
+
+// THREAD CONTEXT: Called from main loop via defer()
+void SpeakerSourceMediaPlayer::handle_volume_request_(float volume) {
+  // Update the media player's volume
+  this->set_volume_(volume);
+  this->publish_state();
+}
+
+// THREAD CONTEXT: Called from main loop via defer()
+void SpeakerSourceMediaPlayer::handle_mute_request_(bool is_muted) {
+  // Update the media player's mute state
+  this->set_mute_state_(is_muted);
+  this->publish_state();
+}
+
+// THREAD CONTEXT: Called from main loop via defer()
+void SpeakerSourceMediaPlayer::handle_play_uri_request_(uint8_t pipeline, const std::string &uri) {
+  // Smart source is requesting the player to play a different URI
+  auto call = this->make_call();
+  call.set_media_url(uri);
+  call.perform();
+}
+
+// THREAD CONTEXT: Called from main loop (media source's loop() calls set_state_ which calls report_state)
+void SpeakerSourceMediaPlayer::handle_media_state_changed_(uint8_t pipeline, media_source::MediaSource *source,
+                                                           media_source::MediaSourceState state) {
+  PipelineContext &ps = this->pipelines_[pipeline];
+
+  if (state == media_source::MediaSourceState::IDLE) {
+    // Source went idle - clear stopping flag if this was the source we asked to stop
+    if (ps.stopping_source == source) {
+      ps.stopping_source = nullptr;
+    }
+
+    // Clear pending flag if this was the source we asked to play
+    if (ps.pending_source == source) {
+      ps.pending_source = nullptr;
+    }
+
+    // Source went idle - clear it if it's the active source
+    if (ps.active_source == source) {
+      ps.last_source = ps.active_source;
+      ps.active_source = nullptr;
+
+      // Finish the speaker to ensure it's ready for the next playback
+      ps.speaker->finish();
+    }
+  } else if (state == media_source::MediaSourceState::PLAYING) {
+    // Source started playing - make it the active source if no one else is active
+    if (ps.active_source == nullptr) {
+      ps.active_source = source;
+      ps.last_source = nullptr;
+
+      // Clear pending flag now that the source is active
+      if (ps.pending_source == source) {
+        ps.pending_source = nullptr;
+      }
+    }
+  }
+}
+
+// THREAD CONTEXT: Called from media source decode task thread (not main loop).
+// Reads ps.active_source (atomic), writes ps.pending_frames (atomic), and calls
+// ps.speaker methods (speaker pointer is immutable after setup).
+size_t SpeakerSourceMediaPlayer::handle_media_output_(uint8_t pipeline, media_source::MediaSource *source,
+                                                      const uint8_t *data, size_t length, uint32_t timeout_ms,
+                                                      const audio::AudioStreamInfo &stream_info) {
+  PipelineContext &ps = this->pipelines_[pipeline];
+
+  // Single read; the if-body only uses ps.speaker (immutable after setup) and the source parameter.
+  if (ps.active_source == source) {
+    // This source is active - play the audio
+    if (ps.speaker->get_audio_stream_info() != stream_info) {
+      // Setup the speaker to play this stream
+      ps.speaker->set_audio_stream_info(stream_info);
+      vTaskDelay(pdMS_TO_TICKS(timeout_ms));
+      return 0;
+    }
+    size_t bytes_written = ps.speaker->play(data, length, pdMS_TO_TICKS(timeout_ms));
+    if (bytes_written > 0) {
+      // Track frames sent to speaker for this source
+      ps.pending_frames.fetch_add(stream_info.bytes_to_frames(bytes_written), std::memory_order_relaxed);
+    }
+    return bytes_written;
+  }
+
+  // Not the active source - wait for state callback to set us as active when we transition to PLAYING
+  vTaskDelay(pdMS_TO_TICKS(timeout_ms));
+  return 0;
+}
+
+media_player::MediaPlayerState SpeakerSourceMediaPlayer::get_media_pipeline_state_(
+    media_source::MediaSource *source) const {
+  if (source != nullptr) {
+    switch (source->get_state()) {
+      case media_source::MediaSourceState::PLAYING:
+        return media_player::MEDIA_PLAYER_STATE_PLAYING;
+      case media_source::MediaSourceState::PAUSED:
+        return media_player::MEDIA_PLAYER_STATE_PAUSED;
+      case media_source::MediaSourceState::ERROR:
+        ESP_LOGE(TAG, "Source error");
+        return media_player::MEDIA_PLAYER_STATE_IDLE;
+      case media_source::MediaSourceState::IDLE:
+      default:
+        return media_player::MEDIA_PLAYER_STATE_IDLE;
+    }
+  }
+
+  return media_player::MEDIA_PLAYER_STATE_IDLE;
+}
+
+void SpeakerSourceMediaPlayer::loop() {
+  // Process queued control commands
+  MediaPlayerControlCommand control_command;
+
+  // Use peek to check command without removing it
+  if (xQueuePeek(this->media_control_command_queue_, &control_command, 0) == pdTRUE) {
+    bool command_executed = false;
+    uint8_t pipeline = control_command.pipeline;
+
+    switch (control_command.type) {
+      case MediaPlayerControlCommand::PLAY_URI: {
+        command_executed = this->try_execute_play_uri_(*control_command.data.uri, pipeline);
+        break;
+      }
+
+      case MediaPlayerControlCommand::SEND_COMMAND: {
+        PipelineContext &ps = this->pipelines_[pipeline];
+
+        // Determine target source: prefer active, fall back to last
+        media_source::MediaSource *target_source = nullptr;
+        if (ps.active_source != nullptr) {
+          target_source = ps.active_source;
+        } else if (ps.last_source != nullptr) {
+          target_source = ps.last_source;
+        }
+
+        media_player::MediaPlayerCommand player_command = control_command.data.command;
+        switch (player_command) {
+          case media_player::MEDIA_PLAYER_COMMAND_TOGGLE: {
+            media_source::MediaSource *active_source = ps.active_source;
+            if ((active_source != nullptr) && (active_source->get_state() == media_source::MediaSourceState::PLAYING)) {
+              if (target_source != nullptr) {
+                target_source->handle_command(media_source::MediaSourceCommand::PAUSE);
+              }
+            } else {
+              if (target_source != nullptr) {
+                target_source->handle_command(media_source::MediaSourceCommand::PLAY);
+              }
+            }
+            break;
+          }
+
+          case media_player::MEDIA_PLAYER_COMMAND_PLAY: {
+            if (target_source != nullptr) {
+              target_source->handle_command(media_source::MediaSourceCommand::PLAY);
+            }
+            break;
+          }
+
+          case media_player::MEDIA_PLAYER_COMMAND_PAUSE: {
+            if (target_source != nullptr) {
+              target_source->handle_command(media_source::MediaSourceCommand::PAUSE);
+            }
+            break;
+          }
+
+          case media_player::MEDIA_PLAYER_COMMAND_STOP: {
+            if (target_source != nullptr) {
+              target_source->handle_command(media_source::MediaSourceCommand::STOP);
+            }
+            break;
+          }
+
+          default:
+            break;
+        }
+
+        command_executed = true;
+        break;
+      }
+    }
+
+    // Only remove from queue if successfully executed
+    if (command_executed) {
+      xQueueReceive(this->media_control_command_queue_, &control_command, 0);
+
+      // Delete the allocated string for PLAY_URI commands
+      if (control_command.type == MediaPlayerControlCommand::PLAY_URI) {
+        delete control_command.data.uri;
+      }
+    }
+  }
+
+  // Update state based on active sources
+  media_player::MediaPlayerState old_state = this->state;
+
+  PipelineContext &media_ps = this->pipelines_[MEDIA_PIPELINE];
+  this->state = this->get_media_pipeline_state_(media_ps.active_source);
+
+  if (this->state != old_state) {
+    this->publish_state();
+    ESP_LOGD(TAG, "State changed to %s", media_player::media_player_state_to_string(this->state));
+  }
+}
+
+media_source::MediaSource *SpeakerSourceMediaPlayer::find_source_for_uri_(const std::string &uri, uint8_t pipeline) {
+  PipelineContext &ps = this->pipelines_[pipeline];
+  media_source::MediaSource *first_match = nullptr;
+  for (auto &binding : ps.sources) {
+    if (binding->source->can_handle(uri)) {
+      // Prefer an idle source; otherwise remember the first match (will be stopped by try_execute_play_uri_)
+      if (binding->source->get_state() == media_source::MediaSourceState::IDLE) {
+        return binding->source;
+      }
+      if (first_match == nullptr) {
+        first_match = binding->source;
+      }
+    }
+  }
+  return first_match;
+}
+
+bool SpeakerSourceMediaPlayer::try_execute_play_uri_(const std::string &uri, uint8_t pipeline) {
+  // Find target source
+  media_source::MediaSource *target_source = this->find_source_for_uri_(uri, pipeline);
+  if (target_source == nullptr) {
+    ESP_LOGW(TAG, "No source for URI");
+    ESP_LOGV(TAG, "URI: %s", uri.c_str());
+    return true;  // Remove from queue (unrecoverable)
+  }
+
+  PipelineContext &ps = this->pipelines_[pipeline];
+
+  media_source::MediaSource *active_source = ps.active_source;
+
+  // If active source exists and is not IDLE, stop it and wait
+  if (active_source != nullptr) {
+    media_source::MediaSourceState active_state = active_source->get_state();
+    if (active_state != media_source::MediaSourceState::IDLE) {
+      // Only send END command once per source - check if we've already asked this source to stop
+      if (ps.stopping_source != active_source) {
+        ESP_LOGV(TAG, "Pipeline %u: stopping active source", pipeline);
+        active_source->handle_command(media_source::MediaSourceCommand::STOP);
+        ps.speaker->stop();
+        ps.stopping_source = active_source;
+      }
+      return false;  // Leave in queue, retry next loop
+    }
+  }
+
+  // Also check target source directly - handles case where source errored before PLAYING state
+  media_source::MediaSourceState target_state = target_source->get_state();
+  if (target_state != media_source::MediaSourceState::IDLE) {
+    // Only send STOP command once per source
+    if (ps.stopping_source != target_source) {
+      ESP_LOGV(TAG, "Pipeline %u: target source busy, stopping", pipeline);
+      target_source->handle_command(media_source::MediaSourceCommand::STOP);
+      ps.speaker->stop();
+      ps.stopping_source = target_source;
+    }
+    return false;  // Leave in queue, retry next loop
+  }
+
+  // Clear stopping flag since we're past the stopping phase
+  ps.stopping_source = nullptr;
+
+  // Check if speaker is ready
+  if (!ps.speaker->is_stopped()) {
+    return false;  // Speaker not ready yet, retry later
+  }
+
+  // Set pending source so handle_media_state_changed_ can recognize it when the source transitions to PLAYING
+  ps.pending_source = target_source;
+
+  // Speaker is ready, try to play
+  if (!target_source->play_uri(uri)) {
+    ESP_LOGE(TAG, "Pipeline %u: Failed to play URI: %s", pipeline, uri.c_str());
+    ps.pending_source = nullptr;
+  }
+
+  // Reset pending frame counter for this pipeline since we're starting a new source
+  ps.pending_frames.store(0, std::memory_order_relaxed);
+
+  return true;  // Remove from queue
+}
+
+// THREAD CONTEXT: Called from main loop only. Entry points:
+// - HA/automation commands (direct)
+// - handle_play_uri_request_() via make_call().perform() (deferred from source tasks)
+void SpeakerSourceMediaPlayer::control(const media_player::MediaPlayerCall &call) {
+  if (!this->is_ready()) {
+    return;
+  }
+
+  MediaPlayerControlCommand control_command;
+  control_command.pipeline = MEDIA_PIPELINE;
+
+  auto media_url = call.get_media_url();
+  if (media_url.has_value()) {
+    control_command.type = MediaPlayerControlCommand::PLAY_URI;
+    // Heap allocation is unavoidable: URIs from Home Assistant are arbitrary-length (media URLs with tokens
+    // can easily exceed 500 bytes). Deleted after the command is consumed. FreeRTOS queues require items to be
+    // copyable, so we store a pointer to the string in the queue rather than the string itself.
+    control_command.data.uri = new std::string(media_url.value());
+    if (xQueueSend(this->media_control_command_queue_, &control_command, 0) != pdTRUE) {
+      delete control_command.data.uri;
+      ESP_LOGE(TAG, "Queue full, URI dropped");
+    }
+    return;
+  }
+
+  auto volume = call.get_volume();
+  if (volume.has_value()) {
+    this->set_volume_(volume.value());
+    this->publish_state();
+    return;
+  }
+
+  auto cmd = call.get_command();
+  if (cmd.has_value()) {
+    switch (cmd.value()) {
+      case media_player::MEDIA_PLAYER_COMMAND_MUTE:
+        this->set_mute_state_(true);
+        break;
+      case media_player::MEDIA_PLAYER_COMMAND_UNMUTE:
+        this->set_mute_state_(false);
+        break;
+      case media_player::MEDIA_PLAYER_COMMAND_VOLUME_UP:
+        this->set_volume_(std::min(1.0f, this->volume + this->volume_increment_));
+        break;
+      case media_player::MEDIA_PLAYER_COMMAND_VOLUME_DOWN:
+        this->set_volume_(std::max(0.0f, this->volume - this->volume_increment_));
+        break;
+      default:
+        // Queue command for processing in loop()
+        control_command.type = MediaPlayerControlCommand::SEND_COMMAND;
+        control_command.data.command = cmd.value();
+        if (xQueueSend(this->media_control_command_queue_, &control_command, 0) != pdTRUE) {
+          ESP_LOGE(TAG, "Queue full, command dropped");
+        }
+        return;
+    }
+    this->publish_state();
+  }
+}
+
+media_player::MediaPlayerTraits SpeakerSourceMediaPlayer::get_traits() {
+  auto traits = media_player::MediaPlayerTraits();
+  traits.set_supports_pause(true);
+
+  for (const auto &ps : this->pipelines_) {
+    if (ps.format.has_value()) {
+      traits.get_supported_formats().push_back(ps.format.value());
+    }
+  }
+
+  return traits;
+}
+
+void SpeakerSourceMediaPlayer::save_volume_restore_state_() {
+  VolumeRestoreState volume_restore_state;
+  volume_restore_state.volume = this->volume;
+  volume_restore_state.is_muted = this->is_muted_;
+  this->pref_.save(&volume_restore_state);
+}
+
+void SpeakerSourceMediaPlayer::set_mute_state_(bool mute_state, bool publish) {
+  if (this->is_muted_ == mute_state) {
+    return;
+  }
+
+  for (auto &ps : this->pipelines_) {
+    if (ps.is_configured()) {
+      ps.speaker->set_mute_state(mute_state);
+    }
+  }
+
+  this->is_muted_ = mute_state;
+
+  if (publish) {
+    this->save_volume_restore_state_();
+  }
+
+  // Notify all media sources about the mute state change
+  for (auto &ps : this->pipelines_) {
+    for (auto &binding : ps.sources) {
+      binding->source->notify_mute_changed(mute_state);
+    }
+  }
+
+  if (mute_state) {
+    this->defer([this]() { this->mute_trigger_.trigger(); });
+  } else {
+    this->defer([this]() { this->unmute_trigger_.trigger(); });
+  }
+}
+
+void SpeakerSourceMediaPlayer::set_volume_(float volume, bool publish) {
+  // Remap the volume to fit within the configured limits
+  float bounded_volume = remap<float, float>(volume, 0.0f, 1.0f, this->volume_min_, this->volume_max_);
+
+  for (auto &ps : this->pipelines_) {
+    if (ps.is_configured()) {
+      ps.speaker->set_volume(bounded_volume);
+    }
+  }
+
+  if (publish) {
+    this->volume = volume;
+  }
+
+  // Notify all media sources about the volume change
+  for (auto &ps : this->pipelines_) {
+    for (auto &binding : ps.sources) {
+      binding->source->notify_volume_changed(volume);
+    }
+  }
+
+  // Turn on the mute state if the volume is effectively zero, off otherwise.
+  // Pass publish=false to avoid saving twice.
+  if (volume < 0.001) {
+    this->set_mute_state_(true, false);
+  } else {
+    this->set_mute_state_(false, false);
+  }
+
+  // Save after mute mutation so the restored state has the correct is_muted_ value
+  if (publish) {
+    this->save_volume_restore_state_();
+  }
+
+  this->defer([this, volume]() { this->volume_trigger_.trigger(volume); });
+}
+
+}  // namespace esphome::speaker_source
+
+#endif  // USE_ESP32
diff --git a/esphome/components/speaker_source/speaker_source_media_player.h b/esphome/components/speaker_source/speaker_source_media_player.h
new file mode 100644
index 0000000000..7896fef295
--- /dev/null
+++ b/esphome/components/speaker_source/speaker_source_media_player.h
@@ -0,0 +1,217 @@
+#pragma once
+
+#include "esphome/core/defines.h"
+
+#ifdef USE_ESP32
+
+#include "esphome/components/audio/audio.h"
+#include "esphome/components/media_source/media_source.h"
+#include "esphome/components/media_player/media_player.h"
+#include "esphome/components/speaker/speaker.h"
+
+#include "esphome/core/automation.h"
+#include "esphome/core/component.h"
+#include "esphome/core/preferences.h"
+
+#include <array>
+#include <atomic>
+#include <memory>
+#include <vector>
+#include <freertos/FreeRTOS.h>
+#include <freertos/queue.h>
+
+namespace esphome::speaker_source {
+
+// THREADING MODEL:
+// This component coordinates media sources that run their own decode tasks with speakers
+// that have their own playback callback tasks. Three thread contexts exist:
+//
+// - Main loop task: setup(), loop(), dump_config(), handle_media_state_changed_(),
+//   handle_volume_request_(), handle_mute_request_(), handle_play_uri_request_(),
+//   set_volume_(), set_mute_state_(), control(), get_media_pipeline_state_(),
+//   find_source_for_uri_(), try_execute_play_uri_(), save_volume_restore_state_()
+//
+// - Media source task(s): handle_media_output_() via SourceBinding::write_audio().
+//   Called from each source's decode task thread when streaming audio data.
+//   Reads ps.active_source (atomic), writes ps.pending_frames (atomic), and calls
+//   ps.speaker methods (speaker pointer is immutable after setup).
+//
+// - Speaker callback task: handle_speaker_playback_callback_() via speaker's
+//   add_audio_output_callback(). Called when the speaker finishes writing frames to the DAC.
+//   Reads ps.active_source (atomic), writes ps.pending_frames (atomic), and calls
+//   active_source->notify_audio_played().
+//
+// control() is only called from the main loop (HA/automation commands).
+// Source tasks use defer() for all requests (volume, mute, play_uri).
+//
+// Thread-safe communication:
+// - FreeRTOS queue (media_control_command_queue_): control() -> loop() for play/command dispatch
+// - defer(): SourceBinding::request_volume/request_mute/request_play_uri -> main loop
+// - Atomic fields (active_source, pending_frames): shared between all three thread contexts
+//
+// Non-atomic pipeline fields (last_source, stopping_source, pending_source) are only accessed
+// from the main loop thread.
+
+enum Pipeline : uint8_t {
+  MEDIA_PIPELINE = 0,
+};
+
+// Forward declaration
+class SpeakerSourceMediaPlayer;
+
+/// @brief Per-source listener binding that captures the source pointer at registration time.
+/// Each binding implements MediaSourceListener and forwards callbacks to the player with the source identified.
+/// Defined before PipelineContext so pipelines can own their bindings directly.
+struct SourceBinding : public media_source::MediaSourceListener {
+  SourceBinding(SpeakerSourceMediaPlayer *player, media_source::MediaSource *source, uint8_t pipeline)
+      : player(player), source(source), pipeline(pipeline) {}
+  SpeakerSourceMediaPlayer *player;
+  media_source::MediaSource *source;
+  uint8_t pipeline;
+
+  // Implementations are in the .cpp file because SpeakerSourceMediaPlayer is only forward-declared here
+  size_t write_audio(const uint8_t *data, size_t length, uint32_t timeout_ms,
+                     const audio::AudioStreamInfo &stream_info) override;
+  void report_state(media_source::MediaSourceState state) override;
+  void request_volume(float volume) override;
+  void request_mute(bool is_muted) override;
+  void request_play_uri(const std::string &uri) override;
+};
+
+struct PipelineContext {
+  speaker::Speaker *speaker{nullptr};
+  optional<media_player::MediaPlayerSupportedFormat> format;
+
+  std::atomic<media_source::MediaSource *> active_source{nullptr};
+  media_source::MediaSource *last_source{nullptr};
+  media_source::MediaSource *stopping_source{nullptr};  // Source we've asked to stop, awaiting IDLE
+  media_source::MediaSource *pending_source{nullptr};   // Source we've asked to play, awaiting PLAYING
+
+  // Each SourceBinding pairs a MediaSource* with its listener implementation.
+  // Uses unique_ptr so binding addresses are stable and set_listener() can be called in add_media_source().
+  // Uses std::vector because the count varies across instances (multiple speaker_source media players may exist).
+  std::vector<std::unique_ptr<SourceBinding>> sources;
+
+  // Track frames sent to speaker to correlate with playback callbacks.
+  // Atomic because it is written from the main loop/source tasks and read/decremented from the speaker playback
+  // callback.
+  std::atomic<uint32_t> pending_frames{0};
+
+  /// @brief Check if this pipeline is configured (has a speaker assigned)
+  bool is_configured() const { return this->speaker != nullptr; }
+};
+
+struct MediaPlayerControlCommand {
+  enum Type : uint8_t {
+    PLAY_URI,      // Find a source that can handle this URI and play it
+    SEND_COMMAND,  // Send command to active source
+  };
+  Type type;
+  uint8_t pipeline;
+
+  union {
+    std::string *uri;  // Owned pointer, must delete after xQueueReceive (for PLAY_URI)
+    media_player::MediaPlayerCommand command;
+  } data;
+};
+
+struct VolumeRestoreState {
+  float volume;
+  bool is_muted;
+};
+
+class SpeakerSourceMediaPlayer : public Component, public media_player::MediaPlayer {
+  friend struct SourceBinding;
+
+ public:
+  float get_setup_priority() const override { return esphome::setup_priority::PROCESSOR; }
+  void setup() override;
+  void loop() override;
+  void dump_config() override;
+
+  // MediaPlayer implementations
+  media_player::MediaPlayerTraits get_traits() override;
+  bool is_muted() const override { return this->is_muted_; }
+
+  // Percentage to increase or decrease the volume for volume up or volume down commands
+  void set_volume_increment(float volume_increment) { this->volume_increment_ = volume_increment; }
+
+  // Volume used initially on first boot when no volume had been previously saved
+  void set_volume_initial(float volume_initial) { this->volume_initial_ = volume_initial; }
+
+  void set_volume_max(float volume_max) { this->volume_max_ = volume_max; }
+  void set_volume_min(float volume_min) { this->volume_min_ = volume_min; }
+
+  /// @brief Adds a media source to a pipeline and registers this player as its listener
+  void add_media_source(uint8_t pipeline, media_source::MediaSource *media_source);
+
+  void set_speaker(uint8_t pipeline, speaker::Speaker *speaker) { this->pipelines_[pipeline].speaker = speaker; }
+  void set_format(uint8_t pipeline, const media_player::MediaPlayerSupportedFormat &format) {
+    this->pipelines_[pipeline].format = format;
+  }
+
+  Trigger<> *get_mute_trigger() { return &this->mute_trigger_; }
+  Trigger<> *get_unmute_trigger() { return &this->unmute_trigger_; }
+  Trigger<float> *get_volume_trigger() { return &this->volume_trigger_; }
+
+ protected:
+  // Callbacks from source bindings (pipeline index is captured at binding creation time)
+  size_t handle_media_output_(uint8_t pipeline, media_source::MediaSource *source, const uint8_t *data, size_t length,
+                              uint32_t timeout_ms, const audio::AudioStreamInfo &stream_info);
+  void handle_media_state_changed_(uint8_t pipeline, media_source::MediaSource *source,
+                                   media_source::MediaSourceState state);
+  void handle_volume_request_(float volume);
+  void handle_mute_request_(bool is_muted);
+  void handle_play_uri_request_(uint8_t pipeline, const std::string &uri);
+
+  void handle_speaker_playback_callback_(uint32_t frames, int64_t timestamp, uint8_t pipeline);
+
+  // Receives commands from HA or from the voice assistant component
+  // Sends commands to the media_control_command_queue_
+  void control(const media_player::MediaPlayerCall &call) override;
+
+  /// @brief Updates this->volume and saves volume/mute state to flash for restoration if publish is true.
+  void set_volume_(float volume, bool publish = true);
+
+  /// @brief Sets the mute state.
+  /// @param mute_state If true, audio will be muted. If false, audio will be unmuted
+  /// @param publish If true, saves volume/mute state to flash for restoration
+  void set_mute_state_(bool mute_state, bool publish = true);
+
+  /// @brief Saves the current volume and mute state to the flash for restoration.
+  void save_volume_restore_state_();
+
+  /// @brief Determine media player state from the media pipeline's active source
+  /// @param media_source Active source for the media pipeline (may be nullptr)
+  /// @return The appropriate MediaPlayerState
+  media_player::MediaPlayerState get_media_pipeline_state_(media_source::MediaSource *media_source) const;
+
+  bool try_execute_play_uri_(const std::string &uri, uint8_t pipeline);
+  media_source::MediaSource *find_source_for_uri_(const std::string &uri, uint8_t pipeline);
+  QueueHandle_t media_control_command_queue_;
+
+  // Pipeline context for media pipeline. See THREADING MODEL at top of namespace for access rules.
+  std::array<PipelineContext, 1> pipelines_;
+
+  // Used to save volume/mute state for restoration on reboot
+  ESPPreferenceObject pref_;
+
+  Trigger<> mute_trigger_;
+  Trigger<> unmute_trigger_;
+  Trigger<float> volume_trigger_;
+
+  // The amount to change the volume on volume up/down commands
+  float volume_increment_;
+
+  // The initial volume used by Setup when no previous volume was saved
+  float volume_initial_;
+
+  float volume_max_;
+  float volume_min_;
+
+  bool is_muted_{false};
+};
+
+}  // namespace esphome::speaker_source
+
+#endif  // USE_ESP32
diff --git a/tests/components/speaker_source/common.yaml b/tests/components/speaker_source/common.yaml
new file mode 100644
index 0000000000..cfcb065f57
--- /dev/null
+++ b/tests/components/speaker_source/common.yaml
@@ -0,0 +1,43 @@
+i2s_audio:
+  i2s_lrclk_pin: ${i2s_bclk_pin}
+  i2s_bclk_pin: ${i2s_lrclk_pin}
+  i2s_mclk_pin: ${i2s_mclk_pin}
+
+speaker:
+  - platform: i2s_audio
+    id: speaker_id
+    dac_type: external
+    i2s_dout_pin: ${i2s_dout_pin}
+    sample_rate: 48000
+    num_channels: 2
+
+audio_file:
+  - id: test_audio
+    file:
+      type: local
+      path: $component_dir/test.wav
+
+media_source:
+  - platform: audio_file
+    id: audio_file_source
+
+media_player:
+  - platform: speaker_source
+    id: media_player_id
+    name: Media Player
+    volume_increment: 0.02
+    volume_initial: 0.75
+    volume_max: 0.95
+    volume_min: 0.0
+    media_pipeline:
+      speaker: speaker_id
+      format: FLAC
+      num_channels: 1
+      sources:
+        - audio_file_source
+    on_mute:
+      - media_player.pause:
+          id: media_player_id
+    on_unmute:
+      - media_player.play:
+          id: media_player_id
diff --git a/tests/components/speaker_source/test.esp32-idf.yaml b/tests/components/speaker_source/test.esp32-idf.yaml
new file mode 100644
index 0000000000..e2439ebdf2
--- /dev/null
+++ b/tests/components/speaker_source/test.esp32-idf.yaml
@@ -0,0 +1,9 @@
+substitutions:
+  scl_pin: GPIO16
+  sda_pin: GPIO17
+  i2s_bclk_pin: GPIO27
+  i2s_lrclk_pin: GPIO26
+  i2s_mclk_pin: GPIO25
+  i2s_dout_pin: GPIO23
+
+<<: !include common.yaml
diff --git a/tests/components/speaker_source/test.wav b/tests/components/speaker_source/test.wav
new file mode 100644
index 0000000000000000000000000000000000000000..f9d07ef2238eb2fcb355055466d3789ee1a1fe0b
GIT binary patch
literal 46
vcmWIYbaPW<U|<M$40BD(Em05vvKfJ}!4Al%Wnf@p5MW42EJ<Vnav2x^yom<m

literal 0
HcmV?d00001