[speaker_source] Add new media player (#14649)

Co-authored-by: J. Nick Koston <nick@home-assistant.io>
2026-03-23 22:37:31 +08:00 · 2026-03-10 15:25:26 -05:00
parent 780e009bf4
commit 8ca6ee4349
8 changed files with 1028 additions and 0 deletions
--- a/1
+++ b/1
@@ -459,6 +459,7 @@ esphome/components/sonoff_d1/* @anatoly-savchenkov
 esphome/components/sound_level/* @kahrendt
 esphome/components/speaker/* @jesserockz @kahrendt
 esphome/components/speaker/media_player/* @kahrendt @synesthesiam
+esphome/components/speaker_source/* @kahrendt
 esphome/components/spi/* @clydebarrow @esphome/core
 esphome/components/spi_device/* @clydebarrow
 esphome/components/spi_led_strip/* @clydebarrow
--- a/esphome/components/speaker_source/init.py
+++ b/esphome/components/speaker_source/init.py
--- a/esphome/components/speaker_source/media_player.py
+++ b/esphome/components/speaker_source/media_player.py
@@ -0,0 +1,212 @@
+from esphome import automation
+import esphome.codegen as cg
+from esphome.components import audio, media_player, media_source, speaker
+import esphome.config_validation as cv
+from esphome.const import (
+    CONF_FORMAT,
+    CONF_ID,
+    CONF_NUM_CHANNELS,
+    CONF_SAMPLE_RATE,
+    CONF_SPEAKER,
+)
+from esphome.core.entity_helpers import inherit_property_from
+from esphome.types import ConfigType
+
+AUTO_LOAD = ["audio"]
+DEPENDENCIES = ["media_source", "speaker"]
+
+CODEOWNERS = ["@kahrendt"]
+
+CONF_MEDIA_PIPELINE = "media_pipeline"
+CONF_ON_MUTE = "on_mute"
+CONF_ON_UNMUTE = "on_unmute"
+CONF_ON_VOLUME = "on_volume"
+CONF_SOURCES = "sources"
+CONF_VOLUME_INCREMENT = "volume_increment"
+CONF_VOLUME_INITIAL = "volume_initial"
+CONF_VOLUME_MAX = "volume_max"
+CONF_VOLUME_MIN = "volume_min"
+
+speaker_source_ns = cg.esphome_ns.namespace("speaker_source")
+
+SpeakerSourceMediaPlayer = speaker_source_ns.class_(
+    "SpeakerSourceMediaPlayer", cg.Component, media_player.MediaPlayer
+)
+
+PipelineContext = speaker_source_ns.struct("PipelineContext")
+
+Pipeline = speaker_source_ns.enum("Pipeline")
+
+
+FORMAT_MAPPING = {
+    "FLAC": "flac",
+    "MP3": "mp3",
+    "OPUS": "opus",
+    "WAV": "wav",
+}
+
+
+# Returns a media_player.MediaPlayerSupportedFormat struct with the configured
+# format, sample rate, number of channels, purpose, and bytes per sample
+def _get_supported_format_struct(pipeline: ConfigType):
+    args = [
+        media_player.MediaPlayerSupportedFormat,
+    ]
+
+    args.append(("format", FORMAT_MAPPING[pipeline[CONF_FORMAT]]))
+
+    args.append(("sample_rate", pipeline[CONF_SAMPLE_RATE]))
+    args.append(("num_channels", pipeline[CONF_NUM_CHANNELS]))
+    args.append(("purpose", media_player.MEDIA_PLAYER_FORMAT_PURPOSE_ENUM["default"]))
+
+    # Omit sample_bytes for MP3: ffmpeg transcoding in Home Assistant fails
+    # if the number of bytes per sample is specified for MP3.
+    if pipeline[CONF_FORMAT] != "MP3":
+        args.append(("sample_bytes", 2))
+
+    return cg.StructInitializer(*args)
+
+
+def _validate_pipeline(config: ConfigType) -> ConfigType:
+    # Inherit settings from speaker if not manually set
+    inherit_property_from(CONF_NUM_CHANNELS, CONF_SPEAKER)(config)
+    inherit_property_from(CONF_SAMPLE_RATE, CONF_SPEAKER)(config)
+
+    # Opus only supports 48 kHz
+    if config.get(CONF_FORMAT) == "OPUS" and config.get(CONF_SAMPLE_RATE) != 48000:
+        raise cv.Invalid("Opus only supports a sample rate of 48000 Hz")
+
+    audio.final_validate_audio_schema(
+        "speaker_source media_player",
+        audio_device=CONF_SPEAKER,
+        bits_per_sample=16,
+        channels=config.get(CONF_NUM_CHANNELS),
+        sample_rate=config.get(CONF_SAMPLE_RATE),
+    )(config)
+
+    return config
+
+
+PIPELINE_SCHEMA = cv.Schema(
+    {
+        cv.GenerateID(): cv.declare_id(
+            PipelineContext
+        ),  # Needed to inherit audio settings from the speaker
+        cv.Required(CONF_SPEAKER): cv.use_id(speaker.Speaker),
+        cv.Required(CONF_SOURCES): cv.All(
+            cv.ensure_list(cv.use_id(media_source.MediaSource)),
+            cv.Length(min=1),
+        ),
+        cv.Optional(CONF_FORMAT, default="FLAC"): cv.enum(audio.AUDIO_FILE_TYPE_ENUM),
+        cv.Optional(CONF_SAMPLE_RATE): cv.int_range(min=1),
+        cv.Optional(CONF_NUM_CHANNELS): cv.int_range(1, 2),
+    }
+)
+
+
+def _validate_volume_settings(config: ConfigType) -> ConfigType:
+    # CONF_VOLUME_INITIAL is in the scaled volume domain (0.0-1.0) and doesn't need to be validated
+    if config[CONF_VOLUME_MIN] > config[CONF_VOLUME_MAX]:
+        raise cv.Invalid(
+            f"{CONF_VOLUME_MIN} ({config[CONF_VOLUME_MIN]}) must be less than or equal to {CONF_VOLUME_MAX} ({config[CONF_VOLUME_MAX]})"
+        )
+    return config
+
+
+CONFIG_SCHEMA = cv.All(
+    cv.Schema(
+        {
+            cv.Optional(CONF_VOLUME_INCREMENT, default=0.05): cv.percentage,
+            cv.Optional(CONF_VOLUME_INITIAL, default=0.5): cv.percentage,
+            cv.Optional(CONF_VOLUME_MAX, default=1.0): cv.percentage,
+            cv.Optional(CONF_VOLUME_MIN, default=0.0): cv.percentage,
+            cv.Required(CONF_MEDIA_PIPELINE): PIPELINE_SCHEMA,
+            cv.Optional(CONF_ON_MUTE): automation.validate_automation(single=True),
+            cv.Optional(CONF_ON_UNMUTE): automation.validate_automation(single=True),
+            cv.Optional(CONF_ON_VOLUME): automation.validate_automation(single=True),
+        }
+    )
+    .extend(cv.COMPONENT_SCHEMA)
+    .extend(media_player.media_player_schema(SpeakerSourceMediaPlayer)),
+    cv.only_on_esp32,
+    _validate_volume_settings,
+)
+
+
+def _final_validate_codecs(config: ConfigType) -> ConfigType:
+    pipeline = config[CONF_MEDIA_PIPELINE]
+    fmt = pipeline[CONF_FORMAT]
+    if fmt == "NONE":
+        audio.request_flac_support()
+        audio.request_mp3_support()
+        audio.request_opus_support()
+    elif fmt == "FLAC":
+        audio.request_flac_support()
+    elif fmt == "MP3":
+        audio.request_mp3_support()
+    elif fmt == "OPUS":
+        audio.request_opus_support()
+
+    return config
+
+
+FINAL_VALIDATE_SCHEMA = cv.All(
+    cv.Schema(
+        {
+            cv.Required(CONF_MEDIA_PIPELINE): _validate_pipeline,
+        },
+        extra=cv.ALLOW_EXTRA,
+    ),
+    _final_validate_codecs,
+)
+
+
+async def to_code(config: ConfigType) -> None:
+    var = cg.new_Pvariable(config[CONF_ID])
+    await cg.register_component(var, config)
+    await media_player.register_media_player(var, config)
+
+    cg.add(var.set_volume_increment(config[CONF_VOLUME_INCREMENT]))
+    cg.add(var.set_volume_initial(config[CONF_VOLUME_INITIAL]))
+    cg.add(var.set_volume_max(config[CONF_VOLUME_MAX]))
+    cg.add(var.set_volume_min(config[CONF_VOLUME_MIN]))
+
+    pipeline_config = config[CONF_MEDIA_PIPELINE]
+    pipeline_enum = Pipeline.MEDIA_PIPELINE
+
+    for source in pipeline_config[CONF_SOURCES]:
+        src = await cg.get_variable(source)
+        cg.add(var.add_media_source(pipeline_enum, src))
+
+    cg.add(
+        var.set_speaker(
+            pipeline_enum,
+            await cg.get_variable(pipeline_config[CONF_SPEAKER]),
+        )
+    )
+    if pipeline_config[CONF_FORMAT] != "NONE":
+        cg.add(
+            var.set_format(
+                pipeline_enum,
+                _get_supported_format_struct(pipeline_config),
+            )
+        )
+
+    if on_mute := config.get(CONF_ON_MUTE):
+        await automation.build_automation(
+            var.get_mute_trigger(),
+            [],
+            on_mute,
+        )
+    if on_unmute := config.get(CONF_ON_UNMUTE):
+        await automation.build_automation(
+            var.get_unmute_trigger(),
+            [],
+            on_unmute,
+        )
+    if on_volume := config.get(CONF_ON_VOLUME):
+        await automation.build_automation(
+            var.get_volume_trigger(),
+            [(cg.float_, "x")],
+            on_volume,
+        )
--- a/esphome/components/speaker_source/speaker_source_media_player.cpp
+++ b/esphome/components/speaker_source/speaker_source_media_player.cpp
--- a/esphome/components/speaker_source/speaker_source_media_player.h
+++ b/esphome/components/speaker_source/speaker_source_media_player.h
@@ -0,0 +1,217 @@
+#pragma once
+
+#include "esphome/core/defines.h"
+
+#ifdef USE_ESP32
+
+#include "esphome/components/audio/audio.h"
+#include "esphome/components/media_source/media_source.h"
+#include "esphome/components/media_player/media_player.h"
+#include "esphome/components/speaker/speaker.h"
+
+#include "esphome/core/automation.h"
+#include "esphome/core/component.h"
+#include "esphome/core/preferences.h"
+
+#include <array>
+#include <atomic>
+#include <memory>
+#include <vector>
+#include <freertos/FreeRTOS.h>
+#include <freertos/queue.h>
+
+namespace esphome::speaker_source {
+
+// THREADING MODEL:
+// This component coordinates media sources that run their own decode tasks with speakers
+// that have their own playback callback tasks. Three thread contexts exist:
+//
+// - Main loop task: setup(), loop(), dump_config(), handle_media_state_changed_(),
+//   handle_volume_request_(), handle_mute_request_(), handle_play_uri_request_(),
+//   set_volume_(), set_mute_state_(), control(), get_media_pipeline_state_(),
+//   find_source_for_uri_(), try_execute_play_uri_(), save_volume_restore_state_()
+//
+// - Media source task(s): handle_media_output_() via SourceBinding::write_audio().
+//   Called from each source's decode task thread when streaming audio data.
+//   Reads ps.active_source (atomic), writes ps.pending_frames (atomic), and calls
+//   ps.speaker methods (speaker pointer is immutable after setup).
+//
+// - Speaker callback task: handle_speaker_playback_callback_() via speaker's
+//   add_audio_output_callback(). Called when the speaker finishes writing frames to the DAC.
+//   Reads ps.active_source (atomic), writes ps.pending_frames (atomic), and calls
+//   active_source->notify_audio_played().
+//
+// control() is only called from the main loop (HA/automation commands).
+// Source tasks use defer() for all requests (volume, mute, play_uri).
+//
+// Thread-safe communication:
+// - FreeRTOS queue (media_control_command_queue_): control() -> loop() for play/command dispatch
+// - defer(): SourceBinding::request_volume/request_mute/request_play_uri -> main loop
+// - Atomic fields (active_source, pending_frames): shared between all three thread contexts
+//
+// Non-atomic pipeline fields (last_source, stopping_source, pending_source) are only accessed
+// from the main loop thread.
+
+enum Pipeline : uint8_t {
+  MEDIA_PIPELINE = 0,
+};
+
+// Forward declaration
+class SpeakerSourceMediaPlayer;
+
+/// @brief Per-source listener binding that captures the source pointer at registration time.
+/// Each binding implements MediaSourceListener and forwards callbacks to the player with the source identified.
+/// Defined before PipelineContext so pipelines can own their bindings directly.
+struct SourceBinding : public media_source::MediaSourceListener {
+  SourceBinding(SpeakerSourceMediaPlayer *player, media_source::MediaSource *source, uint8_t pipeline)
+      : player(player), source(source), pipeline(pipeline) {}
+  SpeakerSourceMediaPlayer *player;
+  media_source::MediaSource *source;
+  uint8_t pipeline;
+
+  // Implementations are in the .cpp file because SpeakerSourceMediaPlayer is only forward-declared here
+  size_t write_audio(const uint8_t *data, size_t length, uint32_t timeout_ms,
+                     const audio::AudioStreamInfo &stream_info) override;
+  void report_state(media_source::MediaSourceState state) override;
+  void request_volume(float volume) override;
+  void request_mute(bool is_muted) override;
+  void request_play_uri(const std::string &uri) override;
+};
+
+struct PipelineContext {
+  speaker::Speaker *speaker{nullptr};
+  optional<media_player::MediaPlayerSupportedFormat> format;
+
+  std::atomic<media_source::MediaSource *> active_source{nullptr};
+  media_source::MediaSource *last_source{nullptr};
+  media_source::MediaSource *stopping_source{nullptr};  // Source we've asked to stop, awaiting IDLE
+  media_source::MediaSource *pending_source{nullptr};   // Source we've asked to play, awaiting PLAYING
+
+  // Each SourceBinding pairs a MediaSource* with its listener implementation.
+  // Uses unique_ptr so binding addresses are stable and set_listener() can be called in add_media_source().
+  // Uses std::vector because the count varies across instances (multiple speaker_source media players may exist).
+  std::vector<std::unique_ptr<SourceBinding>> sources;
+
+  // Track frames sent to speaker to correlate with playback callbacks.
+  // Atomic because it is written from the main loop/source tasks and read/decremented from the speaker playback
+  // callback.
+  std::atomic<uint32_t> pending_frames{0};
+
+  /// @brief Check if this pipeline is configured (has a speaker assigned)
+  bool is_configured() const { return this->speaker != nullptr; }
+};
+
+struct MediaPlayerControlCommand {
+  enum Type : uint8_t {
+    PLAY_URI,      // Find a source that can handle this URI and play it
+    SEND_COMMAND,  // Send command to active source
+  };
+  Type type;
+  uint8_t pipeline;
+
+  union {
+    std::string *uri;  // Owned pointer, must delete after xQueueReceive (for PLAY_URI)
+    media_player::MediaPlayerCommand command;
+  } data;
+};
+
+struct VolumeRestoreState {
+  float volume;
+  bool is_muted;
+};
+
+class SpeakerSourceMediaPlayer : public Component, public media_player::MediaPlayer {
+  friend struct SourceBinding;
+
+ public:
+  float get_setup_priority() const override { return esphome::setup_priority::PROCESSOR; }
+  void setup() override;
+  void loop() override;
+  void dump_config() override;
+
+  // MediaPlayer implementations
+  media_player::MediaPlayerTraits get_traits() override;
+  bool is_muted() const override { return this->is_muted_; }
+
+  // Percentage to increase or decrease the volume for volume up or volume down commands
+  void set_volume_increment(float volume_increment) { this->volume_increment_ = volume_increment; }
+
+  // Volume used initially on first boot when no volume had been previously saved
+  void set_volume_initial(float volume_initial) { this->volume_initial_ = volume_initial; }
+
+  void set_volume_max(float volume_max) { this->volume_max_ = volume_max; }
+  void set_volume_min(float volume_min) { this->volume_min_ = volume_min; }
+
+  /// @brief Adds a media source to a pipeline and registers this player as its listener
+  void add_media_source(uint8_t pipeline, media_source::MediaSource *media_source);
+
+  void set_speaker(uint8_t pipeline, speaker::Speaker *speaker) { this->pipelines_[pipeline].speaker = speaker; }
+  void set_format(uint8_t pipeline, const media_player::MediaPlayerSupportedFormat &format) {
+    this->pipelines_[pipeline].format = format;
+  }
+
+  Trigger<> *get_mute_trigger() { return &this->mute_trigger_; }
+  Trigger<> *get_unmute_trigger() { return &this->unmute_trigger_; }
+  Trigger<float> *get_volume_trigger() { return &this->volume_trigger_; }
+
+ protected:
+  // Callbacks from source bindings (pipeline index is captured at binding creation time)
+  size_t handle_media_output_(uint8_t pipeline, media_source::MediaSource *source, const uint8_t *data, size_t length,
+                              uint32_t timeout_ms, const audio::AudioStreamInfo &stream_info);
+  void handle_media_state_changed_(uint8_t pipeline, media_source::MediaSource *source,
+                                   media_source::MediaSourceState state);
+  void handle_volume_request_(float volume);
+  void handle_mute_request_(bool is_muted);
+  void handle_play_uri_request_(uint8_t pipeline, const std::string &uri);
+
+  void handle_speaker_playback_callback_(uint32_t frames, int64_t timestamp, uint8_t pipeline);
+
+  // Receives commands from HA or from the voice assistant component
+  // Sends commands to the media_control_command_queue_
+  void control(const media_player::MediaPlayerCall &call) override;
+
+  /// @brief Updates this->volume and saves volume/mute state to flash for restoration if publish is true.
+  void set_volume_(float volume, bool publish = true);
+
+  /// @brief Sets the mute state.
+  /// @param mute_state If true, audio will be muted. If false, audio will be unmuted
+  /// @param publish If true, saves volume/mute state to flash for restoration
+  void set_mute_state_(bool mute_state, bool publish = true);
+
+  /// @brief Saves the current volume and mute state to the flash for restoration.
+  void save_volume_restore_state_();
+
+  /// @brief Determine media player state from the media pipeline's active source
+  /// @param media_source Active source for the media pipeline (may be nullptr)
+  /// @return The appropriate MediaPlayerState
+  media_player::MediaPlayerState get_media_pipeline_state_(media_source::MediaSource *media_source) const;
+
+  bool try_execute_play_uri_(const std::string &uri, uint8_t pipeline);
+  media_source::MediaSource *find_source_for_uri_(const std::string &uri, uint8_t pipeline);
+  QueueHandle_t media_control_command_queue_;
+
+  // Pipeline context for media pipeline. See THREADING MODEL at top of namespace for access rules.
+  std::array<PipelineContext, 1> pipelines_;
+
+  // Used to save volume/mute state for restoration on reboot
+  ESPPreferenceObject pref_;
+
+  Trigger<> mute_trigger_;
+  Trigger<> unmute_trigger_;
+  Trigger<float> volume_trigger_;
+
+  // The amount to change the volume on volume up/down commands
+  float volume_increment_;
+
+  // The initial volume used by Setup when no previous volume was saved
+  float volume_initial_;
+
+  float volume_max_;
+  float volume_min_;
+
+  bool is_muted_{false};
+};
+
+}  // namespace esphome::speaker_source
+
+#endif  // USE_ESP32
--- a/tests/components/speaker_source/common.yaml
+++ b/tests/components/speaker_source/common.yaml
@@ -0,0 +1,43 @@
+i2s_audio:
+  i2s_lrclk_pin: ${i2s_bclk_pin}
+  i2s_bclk_pin: ${i2s_lrclk_pin}
+  i2s_mclk_pin: ${i2s_mclk_pin}
+
+speaker:
+  - platform: i2s_audio
+    id: speaker_id
+    dac_type: external
+    i2s_dout_pin: ${i2s_dout_pin}
+    sample_rate: 48000
+    num_channels: 2
+
+audio_file:
+  - id: test_audio
+    file:
+      type: local
+      path: $component_dir/test.wav
+
+media_source:
+  - platform: audio_file
+    id: audio_file_source
+
+media_player:
+  - platform: speaker_source
+    id: media_player_id
+    name: Media Player
+    volume_increment: 0.02
+    volume_initial: 0.75
+    volume_max: 0.95
+    volume_min: 0.0
+    media_pipeline:
+      speaker: speaker_id
+      format: FLAC
+      num_channels: 1
+      sources:
+        - audio_file_source
+    on_mute:
+      - media_player.pause:
+          id: media_player_id
+    on_unmute:
+      - media_player.play:
+          id: media_player_id
--- a/tests/components/speaker_source/test.esp32-idf.yaml
+++ b/tests/components/speaker_source/test.esp32-idf.yaml
@@ -0,0 +1,9 @@
+substitutions:
+  scl_pin: GPIO16
+  sda_pin: GPIO17
+  i2s_bclk_pin: GPIO27
+  i2s_lrclk_pin: GPIO26
+  i2s_mclk_pin: GPIO25
+  i2s_dout_pin: GPIO23
+
+<<: !include common.yaml
--- a/tests/components/speaker_source/test.wav
+++ b/tests/components/speaker_source/test.wav