[api][voice_assistant] Add second audio channel for voice_assistant (#16265)

Co-authored-by: Kevin Ahrendt <kevin.ahrendt@openhomefoundation.org> Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
2026-06-02 11:08:06 +08:00 · 2026-05-12 20:38:39 -05:00
parent 65b53692bd
commit f94735dc62
9 changed files with 179 additions and 37 deletions
@@ -2026,6 +2026,7 @@ message VoiceAssistantAudio {
  bytes data = 1 [(pointer_to_buffer) = true];
  bool end = 2;
  bytes data2 = 3 [(pointer_to_buffer) = true];
 }
 enum VoiceAssistantTimerEvent {
@@ -2893,6 +2893,11 @@ bool VoiceAssistantAudio::decode_length(uint32_t field_id, ProtoLengthDelimited
      this->data_len = value.size();
      break;
    }
    case 3: {
      this->data2 = value.data();
      this->data2_len = value.size();
      break;
    }
    default:
      return false;
  }
@@ -2902,12 +2907,14 @@ uint8_t *VoiceAssistantAudio::encode(ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG
  uint8_t *__restrict__ pos = buffer.get_pos();
  ProtoEncode::encode_bytes(pos PROTO_ENCODE_DEBUG_ARG, 1, this->data, this->data_len);
  ProtoEncode::encode_bool(pos PROTO_ENCODE_DEBUG_ARG, 2, this->end);
  ProtoEncode::encode_bytes(pos PROTO_ENCODE_DEBUG_ARG, 3, this->data2, this->data2_len);
  return pos;
 }
 uint32_t VoiceAssistantAudio::calculate_size() const {
  uint32_t size = 0;
  size += ProtoSize::calc_length(1, this->data_len);
  size += ProtoSize::calc_bool(1, this->end);
  size += ProtoSize::calc_length(1, this->data2_len);
  return size;
 }
 bool VoiceAssistantTimerEventResponse::decode_varint(uint32_t field_id, proto_varint_value_t value) {
@@ -2436,13 +2436,15 @@ class VoiceAssistantEventResponse final : public ProtoDecodableMessage {
 class VoiceAssistantAudio final : public ProtoDecodableMessage {
 public:
  static constexpr uint8_t MESSAGE_TYPE = 106;
-  static constexpr uint8_t ESTIMATED_SIZE = 21;
+  static constexpr uint8_t ESTIMATED_SIZE = 40;
 #ifdef HAS_PROTO_MESSAGE_DUMP
  const LogString *message_name() const override { return LOG_STR("voice_assistant_audio"); }
 #endif
  const uint8_t *data{nullptr};
  uint16_t data_len{0};
  bool end{false};
  const uint8_t *data2{nullptr};
  uint16_t data2_len{0};
  uint8_t *encode(ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG_PARAM) const;
  uint32_t calculate_size() const;
 #ifdef HAS_PROTO_MESSAGE_DUMP
@@ -2174,6 +2174,7 @@ const char *VoiceAssistantAudio::dump_to(DumpBuffer &out) const {
  MessageDumpHelper helper(out, ESPHOME_PSTR("VoiceAssistantAudio"));
  dump_bytes_field(out, ESPHOME_PSTR("data"), this->data, this->data_len);
  dump_field(out, ESPHOME_PSTR("end"), this->end);
  dump_bytes_field(out, ESPHOME_PSTR("data2"), this->data2, this->data2_len);
  return out.c_str();
 }
 const char *VoiceAssistantTimerEventResponse::dump_to(DumpBuffer &out) const {
@@ -53,6 +53,8 @@ CONF_ON_TIMER_CANCELLED = "on_timer_cancelled"
 CONF_ON_TIMER_FINISHED = "on_timer_finished"
 CONF_ON_TIMER_TICK = "on_timer_tick"
 MAX_MICROPHONE_SOURCES = 2
 voice_assistant_ns = cg.esphome_ns.namespace("voice_assistant")
 VoiceAssistant = voice_assistant_ns.class_("VoiceAssistant", cg.Component)
@@ -90,13 +92,20 @@ CONFIG_SCHEMA = cv.All(
    cv.Schema(
        {
            cv.GenerateID(): cv.declare_id(VoiceAssistant),
-            cv.Optional(
+            cv.Optional(CONF_MICROPHONE, default=[{}]): cv.All(
-                CONF_MICROPHONE, default={}
+                cv.ensure_list(
-            ): microphone.microphone_source_schema(
+                    microphone.microphone_source_schema(
-                min_bits_per_sample=16,
+                        min_bits_per_sample=16,
-                max_bits_per_sample=16,
+                        max_bits_per_sample=16,
-                min_channels=1,
+                        min_channels=1,
-                max_channels=1,
+                        max_channels=1,
                    )
                ),
                cv.Length(
                    min=1,
                    max=MAX_MICROPHONE_SOURCES,
                    msg=f"Voice Assistant supports at most {MAX_MICROPHONE_SOURCES} microphone sources",
                ),
            ),
            cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id(
                media_player.MediaPlayer
@@ -179,10 +188,10 @@ CONFIG_SCHEMA = cv.All(
 FINAL_VALIDATE_SCHEMA = cv.All(
    cv.Schema(
        {
-            cv.Optional(
+            cv.Optional(CONF_MICROPHONE): cv.ensure_list(
-                CONF_MICROPHONE
+                microphone.final_validate_microphone_source_schema(
-            ): microphone.final_validate_microphone_source_schema(
+                    "voice_assistant", sample_rate=16000
-                "voice_assistant", sample_rate=16000
+                )
            ),
        },
        extra=cv.ALLOW_EXTRA,
@@ -194,9 +203,14 @@ async def to_code(config):
    var = cg.new_Pvariable(config[CONF_ID])
    await cg.register_component(var, config)
-    mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE])
+    mic_sources = config[CONF_MICROPHONE]
    mic_source = await microphone.microphone_source_to_code(mic_sources[0])
    cg.add(var.set_microphone_source(mic_source))
    if len(mic_sources) > 1:
        mic_source2 = await microphone.microphone_source_to_code(mic_sources[1])
        cg.add(var.set_microphone_source2(mic_source2))
    if CONF_MICRO_WAKE_WORD in config:
        mww = await cg.get_variable(config[CONF_MICRO_WAKE_WORD])
        cg.add(var.set_micro_wake_word(mww))
@@ -31,11 +31,21 @@ VoiceAssistant::VoiceAssistant() { global_voice_assistant = this; }
 void VoiceAssistant::setup() {
  this->mic_source_->add_data_callback([this](const std::vector<uint8_t> &data) {
    std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = this->ring_buffer_;
-    if (this->ring_buffer_.use_count() > 1) {
+    if (temp_ring_buffer != nullptr) {
      temp_ring_buffer->write((void *) data.data(), data.size());
    }
  });
  // Second microphone channel
  if (this->mic_source2_ != nullptr) {
    this->mic_source2_->add_data_callback([this](const std::vector<uint8_t> &data) {
      std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = this->ring_buffer2_;
      if (temp_ring_buffer != nullptr) {
        temp_ring_buffer->write((void *) data.data(), data.size());
      }
    });
  }
 #ifdef USE_MEDIA_PLAYER
  if (this->media_player_ != nullptr) {
    this->media_player_->add_on_state_callback([this](media_player::MediaPlayerState state) {
@@ -115,9 +125,9 @@ bool VoiceAssistant::allocate_buffers_() {
  }
 #endif
-  if (this->ring_buffer_.use_count() == 0) {
+  if (this->ring_buffer_ == nullptr) {
    this->ring_buffer_ = ring_buffer::RingBuffer::create(RING_BUFFER_SIZE);
-    if (this->ring_buffer_.use_count() == 0) {
+    if (this->ring_buffer_ == nullptr) {
      ESP_LOGE(TAG, "Could not allocate ring buffer");
      return false;
    }
@@ -132,6 +142,26 @@ bool VoiceAssistant::allocate_buffers_() {
    }
  }
  // Second microphone channel
  if (this->mic_source2_ != nullptr) {
    if (this->ring_buffer2_ == nullptr) {
      this->ring_buffer2_ = ring_buffer::RingBuffer::create(RING_BUFFER_SIZE);
      if (this->ring_buffer2_ == nullptr) {
        ESP_LOGE(TAG, "Could not allocate second ring buffer");
        return false;
      }
    }
    if (this->send_buffer2_ == nullptr) {
      RAMAllocator<uint8_t> send_allocator;
      this->send_buffer2_ = send_allocator.allocate(SEND_BUFFER_SIZE);
      if (this->send_buffer2_ == nullptr) {
        ESP_LOGW(TAG, "Could not allocate second send buffer");
        return false;
      }
    }
  }
  return true;
 }
@@ -144,6 +174,15 @@ void VoiceAssistant::clear_buffers_() {
    this->ring_buffer_->reset();
  }
  // Second microphone channel
  if (this->send_buffer2_ != nullptr) {
    memset(this->send_buffer2_, 0, SEND_BUFFER_SIZE);
  }
  if (this->ring_buffer2_ != nullptr) {
    this->ring_buffer2_->reset();
  }
 #ifdef USE_SPEAKER
  if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
    memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE);
@@ -162,10 +201,17 @@ void VoiceAssistant::deallocate_buffers_() {
    this->send_buffer_ = nullptr;
  }
-  if (this->ring_buffer_.use_count() > 0) {
+  this->ring_buffer_.reset();
-    this->ring_buffer_.reset();
+
  // Second microphone channel
  if (this->send_buffer2_ != nullptr) {
    RAMAllocator<uint8_t> send_deallocator;
    send_deallocator.deallocate(this->send_buffer2_, SEND_BUFFER_SIZE);
    this->send_buffer2_ = nullptr;
  }
  this->ring_buffer2_.reset();
 #ifdef USE_SPEAKER
  if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
    RAMAllocator<uint8_t> speaker_deallocator;
@@ -183,7 +229,8 @@ void VoiceAssistant::reset_conversation_id() {
 void VoiceAssistant::loop() {
  if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
      this->state_ != State::STOPPING_MICROPHONE) {
-    if (this->mic_source_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
+    if (this->mic_source_->is_running() || (this->mic_source2_ && this->mic_source2_->is_running()) ||
        this->state_ == State::STARTING_MICROPHONE) {
      this->set_state_(State::STOP_MICROPHONE, State::IDLE);
    } else {
      this->set_state_(State::IDLE, State::IDLE);
@@ -215,11 +262,14 @@ void VoiceAssistant::loop() {
      this->clear_buffers_();
      this->mic_source_->start();
      if (this->mic_source2_) {
        this->mic_source2_->start();
      }
      this->set_state_(State::STARTING_MICROPHONE);
      break;
    }
    case State::STARTING_MICROPHONE: {
-      if (this->mic_source_->is_running()) {
+      if (this->mic_source_->is_running() && (!this->mic_source2_ || this->mic_source2_->is_running())) {
        this->set_state_(this->desired_state_);
      }
      break;
@@ -266,15 +316,44 @@ void VoiceAssistant::loop() {
      break;  // State changed when udp server port received
    }
    case State::STREAMING_MICROPHONE: {
-      size_t available = this->ring_buffer_->available();
+      if (this->audio_mode_ == AUDIO_MODE_API) {
-      while (available >= SEND_BUFFER_SIZE) {
+        // API audio
-        size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
+        // Both microphone channels are sent, if configured
-        if (this->audio_mode_ == AUDIO_MODE_API) {
+        bool is_available = this->ring_buffer_->available() >= SEND_BUFFER_SIZE;
        bool is_available2 = false;
        if (this->mic_source2_) {
          is_available2 = this->ring_buffer2_->available() >= SEND_BUFFER_SIZE;
        }
        while (is_available || is_available2) {
          api::VoiceAssistantAudio msg;
-          msg.data = this->send_buffer_;
+
-          msg.data_len = read_bytes;
+          if (is_available) {
            size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
            msg.data = this->send_buffer_;
            msg.data_len = read_bytes;
          }
          // Second microphone channel
          if (is_available2) {
            size_t read_bytes = this->ring_buffer2_->read((void *) this->send_buffer2_, SEND_BUFFER_SIZE, 0);
            msg.data2 = this->send_buffer2_;
            msg.data2_len = read_bytes;
          }
          this->api_client_->send_message(msg);
-        } else {
+          is_available = this->ring_buffer_->available() >= SEND_BUFFER_SIZE;
          if (this->mic_source2_) {
            is_available2 = this->ring_buffer2_->available() >= SEND_BUFFER_SIZE;
          } else {
            is_available2 = false;
          }
        }
      } else {
        // UDP (will eventually be deprecated)
        // Only the primary microphone channel is used
        while (this->ring_buffer_->available() >= SEND_BUFFER_SIZE) {
          size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
          if (!this->udp_socket_running_) {
            if (!this->start_udp_socket_()) {
              this->set_state_(State::STOP_MICROPHONE, State::IDLE);
@@ -284,14 +363,23 @@ void VoiceAssistant::loop() {
          this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
                                sizeof(this->dest_addr_));
        }
-        available = this->ring_buffer_->available();
+      }  // audio mode
      }
      break;
    }
    case State::STOP_MICROPHONE: {
-      if (this->mic_source_->is_running()) {
+      // Check both microphone channels
-        this->mic_source_->stop();
+      bool is_running = this->mic_source_->is_running();
      bool is_running2 = false;
      if (this->mic_source2_) {
        is_running2 = this->mic_source2_->is_running();
      }
      if (is_running || is_running2) {
        if (is_running) {
          this->mic_source_->stop();
        }
        if (is_running2) {
          this->mic_source2_->stop();
        }
        this->set_state_(State::STOPPING_MICROPHONE);
      } else {
        this->set_state_(this->desired_state_);
@@ -299,7 +387,13 @@ void VoiceAssistant::loop() {
      break;
    }
    case State::STOPPING_MICROPHONE: {
-      if (this->mic_source_->is_stopped()) {
+      // Check both microphone channels
      bool is_stopped = this->mic_source_->is_stopped();
      bool is_stopped2 = true;
      if (this->mic_source2_) {
        is_stopped2 = this->mic_source2_->is_stopped();
      }
      if (is_stopped && is_stopped2) {
        this->set_state_(this->desired_state_);
      }
      break;
@@ -504,7 +598,8 @@ void VoiceAssistant::start_streaming() {
  ESP_LOGD(TAG, "Client started, streaming microphone");
  this->audio_mode_ = AUDIO_MODE_API;
-  if (this->mic_source_->is_running()) {
+  // Both microphone channels
  if (this->mic_source_->is_running() && (!this->mic_source2_ || this->mic_source2_->is_running())) {
    this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
  } else {
    this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
@@ -520,6 +615,10 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por
  ESP_LOGD(TAG, "Client started, streaming microphone");
  this->audio_mode_ = AUDIO_MODE_UDP;
  if (this->mic_source2_ != nullptr) {
    ESP_LOGW(TAG, "UDP audio mode does not support a second microphone channel; only the primary will be streamed");
  }
  memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
  if (this->dest_addr_.ss_family == AF_INET) {
    ((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port);
@@ -534,6 +633,7 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por
    return;
  }
  // Only primary microphone channel over UDP
  if (this->mic_source_->is_running()) {
    this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
  } else {
@@ -40,6 +40,7 @@ enum VoiceAssistantFeature : uint32_t {
  FEATURE_TIMERS = 1 << 3,
  FEATURE_ANNOUNCE = 1 << 4,
  FEATURE_START_CONVERSATION = 1 << 5,
  FEATURE_MULTI_CHANNEL_AUDIO = 1 << 6,
 };
 enum class State {
@@ -120,6 +121,7 @@ class VoiceAssistant : public Component {
  void failed_to_start();
  void set_microphone_source(microphone::MicrophoneSource *mic_source) { this->mic_source_ = mic_source; }
  void set_microphone_source2(microphone::MicrophoneSource *mic_source2) { this->mic_source2_ = mic_source2; }
 #ifdef USE_MICRO_WAKE_WORD
  void set_micro_wake_word(micro_wake_word::MicroWakeWord *mww) { this->micro_wake_word_ = mww; }
 #endif
@@ -149,6 +151,9 @@ class VoiceAssistant : public Component {
    uint32_t flags = 0;
    flags |= VoiceAssistantFeature::FEATURE_VOICE_ASSISTANT;
    flags |= VoiceAssistantFeature::FEATURE_API_AUDIO;
    if (this->mic_source2_ != nullptr) {
      flags |= VoiceAssistantFeature::FEATURE_MULTI_CHANNEL_AUDIO;
    }
 #ifdef USE_SPEAKER
    if (this->speaker_ != nullptr) {
      flags |= VoiceAssistantFeature::FEATURE_SPEAKER;
@@ -276,6 +281,7 @@ class VoiceAssistant : public Component {
  bool timer_tick_running_{false};
  microphone::MicrophoneSource *mic_source_{nullptr};
  microphone::MicrophoneSource *mic_source2_{nullptr};
 #ifdef USE_SPEAKER
  void write_speaker_();
  speaker::Speaker *speaker_{nullptr};
@@ -301,6 +307,7 @@ class VoiceAssistant : public Component {
  std::string wake_word_;
  std::shared_ptr<ring_buffer::RingBuffer> ring_buffer_;
  std::shared_ptr<ring_buffer::RingBuffer> ring_buffer2_;
  bool use_wake_word_;
  uint8_t noise_suppression_level_;
@@ -309,6 +316,7 @@ class VoiceAssistant : public Component {
  uint32_t conversation_timeout_;
  uint8_t *send_buffer_{nullptr};
  uint8_t *send_buffer2_{nullptr};
  bool continuous_{false};
  bool silence_detection_;
@@ -31,6 +31,11 @@ microphone:
    i2s_din_pin: ${i2s_din_pin}
    adc_type: external
    pdm: false
  - platform: i2s_audio
    id: mic_id_external2
    i2s_din_pin: ${i2s_din_pin2}
    adc_type: external
    pdm: false
 speaker:
  - platform: i2s_audio
@@ -40,9 +45,12 @@ speaker:
 voice_assistant:
  microphone:
-    microphone: mic_id_external
+    - microphone: mic_id_external
-    gain_factor: 4
+      gain_factor: 4
-    channels: 0
+      channels: 0
    - microphone: mic_id_external2
      gain_factor: 4
      channels: 0
  speaker: speaker_id
  micro_wake_word: mww_id
  conversation_timeout: 60s
@@ -3,6 +3,7 @@ substitutions:
  i2s_bclk_pin: GPIO5
  i2s_mclk_pin: GPIO15
  i2s_din_pin: GPIO13
  i2s_din_pin2: GPIO14
  i2s_dout_pin: GPIO12
 <<: !include common-idf.yaml