[api][voice_assistant] Add second audio channel for voice_assistant (#16265)

Co-authored-by: Kevin Ahrendt <kevin.ahrendt@openhomefoundation.org>
Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
This commit is contained in:
Michael Hansen
2026-05-12 20:38:39 -05:00
committed by GitHub
parent 65b53692bd
commit f94735dc62
9 changed files with 179 additions and 37 deletions
+1
View File
@@ -2026,6 +2026,7 @@ message VoiceAssistantAudio {
bytes data = 1 [(pointer_to_buffer) = true];
bool end = 2;
bytes data2 = 3 [(pointer_to_buffer) = true];
}
enum VoiceAssistantTimerEvent {
+7
View File
@@ -2893,6 +2893,11 @@ bool VoiceAssistantAudio::decode_length(uint32_t field_id, ProtoLengthDelimited
this->data_len = value.size();
break;
}
case 3: {
this->data2 = value.data();
this->data2_len = value.size();
break;
}
default:
return false;
}
@@ -2902,12 +2907,14 @@ uint8_t *VoiceAssistantAudio::encode(ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG
uint8_t *__restrict__ pos = buffer.get_pos();
ProtoEncode::encode_bytes(pos PROTO_ENCODE_DEBUG_ARG, 1, this->data, this->data_len);
ProtoEncode::encode_bool(pos PROTO_ENCODE_DEBUG_ARG, 2, this->end);
ProtoEncode::encode_bytes(pos PROTO_ENCODE_DEBUG_ARG, 3, this->data2, this->data2_len);
return pos;
}
uint32_t VoiceAssistantAudio::calculate_size() const {
uint32_t size = 0;
size += ProtoSize::calc_length(1, this->data_len);
size += ProtoSize::calc_bool(1, this->end);
size += ProtoSize::calc_length(1, this->data2_len);
return size;
}
bool VoiceAssistantTimerEventResponse::decode_varint(uint32_t field_id, proto_varint_value_t value) {
+3 -1
View File
@@ -2436,13 +2436,15 @@ class VoiceAssistantEventResponse final : public ProtoDecodableMessage {
class VoiceAssistantAudio final : public ProtoDecodableMessage {
public:
static constexpr uint8_t MESSAGE_TYPE = 106;
static constexpr uint8_t ESTIMATED_SIZE = 21;
static constexpr uint8_t ESTIMATED_SIZE = 40;
#ifdef HAS_PROTO_MESSAGE_DUMP
const LogString *message_name() const override { return LOG_STR("voice_assistant_audio"); }
#endif
const uint8_t *data{nullptr};
uint16_t data_len{0};
bool end{false};
const uint8_t *data2{nullptr};
uint16_t data2_len{0};
uint8_t *encode(ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG_PARAM) const;
uint32_t calculate_size() const;
#ifdef HAS_PROTO_MESSAGE_DUMP
+1
View File
@@ -2174,6 +2174,7 @@ const char *VoiceAssistantAudio::dump_to(DumpBuffer &out) const {
MessageDumpHelper helper(out, ESPHOME_PSTR("VoiceAssistantAudio"));
dump_bytes_field(out, ESPHOME_PSTR("data"), this->data, this->data_len);
dump_field(out, ESPHOME_PSTR("end"), this->end);
dump_bytes_field(out, ESPHOME_PSTR("data2"), this->data2, this->data2_len);
return out.c_str();
}
const char *VoiceAssistantTimerEventResponse::dump_to(DumpBuffer &out) const {
+26 -12
View File
@@ -53,6 +53,8 @@ CONF_ON_TIMER_CANCELLED = "on_timer_cancelled"
CONF_ON_TIMER_FINISHED = "on_timer_finished"
CONF_ON_TIMER_TICK = "on_timer_tick"
MAX_MICROPHONE_SOURCES = 2
voice_assistant_ns = cg.esphome_ns.namespace("voice_assistant")
VoiceAssistant = voice_assistant_ns.class_("VoiceAssistant", cg.Component)
@@ -90,13 +92,20 @@ CONFIG_SCHEMA = cv.All(
cv.Schema(
{
cv.GenerateID(): cv.declare_id(VoiceAssistant),
cv.Optional(
CONF_MICROPHONE, default={}
): microphone.microphone_source_schema(
min_bits_per_sample=16,
max_bits_per_sample=16,
min_channels=1,
max_channels=1,
cv.Optional(CONF_MICROPHONE, default=[{}]): cv.All(
cv.ensure_list(
microphone.microphone_source_schema(
min_bits_per_sample=16,
max_bits_per_sample=16,
min_channels=1,
max_channels=1,
)
),
cv.Length(
min=1,
max=MAX_MICROPHONE_SOURCES,
msg=f"Voice Assistant supports at most {MAX_MICROPHONE_SOURCES} microphone sources",
),
),
cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id(
media_player.MediaPlayer
@@ -179,10 +188,10 @@ CONFIG_SCHEMA = cv.All(
FINAL_VALIDATE_SCHEMA = cv.All(
cv.Schema(
{
cv.Optional(
CONF_MICROPHONE
): microphone.final_validate_microphone_source_schema(
"voice_assistant", sample_rate=16000
cv.Optional(CONF_MICROPHONE): cv.ensure_list(
microphone.final_validate_microphone_source_schema(
"voice_assistant", sample_rate=16000
)
),
},
extra=cv.ALLOW_EXTRA,
@@ -194,9 +203,14 @@ async def to_code(config):
var = cg.new_Pvariable(config[CONF_ID])
await cg.register_component(var, config)
mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE])
mic_sources = config[CONF_MICROPHONE]
mic_source = await microphone.microphone_source_to_code(mic_sources[0])
cg.add(var.set_microphone_source(mic_source))
if len(mic_sources) > 1:
mic_source2 = await microphone.microphone_source_to_code(mic_sources[1])
cg.add(var.set_microphone_source2(mic_source2))
if CONF_MICRO_WAKE_WORD in config:
mww = await cg.get_variable(config[CONF_MICRO_WAKE_WORD])
cg.add(var.set_micro_wake_word(mww))
@@ -31,11 +31,21 @@ VoiceAssistant::VoiceAssistant() { global_voice_assistant = this; }
void VoiceAssistant::setup() {
this->mic_source_->add_data_callback([this](const std::vector<uint8_t> &data) {
std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = this->ring_buffer_;
if (this->ring_buffer_.use_count() > 1) {
if (temp_ring_buffer != nullptr) {
temp_ring_buffer->write((void *) data.data(), data.size());
}
});
// Second microphone channel
if (this->mic_source2_ != nullptr) {
this->mic_source2_->add_data_callback([this](const std::vector<uint8_t> &data) {
std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = this->ring_buffer2_;
if (temp_ring_buffer != nullptr) {
temp_ring_buffer->write((void *) data.data(), data.size());
}
});
}
#ifdef USE_MEDIA_PLAYER
if (this->media_player_ != nullptr) {
this->media_player_->add_on_state_callback([this](media_player::MediaPlayerState state) {
@@ -115,9 +125,9 @@ bool VoiceAssistant::allocate_buffers_() {
}
#endif
if (this->ring_buffer_.use_count() == 0) {
if (this->ring_buffer_ == nullptr) {
this->ring_buffer_ = ring_buffer::RingBuffer::create(RING_BUFFER_SIZE);
if (this->ring_buffer_.use_count() == 0) {
if (this->ring_buffer_ == nullptr) {
ESP_LOGE(TAG, "Could not allocate ring buffer");
return false;
}
@@ -132,6 +142,26 @@ bool VoiceAssistant::allocate_buffers_() {
}
}
// Second microphone channel
if (this->mic_source2_ != nullptr) {
if (this->ring_buffer2_ == nullptr) {
this->ring_buffer2_ = ring_buffer::RingBuffer::create(RING_BUFFER_SIZE);
if (this->ring_buffer2_ == nullptr) {
ESP_LOGE(TAG, "Could not allocate second ring buffer");
return false;
}
}
if (this->send_buffer2_ == nullptr) {
RAMAllocator<uint8_t> send_allocator;
this->send_buffer2_ = send_allocator.allocate(SEND_BUFFER_SIZE);
if (this->send_buffer2_ == nullptr) {
ESP_LOGW(TAG, "Could not allocate second send buffer");
return false;
}
}
}
return true;
}
@@ -144,6 +174,15 @@ void VoiceAssistant::clear_buffers_() {
this->ring_buffer_->reset();
}
// Second microphone channel
if (this->send_buffer2_ != nullptr) {
memset(this->send_buffer2_, 0, SEND_BUFFER_SIZE);
}
if (this->ring_buffer2_ != nullptr) {
this->ring_buffer2_->reset();
}
#ifdef USE_SPEAKER
if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE);
@@ -162,10 +201,17 @@ void VoiceAssistant::deallocate_buffers_() {
this->send_buffer_ = nullptr;
}
if (this->ring_buffer_.use_count() > 0) {
this->ring_buffer_.reset();
this->ring_buffer_.reset();
// Second microphone channel
if (this->send_buffer2_ != nullptr) {
RAMAllocator<uint8_t> send_deallocator;
send_deallocator.deallocate(this->send_buffer2_, SEND_BUFFER_SIZE);
this->send_buffer2_ = nullptr;
}
this->ring_buffer2_.reset();
#ifdef USE_SPEAKER
if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
RAMAllocator<uint8_t> speaker_deallocator;
@@ -183,7 +229,8 @@ void VoiceAssistant::reset_conversation_id() {
void VoiceAssistant::loop() {
if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
this->state_ != State::STOPPING_MICROPHONE) {
if (this->mic_source_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
if (this->mic_source_->is_running() || (this->mic_source2_ && this->mic_source2_->is_running()) ||
this->state_ == State::STARTING_MICROPHONE) {
this->set_state_(State::STOP_MICROPHONE, State::IDLE);
} else {
this->set_state_(State::IDLE, State::IDLE);
@@ -215,11 +262,14 @@ void VoiceAssistant::loop() {
this->clear_buffers_();
this->mic_source_->start();
if (this->mic_source2_) {
this->mic_source2_->start();
}
this->set_state_(State::STARTING_MICROPHONE);
break;
}
case State::STARTING_MICROPHONE: {
if (this->mic_source_->is_running()) {
if (this->mic_source_->is_running() && (!this->mic_source2_ || this->mic_source2_->is_running())) {
this->set_state_(this->desired_state_);
}
break;
@@ -266,15 +316,44 @@ void VoiceAssistant::loop() {
break; // State changed when udp server port received
}
case State::STREAMING_MICROPHONE: {
size_t available = this->ring_buffer_->available();
while (available >= SEND_BUFFER_SIZE) {
size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
if (this->audio_mode_ == AUDIO_MODE_API) {
if (this->audio_mode_ == AUDIO_MODE_API) {
// API audio
// Both microphone channels are sent, if configured
bool is_available = this->ring_buffer_->available() >= SEND_BUFFER_SIZE;
bool is_available2 = false;
if (this->mic_source2_) {
is_available2 = this->ring_buffer2_->available() >= SEND_BUFFER_SIZE;
}
while (is_available || is_available2) {
api::VoiceAssistantAudio msg;
msg.data = this->send_buffer_;
msg.data_len = read_bytes;
if (is_available) {
size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
msg.data = this->send_buffer_;
msg.data_len = read_bytes;
}
// Second microphone channel
if (is_available2) {
size_t read_bytes = this->ring_buffer2_->read((void *) this->send_buffer2_, SEND_BUFFER_SIZE, 0);
msg.data2 = this->send_buffer2_;
msg.data2_len = read_bytes;
}
this->api_client_->send_message(msg);
} else {
is_available = this->ring_buffer_->available() >= SEND_BUFFER_SIZE;
if (this->mic_source2_) {
is_available2 = this->ring_buffer2_->available() >= SEND_BUFFER_SIZE;
} else {
is_available2 = false;
}
}
} else {
// UDP (will eventually be deprecated)
// Only the primary microphone channel is used
while (this->ring_buffer_->available() >= SEND_BUFFER_SIZE) {
size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
if (!this->udp_socket_running_) {
if (!this->start_udp_socket_()) {
this->set_state_(State::STOP_MICROPHONE, State::IDLE);
@@ -284,14 +363,23 @@ void VoiceAssistant::loop() {
this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
sizeof(this->dest_addr_));
}
available = this->ring_buffer_->available();
}
} // audio mode
break;
}
case State::STOP_MICROPHONE: {
if (this->mic_source_->is_running()) {
this->mic_source_->stop();
// Check both microphone channels
bool is_running = this->mic_source_->is_running();
bool is_running2 = false;
if (this->mic_source2_) {
is_running2 = this->mic_source2_->is_running();
}
if (is_running || is_running2) {
if (is_running) {
this->mic_source_->stop();
}
if (is_running2) {
this->mic_source2_->stop();
}
this->set_state_(State::STOPPING_MICROPHONE);
} else {
this->set_state_(this->desired_state_);
@@ -299,7 +387,13 @@ void VoiceAssistant::loop() {
break;
}
case State::STOPPING_MICROPHONE: {
if (this->mic_source_->is_stopped()) {
// Check both microphone channels
bool is_stopped = this->mic_source_->is_stopped();
bool is_stopped2 = true;
if (this->mic_source2_) {
is_stopped2 = this->mic_source2_->is_stopped();
}
if (is_stopped && is_stopped2) {
this->set_state_(this->desired_state_);
}
break;
@@ -504,7 +598,8 @@ void VoiceAssistant::start_streaming() {
ESP_LOGD(TAG, "Client started, streaming microphone");
this->audio_mode_ = AUDIO_MODE_API;
if (this->mic_source_->is_running()) {
// Both microphone channels
if (this->mic_source_->is_running() && (!this->mic_source2_ || this->mic_source2_->is_running())) {
this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
} else {
this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
@@ -520,6 +615,10 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por
ESP_LOGD(TAG, "Client started, streaming microphone");
this->audio_mode_ = AUDIO_MODE_UDP;
if (this->mic_source2_ != nullptr) {
ESP_LOGW(TAG, "UDP audio mode does not support a second microphone channel; only the primary will be streamed");
}
memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
if (this->dest_addr_.ss_family == AF_INET) {
((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port);
@@ -534,6 +633,7 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por
return;
}
// Only primary microphone channel over UDP
if (this->mic_source_->is_running()) {
this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
} else {
@@ -40,6 +40,7 @@ enum VoiceAssistantFeature : uint32_t {
FEATURE_TIMERS = 1 << 3,
FEATURE_ANNOUNCE = 1 << 4,
FEATURE_START_CONVERSATION = 1 << 5,
FEATURE_MULTI_CHANNEL_AUDIO = 1 << 6,
};
enum class State {
@@ -120,6 +121,7 @@ class VoiceAssistant : public Component {
void failed_to_start();
void set_microphone_source(microphone::MicrophoneSource *mic_source) { this->mic_source_ = mic_source; }
void set_microphone_source2(microphone::MicrophoneSource *mic_source2) { this->mic_source2_ = mic_source2; }
#ifdef USE_MICRO_WAKE_WORD
void set_micro_wake_word(micro_wake_word::MicroWakeWord *mww) { this->micro_wake_word_ = mww; }
#endif
@@ -149,6 +151,9 @@ class VoiceAssistant : public Component {
uint32_t flags = 0;
flags |= VoiceAssistantFeature::FEATURE_VOICE_ASSISTANT;
flags |= VoiceAssistantFeature::FEATURE_API_AUDIO;
if (this->mic_source2_ != nullptr) {
flags |= VoiceAssistantFeature::FEATURE_MULTI_CHANNEL_AUDIO;
}
#ifdef USE_SPEAKER
if (this->speaker_ != nullptr) {
flags |= VoiceAssistantFeature::FEATURE_SPEAKER;
@@ -276,6 +281,7 @@ class VoiceAssistant : public Component {
bool timer_tick_running_{false};
microphone::MicrophoneSource *mic_source_{nullptr};
microphone::MicrophoneSource *mic_source2_{nullptr};
#ifdef USE_SPEAKER
void write_speaker_();
speaker::Speaker *speaker_{nullptr};
@@ -301,6 +307,7 @@ class VoiceAssistant : public Component {
std::string wake_word_;
std::shared_ptr<ring_buffer::RingBuffer> ring_buffer_;
std::shared_ptr<ring_buffer::RingBuffer> ring_buffer2_;
bool use_wake_word_;
uint8_t noise_suppression_level_;
@@ -309,6 +316,7 @@ class VoiceAssistant : public Component {
uint32_t conversation_timeout_;
uint8_t *send_buffer_{nullptr};
uint8_t *send_buffer2_{nullptr};
bool continuous_{false};
bool silence_detection_;
@@ -31,6 +31,11 @@ microphone:
i2s_din_pin: ${i2s_din_pin}
adc_type: external
pdm: false
- platform: i2s_audio
id: mic_id_external2
i2s_din_pin: ${i2s_din_pin2}
adc_type: external
pdm: false
speaker:
- platform: i2s_audio
@@ -40,9 +45,12 @@ speaker:
voice_assistant:
microphone:
microphone: mic_id_external
gain_factor: 4
channels: 0
- microphone: mic_id_external
gain_factor: 4
channels: 0
- microphone: mic_id_external2
gain_factor: 4
channels: 0
speaker: speaker_id
micro_wake_word: mww_id
conversation_timeout: 60s
@@ -3,6 +3,7 @@ substitutions:
i2s_bclk_pin: GPIO5
i2s_mclk_pin: GPIO15
i2s_din_pin: GPIO13
i2s_din_pin2: GPIO14
i2s_dout_pin: GPIO12
<<: !include common-idf.yaml