ESPHome 2026.2.2
Loading...
Searching...
No Matches
voice_assistant.cpp
Go to the documentation of this file.
1#include "voice_assistant.h"
3
4#ifdef USE_VOICE_ASSISTANT
5
7#include "esphome/core/log.h"
8
9#include <cinttypes>
10#include <cstdio>
11
12namespace esphome {
13namespace voice_assistant {
14
15static const char *const TAG = "voice_assistant";
16
17#ifdef SAMPLE_RATE_HZ
18#undef SAMPLE_RATE_HZ
19#endif
20
21static const size_t SAMPLE_RATE_HZ = 16000;
22
23static const size_t RING_BUFFER_SAMPLES = 512 * SAMPLE_RATE_HZ / 1000; // 512 ms * 16 kHz/ 1000 ms
24static const size_t RING_BUFFER_SIZE = RING_BUFFER_SAMPLES * sizeof(int16_t);
25static const size_t SEND_BUFFER_SAMPLES = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms
26static const size_t SEND_BUFFER_SIZE = SEND_BUFFER_SAMPLES * sizeof(int16_t);
27static const size_t RECEIVE_SIZE = 1024;
28static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
29
31
33 this->mic_source_->add_data_callback([this](const std::vector<uint8_t> &data) {
34 std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_;
35 if (this->ring_buffer_.use_count() > 1) {
36 temp_ring_buffer->write((void *) data.data(), data.size());
37 }
38 });
39
40#ifdef USE_MEDIA_PLAYER
41 if (this->media_player_ != nullptr) {
42 this->media_player_->add_on_state_callback([this]() {
43 switch (this->media_player_->state) {
46 // State changed to announcing after receiving the url
48 }
49 break;
50 default:
52 // No longer announcing the TTS response
54 }
55 break;
56 }
57 });
58 }
59#endif
60}
61
63
65 this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
66 if (this->socket_ == nullptr) {
67 ESP_LOGE(TAG, "Could not create socket");
68 this->mark_failed();
69 return false;
70 }
71 int enable = 1;
72 int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
73 if (err != 0) {
74 ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
75 // we can still continue
76 }
77 err = this->socket_->setblocking(false);
78 if (err != 0) {
79 ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err);
80 this->mark_failed();
81 return false;
82 }
83
84#ifdef USE_SPEAKER
85 if (this->speaker_ != nullptr) {
86 struct sockaddr_storage server;
87
88 socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055);
89 if (sl == 0) {
90 ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno);
91 this->mark_failed();
92 return false;
93 }
94
95 err = this->socket_->bind((struct sockaddr *) &server, sizeof(server));
96 if (err != 0) {
97 ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno);
98 this->mark_failed();
99 return false;
100 }
101 }
102#endif
103 this->udp_socket_running_ = true;
104 return true;
105}
106
108#ifdef USE_SPEAKER
109 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ == nullptr)) {
110 RAMAllocator<uint8_t> speaker_allocator;
111 this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
112 if (this->speaker_buffer_ == nullptr) {
113 ESP_LOGW(TAG, "Could not allocate speaker buffer");
114 return false;
115 }
116 }
117#endif
118
119 if (this->ring_buffer_.use_count() == 0) {
120 this->ring_buffer_ = RingBuffer::create(RING_BUFFER_SIZE);
121 if (this->ring_buffer_.use_count() == 0) {
122 ESP_LOGE(TAG, "Could not allocate ring buffer");
123 return false;
124 }
125 }
126
127 if (this->send_buffer_ == nullptr) {
128 RAMAllocator<uint8_t> send_allocator;
129 this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE);
130 if (send_buffer_ == nullptr) {
131 ESP_LOGW(TAG, "Could not allocate send buffer");
132 return false;
133 }
134 }
135
136 return true;
137}
138
140 if (this->send_buffer_ != nullptr) {
141 memset(this->send_buffer_, 0, SEND_BUFFER_SIZE);
142 }
143
144 if (this->ring_buffer_ != nullptr) {
145 this->ring_buffer_->reset();
146 }
147
148#ifdef USE_SPEAKER
149 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
150 memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE);
151
152 this->speaker_buffer_size_ = 0;
153 this->speaker_buffer_index_ = 0;
154 this->speaker_bytes_received_ = 0;
155 }
156#endif
157}
158
160 if (this->send_buffer_ != nullptr) {
161 RAMAllocator<uint8_t> send_deallocator;
162 send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE);
163 this->send_buffer_ = nullptr;
164 }
165
166 if (this->ring_buffer_.use_count() > 0) {
167 this->ring_buffer_.reset();
168 }
169
170#ifdef USE_SPEAKER
171 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
172 RAMAllocator<uint8_t> speaker_deallocator;
173 speaker_deallocator.deallocate(this->speaker_buffer_, SPEAKER_BUFFER_SIZE);
174 this->speaker_buffer_ = nullptr;
175 }
176#endif
177}
178
180 this->conversation_id_ = "";
181 ESP_LOGD(TAG, "reset conversation ID");
182}
183
185 if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
187 if (this->mic_source_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
189 } else {
191 }
192 this->continuous_ = false;
193 this->signal_stop_();
194 this->clear_buffers_();
195 return;
196 }
197 switch (this->state_) {
198 case State::IDLE: {
199 if (this->continuous_ && this->desired_state_ == State::IDLE) {
200 this->idle_trigger_.trigger();
202 } else {
203 this->deallocate_buffers_();
204 }
205 break;
206 }
208 ESP_LOGD(TAG, "Starting Microphone");
209 if (!this->allocate_buffers_()) {
210 this->status_set_error(LOG_STR("Failed to allocate buffers"));
211 return;
212 }
213 if (this->status_has_error()) {
214 this->status_clear_error();
215 }
216 this->clear_buffers_();
217
218 this->mic_source_->start();
220 break;
221 }
223 if (this->mic_source_->is_running()) {
224 this->set_state_(this->desired_state_);
225 }
226 break;
227 }
229 ESP_LOGD(TAG, "Requesting start");
230 uint32_t flags = 0;
231 if (!this->continue_conversation_ && this->use_wake_word_)
233 if (this->silence_detection_)
237 audio_settings.auto_gain = this->auto_gain_;
238 audio_settings.volume_multiplier = this->volume_multiplier_;
239
241 msg.start = true;
243 msg.flags = flags;
244 msg.audio_settings = audio_settings;
246
247 // Reset media player state tracking
248#ifdef USE_MEDIA_PLAYER
249 if (this->media_player_ != nullptr) {
251 }
252#endif
253
254 if (this->api_client_ == nullptr ||
256 ESP_LOGW(TAG, "Could not request start");
257 this->error_trigger_.trigger("not-connected", "Could not request start");
258 this->continuous_ = false;
260 break;
261 }
263 this->set_timeout("reset-conversation_id", this->conversation_timeout_,
264 [this]() { this->reset_conversation_id(); });
265 break;
266 }
268 break; // State changed when udp server port received
269 }
271 size_t available = this->ring_buffer_->available();
272 while (available >= SEND_BUFFER_SIZE) {
273 size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
274 if (this->audio_mode_ == AUDIO_MODE_API) {
276 msg.data = this->send_buffer_;
277 msg.data_len = read_bytes;
279 } else {
280 if (!this->udp_socket_running_) {
281 if (!this->start_udp_socket_()) {
283 break;
284 }
285 }
286 this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
287 sizeof(this->dest_addr_));
288 }
289 available = this->ring_buffer_->available();
290 }
291
292 break;
293 }
295 if (this->mic_source_->is_running()) {
296 this->mic_source_->stop();
298 } else {
299 this->set_state_(this->desired_state_);
300 }
301 break;
302 }
304 if (this->mic_source_->is_stopped()) {
305 this->set_state_(this->desired_state_);
306 }
307 break;
308 }
310 break; // State changed by events
311 }
313 bool playing = false;
314#ifdef USE_SPEAKER
315 if (this->speaker_ != nullptr) {
316 ssize_t received_len = 0;
317 if (this->audio_mode_ == AUDIO_MODE_UDP) {
318 if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
319 received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
320 if (received_len > 0) {
321 this->speaker_buffer_index_ += received_len;
322 this->speaker_buffer_size_ += received_len;
323 this->speaker_bytes_received_ += received_len;
324 }
325 } else {
326 ESP_LOGD(TAG, "Receive buffer full");
327 }
328 }
329 // Build a small buffer of audio before sending to the speaker
330 bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0);
331 if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream)
332 this->write_speaker_();
333 if (this->wait_for_stream_end_) {
334 this->cancel_timeout("playing");
335 if (end_of_stream) {
336 ESP_LOGD(TAG, "End of audio stream received");
337 this->cancel_timeout("speaker-timeout");
339 }
340 break; // We dont want to timeout here as the STREAM_END event will take care of that.
341 }
342 playing = this->speaker_->is_running();
343 }
344#endif
345#ifdef USE_MEDIA_PLAYER
346 if (this->media_player_ != nullptr) {
348
351 this->cancel_timeout("playing");
352 ESP_LOGD(TAG, "Announcement finished playing");
354
356 msg.success = true;
358 break;
359 }
360 }
361#endif
362 if (playing) {
364 }
365 break;
366 }
368#ifdef USE_SPEAKER
369 if (this->speaker_ != nullptr) {
370 if (this->speaker_buffer_size_ > 0) {
371 this->write_speaker_();
372 break;
373 }
374 if (this->speaker_->has_buffered_data() || this->speaker_->is_running()) {
375 break;
376 }
377 ESP_LOGD(TAG, "Speaker has finished outputting all audio");
378 this->speaker_->stop();
379 this->cancel_timeout("speaker-timeout");
380 this->cancel_timeout("playing");
381
382 this->clear_buffers_();
383
384 this->wait_for_stream_end_ = false;
385 this->stream_ended_ = false;
386
388 }
389#endif
390 if (this->continue_conversation_) {
392 } else {
394 }
395 break;
396 }
397 default:
398 break;
399 }
400}
401
402#ifdef USE_SPEAKER
404 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
405 if (this->speaker_buffer_size_ > 0) {
406 size_t write_chunk = std::min<size_t>(this->speaker_buffer_size_, 4 * 1024);
407 size_t written = this->speaker_->play(this->speaker_buffer_, write_chunk);
408 if (written > 0) {
409 memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written);
412 this->set_timeout("speaker-timeout", 5000, [this]() { this->speaker_->stop(); });
413 } else {
414 ESP_LOGV(TAG, "Speaker buffer full, trying again next loop");
415 }
416 }
417 }
418}
419#endif
420
422 if (!subscribe) {
423 if (this->api_client_ == nullptr || client != this->api_client_) {
424 ESP_LOGE(TAG, "Client attempting to unsubscribe that is not the current API Client");
425 return;
426 }
427 this->api_client_ = nullptr;
429 return;
430 }
431
432 if (this->api_client_ != nullptr) {
433 char current_peername[socket::SOCKADDR_STR_LEN];
434 char new_peername[socket::SOCKADDR_STR_LEN];
435 ESP_LOGE(TAG,
436 "Multiple API Clients attempting to connect to Voice Assistant\n"
437 "Current client: %s (%s)\n"
438 "New client: %s (%s)",
439 this->api_client_->get_name(), this->api_client_->get_peername_to(current_peername), client->get_name(),
440 client->get_peername_to(new_peername));
441 return;
442 }
443
444 this->api_client_ = client;
446}
447
448static const LogString *voice_assistant_state_to_string(State state) {
449 switch (state) {
450 case State::IDLE:
451 return LOG_STR("IDLE");
453 return LOG_STR("START_MICROPHONE");
455 return LOG_STR("STARTING_MICROPHONE");
457 return LOG_STR("WAIT_FOR_VAD");
459 return LOG_STR("WAITING_FOR_VAD");
461 return LOG_STR("START_PIPELINE");
463 return LOG_STR("STARTING_PIPELINE");
465 return LOG_STR("STREAMING_MICROPHONE");
467 return LOG_STR("STOP_MICROPHONE");
469 return LOG_STR("STOPPING_MICROPHONE");
471 return LOG_STR("AWAITING_RESPONSE");
473 return LOG_STR("STREAMING_RESPONSE");
475 return LOG_STR("RESPONSE_FINISHED");
476 default:
477 return LOG_STR("UNKNOWN");
478 }
479};
480
482 State old_state = this->state_;
483 this->state_ = state;
484 ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)),
485 LOG_STR_ARG(voice_assistant_state_to_string(state)));
486}
487
489 this->set_state_(state);
490 this->desired_state_ = desired_state;
491 ESP_LOGD(TAG, "Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state)));
492}
493
495 ESP_LOGE(TAG, "Failed to start server. See Home Assistant logs for more details.");
496 this->error_trigger_.trigger("failed-to-start", "Failed to start server. See Home Assistant logs for more details.");
498}
499
501 if (this->state_ != State::STARTING_PIPELINE) {
502 this->signal_stop_();
503 return;
504 }
505
506 ESP_LOGD(TAG, "Client started, streaming microphone");
508
509 if (this->mic_source_->is_running()) {
511 } else {
513 }
514}
515
516void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) {
517 if (this->state_ != State::STARTING_PIPELINE) {
518 this->signal_stop_();
519 return;
520 }
521
522 ESP_LOGD(TAG, "Client started, streaming microphone");
524
525 memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
526 if (this->dest_addr_.ss_family == AF_INET) {
527 ((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port);
528 }
529#if LWIP_IPV6
530 else if (this->dest_addr_.ss_family == AF_INET6) {
531 ((struct sockaddr_in6 *) &this->dest_addr_)->sin6_port = htons(port);
532 }
533#endif
534 else {
535 ESP_LOGW(TAG, "Unknown address family: %d", this->dest_addr_.ss_family);
536 return;
537 }
538
539 if (this->mic_source_->is_running()) {
541 } else {
543 }
544}
545
546void VoiceAssistant::request_start(bool continuous, bool silence_detection) {
547 if (this->api_client_ == nullptr) {
548 ESP_LOGE(TAG, "No API client connected");
550 this->continuous_ = false;
551 return;
552 }
553 if (this->state_ == State::IDLE) {
554 this->continuous_ = continuous;
555 this->silence_detection_ = silence_detection;
556
558 }
559}
560
562 this->continuous_ = false;
563 this->continue_conversation_ = false;
564
565 switch (this->state_) {
566 case State::IDLE:
567 break;
574 break;
577 this->signal_stop_();
579 break;
583 break;
585 this->signal_stop_();
586 break;
588#ifdef USE_MEDIA_PLAYER
589 // Stop any ongoing media player announcement
590 if (this->media_player_ != nullptr) {
591 this->media_player_->make_call()
593 .set_announcement(true)
594 .perform();
595 }
596 if (this->started_streaming_tts_) {
597 // Haven't reached the TTS_END stage, so send the stop signal to HA.
598 this->signal_stop_();
599 }
600#endif
601 break;
603 break; // Let the incoming audio stream finish then it will go to idle.
604 }
605}
606
608 memset(&this->dest_addr_, 0, sizeof(this->dest_addr_));
609 if (this->api_client_ == nullptr) {
610 return;
611 }
612 ESP_LOGD(TAG, "Signaling stop");
614 msg.start = false;
616}
617
619 this->set_timeout("playing", 2000, [this]() {
620 this->cancel_timeout("speaker-timeout");
622
624 msg.success = true;
626 });
627}
628
630 ESP_LOGD(TAG, "Event Type: %" PRId32, msg.event_type);
631 switch (msg.event_type) {
633 ESP_LOGD(TAG, "Assist Pipeline running");
634#ifdef USE_MEDIA_PLAYER
635 this->started_streaming_tts_ = false;
636 for (const auto &arg : msg.data) {
637 if (arg.name == "url") {
638 this->tts_response_url_ = arg.value;
639 }
640 }
641#endif
642 this->defer([this]() { this->start_trigger_.trigger(); });
643 break;
645 break;
647 ESP_LOGD(TAG, "Wake word detected");
648 this->defer([this]() { this->wake_word_detected_trigger_.trigger(); });
649 break;
650 }
652 ESP_LOGD(TAG, "STT started");
653 this->defer([this]() { this->listening_trigger_.trigger(); });
654 break;
656 std::string text;
657 for (const auto &arg : msg.data) {
658 if (arg.name == "text") {
659 text = arg.value;
660 }
661 }
662 if (text.empty()) {
663 ESP_LOGW(TAG, "No text in STT_END event");
664 return;
665 } else if (text.length() > 500) {
666 text.resize(497);
667 text += "...";
668 }
669 ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str());
670 this->defer([this, text]() { this->stt_end_trigger_.trigger(text); });
671 break;
672 }
674 ESP_LOGD(TAG, "Intent started");
675 this->defer([this]() { this->intent_start_trigger_.trigger(); });
676 break;
678 ESP_LOGD(TAG, "Intent progress");
679 std::string tts_url_for_trigger = "";
680#ifdef USE_MEDIA_PLAYER
681 if (this->media_player_ != nullptr) {
682 for (const auto &arg : msg.data) {
683 if ((arg.name == "tts_start_streaming") && (arg.value == "1") && !this->tts_response_url_.empty()) {
685
687
688 this->started_streaming_tts_ = true;
690
691 tts_url_for_trigger = this->tts_response_url_;
692 this->tts_response_url_.clear(); // Reset streaming URL
694 }
695 }
696 }
697#endif
698 this->defer([this, tts_url_for_trigger]() { this->intent_progress_trigger_.trigger(tts_url_for_trigger); });
699 break;
700 }
702 for (const auto &arg : msg.data) {
703 if (arg.name == "conversation_id") {
704 this->conversation_id_ = arg.value;
705 } else if (arg.name == "continue_conversation") {
706 this->continue_conversation_ = (arg.value == "1");
707 }
708 }
709 this->defer([this]() { this->intent_end_trigger_.trigger(); });
710 break;
711 }
713 std::string text;
714 for (const auto &arg : msg.data) {
715 if (arg.name == "text") {
716 text = arg.value;
717 }
718 }
719 if (text.empty()) {
720 ESP_LOGW(TAG, "No text in TTS_START event");
721 return;
722 }
723 if (text.length() > 500) {
724 text.resize(497);
725 text += "...";
726 }
727 ESP_LOGD(TAG, "Response: \"%s\"", text.c_str());
728 this->defer([this, text]() {
729 this->tts_start_trigger_.trigger(text);
730#ifdef USE_SPEAKER
731 if (this->speaker_ != nullptr) {
732 this->speaker_->start();
733 }
734#endif
735 });
736 break;
737 }
739 std::string url;
740 for (const auto &arg : msg.data) {
741 if (arg.name == "url") {
742 url = arg.value;
743 }
744 }
745 if (url.empty()) {
746 ESP_LOGW(TAG, "No url in TTS_END event");
747 return;
748 }
749 ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str());
750 this->defer([this, url]() {
751#ifdef USE_MEDIA_PLAYER
752 if ((this->media_player_ != nullptr) && (!this->started_streaming_tts_)) {
754
756
758 }
759 this->started_streaming_tts_ = false; // Helps indicate reaching the TTS_END stage
760#endif
761 this->tts_end_trigger_.trigger(url);
762 });
764 if (new_state != this->state_) {
765 // Don't needlessly change the state. The intent progress stage may have already changed the state to streaming
766 // response.
767 this->set_state_(new_state, new_state);
768 }
769 break;
770 }
772 ESP_LOGD(TAG, "Assist Pipeline ended");
773 if ((this->state_ == State::START_PIPELINE) || (this->state_ == State::STARTING_PIPELINE) ||
775 // Microphone is running, stop it
777 } else if (this->state_ == State::AWAITING_RESPONSE) {
778 // No TTS start event ("nevermind")
780 }
781 this->defer([this]() { this->end_trigger_.trigger(); });
782 break;
783 }
785 std::string code = "";
786 std::string message = "";
787 for (const auto &arg : msg.data) {
788 if (arg.name == "code") {
789 code = arg.value;
790 } else if (arg.name == "message") {
791 message = arg.value;
792 }
793 }
794 if (code == "wake-word-timeout" || code == "wake_word_detection_aborted" || code == "no_wake_word") {
795 // Don't change state here since either the "tts-end" or "run-end" events will do it.
796 return;
797 } else if (code == "wake-provider-missing" || code == "wake-engine-missing") {
798 // Wake word is not set up or not ready on Home Assistant so stop and do not retry until user starts again.
799 this->defer([this, code, message]() {
800 this->request_stop();
801 this->error_trigger_.trigger(code, message);
802 });
803 return;
804 }
805 ESP_LOGE(TAG, "Error: %s - %s", code.c_str(), message.c_str());
806 if (this->state_ != State::IDLE) {
807 this->signal_stop_();
809 }
810 this->defer([this, code, message]() { this->error_trigger_.trigger(code, message); });
811 break;
812 }
814#ifdef USE_SPEAKER
815 if (this->speaker_ != nullptr) {
816 this->wait_for_stream_end_ = true;
817 ESP_LOGD(TAG, "TTS stream start");
818 this->defer([this] { this->tts_stream_start_trigger_.trigger(); });
819 }
820#endif
821 break;
822 }
824#ifdef USE_SPEAKER
825 if (this->speaker_ != nullptr) {
826 this->stream_ended_ = true;
827 ESP_LOGD(TAG, "TTS stream end");
828 }
829#endif
830 break;
831 }
833 ESP_LOGD(TAG, "Starting STT by VAD");
834 this->defer([this]() { this->stt_vad_start_trigger_.trigger(); });
835 break;
837 ESP_LOGD(TAG, "STT by VAD end");
839 this->defer([this]() { this->stt_vad_end_trigger_.trigger(); });
840 break;
841 default:
842 ESP_LOGD(TAG, "Unhandled event type: %" PRId32, msg.event_type);
843 break;
844 }
845}
846
848#ifdef USE_SPEAKER // We should never get to this function if there is no speaker anyway
849 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
850 if (this->speaker_buffer_index_ + msg.data_len < SPEAKER_BUFFER_SIZE) {
851 memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data, msg.data_len);
852 this->speaker_buffer_index_ += msg.data_len;
853 this->speaker_buffer_size_ += msg.data_len;
855 ESP_LOGV(TAG, "Received audio: %u bytes from API", msg.data_len);
856 } else {
857 ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
858 }
859 }
860#endif
861}
862
864 // Find existing timer or add a new one
865 auto it = this->timers_.begin();
866 for (; it != this->timers_.end(); ++it) {
867 if (it->id == msg.timer_id)
868 break;
869 }
870 if (it == this->timers_.end()) {
871 this->timers_.push_back({});
872 it = this->timers_.end() - 1;
873 }
874 it->id = msg.timer_id;
875 it->name = msg.name;
876 it->total_seconds = msg.total_seconds;
877 it->seconds_left = msg.seconds_left;
878 it->is_active = msg.is_active;
879
880 char timer_buf[Timer::TO_STR_BUFFER_SIZE];
881 ESP_LOGD(TAG,
882 "Timer Event\n"
883 " Type: %" PRId32 "\n"
884 " %s",
885 msg.event_type, it->to_str(timer_buf));
886
887 switch (msg.event_type) {
889 this->timer_started_trigger_.trigger(*it);
890 break;
892 this->timer_updated_trigger_.trigger(*it);
893 break;
895 this->timer_cancelled_trigger_.trigger(*it);
896 this->timers_.erase(it);
897 break;
899 this->timer_finished_trigger_.trigger(*it);
900 this->timers_.erase(it);
901 break;
902 }
903
904 if (this->timers_.empty()) {
905 this->cancel_interval("timer-event");
906 this->timer_tick_running_ = false;
907 } else if (!this->timer_tick_running_) {
908 this->set_interval("timer-event", 1000, [this]() { this->timer_tick_(); });
909 this->timer_tick_running_ = true;
910 }
911}
912
914 for (auto &timer : this->timers_) {
915 if (timer.is_active && timer.seconds_left > 0) {
916 timer.seconds_left--;
917 }
918 }
919 this->timer_tick_trigger_.trigger(this->timers_);
920}
921
923#ifdef USE_MEDIA_PLAYER
924 if (this->media_player_ != nullptr) {
926
928
929 if (!msg.preannounce_media_id.empty()) {
931 }
932 // Enqueueing a URL with an empty playlist will still play the file immediately
933 this->media_player_->make_call()
936 .set_announcement(true)
937 .perform();
939
941
942 if (this->continuous_) {
944 } else {
946 }
947
949 this->end_trigger_.trigger();
950 }
951#endif
952}
953
954void VoiceAssistant::on_set_configuration(const std::vector<std::string> &active_wake_words) {
955#ifdef USE_MICRO_WAKE_WORD
956 if (this->micro_wake_word_) {
957 // Disable all wake words first
958 for (auto &model : this->micro_wake_word_->get_wake_words()) {
959 model->disable();
960 }
961
962 // Enable only active wake words
963 for (const auto &ww_id : active_wake_words) {
964 for (auto &model : this->micro_wake_word_->get_wake_words()) {
965 if (model->get_id() == ww_id) {
966 model->enable();
967 ESP_LOGD(TAG, "Enabled wake word: %s (id=%s)", model->get_wake_word().c_str(), model->get_id().c_str());
968 }
969 }
970 }
971 }
972#endif
973};
974
976 this->config_.available_wake_words.clear();
977 this->config_.active_wake_words.clear();
978
979#ifdef USE_MICRO_WAKE_WORD
980 if (this->micro_wake_word_) {
982
983 for (auto &model : this->micro_wake_word_->get_wake_words()) {
984 if (model->is_enabled()) {
985 this->config_.active_wake_words.push_back(model->get_id());
986 }
987
988 WakeWord wake_word;
989 wake_word.id = model->get_id();
990 wake_word.wake_word = model->get_wake_word();
991 for (const auto &lang : model->get_trained_languages()) {
992 wake_word.trained_languages.push_back(lang);
993 }
994 this->config_.available_wake_words.push_back(std::move(wake_word));
995 }
996 } else {
997#endif
998 // No microWakeWord
1000#ifdef USE_MICRO_WAKE_WORD
1001 }
1002#endif
1003
1004 return this->config_;
1005};
1006
1007VoiceAssistant *global_voice_assistant = nullptr; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
1008
1009} // namespace voice_assistant
1010} // namespace esphome
1011
1012#endif // USE_VOICE_ASSISTANT
virtual void mark_failed()
Mark this component as failed.
ESPDEPRECATED("Use const char* overload instead. Removed in 2026.7.0", "2026.1.0") void defer(const std voi defer)(const char *name, std::function< void()> &&f)
Defer a callback to the next loop() call.
Definition component.h:479
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") void set_timeout(const std voi set_timeout)(const char *name, uint32_t timeout, std::function< void()> &&f)
Set a timeout function with a unique name.
Definition component.h:429
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") void set_interval(const std voi set_interval)(const char *name, uint32_t interval, std::function< void()> &&f)
Set an interval function with a unique name.
Definition component.h:336
bool status_has_error() const
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") bool cancel_timeout(const std boo cancel_timeout)(const char *name)
Cancel a timeout function.
Definition component.h:451
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") bool cancel_interval(const std boo cancel_interval)(const char *name)
Cancel an interval function.
Definition component.h:358
An STL allocator that uses SPI or internal RAM.
Definition helpers.h:1647
void deallocate(T *p, size_t n)
Definition helpers.h:1705
T * allocate(size_t n)
Definition helpers.h:1667
static std::unique_ptr< RingBuffer > create(size_t len)
StringRef is a reference to a string owned by something else.
Definition string_ref.h:26
constexpr bool empty() const
Definition string_ref.h:76
void trigger(const Ts &...x)
Inform the parent automation that the event has triggered.
Definition automation.h:279
const char * get_peername_to(std::span< char, socket::SOCKADDR_STR_LEN > buf) const
Get peer name (IP address) into caller-provided buffer, returns buf for convenience.
const char * get_name() const
bool send_message(const ProtoMessage &msg, uint8_t message_type)
static constexpr uint8_t MESSAGE_TYPE
Definition api_pb2.h:2414
static constexpr uint8_t MESSAGE_TYPE
Definition api_pb2.h:2354
enums::VoiceAssistantEvent event_type
Definition api_pb2.h:2342
std::vector< VoiceAssistantEventData > data
Definition api_pb2.h:2343
static constexpr uint8_t MESSAGE_TYPE
Definition api_pb2.h:2290
VoiceAssistantAudioSettings audio_settings
Definition api_pb2.h:2298
enums::VoiceAssistantTimerEvent event_type
Definition api_pb2.h:2379
MediaPlayerCall & set_media_url(const std::string &url)
MediaPlayerCall & set_announcement(bool announce)
MediaPlayerCall & set_command(MediaPlayerCommand command)
void add_on_state_callback(std::function< void()> &&callback)
std::vector< WakeWordModel * > get_wake_words()
void add_data_callback(std::function< void(const std::vector< uint8_t > &)> &&data_callback)
virtual size_t play(const uint8_t *data, size_t length)=0
Plays the provided audio data.
bool is_running() const
Definition speaker.h:66
virtual bool has_buffered_data() const =0
virtual void start()=0
virtual void stop()=0
std::unique_ptr< socket::Socket > socket_
void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg)
void on_audio(const api::VoiceAssistantAudio &msg)
media_player::MediaPlayer * media_player_
void client_subscription(api::APIConnection *client, bool subscribe)
MediaPlayerResponseState media_player_response_state_
std::shared_ptr< RingBuffer > ring_buffer_
void on_event(const api::VoiceAssistantEventResponse &msg)
Trigger< std::string, std::string > error_trigger_
Trigger< const std::vector< Timer > & > timer_tick_trigger_
Trigger< std::string > intent_progress_trigger_
void on_announce(const api::VoiceAssistantAnnounceRequest &msg)
void request_start(bool continuous, bool silence_detection)
microphone::MicrophoneSource * mic_source_
micro_wake_word::MicroWakeWord * micro_wake_word_
void on_set_configuration(const std::vector< std::string > &active_wake_words)
const char * message
Definition component.cpp:38
uint16_t flags
bool state
Definition fan.h:2
uint32_t socklen_t
Definition headers.h:97
__int64 ssize_t
Definition httplib.h:178
@ VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD
Definition api_pb2.h:236
@ VOICE_ASSISTANT_REQUEST_USE_VAD
Definition api_pb2.h:235
@ VOICE_ASSISTANT_TIMER_UPDATED
Definition api_pb2.h:259
@ VOICE_ASSISTANT_TIMER_STARTED
Definition api_pb2.h:258
@ VOICE_ASSISTANT_TIMER_FINISHED
Definition api_pb2.h:261
@ VOICE_ASSISTANT_TIMER_CANCELLED
Definition api_pb2.h:260
@ VOICE_ASSISTANT_WAKE_WORD_START
Definition api_pb2.h:249
@ VOICE_ASSISTANT_TTS_STREAM_END
Definition api_pb2.h:254
@ VOICE_ASSISTANT_STT_VAD_START
Definition api_pb2.h:251
@ VOICE_ASSISTANT_INTENT_PROGRESS
Definition api_pb2.h:255
@ VOICE_ASSISTANT_TTS_STREAM_START
Definition api_pb2.h:253
@ VOICE_ASSISTANT_WAKE_WORD_END
Definition api_pb2.h:250
const float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
Definition component.cpp:92
std::unique_ptr< Socket > socket(int domain, int type, int protocol)
Create a socket of the given domain, type and protocol.
socklen_t set_sockaddr_any(struct sockaddr *addr, socklen_t addrlen, uint16_t port)
Set a sockaddr to the any address and specified port for the IP version used by socket_ip().
Definition socket.cpp:159
VoiceAssistant * global_voice_assistant
Providing packet encoding functions for exchanging data with a remote host.
Definition a01nyub.cpp:7
int written
Definition helpers.h:736
std::vector< WakeWord > available_wake_words
std::vector< std::string > active_wake_words
static constexpr size_t TO_STR_BUFFER_SIZE
Buffer size for to_str() - sufficient for typical timer names.
std::vector< std::string > trained_languages
sa_family_t ss_family
Definition headers.h:92