ESPHome 2025.6.3
Loading...
Searching...
No Matches
voice_assistant.cpp
Go to the documentation of this file.
1#include "voice_assistant.h"
3
4#ifdef USE_VOICE_ASSISTANT
5
6#include "esphome/core/log.h"
7
8#include <cinttypes>
9#include <cstdio>
10
11namespace esphome {
12namespace voice_assistant {
13
14static const char *const TAG = "voice_assistant";
15
16#ifdef SAMPLE_RATE_HZ
17#undef SAMPLE_RATE_HZ
18#endif
19
20static const size_t SAMPLE_RATE_HZ = 16000;
21
22static const size_t RING_BUFFER_SAMPLES = 512 * SAMPLE_RATE_HZ / 1000; // 512 ms * 16 kHz/ 1000 ms
23static const size_t RING_BUFFER_SIZE = RING_BUFFER_SAMPLES * sizeof(int16_t);
24static const size_t SEND_BUFFER_SAMPLES = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms
25static const size_t SEND_BUFFER_SIZE = SEND_BUFFER_SAMPLES * sizeof(int16_t);
26static const size_t RECEIVE_SIZE = 1024;
27static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
28
30
32 this->mic_source_->add_data_callback([this](const std::vector<uint8_t> &data) {
33 std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_;
34 if (this->ring_buffer_.use_count() > 1) {
35 temp_ring_buffer->write((void *) data.data(), data.size());
36 }
37 });
38}
39
41
43 this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
44 if (this->socket_ == nullptr) {
45 ESP_LOGE(TAG, "Could not create socket");
46 this->mark_failed();
47 return false;
48 }
49 int enable = 1;
50 int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
51 if (err != 0) {
52 ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
53 // we can still continue
54 }
55 err = this->socket_->setblocking(false);
56 if (err != 0) {
57 ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err);
58 this->mark_failed();
59 return false;
60 }
61
62#ifdef USE_SPEAKER
63 if (this->speaker_ != nullptr) {
64 struct sockaddr_storage server;
65
66 socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055);
67 if (sl == 0) {
68 ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno);
69 this->mark_failed();
70 return false;
71 }
72
73 err = this->socket_->bind((struct sockaddr *) &server, sizeof(server));
74 if (err != 0) {
75 ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno);
76 this->mark_failed();
77 return false;
78 }
79 }
80#endif
81 this->udp_socket_running_ = true;
82 return true;
83}
84
86#ifdef USE_SPEAKER
87 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ == nullptr)) {
89 this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
90 if (this->speaker_buffer_ == nullptr) {
91 ESP_LOGW(TAG, "Could not allocate speaker buffer");
92 return false;
93 }
94 }
95#endif
96
97 if (this->ring_buffer_.use_count() == 0) {
98 this->ring_buffer_ = RingBuffer::create(RING_BUFFER_SIZE);
99 if (this->ring_buffer_.use_count() == 0) {
100 ESP_LOGE(TAG, "Could not allocate ring buffer");
101 return false;
102 }
103 }
104
105 if (this->send_buffer_ == nullptr) {
107 this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE);
108 if (send_buffer_ == nullptr) {
109 ESP_LOGW(TAG, "Could not allocate send buffer");
110 return false;
111 }
112 }
113
114 return true;
115}
116
118 if (this->send_buffer_ != nullptr) {
119 memset(this->send_buffer_, 0, SEND_BUFFER_SIZE);
120 }
121
122 if (this->ring_buffer_ != nullptr) {
123 this->ring_buffer_->reset();
124 }
125
126#ifdef USE_SPEAKER
127 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
128 memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE);
129
130 this->speaker_buffer_size_ = 0;
131 this->speaker_buffer_index_ = 0;
132 this->speaker_bytes_received_ = 0;
133 }
134#endif
135}
136
138 if (this->send_buffer_ != nullptr) {
140 send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE);
141 this->send_buffer_ = nullptr;
142 }
143
144 if (this->ring_buffer_.use_count() > 0) {
145 this->ring_buffer_.reset();
146 }
147
148#ifdef USE_SPEAKER
149 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
151 speaker_deallocator.deallocate(this->speaker_buffer_, SPEAKER_BUFFER_SIZE);
152 this->speaker_buffer_ = nullptr;
153 }
154#endif
155}
156
158 this->conversation_id_ = "";
159 ESP_LOGD(TAG, "reset conversation ID");
160}
161
163 if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
165 if (this->mic_source_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
167 } else {
169 }
170 this->continuous_ = false;
171 this->signal_stop_();
172 this->clear_buffers_();
173 return;
174 }
175 switch (this->state_) {
176 case State::IDLE: {
177 if (this->continuous_ && this->desired_state_ == State::IDLE) {
178 this->idle_trigger_->trigger();
180 } else {
181 this->deallocate_buffers_();
182 }
183 break;
184 }
186 ESP_LOGD(TAG, "Starting Microphone");
187 if (!this->allocate_buffers_()) {
188 this->status_set_error("Failed to allocate buffers");
189 return;
190 }
191 if (this->status_has_error()) {
192 this->status_clear_error();
193 }
194 this->clear_buffers_();
195
196 this->mic_source_->start();
198 break;
199 }
201 if (this->mic_source_->is_running()) {
202 this->set_state_(this->desired_state_);
203 }
204 break;
205 }
207 ESP_LOGD(TAG, "Requesting start");
208 uint32_t flags = 0;
209 if (!this->continue_conversation_ && this->use_wake_word_)
211 if (this->silence_detection_)
215 audio_settings.auto_gain = this->auto_gain_;
216 audio_settings.volume_multiplier = this->volume_multiplier_;
217
219 msg.start = true;
221 msg.flags = flags;
222 msg.audio_settings = audio_settings;
223 msg.wake_word_phrase = this->wake_word_;
224 this->wake_word_ = "";
225
226 if (this->api_client_ == nullptr || !this->api_client_->send_message(msg)) {
227 ESP_LOGW(TAG, "Could not request start");
228 this->error_trigger_->trigger("not-connected", "Could not request start");
229 this->continuous_ = false;
231 break;
232 }
234 this->set_timeout("reset-conversation_id", this->conversation_timeout_,
235 [this]() { this->reset_conversation_id(); });
236 break;
237 }
239 break; // State changed when udp server port received
240 }
242 size_t available = this->ring_buffer_->available();
243 while (available >= SEND_BUFFER_SIZE) {
244 size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
245 if (this->audio_mode_ == AUDIO_MODE_API) {
247 msg.data.assign((char *) this->send_buffer_, read_bytes);
248 this->api_client_->send_message(msg);
249 } else {
250 if (!this->udp_socket_running_) {
251 if (!this->start_udp_socket_()) {
253 break;
254 }
255 }
256 this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
257 sizeof(this->dest_addr_));
258 }
259 available = this->ring_buffer_->available();
260 }
261
262 break;
263 }
265 if (this->mic_source_->is_running()) {
266 this->mic_source_->stop();
268 } else {
269 this->set_state_(this->desired_state_);
270 }
271 break;
272 }
274 if (this->mic_source_->is_stopped()) {
275 this->set_state_(this->desired_state_);
276 }
277 break;
278 }
280 break; // State changed by events
281 }
283 bool playing = false;
284#ifdef USE_SPEAKER
285 if (this->speaker_ != nullptr) {
286 ssize_t received_len = 0;
287 if (this->audio_mode_ == AUDIO_MODE_UDP) {
288 if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
289 received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
290 if (received_len > 0) {
291 this->speaker_buffer_index_ += received_len;
292 this->speaker_buffer_size_ += received_len;
293 this->speaker_bytes_received_ += received_len;
294 }
295 } else {
296 ESP_LOGD(TAG, "Receive buffer full");
297 }
298 }
299 // Build a small buffer of audio before sending to the speaker
300 bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0);
301 if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream)
302 this->write_speaker_();
303 if (this->wait_for_stream_end_) {
304 this->cancel_timeout("playing");
305 if (end_of_stream) {
306 ESP_LOGD(TAG, "End of audio stream received");
307 this->cancel_timeout("speaker-timeout");
309 }
310 break; // We dont want to timeout here as the STREAM_END event will take care of that.
311 }
312 playing = this->speaker_->is_running();
313 }
314#endif
315#ifdef USE_MEDIA_PLAYER
316 if (this->media_player_ != nullptr) {
318
319 if (playing && this->media_player_wait_for_announcement_start_) {
320 // Announcement has started playing, wait for it to finish
323 }
324
325 if (!playing && this->media_player_wait_for_announcement_end_) {
326 // Announcement has finished playing
328 this->cancel_timeout("playing");
329 ESP_LOGD(TAG, "Announcement finished playing");
331
333 msg.success = true;
334 this->api_client_->send_message(msg);
335 break;
336 }
337 }
338#endif
339 if (playing) {
341 }
342 break;
343 }
345#ifdef USE_SPEAKER
346 if (this->speaker_ != nullptr) {
347 if (this->speaker_buffer_size_ > 0) {
348 this->write_speaker_();
349 break;
350 }
351 if (this->speaker_->has_buffered_data() || this->speaker_->is_running()) {
352 break;
353 }
354 ESP_LOGD(TAG, "Speaker has finished outputting all audio");
355 this->speaker_->stop();
356 this->cancel_timeout("speaker-timeout");
357 this->cancel_timeout("playing");
358
359 this->clear_buffers_();
360
361 this->wait_for_stream_end_ = false;
362 this->stream_ended_ = false;
363
365 }
366#endif
367 if (this->continue_conversation_) {
369 } else {
371 }
372 break;
373 }
374 default:
375 break;
376 }
377}
378
379#ifdef USE_SPEAKER
381 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
382 if (this->speaker_buffer_size_ > 0) {
383 size_t write_chunk = std::min<size_t>(this->speaker_buffer_size_, 4 * 1024);
384 size_t written = this->speaker_->play(this->speaker_buffer_, write_chunk);
385 if (written > 0) {
386 memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written);
387 this->speaker_buffer_size_ -= written;
388 this->speaker_buffer_index_ -= written;
389 this->set_timeout("speaker-timeout", 5000, [this]() { this->speaker_->stop(); });
390 } else {
391 ESP_LOGV(TAG, "Speaker buffer full, trying again next loop");
392 }
393 }
394 }
395}
396#endif
397
399 if (!subscribe) {
400 if (this->api_client_ == nullptr || client != this->api_client_) {
401 ESP_LOGE(TAG, "Client attempting to unsubscribe that is not the current API Client");
402 return;
403 }
404 this->api_client_ = nullptr;
406 return;
407 }
408
409 if (this->api_client_ != nullptr) {
410 ESP_LOGE(TAG, "Multiple API Clients attempting to connect to Voice Assistant");
411 ESP_LOGE(TAG, "Current client: %s", this->api_client_->get_client_combined_info().c_str());
412 ESP_LOGE(TAG, "New client: %s", client->get_client_combined_info().c_str());
413 return;
414 }
415
416 this->api_client_ = client;
418}
419
420static const LogString *voice_assistant_state_to_string(State state) {
421 switch (state) {
422 case State::IDLE:
423 return LOG_STR("IDLE");
425 return LOG_STR("START_MICROPHONE");
427 return LOG_STR("STARTING_MICROPHONE");
429 return LOG_STR("WAIT_FOR_VAD");
431 return LOG_STR("WAITING_FOR_VAD");
433 return LOG_STR("START_PIPELINE");
435 return LOG_STR("STARTING_PIPELINE");
437 return LOG_STR("STREAMING_MICROPHONE");
439 return LOG_STR("STOP_MICROPHONE");
441 return LOG_STR("STOPPING_MICROPHONE");
443 return LOG_STR("AWAITING_RESPONSE");
445 return LOG_STR("STREAMING_RESPONSE");
447 return LOG_STR("RESPONSE_FINISHED");
448 default:
449 return LOG_STR("UNKNOWN");
450 }
451};
452
454 State old_state = this->state_;
455 this->state_ = state;
456 ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)),
457 LOG_STR_ARG(voice_assistant_state_to_string(state)));
458}
459
461 this->set_state_(state);
462 this->desired_state_ = desired_state;
463 ESP_LOGD(TAG, "Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state)));
464}
465
467 ESP_LOGE(TAG, "Failed to start server. See Home Assistant logs for more details.");
468 this->error_trigger_->trigger("failed-to-start", "Failed to start server. See Home Assistant logs for more details.");
470}
471
473 if (this->state_ != State::STARTING_PIPELINE) {
474 this->signal_stop_();
475 return;
476 }
477
478 ESP_LOGD(TAG, "Client started, streaming microphone");
480
481 if (this->mic_source_->is_running()) {
483 } else {
485 }
486}
487
488void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) {
489 if (this->state_ != State::STARTING_PIPELINE) {
490 this->signal_stop_();
491 return;
492 }
493
494 ESP_LOGD(TAG, "Client started, streaming microphone");
496
497 memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
498 if (this->dest_addr_.ss_family == AF_INET) {
499 ((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port);
500 }
501#if LWIP_IPV6
502 else if (this->dest_addr_.ss_family == AF_INET6) {
503 ((struct sockaddr_in6 *) &this->dest_addr_)->sin6_port = htons(port);
504 }
505#endif
506 else {
507 ESP_LOGW(TAG, "Unknown address family: %d", this->dest_addr_.ss_family);
508 return;
509 }
510
511 if (this->mic_source_->is_running()) {
513 } else {
515 }
516}
517
518void VoiceAssistant::request_start(bool continuous, bool silence_detection) {
519 if (this->api_client_ == nullptr) {
520 ESP_LOGE(TAG, "No API client connected");
522 this->continuous_ = false;
523 return;
524 }
525 if (this->state_ == State::IDLE) {
526 this->continuous_ = continuous;
527 this->silence_detection_ = silence_detection;
528
530 }
531}
532
534 this->continuous_ = false;
535 this->continue_conversation_ = false;
536
537 switch (this->state_) {
538 case State::IDLE:
539 break;
546 break;
549 this->signal_stop_();
551 break;
555 break;
557 this->signal_stop_();
558 // Fallthrough intended to stop a streaming TTS announcement that has potentially started
560#ifdef USE_MEDIA_PLAYER
561 // Stop any ongoing media player announcement
562 if (this->media_player_ != nullptr) {
563 this->media_player_->make_call()
565 .set_announcement(true)
566 .perform();
567 }
568#endif
569 break;
571 break; // Let the incoming audio stream finish then it will go to idle.
572 }
573}
574
576 memset(&this->dest_addr_, 0, sizeof(this->dest_addr_));
577 if (this->api_client_ == nullptr) {
578 return;
579 }
580 ESP_LOGD(TAG, "Signaling stop");
582 msg.start = false;
583 this->api_client_->send_message(msg);
584}
585
587 this->set_timeout("playing", 2000, [this]() {
588 this->cancel_timeout("speaker-timeout");
590
592 msg.success = true;
593 this->api_client_->send_message(msg);
594 });
595}
596
598 ESP_LOGD(TAG, "Event Type: %" PRId32, msg.event_type);
599 switch (msg.event_type) {
601 ESP_LOGD(TAG, "Assist Pipeline running");
602#ifdef USE_MEDIA_PLAYER
603 this->started_streaming_tts_ = false;
604 for (auto arg : msg.data) {
605 if (arg.name == "url") {
606 this->tts_response_url_ = std::move(arg.value);
607 }
608 }
609#endif
610 this->defer([this]() { this->start_trigger_->trigger(); });
611 break;
613 break;
615 ESP_LOGD(TAG, "Wake word detected");
616 this->defer([this]() { this->wake_word_detected_trigger_->trigger(); });
617 break;
618 }
620 ESP_LOGD(TAG, "STT started");
621 this->defer([this]() { this->listening_trigger_->trigger(); });
622 break;
624 std::string text;
625 for (auto arg : msg.data) {
626 if (arg.name == "text") {
627 text = std::move(arg.value);
628 }
629 }
630 if (text.empty()) {
631 ESP_LOGW(TAG, "No text in STT_END event");
632 return;
633 } else if (text.length() > 500) {
634 text = text.substr(0, 497) + "...";
635 }
636 ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str());
637 this->defer([this, text]() { this->stt_end_trigger_->trigger(text); });
638 break;
639 }
641 ESP_LOGD(TAG, "Intent started");
642 this->defer([this]() { this->intent_start_trigger_->trigger(); });
643 break;
645 ESP_LOGD(TAG, "Intent progress");
646 std::string tts_url_for_trigger = "";
647#ifdef USE_MEDIA_PLAYER
648 if (this->media_player_ != nullptr) {
649 for (const auto &arg : msg.data) {
650 if ((arg.name == "tts_start_streaming") && (arg.value == "1") && !this->tts_response_url_.empty()) {
652
655 this->started_streaming_tts_ = true;
656 tts_url_for_trigger = this->tts_response_url_;
657 this->tts_response_url_.clear(); // Reset streaming URL
658 }
659 }
660 }
661#endif
662 this->defer([this, tts_url_for_trigger]() { this->intent_progress_trigger_->trigger(tts_url_for_trigger); });
663 break;
664 }
666 for (auto arg : msg.data) {
667 if (arg.name == "conversation_id") {
668 this->conversation_id_ = std::move(arg.value);
669 } else if (arg.name == "continue_conversation") {
670 this->continue_conversation_ = (arg.value == "1");
671 }
672 }
673 this->defer([this]() { this->intent_end_trigger_->trigger(); });
674 break;
675 }
677 std::string text;
678 for (auto arg : msg.data) {
679 if (arg.name == "text") {
680 text = std::move(arg.value);
681 }
682 }
683 if (text.empty()) {
684 ESP_LOGW(TAG, "No text in TTS_START event");
685 return;
686 }
687 if (text.length() > 500) {
688 text = text.substr(0, 497) + "...";
689 }
690 ESP_LOGD(TAG, "Response: \"%s\"", text.c_str());
691 this->defer([this, text]() {
692 this->tts_start_trigger_->trigger(text);
693#ifdef USE_SPEAKER
694 if (this->speaker_ != nullptr) {
695 this->speaker_->start();
696 }
697#endif
698 });
699 break;
700 }
702 std::string url;
703 for (auto arg : msg.data) {
704 if (arg.name == "url") {
705 url = std::move(arg.value);
706 }
707 }
708 if (url.empty()) {
709 ESP_LOGW(TAG, "No url in TTS_END event");
710 return;
711 }
712 ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str());
713 this->defer([this, url]() {
714#ifdef USE_MEDIA_PLAYER
715 if ((this->media_player_ != nullptr) && (!this->started_streaming_tts_)) {
717
720 // Start the playback timeout, as the media player state isn't immediately updated
722 }
723#endif
724 this->tts_end_trigger_->trigger(url);
725 });
727 this->set_state_(new_state, new_state);
728 break;
729 }
731 ESP_LOGD(TAG, "Assist Pipeline ended");
732 if ((this->state_ == State::START_PIPELINE) || (this->state_ == State::STARTING_PIPELINE) ||
734 // Microphone is running, stop it
736 } else if (this->state_ == State::AWAITING_RESPONSE) {
737 // No TTS start event ("nevermind")
739 }
740 this->defer([this]() { this->end_trigger_->trigger(); });
741 break;
742 }
744 std::string code = "";
745 std::string message = "";
746 for (auto arg : msg.data) {
747 if (arg.name == "code") {
748 code = std::move(arg.value);
749 } else if (arg.name == "message") {
750 message = std::move(arg.value);
751 }
752 }
753 if (code == "wake-word-timeout" || code == "wake_word_detection_aborted" || code == "no_wake_word") {
754 // Don't change state here since either the "tts-end" or "run-end" events will do it.
755 return;
756 } else if (code == "wake-provider-missing" || code == "wake-engine-missing") {
757 // Wake word is not set up or not ready on Home Assistant so stop and do not retry until user starts again.
758 this->defer([this, code, message]() {
759 this->request_stop();
760 this->error_trigger_->trigger(code, message);
761 });
762 return;
763 }
764 ESP_LOGE(TAG, "Error: %s - %s", code.c_str(), message.c_str());
765 if (this->state_ != State::IDLE) {
766 this->signal_stop_();
768 }
769 this->defer([this, code, message]() { this->error_trigger_->trigger(code, message); });
770 break;
771 }
773#ifdef USE_SPEAKER
774 if (this->speaker_ != nullptr) {
775 this->wait_for_stream_end_ = true;
776 ESP_LOGD(TAG, "TTS stream start");
777 this->defer([this] { this->tts_stream_start_trigger_->trigger(); });
778 }
779#endif
780 break;
781 }
783#ifdef USE_SPEAKER
784 if (this->speaker_ != nullptr) {
785 this->stream_ended_ = true;
786 ESP_LOGD(TAG, "TTS stream end");
787 }
788#endif
789 break;
790 }
792 ESP_LOGD(TAG, "Starting STT by VAD");
793 this->defer([this]() { this->stt_vad_start_trigger_->trigger(); });
794 break;
796 ESP_LOGD(TAG, "STT by VAD end");
798 this->defer([this]() { this->stt_vad_end_trigger_->trigger(); });
799 break;
800 default:
801 ESP_LOGD(TAG, "Unhandled event type: %" PRId32, msg.event_type);
802 break;
803 }
804}
805
807#ifdef USE_SPEAKER // We should never get to this function if there is no speaker anyway
808 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
809 if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) {
810 memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length());
811 this->speaker_buffer_index_ += msg.data.length();
812 this->speaker_buffer_size_ += msg.data.length();
813 this->speaker_bytes_received_ += msg.data.length();
814 ESP_LOGV(TAG, "Received audio: %u bytes from API", msg.data.length());
815 } else {
816 ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
817 }
818 }
819#endif
820}
821
823 Timer timer = {
824 .id = msg.timer_id,
825 .name = msg.name,
826 .total_seconds = msg.total_seconds,
827 .seconds_left = msg.seconds_left,
828 .is_active = msg.is_active,
829 };
830 this->timers_[timer.id] = timer;
831 ESP_LOGD(TAG, "Timer Event");
832 ESP_LOGD(TAG, " Type: %" PRId32, msg.event_type);
833 ESP_LOGD(TAG, " %s", timer.to_string().c_str());
834
835 switch (msg.event_type) {
837 this->timer_started_trigger_->trigger(timer);
838 break;
840 this->timer_updated_trigger_->trigger(timer);
841 break;
843 this->timer_cancelled_trigger_->trigger(timer);
844 this->timers_.erase(timer.id);
845 break;
847 this->timer_finished_trigger_->trigger(timer);
848 this->timers_.erase(timer.id);
849 break;
850 }
851
852 if (this->timers_.empty()) {
853 this->cancel_interval("timer-event");
854 this->timer_tick_running_ = false;
855 } else if (!this->timer_tick_running_) {
856 this->set_interval("timer-event", 1000, [this]() { this->timer_tick_(); });
857 this->timer_tick_running_ = true;
858 }
859}
860
862 std::vector<Timer> res;
863 res.reserve(this->timers_.size());
864 for (auto &pair : this->timers_) {
865 auto &timer = pair.second;
866 if (timer.is_active && timer.seconds_left > 0) {
867 timer.seconds_left--;
868 }
869 res.push_back(timer);
870 }
871 this->timer_tick_trigger_->trigger(res);
872}
873
875#ifdef USE_MEDIA_PLAYER
876 if (this->media_player_ != nullptr) {
877 this->tts_start_trigger_->trigger(msg.text);
878 if (!msg.preannounce_media_id.empty()) {
880 }
881 // Enqueueing a URL with an empty playlist will still play the file immediately
882 this->media_player_->make_call()
885 .set_announcement(true)
886 .perform();
888
891 // Start the playback timeout, as the media player state isn't immediately updated
893
894 if (this->continuous_) {
896 } else {
898 }
899
901 this->end_trigger_->trigger();
902 }
903#endif
904}
905
906void VoiceAssistant::on_set_configuration(const std::vector<std::string> &active_wake_words) {
907#ifdef USE_MICRO_WAKE_WORD
908 if (this->micro_wake_word_) {
909 // Disable all wake words first
910 for (auto &model : this->micro_wake_word_->get_wake_words()) {
911 model->disable();
912 }
913
914 // Enable only active wake words
915 for (auto ww_id : active_wake_words) {
916 for (auto &model : this->micro_wake_word_->get_wake_words()) {
917 if (model->get_id() == ww_id) {
918 model->enable();
919 ESP_LOGD(TAG, "Enabled wake word: %s (id=%s)", model->get_wake_word().c_str(), model->get_id().c_str());
920 }
921 }
922 }
923 }
924#endif
925};
926
928 this->config_.available_wake_words.clear();
929 this->config_.active_wake_words.clear();
930
931#ifdef USE_MICRO_WAKE_WORD
932 if (this->micro_wake_word_) {
934
935 for (auto &model : this->micro_wake_word_->get_wake_words()) {
936 if (model->is_enabled()) {
937 this->config_.active_wake_words.push_back(model->get_id());
938 }
939
940 WakeWord wake_word;
941 wake_word.id = model->get_id();
942 wake_word.wake_word = model->get_wake_word();
943 for (const auto &lang : model->get_trained_languages()) {
944 wake_word.trained_languages.push_back(lang);
945 }
946 this->config_.available_wake_words.push_back(std::move(wake_word));
947 }
948 } else {
949#endif
950 // No microWakeWord
952#ifdef USE_MICRO_WAKE_WORD
953 }
954#endif
955
956 return this->config_;
957};
958
959VoiceAssistant *global_voice_assistant = nullptr; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
960
961} // namespace voice_assistant
962} // namespace esphome
963
964#endif // USE_VOICE_ASSISTANT
virtual void mark_failed()
Mark this component as failed.
void set_interval(const std::string &name, uint32_t interval, std::function< void()> &&f)
Set an interval function with a unique name.
Definition component.cpp:58
bool cancel_timeout(const std::string &name)
Cancel a timeout function.
Definition component.cpp:79
bool status_has_error() const
bool cancel_interval(const std::string &name)
Cancel an interval function.
Definition component.cpp:62
void status_set_error(const char *message="unspecified")
void defer(const std::string &name, std::function< void()> &&f)
Defer a callback to the next loop() call.
void set_timeout(const std::string &name, uint32_t timeout, std::function< void()> &&f)
Set a timeout function with a unique name.
Definition component.cpp:75
An STL allocator that uses SPI or internal RAM.
Definition helpers.h:684
void deallocate(T *p, size_t n)
Definition helpers.h:742
T * allocate(size_t n)
Definition helpers.h:704
static std::unique_ptr< RingBuffer > create(size_t len)
void trigger(Ts... x)
Inform the parent automation that the event has triggered.
Definition automation.h:96
std::string get_client_combined_info() const
enums::VoiceAssistantEvent event_type
Definition api_pb2.h:2402
std::vector< VoiceAssistantEventData > data
Definition api_pb2.h:2403
VoiceAssistantAudioSettings audio_settings
Definition api_pb2.h:2352
enums::VoiceAssistantTimerEvent event_type
Definition api_pb2.h:2440
MediaPlayerCall & set_media_url(const std::string &url)
MediaPlayerCall & set_announcement(bool announce)
MediaPlayerCall & set_command(MediaPlayerCommand command)
std::vector< WakeWordModel * > get_wake_words()
void add_data_callback(std::function< void(const std::vector< uint8_t > &)> &&data_callback)
virtual size_t play(const uint8_t *data, size_t length)=0
Plays the provided audio data.
bool is_running() const
Definition speaker.h:66
virtual bool has_buffered_data() const =0
virtual void start()=0
virtual void stop()=0
std::unique_ptr< socket::Socket > socket_
std::unordered_map< std::string, Timer > timers_
void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg)
void on_audio(const api::VoiceAssistantAudio &msg)
media_player::MediaPlayer * media_player_
Trigger< std::string, std::string > * error_trigger_
void client_subscription(api::APIConnection *client, bool subscribe)
Trigger< std::vector< Timer > > * timer_tick_trigger_
std::shared_ptr< RingBuffer > ring_buffer_
void on_event(const api::VoiceAssistantEventResponse &msg)
Trigger< std::string > * tts_start_trigger_
void on_announce(const api::VoiceAssistantAnnounceRequest &msg)
void request_start(bool continuous, bool silence_detection)
Trigger< std::string > * intent_progress_trigger_
microphone::MicrophoneSource * mic_source_
micro_wake_word::MicroWakeWord * micro_wake_word_
void on_set_configuration(const std::vector< std::string > &active_wake_words)
bool state
Definition fan.h:0
uint32_t socklen_t
Definition headers.h:97
__int64 ssize_t
Definition httplib.h:175
@ VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD
Definition api_pb2.h:193
@ VOICE_ASSISTANT_REQUEST_USE_VAD
Definition api_pb2.h:192
@ VOICE_ASSISTANT_TIMER_UPDATED
Definition api_pb2.h:215
@ VOICE_ASSISTANT_TIMER_STARTED
Definition api_pb2.h:214
@ VOICE_ASSISTANT_TIMER_FINISHED
Definition api_pb2.h:217
@ VOICE_ASSISTANT_TIMER_CANCELLED
Definition api_pb2.h:216
@ VOICE_ASSISTANT_WAKE_WORD_START
Definition api_pb2.h:205
@ VOICE_ASSISTANT_TTS_STREAM_END
Definition api_pb2.h:210
@ VOICE_ASSISTANT_STT_VAD_START
Definition api_pb2.h:207
@ VOICE_ASSISTANT_INTENT_PROGRESS
Definition api_pb2.h:211
@ VOICE_ASSISTANT_TTS_STREAM_START
Definition api_pb2.h:209
@ VOICE_ASSISTANT_WAKE_WORD_END
Definition api_pb2.h:206
const float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
Definition component.cpp:28
std::unique_ptr< Socket > socket(int domain, int type, int protocol)
Create a socket of the given domain, type and protocol.
socklen_t set_sockaddr_any(struct sockaddr *addr, socklen_t addrlen, uint16_t port)
Set a sockaddr to the any address and specified port for the IP version used by socket_ip().
Definition socket.cpp:82
VoiceAssistant * global_voice_assistant
Providing packet encoding functions for exchanging data with a remote host.
Definition a01nyub.cpp:7
std::vector< WakeWord > available_wake_words
std::vector< std::string > active_wake_words
std::vector< std::string > trained_languages
sa_family_t ss_family
Definition headers.h:92