ESPHome 2026.5.1
Loading...
Searching...
No Matches
i2s_audio_speaker_standard.cpp
Go to the documentation of this file.
2
3#ifdef USE_ESP32
4
5#include <driver/i2s_std.h>
6
9
10#include "esphome/core/hal.h"
11#include "esphome/core/log.h"
12
13#include "esp_timer.h"
14
15namespace esphome::i2s_audio {
16
17static const char *const TAG = "i2s_audio.speaker.std";
18
19static constexpr uint32_t DMA_BUFFER_DURATION_MS = 15;
20static constexpr size_t DMA_BUFFERS_COUNT = 4;
21// Sized to comfortably absorb scheduling jitter: at most DMA_BUFFERS_COUNT events can be in flight,
22// doubled so that a transient backlog never overruns the queue (which would desync the lockstep
23// invariant between i2s_event_queue_ and write_records_queue_).
24static constexpr size_t I2S_EVENT_QUEUE_COUNT = DMA_BUFFERS_COUNT * 2;
25// Generous timeout for ``i2s_channel_write`` blocking. A buffer frees roughly every
26// DMA_BUFFER_DURATION_MS, so a multiple of that gives plenty of slack against scheduling jitter
27// without masking real failures.
28static constexpr TickType_t WRITE_TIMEOUT_TICKS = pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * (DMA_BUFFERS_COUNT + 1));
29
32 const char *fmt_str;
33 switch (this->i2s_comm_fmt_) {
34 case I2SCommFmt::PCM:
35 fmt_str = "pcm";
36 break;
37 case I2SCommFmt::MSB:
38 fmt_str = "msb";
39 break;
40 default:
41 fmt_str = "std";
42 break;
43 }
44 ESP_LOGCONFIG(TAG, " Communication format: %s", fmt_str);
45}
46
48 xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::TASK_STARTING);
49
50 const uint32_t dma_buffers_duration_ms = DMA_BUFFER_DURATION_MS * DMA_BUFFERS_COUNT;
51 // Ensure ring buffer duration is at least the duration of all DMA buffers
52 const uint32_t ring_buffer_duration = std::max(dma_buffers_duration_ms, this->buffer_duration_ms_);
53
54 // The DMA buffers may have more bits per sample, so calculate buffer sizes based on the input audio stream info
55 const size_t bytes_per_frame = this->current_stream_info_.frames_to_bytes(1);
56 // Round the ring buffer size down to a multiple of bytes_per_frame so the wrap boundary stays frame-aligned and
57 // avoids unnecessary single-frame splices.
58 const size_t ring_buffer_size =
59 (this->current_stream_info_.ms_to_bytes(ring_buffer_duration) / bytes_per_frame) * bytes_per_frame;
60 const uint32_t frames_per_dma_buffer = this->current_stream_info_.ms_to_frames(DMA_BUFFER_DURATION_MS);
61 const size_t dma_buffer_bytes = this->current_stream_info_.frames_to_bytes(frames_per_dma_buffer);
62
63 bool successful_setup = false;
64
65 std::unique_ptr<audio::RingBufferAudioSource> audio_source;
66
67 // Pre-zeroed buffer used to silence-pad each DMA descriptor whenever real audio doesn't fully fill it.
68 RAMAllocator<uint8_t> silence_allocator;
69 uint8_t *silence_buffer = silence_allocator.allocate(dma_buffer_bytes);
70
71 if (silence_buffer != nullptr) {
72 memset(silence_buffer, 0, dma_buffer_bytes);
73
74 std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = ring_buffer::RingBuffer::create(ring_buffer_size);
75 audio_source =
76 audio::RingBufferAudioSource::create(temp_ring_buffer, dma_buffer_bytes, static_cast<uint8_t>(bytes_per_frame));
77
78 if (audio_source != nullptr) {
79 // audio_source is nullptr if the ring buffer fails to allocate
80 this->audio_ring_buffer_ = temp_ring_buffer;
81 successful_setup = true;
82 }
83 }
84
85 if (successful_setup) {
86 // Preload every DMA descriptor with silence and push a matching zero-real-frames record per buffer.
87 // This guarantees that every on_sent event has a corresponding write record from the start, so
88 // ``i2s_event_queue_`` and ``write_records_queue_`` stay in lockstep for the entire task lifetime.
89 for (size_t i = 0; i < DMA_BUFFERS_COUNT; i++) {
90 size_t bytes_loaded = 0;
91 esp_err_t err = i2s_channel_preload_data(this->tx_handle_, silence_buffer, dma_buffer_bytes, &bytes_loaded);
92 if (err != ESP_OK || bytes_loaded != dma_buffer_bytes) {
93 ESP_LOGV(TAG, "Failed to preload silence into DMA buffer %u (err=%d, loaded=%u)", (unsigned) i, (int) err,
94 (unsigned) bytes_loaded);
95 successful_setup = false;
96 break;
97 }
98 uint32_t zero_real_frames = 0;
99 if (xQueueSend(this->write_records_queue_, &zero_real_frames, 0) != pdTRUE) {
100 // Should never happen: the queue was just reset and is sized for DMA_BUFFERS_COUNT * 2 entries.
101 ESP_LOGV(TAG, "Failed to push preload write record");
102 successful_setup = false;
103 break;
104 }
105 }
106 }
107
108 if (successful_setup) {
109 // Register the on_sent callback BEFORE enabling the channel so the very first transmitted buffer
110 // generates a queued event that pairs with the first preloaded silence record.
111 const i2s_event_callbacks_t callbacks = {.on_sent = i2s_on_sent_cb};
112 i2s_channel_register_event_callback(this->tx_handle_, &callbacks, this);
113
114 if (i2s_channel_enable(this->tx_handle_) != ESP_OK) {
115 ESP_LOGV(TAG, "Failed to enable I2S channel");
116 successful_setup = false;
117 }
118 }
119
120 if (!successful_setup) {
121 xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
122 } else {
123 bool stop_gracefully = false;
124 // Number of records currently in ``write_records_queue_`` that carry real audio. Used by graceful
125 // stop to wait until every real-audio buffer has been confirmed played by an ISR event.
126 uint32_t pending_real_buffers = 0;
127 uint32_t last_data_received_time = millis();
128
129 xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::TASK_RUNNING);
130
131 // Main speaker task loop. Continues while:
132 // - Paused, OR
133 // - No timeout configured, OR
134 // - Timeout hasn't elapsed since last data
135 //
136 // Always-fill model: every iteration writes exactly one DMA buffer's worth, mixing real audio
137 // and silence padding as needed. The blocking ``i2s_channel_write`` paces the loop at the DMA
138 // consumption rate, and every buffer write is matched 1:1 with a record on ``write_records_queue_``.
139 //
140 // While paused, the real-audio fill is skipped and the entire DMA buffer is filled with silence;
141 // the same blocking ``i2s_channel_write`` provides natural pacing (one buffer per ~DMA_BUFFER_DURATION_MS),
142 // so the lockstep invariant is preserved without burning CPU.
143 while (this->pause_state_ || !this->timeout_.has_value() ||
144 (millis() - last_data_received_time) <= this->timeout_.value()) {
145 uint32_t event_group_bits = xEventGroupGetBits(this->event_group_);
146
147 if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP) {
148 // COMMAND_STOP is set both by user-initiated stop() and by the ISR when it drops a completion
149 // event (paired with ERR_DROPPED_EVENT so loop() can distinguish the two cases).
150 xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP);
151 ESP_LOGV(TAG, "Exiting: COMMAND_STOP received");
152 break;
153 }
154 if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY) {
155 xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY);
156 stop_gracefully = true;
157 }
158
159 if (this->audio_stream_info_ != this->current_stream_info_) {
160 // Audio stream info changed, stop the speaker task so it will restart with the proper settings.
161 ESP_LOGV(TAG, "Exiting: stream info changed");
162 break;
163 }
164
165 // Drain ISR-stamped completion events. Each event corresponds 1:1 with a write_records_queue_
166 // entry by construction (preloaded records at startup, plus exactly one record pushed per
167 // iteration alongside exactly one DMA-buffer-sized write).
168 int64_t write_timestamp;
169 bool lockstep_broken = false;
170 while (xQueueReceive(this->i2s_event_queue_, &write_timestamp, 0)) {
171 uint32_t real_frames = 0;
172 if (xQueueReceive(this->write_records_queue_, &real_frames, 0) != pdTRUE) {
173 // Should never happen: would indicate the lockstep invariant is broken.
174 ESP_LOGV(TAG, "Event without matching write record");
176 lockstep_broken = true;
177 break;
178 }
179 if (real_frames > 0) {
180 pending_real_buffers--;
181 // Real audio is packed at the start of each DMA buffer with any silence padding on the
182 // tail, so the real audio finished playing earlier than the buffer-completion timestamp
183 // by the duration of the trailing zeros.
184 const uint32_t silence_frames = frames_per_dma_buffer - real_frames;
185 const int64_t adjusted_ts =
186 write_timestamp - this->current_stream_info_.frames_to_microseconds(silence_frames);
187 this->audio_output_callback_(real_frames, adjusted_ts);
188 }
189 }
190 if (lockstep_broken) {
191 break;
192 }
193
194 // Graceful stop: exit only after the source's exposed chunk is drained, the underlying ring
195 // buffer has nothing left to hand over, and every real-audio buffer we submitted has been
196 // confirmed played. ``has_buffered_data()`` returns bytes still sitting in the ring buffer
197 // awaiting fill().
198 if (stop_gracefully && audio_source->available() == 0 && !this->has_buffered_data() &&
199 pending_real_buffers == 0) {
200 ESP_LOGV(TAG, "Exiting: graceful stop complete");
201 break;
202 }
203
204 // Compose exactly one DMA buffer's worth: drain as much real audio as the source currently
205 // exposes (may take multiple fill() calls when crossing a ring buffer wrap), then pad any
206 // remainder with silence. All writes pack into the next free DMA descriptor in order, so the
207 // descriptor ends up holding [real audio][silence padding].
208 size_t bytes_written_total = 0;
209 size_t real_bytes_total = 0;
210 bool partial_write_failure = false;
211
212 if (!this->pause_state_) {
213 while (bytes_written_total < dma_buffer_bytes) {
214 size_t bytes_read = audio_source->fill(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS) / 2, false);
215 if (bytes_read > 0) {
216 uint8_t *new_data = audio_source->mutable_data() + audio_source->available() - bytes_read;
217 this->apply_software_volume_(new_data, bytes_read);
218 this->swap_esp32_mono_samples_(new_data, bytes_read);
219 }
220
221 const size_t to_write = std::min(audio_source->available(), dma_buffer_bytes - bytes_written_total);
222 if (to_write == 0) {
223 // Ring buffer has nothing more to hand over right now; pad the rest of this DMA buffer
224 // with silence so the lockstep invariant (one write per iteration) is preserved.
225 break;
226 }
227
228 size_t bw = 0;
229 i2s_channel_write(this->tx_handle_, audio_source->data(), to_write, &bw, WRITE_TIMEOUT_TICKS);
230 if (bw != to_write) {
231 // A short real-audio write breaks DMA descriptor alignment for every subsequent event;
232 // the only safe recovery is to restart the task.
233 ESP_LOGV(TAG, "Partial real audio write: %u of %u bytes", (unsigned) bw, (unsigned) to_write);
235 partial_write_failure = true;
236 break;
237 }
238 audio_source->consume(bw);
239 bytes_written_total += bw;
240 real_bytes_total += bw;
241 }
242 if (real_bytes_total > 0) {
243 last_data_received_time = millis();
244 }
245 }
246
247 if (partial_write_failure) {
248 break;
249 }
250
251 const size_t silence_bytes = dma_buffer_bytes - bytes_written_total;
252 if (silence_bytes > 0) {
253 size_t bw = 0;
254 i2s_channel_write(this->tx_handle_, silence_buffer, silence_bytes, &bw, WRITE_TIMEOUT_TICKS);
255 if (bw != silence_bytes) {
256 // Same descriptor-alignment hazard as a partial real-audio write.
257 ESP_LOGV(TAG, "Partial silence write: %u of %u bytes", (unsigned) bw, (unsigned) silence_bytes);
259 break;
260 }
261 }
262
263 const uint32_t real_frames_in_buffer = this->current_stream_info_.bytes_to_frames(real_bytes_total);
264 // Push the matching write record. Capacity headroom in I2S_EVENT_QUEUE_COUNT guarantees this
265 // succeeds even with a transient backlog of unprocessed events; if it ever fails the lockstep
266 // invariant is broken and every subsequent timestamp would be silently wrong, so bail.
267 if (xQueueSend(this->write_records_queue_, &real_frames_in_buffer, 0) != pdTRUE) {
268 ESP_LOGV(TAG, "Exiting: write records queue full");
270 break;
271 }
272 if (real_frames_in_buffer > 0) {
273 pending_real_buffers++;
274 }
275 }
276 }
277
278 xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::TASK_STOPPING);
279
280 audio_source.reset();
281
282 if (silence_buffer != nullptr) {
283 silence_allocator.deallocate(silence_buffer, dma_buffer_bytes);
284 silence_buffer = nullptr;
285 }
286
287 xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::TASK_STOPPED);
288
289 while (true) {
290 // Continuously delay until the loop method deletes the task
291 vTaskDelay(pdMS_TO_TICKS(10));
292 }
293}
294
296 this->current_stream_info_ = audio_stream_info;
297
298 if ((this->i2s_role_ & I2S_ROLE_SLAVE) && (this->sample_rate_ != audio_stream_info.get_sample_rate())) { // NOLINT
299 // Can't reconfigure I2S bus, so the sample rate must match the configured value
300 ESP_LOGE(TAG, "Incompatible stream settings");
301 return ESP_ERR_NOT_SUPPORTED;
302 }
303
304 if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_AUTO &&
305 (i2s_slot_bit_width_t) audio_stream_info.get_bits_per_sample() > this->slot_bit_width_) {
306 // Currently can't handle the case when the incoming audio has more bits per sample than the configured value
307 ESP_LOGE(TAG, "Stream bits per sample must be less than or equal to the speaker's configuration");
308 return ESP_ERR_NOT_SUPPORTED;
309 }
310
311 if (!this->parent_->try_lock()) {
312 ESP_LOGE(TAG, "Parent bus is busy");
313 return ESP_ERR_INVALID_STATE;
314 }
315
316 uint32_t dma_buffer_length = audio_stream_info.ms_to_frames(DMA_BUFFER_DURATION_MS);
317
318 i2s_role_t i2s_role = this->i2s_role_;
319 i2s_clock_src_t clk_src = I2S_CLK_SRC_DEFAULT;
320
321#if SOC_CLK_APLL_SUPPORTED
322 if (this->use_apll_) {
323 clk_src = i2s_clock_src_t::I2S_CLK_SRC_APLL;
324 }
325#endif // SOC_CLK_APLL_SUPPORTED
326
327 // Log DMA configuration for debugging
328 ESP_LOGV(TAG, "I2S DMA config: %zu buffers x %lu frames", (size_t) DMA_BUFFERS_COUNT,
329 (unsigned long) dma_buffer_length);
330
331 i2s_chan_config_t chan_cfg = {
332 .id = this->parent_->get_port(),
333 .role = i2s_role,
334 .dma_desc_num = DMA_BUFFERS_COUNT,
335 .dma_frame_num = dma_buffer_length,
336 .auto_clear = true,
337 .intr_priority = 3,
338 };
339
340 // Build standard I2S clock/slot/gpio configuration
341 i2s_std_clk_config_t clk_cfg = {
342 .sample_rate_hz = audio_stream_info.get_sample_rate(),
343 .clk_src = clk_src,
344 .mclk_multiple = this->mclk_multiple_,
345 };
346
347 i2s_slot_mode_t slot_mode = this->slot_mode_;
348 i2s_std_slot_mask_t slot_mask = this->std_slot_mask_;
349 if (audio_stream_info.get_channels() == 1) {
350 slot_mode = I2S_SLOT_MODE_MONO;
351 } else if (audio_stream_info.get_channels() == 2) {
352 slot_mode = I2S_SLOT_MODE_STEREO;
353 slot_mask = I2S_STD_SLOT_BOTH;
354 }
355
356 i2s_std_slot_config_t slot_cfg;
357 switch (this->i2s_comm_fmt_) {
358 case I2SCommFmt::PCM:
359 slot_cfg =
360 I2S_STD_PCM_SLOT_DEFAULT_CONFIG((i2s_data_bit_width_t) audio_stream_info.get_bits_per_sample(), slot_mode);
361 break;
362 case I2SCommFmt::MSB:
363 slot_cfg =
364 I2S_STD_MSB_SLOT_DEFAULT_CONFIG((i2s_data_bit_width_t) audio_stream_info.get_bits_per_sample(), slot_mode);
365 break;
366 default:
367 slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG((i2s_data_bit_width_t) audio_stream_info.get_bits_per_sample(),
368 slot_mode);
369 break;
370 }
371
372#ifdef USE_ESP32_VARIANT_ESP32
373 // There seems to be a bug on the ESP32 (non-variant) platform where setting the slot bit width higher than the
374 // bits per sample causes the audio to play too fast. Setting the ws_width to the configured slot bit width seems
375 // to make it play at the correct speed while sending more bits per slot.
376 if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_AUTO) {
377 uint32_t configured_bit_width = static_cast<uint32_t>(this->slot_bit_width_);
378 slot_cfg.ws_width = configured_bit_width;
379 if (configured_bit_width > 16) {
380 slot_cfg.msb_right = false;
381 }
382 }
383#else
384 slot_cfg.slot_bit_width = this->slot_bit_width_;
385 if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_AUTO) {
386 slot_cfg.ws_width = static_cast<uint32_t>(this->slot_bit_width_);
387 }
388#endif // USE_ESP32_VARIANT_ESP32
389 slot_cfg.slot_mask = slot_mask;
390
391 i2s_std_gpio_config_t gpio_cfg = this->parent_->get_pin_config();
392 gpio_cfg.dout = this->dout_pin_;
393
394 i2s_std_config_t std_cfg = {
395 .clk_cfg = clk_cfg,
396 .slot_cfg = slot_cfg,
397 .gpio_cfg = gpio_cfg,
398 };
399
400 esp_err_t err = this->init_i2s_channel_(chan_cfg, std_cfg, I2S_EVENT_QUEUE_COUNT);
401 if (err != ESP_OK) {
402 return err;
403 }
404
405 // The speaker task will enable the channel after preloading.
406
407 return ESP_OK;
408}
409
410} // namespace esphome::i2s_audio
411
412#endif // USE_ESP32
An STL allocator that uses SPI or internal RAM.
Definition helpers.h:2053
void deallocate(T *p, size_t n)
Definition helpers.h:2110
T * allocate(size_t n)
Definition helpers.h:2080
size_t ms_to_bytes(uint32_t ms) const
Converts duration to bytes.
Definition audio.h:72
size_t frames_to_bytes(uint32_t frames) const
Converts frames to bytes.
Definition audio.h:52
uint8_t get_bits_per_sample() const
Definition audio.h:27
uint32_t frames_to_microseconds(uint32_t frames) const
Computes the duration, in microseconds, the given amount of frames represents.
Definition audio.cpp:25
uint32_t bytes_to_frames(size_t bytes) const
Convert bytes to frames.
Definition audio.h:42
uint8_t get_channels() const
Definition audio.h:28
uint32_t ms_to_frames(uint32_t ms) const
Converts duration to frames.
Definition audio.h:62
uint32_t get_sample_rate() const
Definition audio.h:29
static std::unique_ptr< RingBufferAudioSource > create(std::shared_ptr< ring_buffer::RingBuffer > ring_buffer, size_t max_fill_bytes, uint8_t alignment_bytes=1)
Creates a new ring-buffer-backed audio source after validating its parameters.
i2s_std_slot_mask_t std_slot_mask_
Definition i2s_audio.h:28
i2s_slot_bit_width_t slot_bit_width_
Definition i2s_audio.h:29
i2s_mclk_multiple_t mclk_multiple_
Definition i2s_audio.h:32
static bool i2s_on_sent_cb(i2s_chan_handle_t handle, i2s_event_data_t *event, void *user_ctx)
Callback function used to send playback timestamps to the speaker task.
void apply_software_volume_(uint8_t *data, size_t bytes_read)
Apply software volume control using Q15 fixed-point scaling.
std::weak_ptr< ring_buffer::RingBuffer > audio_ring_buffer_
void swap_esp32_mono_samples_(uint8_t *data, size_t bytes_read)
Swap adjacent 16-bit mono samples for ESP32 (non-variant) hardware quirk.
esp_err_t init_i2s_channel_(const i2s_chan_config_t &chan_cfg, const i2s_std_config_t &std_cfg, size_t event_queue_size)
Shared I2S channel allocation, initialization, and event queue setup.
esp_err_t start_i2s_driver(audio::AudioStreamInfo &audio_stream_info) override
static std::unique_ptr< RingBuffer > create(size_t len, MemoryPreference preference=MemoryPreference::EXTERNAL_FIRST)
CallbackManager< void(uint32_t, int64_t)> audio_output_callback_
Definition speaker.h:122
audio::AudioStreamInfo audio_stream_info_
Definition speaker.h:114
auto * new_data
Definition helpers.cpp:29
uint32_t IRAM_ATTR HOT millis()
Definition hal.cpp:28
static void uint32_t