From 97d5b1e9239da6a2186b1e15a6eb4b267c411aa5 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Mon, 22 Jul 2024 15:11:01 -0500 Subject: [PATCH] Add WAV decoder (#14) * Add WAV decoder * external component uses dev branch --- esphome/components/nabu/decode_streamer.cpp | 159 +++++++++----------- esphome/components/nabu/wav_decoder.cpp | 125 +++++++++++++++ esphome/components/nabu/wav_decoder.h | 102 +++++++++++++ voice-kit.yaml | 2 +- 4 files changed, 300 insertions(+), 88 deletions(-) create mode 100644 esphome/components/nabu/wav_decoder.cpp create mode 100644 esphome/components/nabu/wav_decoder.h diff --git a/esphome/components/nabu/decode_streamer.cpp b/esphome/components/nabu/decode_streamer.cpp index 4915aef..3d87632 100644 --- a/esphome/components/nabu/decode_streamer.cpp +++ b/esphome/components/nabu/decode_streamer.cpp @@ -4,6 +4,7 @@ #include "flac_decoder.h" #include "mp3_decoder.h" +#include "wav_decoder.h" #include "streamer.h" #include "esphome/components/media_player/media_player.h" @@ -82,9 +83,11 @@ void DecodeStreamer::decode_task_(void *params) { media_player::MediaFileType media_file_type = media_player::MediaFileType::NONE; - size_t wav_header_bytes_to_read = 4 * 5; // enough to get fmt size - size_t wav_header_bytes_read = 0; - bool wav_have_fmt_size = false; + wav_decoder::WAVDecoder wav_decoder(input_buffer); + size_t wav_header_bytes_to_read = wav_decoder.bytes_needed(); + size_t wav_buffer_offset = 0; + size_t wav_bytes_to_skip = wav_decoder.bytes_to_skip(); + size_t wav_sample_bytes_to_read = 0; // TODO: only initialize if needed HMP3Decoder mp3_decoder = MP3InitDecoder(); @@ -123,9 +126,11 @@ void DecodeStreamer::decode_task_(void *params) { stopping = false; header_parsed = false; - wav_header_bytes_to_read = 4 * 5; - wav_header_bytes_read = 0; - wav_have_fmt_size = false; + wav_decoder.reset(); + wav_header_bytes_to_read = wav_decoder.bytes_needed(); + wav_buffer_offset = 0; + wav_bytes_to_skip = wav_decoder.bytes_to_skip(); + wav_sample_bytes_to_read = 0; if (media_file_type == media_player::MediaFileType::MP3) { mp3_decoder = MP3InitDecoder(); @@ -163,88 +168,68 @@ void DecodeStreamer::decode_task_(void *params) { size_t bytes_read = 0; - if (!header_parsed) { - if (max_bytes_to_read > 0) { - bytes_read = this_streamer->input_ring_buffer_->read((void *) (input_buffer + wav_header_bytes_read), - wav_header_bytes_to_read - wav_header_bytes_read); - } - max_bytes_to_read -= bytes_read; - wav_header_bytes_read += bytes_read; - - if (wav_header_bytes_read == wav_header_bytes_to_read) { - if (!wav_have_fmt_size) { - // We should have: - // 'RIFF' (4 bytes) - // chunk size (4 bytes) - // 'WAVE' (4 bytes) - // 'fmt ' (4 bytes) - // format size (4 bytes) - if (strncmp((char *) input_buffer, "RIFF", 4) != 0) { - printf("Missing RIFF header: %.*s\n", 4, (char *) input_buffer); - break; - } - - if (strncmp((char *) (input_buffer + 8), "WAVE", 4) != 0) { - printf("Missing WAVE header: %.*s\n", 4, (char *) (input_buffer + 8)); + if (!header_parsed && (bytes_available > 0)) { + if (wav_bytes_to_skip > 0) { + // Skip unneeded data + bytes_read = this_streamer->input_ring_buffer_->read((void *) input_buffer, + std::min(wav_bytes_to_skip, max_bytes_to_read)); + wav_bytes_to_skip -= bytes_read; + } else if (wav_header_bytes_to_read > 0) { + // Read needed header data + bytes_read = this_streamer->input_ring_buffer_->read((void *) (input_buffer + wav_buffer_offset), + wav_header_bytes_to_read); + wav_header_bytes_to_read -= bytes_read; + wav_buffer_offset += bytes_read; + + if (wav_header_bytes_to_read == 0) { + // Process header data in buffer + wav_decoder::WAVDecoderResult result = wav_decoder.next(); + if (result == wav_decoder::WAV_DECODER_SUCCESS_IN_DATA) { + // Header parsing is complete + header_parsed = true; + wav_sample_bytes_to_read = wav_decoder.chunk_bytes_left(); + + StreamInfo old_stream_info = stream_info; + + // Assume PCM and 16-bits per sample + stream_info.channels = wav_decoder.num_channels(); + stream_info.sample_rate = wav_decoder.sample_rate(); + + printf("sample channels: %d\n", stream_info.channels); + printf("sample rate: %d\n", stream_info.sample_rate); + printf("number of samples: %d\n", wav_sample_bytes_to_read / + (wav_decoder.num_channels() * (wav_decoder.bits_per_sample() / 8))); + + if (stream_info != old_stream_info) { + this_streamer->output_ring_buffer_->reset(); + + event.type = EventType::STARTED; + event.media_file_type = media_file_type; + event.stream_info = stream_info; + xQueueSend(this_streamer->event_queue_, &event, portMAX_DELAY); + } + } else if (result == wav_decoder::WAV_DECODER_SUCCESS_NEXT) { + // Continue parsing header + wav_bytes_to_skip = wav_decoder.bytes_to_skip(); + wav_header_bytes_to_read = wav_decoder.bytes_needed(); + wav_buffer_offset = 0; + } else { + printf("Unexpected error while parsing WAV header: %d\n", result); break; - } - - if (strncmp((char *) (input_buffer + 12), "fmt ", 4) != 0) { - printf("Missing fmt header: %.*s\n", 4, (char *) (input_buffer + 12)); - break; - } - - // Should be 16, but can vary - uint32_t fmt_size = *((uint32_t *) (input_buffer + 16)); - - // Read rest of fmt chunk + 'data' + data size - wav_header_bytes_to_read = fmt_size + 4 + 4; - wav_header_bytes_read = 0; - wav_have_fmt_size = true; - } else { - // We are just past the fmt chunk size in the header now. - // Next up is: - // audio format (2 bytes, PCM = 1) - // channels (2 bytes) - // sample rate (4 bytes) - // bytes per second (4 bytes) - // block align (2 bytes) - // bits per sample (2 bytes) - // 'data' (4 bytes) - // data size (4 bytes) - header_parsed = true; - StreamInfo old_stream_info = stream_info; - - // Assume PCM and 16-bits per sample - stream_info.channels = *((uint16_t *) (input_buffer + 2)); - stream_info.sample_rate = *((uint32_t *) (input_buffer + 4)); - - printf("sample channels: %d\n", stream_info.channels); - printf("sample rate: %d\n", stream_info.sample_rate); - - if (stream_info != old_stream_info) { - this_streamer->output_ring_buffer_->reset(); - - event.type = EventType::STARTED; - event.media_file_type = media_file_type; - event.stream_info = stream_info; - xQueueSend(this_streamer->event_queue_, &event, portMAX_DELAY); - } - } - } - - if (!header_parsed) { - // Need more data to parse header - continue; - } - } - - size_t bytes_to_read = std::min(max_bytes_to_read, BUFFER_SIZE); - if (bytes_to_read > 0) { - bytes_read = + } // parsing state + } // if header bytes available + } // if header bytes needed + } // if header parsed + + if (header_parsed && (wav_sample_bytes_to_read > 0)) { + size_t bytes_to_read = std::min(max_bytes_to_read, BUFFER_SIZE); + if (bytes_to_read > 0) { + bytes_read = this_streamer->input_ring_buffer_->read((void *) output_buffer, bytes_to_read, (10 / portTICK_PERIOD_MS)); - output_buffer_current = output_buffer; - output_buffer_length += bytes_read; + output_buffer_current = output_buffer; + output_buffer_length += bytes_read; + wav_sample_bytes_to_read -= bytes_read; + } } } else if (media_file_type == media_player::MediaFileType::MP3) { // Shift unread data in buffer to start @@ -411,4 +396,4 @@ void DecodeStreamer::reset_ring_buffers() { } // namespace nabu } // namespace esphome -#endif \ No newline at end of file +#endif diff --git a/esphome/components/nabu/wav_decoder.cpp b/esphome/components/nabu/wav_decoder.cpp new file mode 100644 index 0000000..32effbd --- /dev/null +++ b/esphome/components/nabu/wav_decoder.cpp @@ -0,0 +1,125 @@ +#include "wav_decoder.h" + +namespace wav_decoder { + +WAVDecoderResult WAVDecoder::next() { + this->bytes_to_skip_ = 0; + + switch (this->state_) { + case WAV_DECODER_BEFORE_RIFF: { + this->chunk_name_ = std::string((const char *)this->buffer_, 4); + if (this->chunk_name_ != "RIFF") { + return WAV_DECODER_ERROR_NO_RIFF; + } + + this->chunk_bytes_left_ = *((uint32_t *)(this->buffer_ + 4)); + if ((this->chunk_bytes_left_ % 2) != 0) { + // Pad byte + this->chunk_bytes_left_++; + } + + // WAVE sub-chunk header should follow + this->state_ = WAV_DECODER_BEFORE_WAVE; + this->bytes_needed_ = 4; // WAVE + break; + } + + case WAV_DECODER_BEFORE_WAVE: { + this->chunk_name_ = std::string((const char *)this->buffer_, 4); + if (this->chunk_name_ != "WAVE") { + return WAV_DECODER_ERROR_NO_WAVE; + } + + // Next chunk header + this->state_ = WAV_DECODER_BEFORE_FMT; + this->bytes_needed_ = 8; // chunk name + size + break; + } + + case WAV_DECODER_BEFORE_FMT: { + this->chunk_name_ = std::string((const char *)this->buffer_, 4); + this->chunk_bytes_left_ = *((uint32_t *)(this->buffer_ + 4)); + if ((this->chunk_bytes_left_ % 2) != 0) { + // Pad byte + this->chunk_bytes_left_++; + } + + if (this->chunk_name_ == "fmt ") { + // Read rest of fmt chunk + this->state_ = WAV_DECODER_IN_FMT; + this->bytes_needed_ = this->chunk_bytes_left_; + } else { + // Skip over chunk + // this->state_ = WAV_DECODER_BEFORE_FMT_SKIP_CHUNK; + this->bytes_to_skip_ = this->chunk_bytes_left_; + this->bytes_needed_ = 8; + } + break; + } + + // case WAV_DECODER_BEFORE_FMT_SKIP_CHUNK: { + // // Next chunk header + // this->state_ = WAV_DECODER_BEFORE_FMT; + // this->bytes_needed_ = 8; // chunk name + size + // break; + // } + + case WAV_DECODER_IN_FMT: { + /** + * audio format (uint16_t) + * number of channels (uint16_t) + * sample rate (uint32_t) + * bytes per second (uint32_t) + * block align (uint16_t) + * bits per sample (uint16_t) + * [rest of format chunk] + */ + this->num_channels_ = *((uint16_t *)(this->buffer_ + 2)); + this->sample_rate_ = *((uint32_t *)(this->buffer_ + 4)); + this->bits_per_sample_ = *((uint16_t *)(this->buffer_ + 14)); + + // Next chunk + this->state_ = WAV_DECODER_BEFORE_DATA; + this->bytes_needed_ = 8; // chunk name + size + break; + } + + case WAV_DECODER_BEFORE_DATA: { + this->chunk_name_ = std::string((const char *)this->buffer_, 4); + this->chunk_bytes_left_ = *((uint32_t *)(this->buffer_ + 4)); + if ((this->chunk_bytes_left_ % 2) != 0) { + // Pad byte + this->chunk_bytes_left_++; + } + + if (this->chunk_name_ == "data") { + // Complete + this->state_ = WAV_DECODER_IN_DATA; + this->bytes_needed_ = 0; + return WAV_DECODER_SUCCESS_IN_DATA; + } + + // Skip over chunk + // this->state_ = WAV_DECODER_BEFORE_DATA_SKIP_CHUNK; + this->bytes_to_skip_ = this->chunk_bytes_left_; + this->bytes_needed_ = 8; + break; + } + + // case WAV_DECODER_BEFORE_DATA_SKIP_CHUNK: { + // // Next chunk header + // this->state_ = WAV_DECODER_BEFORE_DATA; + // this->bytes_needed_ = 8; // chunk name + size + // break; + // } + + case WAV_DECODER_IN_DATA: { + return WAV_DECODER_SUCCESS_IN_DATA; + break; + } + } + + return WAV_DECODER_SUCCESS_NEXT; +} + +} // namespace wav_decoder diff --git a/esphome/components/nabu/wav_decoder.h b/esphome/components/nabu/wav_decoder.h new file mode 100644 index 0000000..13bcba1 --- /dev/null +++ b/esphome/components/nabu/wav_decoder.h @@ -0,0 +1,102 @@ +// Very basic WAV file decoder that parses format information and gets to the +// data portion of the file. +// Skips over extraneous chunks like LIST and INFO. + +#ifndef WAV_DECODER_H_ +#define WAV_DECODER_H_ + +#include +#include + +/* WAV header: + * 'RIFF' (4 bytes, ASCII) + * RIFF chunk size (uint32_t) + * 'WAVE' (4 bytes, ASCII) + * (optional RIFF chunks) + * 'fmt ' (4 bytes, ASCII) + * format chunk size (uint32_t) + * audio format (uint16_t, PCM = 1) + * number of channels (uint16_t) + * sample rate (uint32_t) + * bytes per second (uint32_t) + * block align (uint16_t) + * bits per sample (uint16_t) + * [rest of format chunk] + * (optional RIFF chunks) + * 'data' (4 bytes, ASCII) + * data chunks size (uint32_t) + * [rest of data chunk] + * (optional RIFF chunks) + * */ + +namespace wav_decoder { + +const std::size_t min_buffer_size = 24; + +enum WAVDecoderState { + + WAV_DECODER_BEFORE_RIFF = 0, + WAV_DECODER_BEFORE_WAVE = 1, + WAV_DECODER_BEFORE_FMT = 2, + WAV_DECODER_IN_FMT = 3, + WAV_DECODER_BEFORE_DATA = 4, + WAV_DECODER_IN_DATA = 5, + +}; + +enum WAVDecoderResult { + WAV_DECODER_SUCCESS_NEXT = 0, + WAV_DECODER_SUCCESS_IN_DATA = 1, + WAV_DECODER_ERROR_NO_RIFF = 2, + WAV_DECODER_ERROR_NO_WAVE = 3, +}; + +class WAVDecoder { + +public: + WAVDecoder(uint8_t *buffer) : buffer_(buffer){}; + ~WAVDecoder(){}; + + WAVDecoderState state() { return this->state_; } + std::size_t bytes_to_skip() { return this->bytes_to_skip_; } + std::size_t bytes_needed() { return this->bytes_needed_; } + std::string chunk_name() { return this->chunk_name_; } + std::size_t chunk_bytes_left() { return this->chunk_bytes_left_; } + uint32_t sample_rate() { return this->sample_rate_; } + uint16_t num_channels() { return this->num_channels_; } + uint16_t bits_per_sample() { return this->bits_per_sample_; } + + // Advance decoding: + // 1. Check bytes_to_skip() first, and skip that many bytes. + // 2. Read exactly bytes_needed() into the start of the buffer. + // 3. Run next() and loop to 1 until the result is + // WAV_DECODER_SUCCESS_IN_DATA. + // 4. Use chunk_bytes_left() to read the data samples. + WAVDecoderResult next(); + + void reset() { + this->state_ = WAV_DECODER_BEFORE_RIFF; + this->bytes_to_skip_ = 0; + this->chunk_name_ = ""; + this->chunk_bytes_left_ = 0; + + this->sample_rate_ = 0; + this->num_channels_ = 0; + this->bits_per_sample_ = 0; + } + +protected: + uint8_t *buffer_; + WAVDecoderState state_ = WAV_DECODER_BEFORE_RIFF; + std::size_t bytes_needed_ = 8; // chunk name + size + std::size_t bytes_to_skip_ = 0; + std::string chunk_name_; + std::size_t chunk_bytes_left_ = 0; + + uint32_t sample_rate_ = 0; + uint16_t num_channels_ = 0; + uint16_t bits_per_sample_ = 0; +}; +} // namespace wav_decoder + +#endif // WAV_DECODER_H_ diff --git a/voice-kit.yaml b/voice-kit.yaml index 0c34d4d..4a02d8e 100644 --- a/voice-kit.yaml +++ b/voice-kit.yaml @@ -668,7 +668,7 @@ external_components: - source: type: git url: https://github.com/esphome/voice-kit - ref: kahrendt-2024-2 + ref: dev components: [i2s_audio, nabu, voice_assistant, media_player, micro_wake_word] refresh: 0s