From 97d5b1e9239da6a2186b1e15a6eb4b267c411aa5 Mon Sep 17 00:00:00 2001
From: Michael Hansen <hansen.mike@gmail.com>
Date: Mon, 22 Jul 2024 15:11:01 -0500
Subject: [PATCH] Add WAV decoder (#14)

* Add WAV decoder

* external component uses dev branch
---
 esphome/components/nabu/decode_streamer.cpp | 159 +++++++++-----------
 esphome/components/nabu/wav_decoder.cpp     | 125 +++++++++++++++
 esphome/components/nabu/wav_decoder.h       | 102 +++++++++++++
 voice-kit.yaml                              |   2 +-
 4 files changed, 300 insertions(+), 88 deletions(-)
 create mode 100644 esphome/components/nabu/wav_decoder.cpp
 create mode 100644 esphome/components/nabu/wav_decoder.h

diff --git a/esphome/components/nabu/decode_streamer.cpp b/esphome/components/nabu/decode_streamer.cpp
index 4915aef..3d87632 100644
--- a/esphome/components/nabu/decode_streamer.cpp
+++ b/esphome/components/nabu/decode_streamer.cpp
@@ -4,6 +4,7 @@
 
 #include "flac_decoder.h"
 #include "mp3_decoder.h"
+#include "wav_decoder.h"
 #include "streamer.h"
 
 #include "esphome/components/media_player/media_player.h"
@@ -82,9 +83,11 @@ void DecodeStreamer::decode_task_(void *params) {
 
   media_player::MediaFileType media_file_type = media_player::MediaFileType::NONE;
 
-  size_t wav_header_bytes_to_read = 4 * 5;  // enough to get fmt size
-  size_t wav_header_bytes_read = 0;
-  bool wav_have_fmt_size = false;
+  wav_decoder::WAVDecoder wav_decoder(input_buffer);
+  size_t wav_header_bytes_to_read = wav_decoder.bytes_needed();
+  size_t wav_buffer_offset = 0;
+  size_t wav_bytes_to_skip = wav_decoder.bytes_to_skip();
+  size_t wav_sample_bytes_to_read  = 0;
 
   // TODO: only initialize if needed
   HMP3Decoder mp3_decoder = MP3InitDecoder();
@@ -123,9 +126,11 @@ void DecodeStreamer::decode_task_(void *params) {
         stopping = false;
         header_parsed = false;
 
-        wav_header_bytes_to_read = 4 * 5;
-        wav_header_bytes_read = 0;
-        wav_have_fmt_size = false;
+        wav_decoder.reset();
+        wav_header_bytes_to_read = wav_decoder.bytes_needed();
+        wav_buffer_offset = 0;
+        wav_bytes_to_skip = wav_decoder.bytes_to_skip();
+        wav_sample_bytes_to_read  = 0;
 
         if (media_file_type == media_player::MediaFileType::MP3) {
           mp3_decoder = MP3InitDecoder();
@@ -163,88 +168,68 @@ void DecodeStreamer::decode_task_(void *params) {
 
         size_t bytes_read = 0;
 
-        if (!header_parsed) {
-          if (max_bytes_to_read > 0) {
-            bytes_read = this_streamer->input_ring_buffer_->read((void *) (input_buffer + wav_header_bytes_read),
-                                                                 wav_header_bytes_to_read - wav_header_bytes_read);
-          }
-          max_bytes_to_read -= bytes_read;
-          wav_header_bytes_read += bytes_read;
-
-          if (wav_header_bytes_read == wav_header_bytes_to_read) {
-            if (!wav_have_fmt_size) {
-              // We should have:
-              // 'RIFF' (4 bytes)
-              // chunk size (4 bytes)
-              // 'WAVE' (4 bytes)
-              // 'fmt ' (4 bytes)
-              // format size (4 bytes)
-              if (strncmp((char *) input_buffer, "RIFF", 4) != 0) {
-                printf("Missing RIFF header: %.*s\n", 4, (char *) input_buffer);
-                break;
-              }
-
-              if (strncmp((char *) (input_buffer + 8), "WAVE", 4) != 0) {
-                printf("Missing WAVE header: %.*s\n", 4, (char *) (input_buffer + 8));
+        if (!header_parsed && (bytes_available > 0)) {
+          if (wav_bytes_to_skip > 0) {
+            // Skip unneeded data
+            bytes_read = this_streamer->input_ring_buffer_->read((void *) input_buffer,
+                                                                 std::min(wav_bytes_to_skip, max_bytes_to_read));
+            wav_bytes_to_skip -= bytes_read;
+          } else if (wav_header_bytes_to_read > 0) {
+            // Read needed header data
+            bytes_read = this_streamer->input_ring_buffer_->read((void *) (input_buffer + wav_buffer_offset),
+                                                                 wav_header_bytes_to_read);
+            wav_header_bytes_to_read -= bytes_read;
+            wav_buffer_offset += bytes_read;
+
+            if (wav_header_bytes_to_read == 0) {
+              // Process header data in buffer
+              wav_decoder::WAVDecoderResult result = wav_decoder.next();
+              if (result == wav_decoder::WAV_DECODER_SUCCESS_IN_DATA) {
+                // Header parsing is complete
+                header_parsed = true;
+                wav_sample_bytes_to_read = wav_decoder.chunk_bytes_left();
+
+                StreamInfo old_stream_info = stream_info;
+
+                // Assume PCM and 16-bits per sample
+                stream_info.channels = wav_decoder.num_channels();
+                stream_info.sample_rate = wav_decoder.sample_rate();
+
+                printf("sample channels: %d\n", stream_info.channels);
+                printf("sample rate: %d\n", stream_info.sample_rate);
+                printf("number of samples: %d\n", wav_sample_bytes_to_read /
+                       (wav_decoder.num_channels() * (wav_decoder.bits_per_sample() / 8)));
+
+                if (stream_info != old_stream_info) {
+                  this_streamer->output_ring_buffer_->reset();
+
+                  event.type = EventType::STARTED;
+                  event.media_file_type = media_file_type;
+                  event.stream_info = stream_info;
+                  xQueueSend(this_streamer->event_queue_, &event, portMAX_DELAY);
+                }
+              } else if (result == wav_decoder::WAV_DECODER_SUCCESS_NEXT) {
+                // Continue parsing header
+                wav_bytes_to_skip = wav_decoder.bytes_to_skip();
+                wav_header_bytes_to_read = wav_decoder.bytes_needed();
+                wav_buffer_offset = 0;
+              } else {
+                printf("Unexpected error while parsing WAV header: %d\n", result);
                 break;
-              }
-
-              if (strncmp((char *) (input_buffer + 12), "fmt ", 4) != 0) {
-                printf("Missing fmt header: %.*s\n", 4, (char *) (input_buffer + 12));
-                break;
-              }
-
-              // Should be 16, but can vary
-              uint32_t fmt_size = *((uint32_t *) (input_buffer + 16));
-
-              // Read rest of fmt chunk + 'data' + data size
-              wav_header_bytes_to_read = fmt_size + 4 + 4;
-              wav_header_bytes_read = 0;
-              wav_have_fmt_size = true;
-            } else {
-              // We are just past the fmt chunk size in the header now.
-              // Next up is:
-              // audio format (2 bytes, PCM = 1)
-              // channels (2 bytes)
-              // sample rate (4 bytes)
-              // bytes per second (4 bytes)
-              // block align (2 bytes)
-              // bits per sample (2 bytes)
-              // 'data' (4 bytes)
-              // data size (4 bytes)
-              header_parsed = true;
-              StreamInfo old_stream_info = stream_info;
-
-              // Assume PCM and 16-bits per sample
-              stream_info.channels = *((uint16_t *) (input_buffer + 2));
-              stream_info.sample_rate = *((uint32_t *) (input_buffer + 4));
-
-              printf("sample channels: %d\n", stream_info.channels);
-              printf("sample rate: %d\n", stream_info.sample_rate);
-
-              if (stream_info != old_stream_info) {
-                this_streamer->output_ring_buffer_->reset();
-
-                event.type = EventType::STARTED;
-                event.media_file_type = media_file_type;
-                event.stream_info = stream_info;
-                xQueueSend(this_streamer->event_queue_, &event, portMAX_DELAY);
-              }
-            }
-          }
-
-          if (!header_parsed) {
-            // Need more data to parse header
-            continue;
-          }
-        }
-
-        size_t bytes_to_read = std::min(max_bytes_to_read, BUFFER_SIZE);
-        if (bytes_to_read > 0) {
-          bytes_read =
+              }  // parsing state
+            }  // if header bytes available
+          }  // if header bytes needed
+        }  // if header parsed
+
+        if (header_parsed && (wav_sample_bytes_to_read > 0)) {
+          size_t bytes_to_read = std::min(max_bytes_to_read, BUFFER_SIZE);
+          if (bytes_to_read > 0) {
+            bytes_read =
               this_streamer->input_ring_buffer_->read((void *) output_buffer, bytes_to_read, (10 / portTICK_PERIOD_MS));
-          output_buffer_current = output_buffer;
-          output_buffer_length += bytes_read;
+            output_buffer_current = output_buffer;
+            output_buffer_length += bytes_read;
+            wav_sample_bytes_to_read -= bytes_read;
+          }
         }
       } else if (media_file_type == media_player::MediaFileType::MP3) {
         // Shift unread data in buffer to start
@@ -411,4 +396,4 @@ void DecodeStreamer::reset_ring_buffers() {
 
 }  // namespace nabu
 }  // namespace esphome
-#endif
\ No newline at end of file
+#endif
diff --git a/esphome/components/nabu/wav_decoder.cpp b/esphome/components/nabu/wav_decoder.cpp
new file mode 100644
index 0000000..32effbd
--- /dev/null
+++ b/esphome/components/nabu/wav_decoder.cpp
@@ -0,0 +1,125 @@
+#include "wav_decoder.h"
+
+namespace wav_decoder {
+
+WAVDecoderResult WAVDecoder::next() {
+  this->bytes_to_skip_ = 0;
+
+  switch (this->state_) {
+  case WAV_DECODER_BEFORE_RIFF: {
+    this->chunk_name_ = std::string((const char *)this->buffer_, 4);
+    if (this->chunk_name_ != "RIFF") {
+      return WAV_DECODER_ERROR_NO_RIFF;
+    }
+
+    this->chunk_bytes_left_ = *((uint32_t *)(this->buffer_ + 4));
+    if ((this->chunk_bytes_left_ % 2) != 0) {
+      // Pad byte
+      this->chunk_bytes_left_++;
+    }
+
+    // WAVE sub-chunk header should follow
+    this->state_ = WAV_DECODER_BEFORE_WAVE;
+    this->bytes_needed_ = 4; // WAVE
+    break;
+  }
+
+  case WAV_DECODER_BEFORE_WAVE: {
+    this->chunk_name_ = std::string((const char *)this->buffer_, 4);
+    if (this->chunk_name_ != "WAVE") {
+      return WAV_DECODER_ERROR_NO_WAVE;
+    }
+
+    // Next chunk header
+    this->state_ = WAV_DECODER_BEFORE_FMT;
+    this->bytes_needed_ = 8; // chunk name + size
+    break;
+  }
+
+  case WAV_DECODER_BEFORE_FMT: {
+    this->chunk_name_ = std::string((const char *)this->buffer_, 4);
+    this->chunk_bytes_left_ = *((uint32_t *)(this->buffer_ + 4));
+    if ((this->chunk_bytes_left_ % 2) != 0) {
+      // Pad byte
+      this->chunk_bytes_left_++;
+    }
+
+    if (this->chunk_name_ == "fmt ") {
+      // Read rest of fmt chunk
+      this->state_ = WAV_DECODER_IN_FMT;
+      this->bytes_needed_ = this->chunk_bytes_left_;
+    } else {
+      // Skip over chunk
+      // this->state_ = WAV_DECODER_BEFORE_FMT_SKIP_CHUNK;
+      this->bytes_to_skip_ = this->chunk_bytes_left_;
+      this->bytes_needed_ = 8;
+    }
+    break;
+  }
+
+    // case WAV_DECODER_BEFORE_FMT_SKIP_CHUNK: {
+    //   // Next chunk header
+    //   this->state_ = WAV_DECODER_BEFORE_FMT;
+    //   this->bytes_needed_ = 8; // chunk name + size
+    //   break;
+    // }
+
+  case WAV_DECODER_IN_FMT: {
+    /**
+     * audio format (uint16_t)
+     * number of channels (uint16_t)
+     * sample rate (uint32_t)
+     * bytes per second (uint32_t)
+     * block align (uint16_t)
+     * bits per sample (uint16_t)
+     * [rest of format chunk]
+     */
+    this->num_channels_ = *((uint16_t *)(this->buffer_ + 2));
+    this->sample_rate_ = *((uint32_t *)(this->buffer_ + 4));
+    this->bits_per_sample_ = *((uint16_t *)(this->buffer_ + 14));
+
+    // Next chunk
+    this->state_ = WAV_DECODER_BEFORE_DATA;
+    this->bytes_needed_ = 8; // chunk name + size
+    break;
+  }
+
+  case WAV_DECODER_BEFORE_DATA: {
+    this->chunk_name_ = std::string((const char *)this->buffer_, 4);
+    this->chunk_bytes_left_ = *((uint32_t *)(this->buffer_ + 4));
+    if ((this->chunk_bytes_left_ % 2) != 0) {
+      // Pad byte
+      this->chunk_bytes_left_++;
+    }
+
+    if (this->chunk_name_ == "data") {
+      // Complete
+      this->state_ = WAV_DECODER_IN_DATA;
+      this->bytes_needed_ = 0;
+      return WAV_DECODER_SUCCESS_IN_DATA;
+    }
+
+    // Skip over chunk
+    // this->state_ = WAV_DECODER_BEFORE_DATA_SKIP_CHUNK;
+    this->bytes_to_skip_ = this->chunk_bytes_left_;
+    this->bytes_needed_ = 8;
+    break;
+  }
+
+    // case WAV_DECODER_BEFORE_DATA_SKIP_CHUNK: {
+    //   // Next chunk header
+    //   this->state_ = WAV_DECODER_BEFORE_DATA;
+    //   this->bytes_needed_ = 8; // chunk name + size
+    //   break;
+    // }
+
+  case WAV_DECODER_IN_DATA: {
+    return WAV_DECODER_SUCCESS_IN_DATA;
+    break;
+  }
+  }
+
+  return WAV_DECODER_SUCCESS_NEXT;
+}
+
+} // namespace wav_decoder
diff --git a/esphome/components/nabu/wav_decoder.h b/esphome/components/nabu/wav_decoder.h
new file mode 100644
index 0000000..13bcba1
--- /dev/null
+++ b/esphome/components/nabu/wav_decoder.h
@@ -0,0 +1,102 @@
+// Very basic WAV file decoder that parses format information and gets to the
+// data portion of the file.
+// Skips over extraneous chunks like LIST and INFO.
+
+#ifndef WAV_DECODER_H_
+#define WAV_DECODER_H_
+
+#include <cstdint>
+#include <string>
+
+/* WAV header:
+ * 'RIFF' (4 bytes, ASCII)
+ * RIFF chunk size (uint32_t)
+ * 'WAVE' (4 bytes, ASCII)
+ * (optional RIFF chunks)
+ * 'fmt ' (4 bytes, ASCII)
+ * format chunk size (uint32_t)
+ * audio format (uint16_t, PCM = 1)
+ * number of channels (uint16_t)
+ * sample rate (uint32_t)
+ * bytes per second (uint32_t)
+ * block align (uint16_t)
+ * bits per sample (uint16_t)
+ * [rest of format chunk]
+ * (optional RIFF chunks)
+ * 'data' (4 bytes, ASCII)
+ * data chunks size (uint32_t)
+ * [rest of data chunk]
+ * (optional RIFF chunks)
+ * */
+
+namespace wav_decoder {
+
+const std::size_t min_buffer_size = 24;
+
+enum WAVDecoderState {
+
+  WAV_DECODER_BEFORE_RIFF = 0,
+  WAV_DECODER_BEFORE_WAVE = 1,
+  WAV_DECODER_BEFORE_FMT = 2,
+  WAV_DECODER_IN_FMT = 3,
+  WAV_DECODER_BEFORE_DATA = 4,
+  WAV_DECODER_IN_DATA = 5,
+
+};
+
+enum WAVDecoderResult {
+  WAV_DECODER_SUCCESS_NEXT = 0,
+  WAV_DECODER_SUCCESS_IN_DATA = 1,
+  WAV_DECODER_ERROR_NO_RIFF = 2,
+  WAV_DECODER_ERROR_NO_WAVE = 3,
+};
+
+class WAVDecoder {
+
+public:
+  WAVDecoder(uint8_t *buffer) : buffer_(buffer){};
+  ~WAVDecoder(){};
+
+  WAVDecoderState state() { return this->state_; }
+  std::size_t bytes_to_skip() { return this->bytes_to_skip_; }
+  std::size_t bytes_needed() { return this->bytes_needed_; }
+  std::string chunk_name() { return this->chunk_name_; }
+  std::size_t chunk_bytes_left() { return this->chunk_bytes_left_; }
+  uint32_t sample_rate() { return this->sample_rate_; }
+  uint16_t num_channels() { return this->num_channels_; }
+  uint16_t bits_per_sample() { return this->bits_per_sample_; }
+
+  // Advance decoding:
+  // 1. Check bytes_to_skip() first, and skip that many bytes.
+  // 2. Read exactly bytes_needed() into the start of the buffer.
+  // 3. Run next() and loop to 1 until the result is
+  // WAV_DECODER_SUCCESS_IN_DATA.
+  // 4. Use chunk_bytes_left() to read the data samples.
+  WAVDecoderResult next();
+
+  void reset() {
+    this->state_ = WAV_DECODER_BEFORE_RIFF;
+    this->bytes_to_skip_ = 0;
+    this->chunk_name_ = "";
+    this->chunk_bytes_left_ = 0;
+
+    this->sample_rate_ = 0;
+    this->num_channels_ = 0;
+    this->bits_per_sample_ = 0;
+  }
+
+protected:
+  uint8_t *buffer_;
+  WAVDecoderState state_ = WAV_DECODER_BEFORE_RIFF;
+  std::size_t bytes_needed_ = 8; // chunk name + size
+  std::size_t bytes_to_skip_ = 0;
+  std::string chunk_name_;
+  std::size_t chunk_bytes_left_ = 0;
+
+  uint32_t sample_rate_ = 0;
+  uint16_t num_channels_ = 0;
+  uint16_t bits_per_sample_ = 0;
+};
+} // namespace wav_decoder
+
+#endif // WAV_DECODER_H_
diff --git a/voice-kit.yaml b/voice-kit.yaml
index 0c34d4d..4a02d8e 100644
--- a/voice-kit.yaml
+++ b/voice-kit.yaml
@@ -668,7 +668,7 @@ external_components:
   - source:
       type: git
       url: https://github.com/esphome/voice-kit
-      ref: kahrendt-2024-2
+      ref: dev
     components: [i2s_audio, nabu, voice_assistant, media_player, micro_wake_word]
     refresh: 0s