arduino-audio-tools/_wake_word_detector_8h_source.html

#pragma once


#include <algorithm>

#include <cmath>


#include "AudioTools/CoreAudio/AudioOutput.h"

#include "AudioTools/CoreAudio/AudioBasic/Collections/Vector.h"

#include "AudioTools/CoreAudio/Buffers.h"

#include "AudioTools/AudioLibs/AudioFFT.h"


namespace audio_tools {


/*

 * @brief Frame holding the indices of the top 3 frequencies in an FFT window.

 *

 * Used as a compact representation of the dominant frequency content in a frame

 * of audio.

 */

template <size_t N>


struct FrequencyFrame {

  uint16_t top_freqs[N];

};


template <typename T = int16_t, size_t N = 3>


class WakeWordDetector : public AudioOutput {

 public:


  struct Template {

    Vector<FrequencyFrame<N>>

        frames;

    float threshold_percent;

    const char* name;

    float last_match_percent =

        0.0f;

  };


  using WakeWordCallback = void (*)(const char* name);


  WakeWordDetector(AudioFFTBase& fft)

      : p_fft(&fft) {

    _frame_pos = 0;

    auto& fft_cfg = fft.config();

    fft_cfg.ref = this;

    fft_cfg.callback = fftResult;

  }


  void startRecording() {

    _recent_frames.clear();

    _is_recording = true;

  }


  Vector<FrequencyFrame<N>> stopRecording() {

    _is_recording = false;

    return _recent_frames;

  }


  bool isRecording() const { return _is_recording; }


  void addTemplate(const Vector<FrequencyFrame<N>>& frames,

                   float threshold_percent, const char* name) {

    Template t;

    t.frames = frames;

    t.threshold_percent = threshold_percent;

    t.name = name;

    t.last_match_percent = 0.0f;

    _templates.push_back(t);

    if (frames.size() > _max_template_len) _max_template_len = frames.size();

  }


  void setWakeWordCallback(WakeWordCallback cb) { _callback = cb; }


  size_t write(const uint8_t* buf, size_t size) override {

    return p_fft->write(buf, size);

  }


  static void fftResult(AudioFFTBase& fft) {

    // This static method must access instance data via fft.config().ref

    auto* self = static_cast<WakeWordDetector<T,N>*>(fft.config().ref);

    if (!self) return;

    FrequencyFrame<N> frame;

    AudioFFTResult result[N];

      fft.resultArray(result);

    for (size_t j = 0; j < N; j++) {

      frame.top_freqs[j] = result[j].frequency;

    }

    self->_recent_frames.push_back(frame);


    if (self->_is_recording) {

      return;

    }


    if (self->_recent_frames.size() > self->_max_template_len)

      self->_recent_frames.erase(self->_recent_frames.begin());

    for (size_t i = 0; i < self->_templates.size(); ++i) {

      Template& tmpl = self->_templates[i];

      if (self->_recent_frames.size() >= tmpl.frames.size()) {

        float percent = self->matchTemplate(tmpl);

        if (percent >= tmpl.threshold_percent) {

          if (self->_callback) self->_callback(tmpl.name);

        }

      }

    }

  }


 protected:

  Vector<Template> _templates;

  Vector<FrequencyFrame<N>> _recent_frames;

  Vector<T> _buffer;

  AudioFFTBase* p_fft = nullptr;

  bool _is_recording = false;

  size_t _frame_pos;

  size_t _max_template_len = 0;

  WakeWordCallback _callback = nullptr;


  float matchTemplate(Template& tmpl) {

    size_t matches = 0;

    size_t offset = _recent_frames.size() - tmpl.frames.size();

    for (size_t i = 0; i < tmpl.frames.size(); ++i) {

      size_t frame_matches = 0;

      for (size_t j = 0; j < N; ++j) {

        if (tmpl.frames[i].top_freqs[j] ==

            _recent_frames[offset + i].top_freqs[j])

          frame_matches++;

      }

      if (frame_matches >= (N >= 2 ? N - 1 : 1))  // at least N-1 out of N match

        matches++;

    }

    float percent = (tmpl.frames.size() > 0)

                        ? (100.0f * matches / tmpl.frames.size())

                        : 0.0f;

    tmpl.last_match_percent = percent;

    return percent;

  }

};


}  // namespace audio_tools

audio_tools::AudioFFTBase
Executes FFT using audio data privded by write() and/or an inverse FFT where the samples are made ava...
Definition AudioFFT.h:191

audio_tools::AudioFFTBase::config
AudioFFTConfig & config()
Provides the actual configuration.
Definition AudioFFT.h:639

audio_tools::AudioFFTBase::write
size_t write(const uint8_t *data, size_t len) override
Provide the audio data as FFT input.
Definition AudioFFT.h:294

audio_tools::AudioOutput
Abstract Audio Ouptut class.
Definition AudioOutput.h:25

audio_tools::Vector
Vector implementation which provides the most important methods as defined by std::vector....
Definition Vector.h:21

audio_tools::WakeWordDetector
Template-based wake word detector for microcontrollers using dominant frequency patterns.
Definition WakeWordDetector.h:51

audio_tools::WakeWordDetector::_is_recording
bool _is_recording
True if currently recording a template.
Definition WakeWordDetector.h:136

audio_tools::WakeWordDetector::_max_template_len
size_t _max_template_len
Length of the longest template.
Definition WakeWordDetector.h:138

audio_tools::WakeWordDetector::_frame_pos
size_t _frame_pos
Current position in frame buffer.
Definition WakeWordDetector.h:137

audio_tools::WakeWordDetector::_templates
Vector< Template > _templates
List of wake word templates.
Definition WakeWordDetector.h:132

audio_tools::WakeWordDetector::_recent_frames
Vector< FrequencyFrame< N > > _recent_frames
Recent frames for comparison.
Definition WakeWordDetector.h:133

audio_tools::WakeWordDetector::_buffer
Vector< T > _buffer
Buffer for incoming PCM samples.
Definition WakeWordDetector.h:134

audio_tools
Generic Implementation of sound input and output for desktop environments using portaudio.
Definition AudioCodecsBase.h:10

audio_tools::AudioFFTConfig::ref
void * ref
caller
Definition AudioFFT.h:61

audio_tools::FrequencyFrame
Definition WakeWordDetector.h:20

audio_tools::FrequencyFrame::top_freqs
uint16_t top_freqs[N]
Indices of top 3 frequencies in FFT.
Definition WakeWordDetector.h:21

audio_tools::WakeWordDetector::Template
Definition WakeWordDetector.h:53

audio_tools::WakeWordDetector::Template::name
const char * name
Name/label of the wake word.
Definition WakeWordDetector.h:58

audio_tools::WakeWordDetector::Template::frames
Vector< FrequencyFrame< N > > frames
Sequence of frequency frames for the wake word.
Definition WakeWordDetector.h:55

audio_tools::WakeWordDetector::Template::last_match_percent
float last_match_percent
Last computed match percent for this template.
Definition WakeWordDetector.h:59

audio_tools::WakeWordDetector::Template::threshold_percent
float threshold_percent
Definition WakeWordDetector.h:56