7#include <TensorFlowLite.h>
10#include "AudioTools/CoreAudio/BaseStream.h"
11#include "AudioTools/CoreAudio/AudioOutput.h"
12#include "AudioTools/CoreAudio/Buffers.h"
13#include "tensorflow/lite/c/common.h"
14#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
15#include "tensorflow/lite/experimental/microfrontend/lib/frontend_util.h"
16#include "tensorflow/lite/micro/all_ops_resolver.h"
17#include "tensorflow/lite/micro/kernels/micro_ops.h"
18#include "tensorflow/lite/micro/micro_interpreter.h"
19#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
20#include "tensorflow/lite/micro/system_setup.h"
21#include "tensorflow/lite/schema/schema_generated.h"
33class TfLiteAudioStreamBase;
34class TfLiteAbstractRecognizeCommands;
45 virtual int read(int16_t*data,
int len) = 0;
57 virtual bool write(
const int16_t sample) = 0;
69 const unsigned char* model =
nullptr;
73 bool useAllOpsResolver =
false;
75 void (*respondToCommand)(
const char* found_command, uint8_t score,
76 bool is_new_command) =
nullptr;
81 size_t kTensorArenaSize = 10 * 1024;
91 int sample_rate = 16000;
99 int kFeatureSliceSize = 40;
100 int kFeatureSliceCount = 49;
101 int kFeatureSliceStrideMs = 20;
102 int kFeatureSliceDurationMs = 30;
105 int kSlicesToProcess = 2;
108 int32_t average_window_duration_ms = 1000;
109 uint8_t detection_threshold = 50;
110 int32_t suppression_ms = 1500;
111 int32_t minimum_count = 3;
114 float filterbank_lower_band_limit = 125.0;
115 float filterbank_upper_band_limit = 7500.0;
116 float noise_reduction_smoothing_bits = 10;
117 float noise_reduction_even_smoothing = 0.025;
118 float noise_reduction_odd_smoothing = 0.06;
119 float noise_reduction_min_signal_remaining = 0.05;
120 bool pcan_gain_control_enable_pcan = 1;
121 float pcan_gain_control_strength = 0.95;
122 float pcan_gain_control_offset = 80.0;
123 float pcan_gain_control_gain_bits = 21;
124 bool log_scale_enable_log = 1;
125 uint8_t log_scale_scale_shift = 6;
134 int categoryCount() {
135 return kCategoryCount;
138 int featureElementCount() {
139 return kFeatureSliceSize * kFeatureSliceCount;
142 int audioSampleSize() {
143 return kFeatureSliceDurationMs * (sample_rate / 1000);
146 int strideSampleSize() {
147 return kFeatureSliceStrideMs * (sample_rate / 1000);
151 int kCategoryCount = 0;
152 const char** labels =
nullptr;
164 static int8_t quantize(
float value,
float scale,
float zero_point){
165 if(scale==0.0&&zero_point==0)
return value;
166 return value / scale + zero_point;
169 static float dequantize(int8_t value,
float scale,
float zero_point){
170 if(scale==0.0&&zero_point==0)
return value;
171 return (value - zero_point) * scale;
174 static float dequantizeToNewRange(int8_t value,
float scale,
float zero_point,
float new_range){
175 float deq = (
static_cast<float>(value) - zero_point) * scale;
176 return clip(deq * new_range, new_range);
179 static float clip(
float value,
float range){
181 return value > range ? range : value;
183 return -value < -range ? -range : value;
197 virtual TfLiteStatus getCommand(
const TfLiteTensor* latest_results,
const int32_t current_time_ms,
198 const char** found_command,uint8_t* score,
bool* is_new_command) = 0;
225 if (cfg.labels ==
nullptr) {
226 LOGE(
"config.labels not defined");
233 virtual TfLiteStatus getCommand(
const TfLiteTensor* latest_results,
234 const int32_t current_time_ms,
235 const char** found_command,
237 bool* is_new_command)
override {
240 this->current_time_ms = current_time_ms;
241 this->time_since_last_top = current_time_ms - previous_time_ms;
245 Result row(current_time_ms, idx, latest_results->data.int8[idx]);
246 result_queue.push_back(row);
248 TfLiteStatus result =
validate(latest_results);
249 if (result!=kTfLiteOk){
252 return evaluate(found_command, score, is_new_command);
262 Result(int32_t time_ms,
int category, int8_t score){
263 this->time_ms = time_ms;
264 this->category = category;
270 Vector <Result> result_queue;
271 int previous_cateogory=-1;
272 int32_t current_time_ms=0;
273 int32_t previous_time_ms=0;
274 int32_t time_since_last_top=0;
279 uint8_t top_score = std::numeric_limits<uint8_t>::min();
281 if (score[j]>top_score){
290 return cfg.categoryCount();
295 if (result_queue.empty())
return;
296 while (result_queue[0].time_ms<limit){
297 result_queue.pop_front();
302 TfLiteStatus
evaluate(
const char** found_command, uint8_t* result_score,
bool* is_new_command) {
307 for (
int j=0;j<result_queue.size();j++){
308 int idx = result_queue[j].category;
309 totals[idx] += result_queue[j].score;
324 LOGE(
"Could not find max category")
329 *result_score = totals[maxIdx] / count[maxIdx];
330 *found_command = cfg.labels[maxIdx];
332 if (previous_cateogory!=maxIdx
333 && *result_score > cfg.detection_threshold
334 && time_since_last_top > cfg.suppression_ms){
335 previous_time_ms = current_time_ms;
336 previous_cateogory = maxIdx;
337 *is_new_command =
true;
339 *is_new_command =
false;
342 LOGD(
"Category: %s, score: %d, is_new: %d",*found_command, *result_score, *is_new_command);
302 TfLiteStatus
evaluate(
const char** found_command, uint8_t* result_score,
bool* is_new_command) {
…}
348 TfLiteStatus
validate(
const TfLiteTensor* latest_results) {
349 if ((latest_results->dims->size != 2) ||
350 (latest_results->dims->data[0] != 1) ||
353 "The results for recognition should contain %d "
354 "elements, but there are "
355 "%d in an %d-dimensional shape",
357 (
int)latest_results->dims->size);
361 if (latest_results->type != kTfLiteInt8) {
362 LOGE(
"The results for recognition should be int8 elements, but are %d",
363 (
int)latest_results->type);
367 if ((!result_queue.empty()) &&
368 (current_time_ms < result_queue[0].time_ms)) {
369 LOGE(
"Results must be in increasing time order: timestamp %d < %d",
370 (
int)current_time_ms, (
int)result_queue[0].time_ms);
348 TfLiteStatus
validate(
const TfLiteTensor* latest_results) {
…}
388 virtual void setInterpreter(tflite::MicroInterpreter* p_interpreter) = 0;
391 virtual int availableToWrite() = 0;
394 virtual size_t write(
const uint8_t* data,
size_t len)= 0;
395 virtual tflite::MicroInterpreter& interpreter()= 0;
415 if (p_buffer !=
nullptr)
delete p_buffer;
416 if (p_audio_samples !=
nullptr)
delete p_audio_samples;
422 this->parent = parent;
425 kMaxAudioSampleSize = cfg.audioSampleSize();
426 kStrideSampleSize = cfg.strideSampleSize();
427 kKeepSampleSize = kMaxAudioSampleSize - kStrideSampleSize;
429 if (!setup_recognizer()) {
430 LOGE(
"setup_recognizer");
435 TfLiteStatus init_status = initializeMicroFeatures();
436 if (init_status != kTfLiteOk) {
441 if (p_buffer ==
nullptr) {
443 LOGD(
"Allocating buffer for %d samples", kMaxAudioSampleSize);
447 if (p_feature_data ==
nullptr) {
448 p_feature_data =
new int8_t[cfg.featureElementCount()];
449 memset(p_feature_data, 0, cfg.featureElementCount());
453 if (p_audio_samples ==
nullptr) {
454 p_audio_samples =
new int16_t[kMaxAudioSampleSize];
455 memset(p_audio_samples, 0, kMaxAudioSampleSize *
sizeof(int16_t));
461 virtual bool write(int16_t sample) {
465 current_time += cfg.kFeatureSliceStrideMs;
469 int8_t* feature_buffer = addSlice();
470 if (total_slice_count >= cfg.kSlicesToProcess) {
471 processSlices(feature_buffer);
473 total_slice_count = 0;
481 TfLiteAudioStreamBase *parent=
nullptr;
482 int8_t* p_feature_data =
nullptr;
483 int16_t* p_audio_samples =
nullptr;
485 FrontendState g_micro_features_state;
486 FrontendConfig config;
487 int kMaxAudioSampleSize;
488 int kStrideSampleSize;
492 int32_t current_time = 0;
493 int16_t total_slice_count = 0;
495 virtual bool setup_recognizer() {
497 if (cfg.recognizeCommands ==
nullptr) {
498 static TfLiteMicroSpeechRecognizeCommands static_recognizer;
499 cfg.recognizeCommands = &static_recognizer;
501 return cfg.recognizeCommands->begin(cfg);
505 virtual bool write1(
const int16_t sample) {
506 if (cfg.channels == 1) {
507 p_buffer->
write(sample);
514 p_buffer->
write(((sample / 2) + (last_value / 2)));
532 virtual int8_t* addSlice() {
535 memmove(p_feature_data, p_feature_data + cfg.kFeatureSliceSize,
536 (cfg.kFeatureSliceCount - 1) * cfg.kFeatureSliceSize);
539 int audio_samples_size =
540 p_buffer->
readArray(p_audio_samples, kMaxAudioSampleSize);
543 if (audio_samples_size != kMaxAudioSampleSize) {
544 LOGE(
"audio_samples_size=%d != kMaxAudioSampleSize=%d",
545 audio_samples_size, kMaxAudioSampleSize);
549 p_buffer->
writeArray(p_audio_samples + kStrideSampleSize, kKeepSampleSize);
552 int8_t* new_slice_data =
553 p_feature_data + ((cfg.kFeatureSliceCount - 1) * cfg.kFeatureSliceSize);
554 size_t num_samples_read = 0;
555 if (generateMicroFeatures(p_audio_samples, audio_samples_size,
556 new_slice_data, cfg.kFeatureSliceSize,
557 &num_samples_read) != kTfLiteOk) {
558 LOGE(
"Error generateMicroFeatures");
561 return p_feature_data;
565 virtual bool processSlices(int8_t* feature_buffer) {
566 LOGI(
"->slices: %d", total_slice_count);
568 memcpy(parent->modelInputBuffer(), feature_buffer, cfg.featureElementCount());
571 TfLiteStatus invoke_status = parent->interpreter().Invoke();
572 if (invoke_status != kTfLiteOk) {
573 LOGE(
"Invoke failed");
578 TfLiteTensor* output = parent->interpreter().output(0);
581 const char* found_command =
nullptr;
583 bool is_new_command =
false;
585 TfLiteStatus process_status = cfg.recognizeCommands->getCommand(
586 output, current_time, &found_command, &score, &is_new_command);
587 if (process_status != kTfLiteOk) {
588 LOGE(
"TfLiteMicroSpeechRecognizeCommands::getCommand() failed");
600 for (
int i = 0; i < cfg.kFeatureSliceCount; i++) {
601 for (
int j = 0; j < cfg.kFeatureSliceSize; j++) {
602 Serial.print(p_feature_data[(i * cfg.kFeatureSliceSize) + j]);
607 Serial.println(
"------------");
610 virtual TfLiteStatus initializeMicroFeatures() {
612 config.window.size_ms = cfg.kFeatureSliceDurationMs;
613 config.window.step_size_ms = cfg.kFeatureSliceStrideMs;
614 config.filterbank.num_channels = cfg.kFeatureSliceSize;
615 config.filterbank.lower_band_limit = cfg.filterbank_lower_band_limit;
616 config.filterbank.upper_band_limit = cfg.filterbank_upper_band_limit;
617 config.noise_reduction.smoothing_bits = cfg.noise_reduction_smoothing_bits;
618 config.noise_reduction.even_smoothing = cfg.noise_reduction_even_smoothing;
619 config.noise_reduction.odd_smoothing = cfg.noise_reduction_odd_smoothing;
620 config.noise_reduction.min_signal_remaining = cfg.noise_reduction_min_signal_remaining;
621 config.pcan_gain_control.enable_pcan = cfg.pcan_gain_control_enable_pcan;
622 config.pcan_gain_control.strength = cfg.pcan_gain_control_strength;
623 config.pcan_gain_control.offset = cfg.pcan_gain_control_offset ;
624 config.pcan_gain_control.gain_bits = cfg.pcan_gain_control_gain_bits;
625 config.log_scale.enable_log = cfg.log_scale_enable_log;
626 config.log_scale.scale_shift = cfg.log_scale_scale_shift;
627 if (!FrontendPopulateState(&config, &g_micro_features_state,
629 LOGE(
"frontendPopulateState() failed");
635 virtual TfLiteStatus generateMicroFeatures(
const int16_t* input,
636 int input_size, int8_t* output,
638 size_t* num_samples_read) {
640 const int16_t* frontend_input = input;
643 FrontendOutput frontend_output = FrontendProcessSamples(
644 &g_micro_features_state, frontend_input, input_size, num_samples_read);
647 if (output_size != frontend_output.size) {
648 LOGE(
"output_size=%d, frontend_output.size=%d", output_size,
649 frontend_output.size);
662 for (
size_t i = 0; i < frontend_output.size; ++i) {
676 constexpr int32_t value_scale = 256;
677 constexpr int32_t value_div =
678 static_cast<int32_t
>((25.6f * 26.0f) + 0.5f);
680 ((frontend_output.values[i] * value_scale) + (value_div / 2)) /
697 bool is_new_command) {
698 if (cfg.respondToCommand !=
nullptr) {
699 cfg.respondToCommand(found_command, score, is_new_command);
702 if (is_new_command) {
704 snprintf(buffer, 80,
"Result: %s, score: %d, is_new: %s", found_command,
705 score, is_new_command ?
"true" :
"false");
706 Serial.println(buffer);
721 this->increment = increment;
727 p_interpreter = &parent->interpreter();
728 input = p_interpreter->input(0);
729 output = p_interpreter->output(0);
730 channels = parent->
config().channels;
734 virtual int read(int16_t*data,
int sampleCount)
override {
736 float two_pi = 2 * PI;
737 for (
int j=0; j<sampleCount; j+=channels){
739 input->data.int8[0] = TfLiteQuantizer::quantize(actX,input->params.scale, input->params.zero_point);
742 TfLiteStatus invoke_status = p_interpreter->Invoke();
745 if(kTfLiteOk!= invoke_status){
746 LOGE(
"invoke_status not ok");
749 if(kTfLiteInt8 != output->type){
750 LOGE(
"Output type is not kTfLiteInt8");
755 data[j] = TfLiteQuantizer::dequantizeToNewRange(output->data.int8[0], output->params.scale, output->params.zero_point, range);
757 LOGD(
"%f->%d / %d->%d",actX, input->data.int8[0], output->data.int8[0], data[j]);
758 for (
int i=1;i<channels;i++){
760 LOGD(
"generate data for channels");
776 TfLiteTensor* input =
nullptr;
777 TfLiteTensor* output =
nullptr;
778 tflite::MicroInterpreter* p_interpreter =
nullptr;
791 if (p_tensor_arena !=
nullptr)
delete[] p_tensor_arena;
798 this->p_interpreter = p_interpreter;
813 p_tensor_arena =
new uint8_t[cfg.kTensorArenaSize];
815 if (cfg.categoryCount()>0){
818 if (!setupWriter()) {
823 LOGW(
"categoryCount=%d", cfg.categoryCount());
828 if (!setModel(cfg.model)) {
832 if (!setupInterpreter()) {
837 LOGI(
"AllocateTensors");
838 TfLiteStatus allocate_status = p_interpreter->AllocateTensors();
839 if (allocate_status != kTfLiteOk) {
840 LOGE(
"AllocateTensors() failed");
846 p_tensor = p_interpreter->input(0);
847 if (cfg.categoryCount()>0){
848 if ((p_tensor->dims->size != 2) || (p_tensor->dims->data[0] != 1) ||
849 (p_tensor->dims->data[1] !=
850 (cfg.kFeatureSliceCount * cfg.kFeatureSliceSize)) ||
851 (p_tensor->type != kTfLiteInt8)) {
852 LOGE(
"Bad input tensor parameters in model");
858 p_tensor_buffer = p_tensor->data.int8;
859 if (p_tensor_buffer ==
nullptr) {
860 LOGE(
"p_tensor_buffer is null");
865 if (cfg.reader!=
nullptr){
866 cfg.reader->begin(
this);
879 virtual size_t write(
const uint8_t* data,
size_t len)
override {
881 if (cfg.writer==
nullptr){
882 LOGE(
"cfg.output is null");
885 int16_t* samples = (int16_t*)data;
886 int16_t sample_count = len / 2;
887 for (
int j = 0; j < sample_count; j++) {
888 cfg.writer->write(samples[j]);
879 virtual size_t write(
const uint8_t* data,
size_t len)
override {
…}
894 virtual int available()
override {
return cfg.reader !=
nullptr ? DEFAULT_BUFFER_SIZE : 0; }
897 virtual size_t readBytes(uint8_t *data,
size_t len)
override {
899 if (cfg.reader!=
nullptr){
900 return cfg.reader->read((int16_t*)data, (
int) len/
sizeof(int16_t)) *
sizeof(int16_t);
897 virtual size_t readBytes(uint8_t *data,
size_t len)
override {
…}
908 return *p_interpreter;
918 return p_tensor_buffer;
922 const tflite::Model* p_model =
nullptr;
923 tflite::MicroInterpreter* p_interpreter =
nullptr;
924 TfLiteTensor* p_tensor =
nullptr;
925 bool is_setup =
false;
930 uint8_t* p_tensor_arena =
nullptr;
931 int8_t* p_tensor_buffer =
nullptr;
933 virtual bool setModel(
const unsigned char* model) {
935 p_model = tflite::GetModel(model);
936 if (p_model->version() != TFLITE_SCHEMA_VERSION) {
938 "Model provided is schema version %d not equal "
939 "to supported version %d.",
940 p_model->version(), TFLITE_SCHEMA_VERSION);
946 virtual bool setupWriter() {
947 if (cfg.writer ==
nullptr) {
948 static TfLiteMicroSpeachWriter writer;
949 cfg.writer = &writer;
951 return cfg.writer->begin(
this);
960 virtual bool setupInterpreter() {
961 if (p_interpreter ==
nullptr) {
963 if (cfg.useAllOpsResolver) {
964 tflite::AllOpsResolver resolver;
965 static tflite::MicroInterpreter static_interpreter{
966 p_model, resolver, p_tensor_arena, cfg.kTensorArenaSize};
967 p_interpreter = &static_interpreter;
970 static tflite::MicroMutableOpResolver<4> micro_op_resolver{};
971 if (micro_op_resolver.AddDepthwiseConv2D() != kTfLiteOk) {
974 if (micro_op_resolver.AddFullyConnected() != kTfLiteOk) {
977 if (micro_op_resolver.AddSoftmax() != kTfLiteOk) {
980 if (micro_op_resolver.AddReshape() != kTfLiteOk) {
984 static tflite::MicroInterpreter static_interpreter{
985 p_model, micro_op_resolver, p_tensor_arena, cfg.kTensorArenaSize};
986 p_interpreter = &static_interpreter;