6 #include <TensorFlowLite.h>
9 #include "AudioTools/AudioOutput.h"
10 #include "AudioTools/Buffers.h"
11 #include "tensorflow/lite/c/common.h"
12 #include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
13 #include "tensorflow/lite/experimental/microfrontend/lib/frontend_util.h"
14 #include "tensorflow/lite/micro/all_ops_resolver.h"
15 #include "tensorflow/lite/micro/kernels/micro_ops.h"
16 #include "tensorflow/lite/micro/micro_error_reporter.h"
17 #include "tensorflow/lite/micro/micro_interpreter.h"
18 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
19 #include "tensorflow/lite/micro/system_setup.h"
20 #include "tensorflow/lite/schema/schema_generated.h"
32 class TfLiteAudioStreamBase;
33 class TfLiteAbstractRecognizeCommands;
44 virtual int read(int16_t*data,
int len) = 0;
56 virtual bool write(
const int16_t sample) = 0;
67 virtual int Report(
const char* format, va_list args)
override {
68 int result = snprintf(msg, 200, format, args);
76 tflite::ErrorReporter* error_reporter = &my_error_reporter;
87 const unsigned char* model =
nullptr;
91 bool useAllOpsResolver =
false;
93 void (*respondToCommand)(
const char* found_command, uint8_t score,
94 bool is_new_command) =
nullptr;
99 int kTensorArenaSize = 10 * 1024;
109 int sample_rate = 16000;
117 int kFeatureSliceSize = 40;
118 int kFeatureSliceCount = 49;
119 int kFeatureSliceStrideMs = 20;
120 int kFeatureSliceDurationMs = 30;
123 int kSlicesToProcess = 2;
126 int32_t average_window_duration_ms = 1000;
127 uint8_t detection_threshold = 50;
128 int32_t suppression_ms = 1500;
129 int32_t minimum_count = 3;
132 float filterbank_lower_band_limit = 125.0;
133 float filterbank_upper_band_limit = 7500.0;
134 float noise_reduction_smoothing_bits = 10;
135 float noise_reduction_even_smoothing = 0.025;
136 float noise_reduction_odd_smoothing = 0.06;
137 float noise_reduction_min_signal_remaining = 0.05;
138 bool pcan_gain_control_enable_pcan = 1;
139 float pcan_gain_control_strength = 0.95;
140 float pcan_gain_control_offset = 80.0;
141 float pcan_gain_control_gain_bits = 21;
142 bool log_scale_enable_log = 1;
143 uint8_t log_scale_scale_shift = 6;
152 int categoryCount() {
153 return kCategoryCount;
156 int featureElementCount() {
157 return kFeatureSliceSize * kFeatureSliceCount;
160 int audioSampleSize() {
161 return kFeatureSliceDurationMs * (sample_rate / 1000);
164 int strideSampleSize() {
165 return kFeatureSliceStrideMs * (sample_rate / 1000);
169 int kCategoryCount = 0;
170 const char** labels =
nullptr;
182 static int8_t quantize(
float value,
float scale,
float zero_point){
183 if(scale==0.0&&zero_point==0)
return value;
184 return value / scale + zero_point;
187 static float dequantize(int8_t value,
float scale,
float zero_point){
188 if(scale==0.0&&zero_point==0)
return value;
189 return (value - zero_point) * scale;
192 static float dequantizeToNewRange(int8_t value,
float scale,
float zero_point,
float new_range){
193 float deq = (
static_cast<float>(value) - zero_point) * scale;
194 return clip(deq * new_range, new_range);
197 static float clip(
float value,
float range){
199 return value > range ? range : value;
201 return -value < -range ? -range : value;
215 virtual TfLiteStatus getCommand(
const TfLiteTensor* latest_results,
const int32_t current_time_ms,
216 const char** found_command,uint8_t* score,
bool* is_new_command) = 0;
243 if (cfg.labels ==
nullptr) {
244 LOGE(
"config.labels not defined");
251 virtual TfLiteStatus getCommand(
const TfLiteTensor* latest_results,
252 const int32_t current_time_ms,
253 const char** found_command,
255 bool* is_new_command)
override {
258 this->current_time_ms = current_time_ms;
259 this->time_since_last_top = current_time_ms - previous_time_ms;
263 Result row(current_time_ms, idx, latest_results->data.int8[idx]);
264 result_queue.push_back(row);
266 TfLiteStatus result =
validate(latest_results);
267 if (result!=kTfLiteOk){
270 return evaluate(found_command, score, is_new_command);
280 Result(int32_t time_ms,
int category, int8_t score){
281 this->time_ms = time_ms;
282 this->category = category;
289 int previous_cateogory=-1;
290 int32_t current_time_ms=0;
291 int32_t previous_time_ms=0;
292 int32_t time_since_last_top=0;
297 uint8_t top_score = std::numeric_limits<uint8_t>::min();
299 if (score[j]>top_score){
308 return cfg.categoryCount();
313 while (result_queue[0].time_ms<limit){
314 result_queue.pop_front();
319 TfLiteStatus
evaluate(
const char** found_command, uint8_t* result_score,
bool* is_new_command) {
324 for (
int j=0;j<result_queue.size();j++){
325 int idx = result_queue[j].category;
326 totals[idx] += result_queue[j].score;
341 LOGE(
"Could not find max category")
346 *result_score = totals[maxIdx] / count[maxIdx];
347 *found_command = cfg.labels[maxIdx];
349 if (previous_cateogory!=maxIdx
350 && *result_score > cfg.detection_threshold
351 && time_since_last_top > cfg.suppression_ms){
352 previous_time_ms = current_time_ms;
353 previous_cateogory = maxIdx;
354 *is_new_command =
true;
356 *is_new_command =
false;
359 LOGD(
"Category: %s, score: %d, is_new: %d",*found_command, *result_score, *is_new_command);
365 TfLiteStatus
validate(
const TfLiteTensor* latest_results) {
366 if ((latest_results->dims->size != 2) ||
367 (latest_results->dims->data[0] != 1) ||
370 "The results for recognition should contain %d "
371 "elements, but there are "
372 "%d in an %d-dimensional shape",
374 (
int)latest_results->dims->size);
378 if (latest_results->type != kTfLiteInt8) {
379 LOGE(
"The results for recognition should be int8 elements, but are %d",
380 (
int)latest_results->type);
384 if ((!result_queue.empty()) &&
385 (current_time_ms < result_queue[0].time_ms)) {
386 LOGE(
"Results must be in increasing time order: timestamp %d < %d",
387 (
int)current_time_ms, (
int)result_queue[0].time_ms);
405 virtual void setInterpreter(tflite::MicroInterpreter* p_interpreter) = 0;
408 virtual int availableToWrite() = 0;
411 virtual size_t write(
const uint8_t* audio,
size_t bytes)= 0;
412 virtual tflite::MicroInterpreter& interpreter()= 0;
432 if (p_buffer !=
nullptr)
delete p_buffer;
433 if (p_audio_samples !=
nullptr)
delete p_audio_samples;
439 this->parent = parent;
442 kMaxAudioSampleSize = cfg.audioSampleSize();
443 kStrideSampleSize = cfg.strideSampleSize();
444 kKeepSampleSize = kMaxAudioSampleSize - kStrideSampleSize;
446 if (!setup_recognizer()) {
447 LOGE(
"setup_recognizer");
452 TfLiteStatus init_status = initializeMicroFeatures();
453 if (init_status != kTfLiteOk) {
458 if (p_buffer ==
nullptr) {
460 LOGD(
"Allocating buffer for %d samples", kMaxAudioSampleSize);
464 if (p_feature_data ==
nullptr) {
465 p_feature_data =
new int8_t[cfg.featureElementCount()];
466 memset(p_feature_data, 0, cfg.featureElementCount());
470 if (p_audio_samples ==
nullptr) {
471 p_audio_samples =
new int16_t[kMaxAudioSampleSize];
472 memset(p_audio_samples, 0, kMaxAudioSampleSize *
sizeof(int16_t));
478 virtual bool write(int16_t sample) {
482 current_time += cfg.kFeatureSliceStrideMs;
486 int8_t* feature_buffer = addSlice();
487 if (total_slice_count >= cfg.kSlicesToProcess) {
488 processSlices(feature_buffer);
490 total_slice_count = 0;
498 TfLiteAudioStreamBase *parent=
nullptr;
499 int8_t* p_feature_data =
nullptr;
500 int16_t* p_audio_samples =
nullptr;
502 FrontendState g_micro_features_state;
503 FrontendConfig config;
504 int kMaxAudioSampleSize;
505 int kStrideSampleSize;
509 int32_t current_time = 0;
510 int16_t total_slice_count = 0;
512 virtual bool setup_recognizer() {
514 if (cfg.recognizeCommands ==
nullptr) {
515 static TfLiteMicroSpeechRecognizeCommands static_recognizer;
516 cfg.recognizeCommands = &static_recognizer;
518 return cfg.recognizeCommands->begin(cfg);
522 virtual bool write1(
const int16_t sample) {
523 if (cfg.channels == 1) {
524 p_buffer->
write(sample);
531 p_buffer->
write(((sample / 2) + (last_value / 2)));
549 virtual int8_t* addSlice() {
552 memmove(p_feature_data, p_feature_data + cfg.kFeatureSliceSize,
553 (cfg.kFeatureSliceCount - 1) * cfg.kFeatureSliceSize);
556 int audio_samples_size =
557 p_buffer->
readArray(p_audio_samples, kMaxAudioSampleSize);
560 if (audio_samples_size != kMaxAudioSampleSize) {
561 LOGE(
"audio_samples_size=%d != kMaxAudioSampleSize=%d",
562 audio_samples_size, kMaxAudioSampleSize);
566 p_buffer->
writeArray(p_audio_samples + kStrideSampleSize, kKeepSampleSize);
569 int8_t* new_slice_data =
570 p_feature_data + ((cfg.kFeatureSliceCount - 1) * cfg.kFeatureSliceSize);
571 size_t num_samples_read = 0;
572 if (generateMicroFeatures(p_audio_samples, audio_samples_size,
573 new_slice_data, cfg.kFeatureSliceSize,
574 &num_samples_read) != kTfLiteOk) {
575 LOGE(
"Error generateMicroFeatures");
578 return p_feature_data;
582 virtual bool processSlices(int8_t* feature_buffer) {
583 LOGI(
"->slices: %d", total_slice_count);
585 memcpy(parent->modelInputBuffer(), feature_buffer, cfg.featureElementCount());
588 TfLiteStatus invoke_status = parent->interpreter().Invoke();
589 if (invoke_status != kTfLiteOk) {
590 LOGE(
"Invoke failed");
595 TfLiteTensor* output = parent->interpreter().output(0);
598 const char* found_command =
nullptr;
600 bool is_new_command =
false;
602 TfLiteStatus process_status = cfg.recognizeCommands->getCommand(
603 output, current_time, &found_command, &score, &is_new_command);
604 if (process_status != kTfLiteOk) {
605 LOGE(
"TfLiteMicroSpeechRecognizeCommands::getCommand() failed");
617 for (
int i = 0; i < cfg.kFeatureSliceCount; i++) {
618 for (
int j = 0; j < cfg.kFeatureSliceSize; j++) {
619 Serial.print(p_feature_data[(i * cfg.kFeatureSliceSize) + j]);
624 Serial.println(
"------------");
627 virtual TfLiteStatus initializeMicroFeatures() {
629 config.window.size_ms = cfg.kFeatureSliceDurationMs;
630 config.window.step_size_ms = cfg.kFeatureSliceStrideMs;
631 config.filterbank.num_channels = cfg.kFeatureSliceSize;
632 config.filterbank.lower_band_limit = cfg.filterbank_lower_band_limit;
633 config.filterbank.upper_band_limit = cfg.filterbank_upper_band_limit;
634 config.noise_reduction.smoothing_bits = cfg.noise_reduction_smoothing_bits;
635 config.noise_reduction.even_smoothing = cfg.noise_reduction_even_smoothing;
636 config.noise_reduction.odd_smoothing = cfg.noise_reduction_odd_smoothing;
637 config.noise_reduction.min_signal_remaining = cfg.noise_reduction_min_signal_remaining;
638 config.pcan_gain_control.enable_pcan = cfg.pcan_gain_control_enable_pcan;
639 config.pcan_gain_control.strength = cfg.pcan_gain_control_strength;
640 config.pcan_gain_control.offset = cfg.pcan_gain_control_offset ;
641 config.pcan_gain_control.gain_bits = cfg.pcan_gain_control_gain_bits;
642 config.log_scale.enable_log = cfg.log_scale_enable_log;
643 config.log_scale.scale_shift = cfg.log_scale_scale_shift;
644 if (!FrontendPopulateState(&config, &g_micro_features_state,
646 LOGE(
"frontendPopulateState() failed");
652 virtual TfLiteStatus generateMicroFeatures(
const int16_t* input,
653 int input_size, int8_t* output,
655 size_t* num_samples_read) {
657 const int16_t* frontend_input = input;
660 FrontendOutput frontend_output = FrontendProcessSamples(
661 &g_micro_features_state, frontend_input, input_size, num_samples_read);
664 if (output_size != frontend_output.size) {
665 LOGE(
"output_size=%d, frontend_output.size=%d", output_size,
666 frontend_output.size);
679 for (
size_t i = 0; i < frontend_output.size; ++i) {
693 constexpr int32_t value_scale = 256;
694 constexpr int32_t value_div =
695 static_cast<int32_t
>((25.6f * 26.0f) + 0.5f);
697 ((frontend_output.values[i] * value_scale) + (value_div / 2)) /
714 bool is_new_command) {
715 if (cfg.respondToCommand !=
nullptr) {
716 cfg.respondToCommand(found_command, score, is_new_command);
719 if (is_new_command) {
721 snprintf(buffer, 80,
"Result: %s, score: %d, is_new: %s", found_command,
722 score, is_new_command ?
"true" :
"false");
723 Serial.println(buffer);
738 this->increment = increment;
744 p_interpreter = &parent->interpreter();
745 input = p_interpreter->input(0);
746 output = p_interpreter->output(0);
747 channels = parent->
config().channels;
751 virtual int read(int16_t*data,
int sampleCount)
override {
753 float two_pi = 2 * PI;
754 for (
int j=0; j<sampleCount; j+=channels){
756 input->data.int8[0] = TfLiteQuantizer::quantize(actX,input->params.scale, input->params.zero_point);
759 TfLiteStatus invoke_status = p_interpreter->Invoke();
762 if(kTfLiteOk!= invoke_status){
763 LOGE(
"invoke_status not ok");
766 if(kTfLiteInt8 != output->type){
767 LOGE(
"Output type is not kTfLiteInt8");
772 data[j] = TfLiteQuantizer::dequantizeToNewRange(output->data.int8[0], output->params.scale, output->params.zero_point, range);
774 LOGD(
"%f->%d / %d->%d",actX, input->data.int8[0], output->data.int8[0], data[j]);
775 for (
int i=1;i<channels;i++){
777 LOGD(
"generate data for channels");
793 TfLiteTensor* input =
nullptr;
794 TfLiteTensor* output =
nullptr;
795 tflite::MicroInterpreter* p_interpreter =
nullptr;
808 if (p_tensor_arena !=
nullptr)
delete[] p_tensor_arena;
815 this->p_interpreter = p_interpreter;
830 p_tensor_arena =
new uint8_t[cfg.kTensorArenaSize];
832 if (cfg.categoryCount()>0){
835 if (!setupWriter()) {
840 LOGW(
"categoryCount=%d", cfg.categoryCount());
845 if (!setModel(cfg.model)) {
849 if (!setupInterpreter()) {
854 LOGI(
"AllocateTensors");
855 TfLiteStatus allocate_status = p_interpreter->AllocateTensors();
856 if (allocate_status != kTfLiteOk) {
857 LOGE(
"AllocateTensors() failed");
863 p_tensor = p_interpreter->input(0);
864 if (cfg.categoryCount()>0){
865 if ((p_tensor->dims->size != 2) || (p_tensor->dims->data[0] != 1) ||
866 (p_tensor->dims->data[1] !=
867 (cfg.kFeatureSliceCount * cfg.kFeatureSliceSize)) ||
868 (p_tensor->type != kTfLiteInt8)) {
869 LOGE(
"Bad input tensor parameters in model");
875 p_tensor_buffer = p_tensor->data.int8;
876 if (p_tensor_buffer ==
nullptr) {
877 LOGE(
"p_tensor_buffer is null");
882 if (cfg.reader!=
nullptr){
883 cfg.reader->begin(
this);
896 virtual size_t write(
const uint8_t* audio,
size_t bytes)
override {
898 if (cfg.writer==
nullptr){
899 LOGE(
"cfg.output is null");
902 int16_t* samples = (int16_t*)audio;
903 int16_t sample_count = bytes / 2;
904 for (
int j = 0; j < sample_count; j++) {
905 cfg.writer->write(samples[j]);
911 virtual int available()
override {
return cfg.reader !=
nullptr ? DEFAULT_BUFFER_SIZE : 0; }
914 virtual size_t readBytes(uint8_t *data,
size_t len)
override {
916 if (cfg.reader!=
nullptr){
917 return cfg.reader->read((int16_t*)data, (
int) len/
sizeof(int16_t)) *
sizeof(int16_t);
925 return *p_interpreter;
935 return p_tensor_buffer;
939 const tflite::Model* p_model =
nullptr;
940 tflite::MicroInterpreter* p_interpreter =
nullptr;
941 TfLiteTensor* p_tensor =
nullptr;
942 bool is_setup =
false;
947 uint8_t* p_tensor_arena =
nullptr;
948 int8_t* p_tensor_buffer =
nullptr;
950 virtual bool setModel(
const unsigned char* model) {
952 p_model = tflite::GetModel(model);
953 if (p_model->version() != TFLITE_SCHEMA_VERSION) {
955 "Model provided is schema version %d not equal "
956 "to supported version %d.",
957 p_model->version(), TFLITE_SCHEMA_VERSION);
963 virtual bool setupWriter() {
964 if (cfg.writer ==
nullptr) {
965 static TfLiteMicroSpeachWriter writer;
966 cfg.writer = &writer;
968 return cfg.writer->begin(
this);
977 virtual bool setupInterpreter() {
978 if (p_interpreter ==
nullptr) {
980 if (cfg.useAllOpsResolver) {
981 tflite::AllOpsResolver resolver;
982 static tflite::MicroInterpreter static_interpreter(
983 p_model, resolver, p_tensor_arena, cfg.kTensorArenaSize,
985 p_interpreter = &static_interpreter;
988 static tflite::MicroMutableOpResolver<4> micro_op_resolver(
990 if (micro_op_resolver.AddDepthwiseConv2D() != kTfLiteOk) {
993 if (micro_op_resolver.AddFullyConnected() != kTfLiteOk) {
996 if (micro_op_resolver.AddSoftmax() != kTfLiteOk) {
999 if (micro_op_resolver.AddReshape() != kTfLiteOk) {
1003 static tflite::MicroInterpreter static_interpreter(
1004 p_model, micro_op_resolver, p_tensor_arena, cfg.kTensorArenaSize,
1006 p_interpreter = &static_interpreter;