|
|
|
|
|
|
|
|
|
|
|
#include <stdio.h> |
|
#include <stdint.h> |
|
#include <stdlib.h> |
|
#include <time.h> |
|
#include <inttypes.h> |
|
#include <string.h> |
|
#ifdef _WIN32 |
|
#include <windows.h> |
|
#endif |
|
|
|
#include "ten_vad.h" |
|
|
|
#if defined(__APPLE__) |
|
#include <TargetConditionals.h> |
|
#if TARGET_OS_IPHONE |
|
#include "sample_array.h" |
|
#endif |
|
#endif |
|
|
|
const int hop_size = 256; |
|
|
|
uint64_t get_timestamp_ms() |
|
{ |
|
#ifdef _WIN32 |
|
LARGE_INTEGER frequency; |
|
LARGE_INTEGER counter; |
|
QueryPerformanceFrequency(&frequency); |
|
QueryPerformanceCounter(&counter); |
|
return (uint64_t)(counter.QuadPart * 1000 / frequency.QuadPart); |
|
#else |
|
struct timespec ts; |
|
uint64_t millis; |
|
clock_gettime(CLOCK_MONOTONIC, &ts); |
|
millis = ts.tv_sec * 1000 + ts.tv_nsec / 1000000; |
|
return millis; |
|
#endif |
|
} |
|
|
|
|
|
#pragma pack(push, 1) |
|
typedef struct |
|
{ |
|
char chunk_id[4]; |
|
uint32_t chunk_size; |
|
char format[4]; |
|
} riff_header_t; |
|
|
|
|
|
typedef struct |
|
{ |
|
char id[4]; |
|
uint32_t size; |
|
} chunk_header_t; |
|
#pragma pack(pop) |
|
|
|
|
|
typedef struct |
|
{ |
|
uint16_t audio_format; |
|
uint16_t num_channels; |
|
uint32_t sample_rate; |
|
uint32_t byte_rate; |
|
uint16_t block_align; |
|
uint16_t bits_per_sample; |
|
uint32_t data_size; |
|
long data_offset; |
|
} wav_info_t; |
|
|
|
int read_wav_file(FILE *fp, wav_info_t *info); |
|
|
|
int vad_process(int16_t *input_buf, uint32_t frame_num, |
|
float *out_probs, int32_t *out_flags, |
|
float *use_time) |
|
{ |
|
printf("tenvadsrc version: %s\n", ten_vad_get_version()); |
|
void *ten_vad_handle = NULL; |
|
float voice_threshold = 0.5f; |
|
ten_vad_create(&ten_vad_handle, hop_size, voice_threshold); |
|
|
|
uint64_t start = get_timestamp_ms(); |
|
for (int i = 0; i < frame_num; ++i) |
|
{ |
|
int16_t *audio_data = input_buf + i * hop_size; |
|
ten_vad_process(ten_vad_handle, audio_data, hop_size, |
|
&out_probs[i], &out_flags[i]); |
|
printf("[%d] %0.6f, %d\n", i, out_probs[i], out_flags[i]); |
|
} |
|
uint64_t end = get_timestamp_ms(); |
|
*use_time = (float)(end - start); |
|
|
|
ten_vad_destroy(&ten_vad_handle); |
|
ten_vad_handle = NULL; |
|
return 0; |
|
} |
|
|
|
int test_with_wav(int argc, char *argv[]) |
|
{ |
|
if (argc < 3) |
|
{ |
|
printf("Warning: Test.exe input.wav output.txt\n"); |
|
return 0; |
|
} |
|
char *input_file = argv[1]; |
|
char *out_file = argv[2]; |
|
|
|
FILE *fp = fopen(input_file, "rb"); |
|
if (fp == NULL) |
|
{ |
|
printf("Failed to open input file: %s\n", input_file); |
|
return 1; |
|
} |
|
fseek(fp, 0, SEEK_SET); |
|
wav_info_t info; |
|
if (read_wav_file(fp, &info) != 0) |
|
{ |
|
printf("Failed to read WAV file header\n"); |
|
fclose(fp); |
|
return 1; |
|
} |
|
|
|
uint32_t byte_num = info.data_size; |
|
printf("WAV file byte num: %d\n", byte_num); |
|
char *input_buf = (char *)malloc(byte_num); |
|
fseek(fp, info.data_offset, SEEK_SET); |
|
fread(input_buf, 1, byte_num, fp); |
|
fclose(fp); |
|
fp = NULL; |
|
|
|
uint32_t sample_num = byte_num / sizeof(int16_t); |
|
float total_audio_time = (float)sample_num / 16.0; |
|
printf("total_audio_time: %.2f(ms)\n", total_audio_time); |
|
uint32_t frame_num = sample_num / hop_size; |
|
printf("Audio frame Num: %d\n", frame_num); |
|
float *out_probs = (float *)malloc(frame_num * sizeof(float)); |
|
int32_t *out_flags = (int32_t *)malloc(frame_num * sizeof(int32_t)); |
|
float use_time = .0; |
|
vad_process((int16_t *)input_buf, frame_num, |
|
out_probs, out_flags, |
|
&use_time); |
|
float rtf = use_time / total_audio_time; |
|
printf("Consuming time: %f(ms), audio-time: %.2f(ms), =====> RTF: %0.6f\n", |
|
use_time, total_audio_time, rtf); |
|
|
|
FILE *fout = fopen(out_file, "w"); |
|
if (fout != NULL) |
|
{ |
|
for (int i = 0; i < frame_num; i++) |
|
{ |
|
fprintf(fout, "[%d] %0.6f, %d\n", i, out_probs[i], out_flags[i]); |
|
} |
|
fclose(fout); |
|
fout = NULL; |
|
} |
|
|
|
free(input_buf); |
|
free(out_probs); |
|
free(out_flags); |
|
return 0; |
|
} |
|
|
|
#if TARGET_OS_IPHONE |
|
|
|
int test_with_array() |
|
{ |
|
char *input_buf = (char *)sample_array; |
|
uint32_t byte_num = sizeof(sample_array) / sizeof(sample_array[0]); |
|
printf("WAV file byte num: %d\n", byte_num); |
|
|
|
uint32_t sample_num = byte_num / sizeof(int16_t); |
|
float total_audio_time = (float)sample_num / 16.0; |
|
printf("total_audio_time: %.2f(ms)\n", total_audio_time); |
|
uint32_t frame_num = sample_num / hop_size; |
|
printf("Audio frame Num: %d\n", frame_num); |
|
float *out_probs = (float *)malloc(frame_num * sizeof(float)); |
|
int32_t *out_flags = (int32_t *)malloc(frame_num * sizeof(int32_t)); |
|
float use_time = .0; |
|
vad_process((int16_t *)input_buf, frame_num, |
|
out_probs, out_flags, |
|
&use_time); |
|
float rtf = use_time / total_audio_time; |
|
printf("Consuming time: %f(ms), audio-time: %.2f(ms), =====> RTF: %0.6f\n", |
|
use_time, total_audio_time, rtf); |
|
|
|
return 0; |
|
} |
|
#endif |
|
|
|
int main(int argc, char *argv[]) |
|
{ |
|
#if TARGET_OS_IPHONE |
|
return test_with_array(); |
|
#else |
|
return test_with_wav(argc, argv); |
|
#endif |
|
} |
|
|
|
|
|
int read_wav_file(FILE *fp, wav_info_t *info) |
|
{ |
|
if (fp == NULL || info == NULL) |
|
return -1; |
|
|
|
long orig_pos = ftell(fp); |
|
fseek(fp, 0, SEEK_SET); |
|
|
|
riff_header_t riff; |
|
if (fread(&riff, sizeof(riff_header_t), 1, fp) != 1) |
|
{ |
|
fprintf(stderr, "Can not read RIFF head\n"); |
|
fseek(fp, orig_pos, SEEK_SET); |
|
return -1; |
|
} |
|
|
|
if (memcmp(riff.chunk_id, "RIFF", 4) != 0 || |
|
memcmp(riff.format, "WAVE", 4) != 0) |
|
{ |
|
fprintf(stderr, "not a valid RIFF/WAVE file\n"); |
|
fseek(fp, orig_pos, SEEK_SET); |
|
return -1; |
|
} |
|
|
|
int fmt_found = 0, data_found = 0; |
|
memset(info, 0, sizeof(wav_info_t)); |
|
|
|
|
|
while (!feof(fp)) |
|
{ |
|
chunk_header_t chunk; |
|
if (fread(&chunk, sizeof(chunk_header_t), 1, fp) != 1) |
|
{ |
|
break; |
|
} |
|
|
|
if (memcmp(chunk.id, "fmt ", 4) == 0) |
|
{ |
|
|
|
fmt_found = 1; |
|
if (chunk.size < 16) |
|
{ |
|
fprintf(stderr, "fmt chunk size is abnormal\n"); |
|
fseek(fp, orig_pos, SEEK_SET); |
|
return -1; |
|
} |
|
|
|
if (fread(&info->audio_format, 2, 1, fp) != 1 || |
|
fread(&info->num_channels, 2, 1, fp) != 1 || |
|
fread(&info->sample_rate, 4, 1, fp) != 1 || |
|
fread(&info->byte_rate, 4, 1, fp) != 1 || |
|
fread(&info->block_align, 2, 1, fp) != 1 || |
|
fread(&info->bits_per_sample, 2, 1, fp) != 1) |
|
{ |
|
fprintf(stderr, "failed to read fmt data\n"); |
|
fseek(fp, orig_pos, SEEK_SET); |
|
return -1; |
|
} |
|
|
|
if (chunk.size > 16) |
|
{ |
|
fseek(fp, chunk.size - 16, SEEK_CUR); |
|
} |
|
} |
|
|
|
else if (memcmp(chunk.id, "data", 4) == 0) |
|
{ |
|
data_found = 1; |
|
info->data_size = chunk.size; |
|
info->data_offset = ftell(fp); |
|
break; |
|
} |
|
|
|
else |
|
{ |
|
|
|
fseek(fp, (chunk.size + (chunk.size % 2)), SEEK_CUR); |
|
} |
|
} |
|
|
|
if (!fmt_found) |
|
{ |
|
fprintf(stderr, "fmt chunk not found\n"); |
|
fseek(fp, orig_pos, SEEK_SET); |
|
return -1; |
|
} |
|
if (!data_found) |
|
{ |
|
fprintf(stderr, "data chunk not found\n"); |
|
fseek(fp, orig_pos, SEEK_SET); |
|
return -1; |
|
} |
|
|
|
fseek(fp, orig_pos, SEEK_SET); |
|
return 0; |
|
} |