Ziyi Lin commited on
Commit
1e05caf
·
1 Parent(s): 39ef55d

Upload examples

Browse files
examples/.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.wav filter=lfs diff=lfs merge=lfs -text
examples/CMakeLists.txt ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 3.10)
2
+ get_filename_component(ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../ ABSOLUTE)
3
+
4
+ project(ten_vad_demo)
5
+
6
+ add_executable(ten_vad_demo ${ROOT}/examples/main.c)
7
+ target_include_directories(ten_vad_demo PRIVATE "${ROOT}/include")
8
+
9
+ if(WIN32)
10
+ if(CMAKE_SIZEOF_VOID_P EQUAL 8)
11
+ target_link_libraries(ten_vad_demo "${ROOT}/lib/Windows/x64/ten_vad.lib")
12
+ else()
13
+ target_link_libraries(ten_vad_demo "${ROOT}/lib/Windows/x86/ten_vad.lib")
14
+ endif()
15
+ elseif(ANDROID)
16
+ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
17
+ target_link_libraries(ten_vad_demo "${ROOT}/lib/Android/arm64-v8a/libten_vad.so")
18
+ else()
19
+ target_link_libraries(ten_vad_demo "${ROOT}/lib/Android/armeabi-v7a/libten_vad.so")
20
+ endif()
21
+ elseif(IOS)
22
+ target_link_libraries(ten_vad_demo "${ROOT}/lib/iOS/ten_vad.framework")
23
+ set_target_properties(ten_vad_demo PROPERTIES
24
+ XCODE_ATTRIBUTE_FRAMEWORK_SEARCH_PATHS "${ROOT}/lib/iOS"
25
+ XCODE_ATTRIBUTE_LD_RUNPATH_SEARCH_PATHS "@executable_path/Frameworks"
26
+ XCODE_ATTRIBUTE_CODE_SIGN_STYLE "Manual"
27
+ XCODE_ATTRIBUTE_DEVELOPMENT_TEAM "${CMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM}"
28
+ XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER "com.yourcompany.ten_vad_demo"
29
+ XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS "iphoneos"
30
+ XCODE_ATTRIBUTE_ARCHS "arm64"
31
+ )
32
+ elseif(APPLE)
33
+ target_link_libraries(ten_vad_demo "${ROOT}/lib/macOS/ten_vad.framework")
34
+ set_target_properties(ten_vad_demo PROPERTIES
35
+ INSTALL_RPATH "@loader_path"
36
+ BUILD_WITH_INSTALL_RPATH TRUE
37
+ )
38
+ elseif(UNIX)
39
+ target_link_libraries(ten_vad_demo "${ROOT}/lib/Linux/x64/libten_vad.so")
40
+ endif()
examples/build-and-deploy-android.sh ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -eo pipefail
3
+
4
+ # Customize the arch and toolchain
5
+ arch=arm64-v8a
6
+ toolchain=aarch64-linux-android-clang
7
+
8
+ # arch=armeabi-v7a
9
+ # toolchain=arm-linux-android-clang
10
+
11
+ build_dir=build-android/$arch
12
+ rm -rf $build_dir
13
+ mkdir -p $build_dir
14
+ cd $build_dir
15
+
16
+ # Step 1: Build the demo
17
+ cmake ../../ \
18
+ -DANDROID_TOOLCHAIN_NAME=$toolchain \
19
+ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
20
+ -G "Unix Makefiles"
21
+
22
+ cmake --build . --config Release
23
+
24
+
25
+ # Step 2: Run the demo
26
+ adb push ../../s0724-s0730.wav /data/local/tmp/
27
+ adb push ../../../lib/Android/${arch}/libten_vad.so /data/local/tmp/libten_vad.so &&
28
+ adb push ten_vad_demo /data/local/tmp/ &&
29
+ adb shell "cd /data/local/tmp && chmod +x ten_vad_demo && \
30
+ LD_LIBRARY_PATH=/data/local/tmp ./ten_vad_demo ./s0724-s0730.wav ./out.txt && \
31
+ exit 0"
32
+
33
+ adb pull /data/local/tmp/out.txt ./
34
+ cd ../../
examples/build-and-deploy-ios.sh ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ work_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
5
+ build_dir="${work_dir}/build-ios"
6
+
7
+ mkdir -p "${build_dir}"
8
+ cd "${build_dir}"
9
+
10
+ # Step 1: Generate Xcode project for iOS device
11
+ echo "[Info] Generating Xcode project"
12
+ cmake "${work_dir}" \
13
+ -DCMAKE_SYSTEM_NAME=iOS \
14
+ -DCMAKE_OSX_SYSROOT="iphoneos" \
15
+ -DCMAKE_OSX_ARCHITECTURES="arm64" \
16
+ -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY="Apple Development" \
17
+ -DCMAKE_OSX_DEPLOYMENT_TARGET=12.1 \
18
+ -DCMAKE_INSTALL_RPATH="@executable_path/Frameworks" \
19
+ -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
20
+ -G Xcode
21
+
22
+
23
+ # Step 2: Use Xcode to open the project in build-ios directory
24
+ # Step 3: Build and run the project in Xcode IDE
examples/build-and-deploy-linux.sh ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+
4
+ arch=x64
5
+ build_dir=build-linux/$arch
6
+ rm -rf $build_dir
7
+ mkdir -p $build_dir
8
+ cd $build_dir
9
+
10
+ # Step 1: Build the demo
11
+ cmake ../../
12
+ cmake --build . --config Release
13
+
14
+
15
+ # Step 2: Run the demo
16
+ export LD_LIBRARY_PATH=../../../lib/Linux/$arch
17
+ ./ten_vad_demo ../../s0724-s0730.wav out.txt
18
+
19
+ cd ../../
examples/build-and-deploy-mac.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+
4
+ # Customize the arch
5
+ arch=arm64
6
+ # arch=x86_64
7
+
8
+ build_dir=build-mac/$arch
9
+ rm -rf $build_dir
10
+ mkdir -p $build_dir
11
+ cd $build_dir
12
+
13
+ # Step 1: Build the demo
14
+ cmake ../../ \
15
+ -DCMAKE_CXX_COMPILER=/usr/bin/clang++ \
16
+ -DCMAKE_C_COMPILER=/usr/bin/clang \
17
+ -DCMAKE_OSX_ARCHITECTURES=${arch} \
18
+ -G Xcode
19
+
20
+ cmake --build . --config Release -- -UseModernBuildSystem=NO
21
+
22
+
23
+ # Step 2: Run the demo
24
+ export DYLD_FRAMEWORK_PATH="../../../lib/macOS/"
25
+ Release/ten_vad_demo ../../s0724-s0730.wav out.txt
26
+ cd ../../
examples/build-and-deploy-windows.bat ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+ setlocal
3
+
4
+ @REM Customize the arch
5
+ set arch=x64
6
+ @REM set arch=x86
7
+
8
+ @REM step 1: Build the demo
9
+ set "build_dir=%~dp0\build-windows"
10
+ if exist "%build_dir%" rmdir /s /q "%build_dir%"
11
+ mkdir "%build_dir%"
12
+ cd /d "%build_dir%"
13
+
14
+ @REM Customize the Visual Studio version
15
+ @REM REM VS 2017
16
+ @REM if %arch% == x64 (
17
+ @REM cmake .. -G "Visual Studio 15 2017" -A x64
18
+ @REM ) else if %arch% == x86 (
19
+ @REM cmake .. -G "Visual Studio 15 2017" -A Win32
20
+ @REM )
21
+
22
+ REM VS 2019
23
+ if %arch% == x64 (
24
+ cmake .. -G "Visual Studio 16 2019" -A x64
25
+ ) else if %arch% == x86 (
26
+ cmake .. -G "Visual Studio 16 2019" -A Win32
27
+ )
28
+
29
+ @REM REM VS 2022
30
+ @REM if %arch% == x64 (
31
+ @REM cmake .. -G "Visual Studio 17 2022" -A x64
32
+ @REM ) else if %arch% == x86 (
33
+ @REM cmake .. -G "Visual Studio 17 2022" -A Win32
34
+ @REM )
35
+
36
+ cmake --build . --config Release
37
+ cd ..
38
+
39
+
40
+ @REM step 2: Run the demo
41
+ pushd "%~dp0"
42
+ copy /Y "s0724-s0730.wav" "%build_dir%\Release"
43
+ copy /Y "..\lib\Windows\%arch%\ten_vad.dll" "%build_dir%\Release"
44
+ if errorlevel 1 (
45
+ echo [Error] copy file failed
46
+ popd
47
+ exit /b 1
48
+ )
49
+ cd /d "%build_dir%\Release"
50
+ if not exist "ten_vad_demo.exe" (
51
+ echo Error: ten_vad_demo.exe not found
52
+ exit /b 1
53
+ )
54
+ if not exist "s0724-s0730.wav" (
55
+ echo Error: s0724-s0730.wav not found
56
+ exit /b 1
57
+ )
58
+
59
+ ten_vad_demo.exe "s0724-s0730.wav" out.txt
60
+ if errorlevel 1 (
61
+ echo Error: ten_vad_demo.exe failed
62
+ exit /b 1
63
+ )
64
+
65
+ cd /d "%~dp0"
66
+ popd
67
+ exit /b 0
examples/main.c ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // This file is part of TEN Framework, an open source project.
3
+ // Licensed under the Apache License, Version 2.0.
4
+ // See the LICENSE file for more information.
5
+ //
6
+ #include <stdio.h>
7
+ #include <stdint.h>
8
+ #include <stdlib.h>
9
+ #include <time.h>
10
+ #include <inttypes.h>
11
+ #include <string.h> // memcmp
12
+ #ifdef _WIN32
13
+ #include <windows.h>
14
+ #endif
15
+
16
+ #include "ten_vad.h"
17
+
18
+ #if defined(__APPLE__)
19
+ #include <TargetConditionals.h>
20
+ #if TARGET_OS_IPHONE
21
+ #include "sample_array.h"
22
+ #endif
23
+ #endif
24
+
25
+ const int hop_size = 256; // 16 ms per frame
26
+
27
+ uint64_t get_timestamp_ms()
28
+ {
29
+ #ifdef _WIN32
30
+ LARGE_INTEGER frequency;
31
+ LARGE_INTEGER counter;
32
+ QueryPerformanceFrequency(&frequency);
33
+ QueryPerformanceCounter(&counter);
34
+ return (uint64_t)(counter.QuadPart * 1000 / frequency.QuadPart);
35
+ #else
36
+ struct timespec ts;
37
+ uint64_t millis;
38
+ clock_gettime(CLOCK_MONOTONIC, &ts);
39
+ millis = ts.tv_sec * 1000 + ts.tv_nsec / 1000000;
40
+ return millis;
41
+ #endif
42
+ }
43
+
44
+ // define RIFF header
45
+ #pragma pack(push, 1)
46
+ typedef struct
47
+ {
48
+ char chunk_id[4]; // should be "RIFF"
49
+ uint32_t chunk_size; // file total size - 8
50
+ char format[4]; // should be "WAVE"
51
+ } riff_header_t;
52
+
53
+ // define each sub chunk header
54
+ typedef struct
55
+ {
56
+ char id[4]; // should be "fmt " or "data"
57
+ uint32_t size; // chunk data size
58
+ } chunk_header_t;
59
+ #pragma pack(pop)
60
+
61
+ // define WAV file info we care about
62
+ typedef struct
63
+ {
64
+ uint16_t audio_format; // audio format (e.g. PCM=1)
65
+ uint16_t num_channels; // number of channels
66
+ uint32_t sample_rate; // sample rate
67
+ uint32_t byte_rate; // byte rate
68
+ uint16_t block_align; // block align
69
+ uint16_t bits_per_sample; // bits per sample
70
+ uint32_t data_size; // data size
71
+ long data_offset; // data offset in file
72
+ } wav_info_t;
73
+
74
+ int read_wav_file(FILE *fp, wav_info_t *info);
75
+
76
+ int vad_process(int16_t *input_buf, uint32_t frame_num,
77
+ float *out_probs, int32_t *out_flags,
78
+ float *use_time)
79
+ {
80
+ printf("tenvadsrc version: %s\n", ten_vad_get_version());
81
+ void *ten_vad_handle = NULL;
82
+ float voice_threshold = 0.5f;
83
+ ten_vad_create(&ten_vad_handle, hop_size, voice_threshold);
84
+
85
+ uint64_t start = get_timestamp_ms();
86
+ for (int i = 0; i < frame_num; ++i)
87
+ {
88
+ int16_t *audio_data = input_buf + i * hop_size;
89
+ ten_vad_process(ten_vad_handle, audio_data, hop_size,
90
+ &out_probs[i], &out_flags[i]);
91
+ printf("[%d] %0.6f, %d\n", i, out_probs[i], out_flags[i]);
92
+ }
93
+ uint64_t end = get_timestamp_ms();
94
+ *use_time = (float)(end - start);
95
+
96
+ ten_vad_destroy(&ten_vad_handle);
97
+ ten_vad_handle = NULL;
98
+ return 0;
99
+ }
100
+
101
+ int test_with_wav(int argc, char *argv[])
102
+ {
103
+ if (argc < 3)
104
+ {
105
+ printf("Warning: Test.exe input.wav output.txt\n");
106
+ return 0;
107
+ }
108
+ char *input_file = argv[1];
109
+ char *out_file = argv[2];
110
+
111
+ FILE *fp = fopen(input_file, "rb");
112
+ if (fp == NULL)
113
+ {
114
+ printf("Failed to open input file: %s\n", input_file);
115
+ return 1;
116
+ }
117
+ fseek(fp, 0, SEEK_SET);
118
+ wav_info_t info;
119
+ if (read_wav_file(fp, &info) != 0)
120
+ {
121
+ printf("Failed to read WAV file header\n");
122
+ fclose(fp);
123
+ return 1;
124
+ }
125
+
126
+ uint32_t byte_num = info.data_size;
127
+ printf("WAV file byte num: %d\n", byte_num);
128
+ char *input_buf = (char *)malloc(byte_num);
129
+ fseek(fp, info.data_offset, SEEK_SET);
130
+ fread(input_buf, 1, byte_num, fp);
131
+ fclose(fp);
132
+ fp = NULL;
133
+
134
+ uint32_t sample_num = byte_num / sizeof(int16_t);
135
+ float total_audio_time = (float)sample_num / 16.0;
136
+ printf("total_audio_time: %.2f(ms)\n", total_audio_time);
137
+ uint32_t frame_num = sample_num / hop_size;
138
+ printf("Audio frame Num: %d\n", frame_num);
139
+ float *out_probs = (float *)malloc(frame_num * sizeof(float));
140
+ int32_t *out_flags = (int32_t *)malloc(frame_num * sizeof(int32_t));
141
+ float use_time = .0;
142
+ vad_process((int16_t *)input_buf, frame_num,
143
+ out_probs, out_flags,
144
+ &use_time);
145
+ float rtf = use_time / total_audio_time;
146
+ printf("Consuming time: %f(ms), audio-time: %.2f(ms), =====> RTF: %0.6f\n",
147
+ use_time, total_audio_time, rtf);
148
+
149
+ FILE *fout = fopen(out_file, "w");
150
+ if (fout != NULL)
151
+ {
152
+ for (int i = 0; i < frame_num; i++)
153
+ {
154
+ fprintf(fout, "[%d] %0.6f, %d\n", i, out_probs[i], out_flags[i]);
155
+ }
156
+ fclose(fout);
157
+ fout = NULL;
158
+ }
159
+
160
+ free(input_buf);
161
+ free(out_probs);
162
+ free(out_flags);
163
+ return 0;
164
+ }
165
+
166
+ #if TARGET_OS_IPHONE
167
+ // Used for iOS APP demo
168
+ int test_with_array()
169
+ {
170
+ char *input_buf = (char *)sample_array;
171
+ uint32_t byte_num = sizeof(sample_array) / sizeof(sample_array[0]);
172
+ printf("WAV file byte num: %d\n", byte_num);
173
+
174
+ uint32_t sample_num = byte_num / sizeof(int16_t);
175
+ float total_audio_time = (float)sample_num / 16.0;
176
+ printf("total_audio_time: %.2f(ms)\n", total_audio_time);
177
+ uint32_t frame_num = sample_num / hop_size;
178
+ printf("Audio frame Num: %d\n", frame_num);
179
+ float *out_probs = (float *)malloc(frame_num * sizeof(float));
180
+ int32_t *out_flags = (int32_t *)malloc(frame_num * sizeof(int32_t));
181
+ float use_time = .0;
182
+ vad_process((int16_t *)input_buf, frame_num,
183
+ out_probs, out_flags,
184
+ &use_time);
185
+ float rtf = use_time / total_audio_time;
186
+ printf("Consuming time: %f(ms), audio-time: %.2f(ms), =====> RTF: %0.6f\n",
187
+ use_time, total_audio_time, rtf);
188
+
189
+ return 0;
190
+ }
191
+ #endif
192
+
193
+ int main(int argc, char *argv[])
194
+ {
195
+ #if TARGET_OS_IPHONE
196
+ return test_with_array();
197
+ #else
198
+ return test_with_wav(argc, argv);
199
+ #endif
200
+ }
201
+
202
+ // function to read WAV file info
203
+ int read_wav_file(FILE *fp, wav_info_t *info)
204
+ {
205
+ if (fp == NULL || info == NULL)
206
+ return -1;
207
+ // save current file position
208
+ long orig_pos = ftell(fp);
209
+ fseek(fp, 0, SEEK_SET);
210
+ // read RIFF header
211
+ riff_header_t riff;
212
+ if (fread(&riff, sizeof(riff_header_t), 1, fp) != 1)
213
+ {
214
+ fprintf(stderr, "Can not read RIFF head\n");
215
+ fseek(fp, orig_pos, SEEK_SET);
216
+ return -1;
217
+ }
218
+ // verify RIFF/WAVE format
219
+ if (memcmp(riff.chunk_id, "RIFF", 4) != 0 ||
220
+ memcmp(riff.format, "WAVE", 4) != 0)
221
+ {
222
+ fprintf(stderr, "not a valid RIFF/WAVE file\n");
223
+ fseek(fp, orig_pos, SEEK_SET);
224
+ return -1;
225
+ }
226
+ // initialize, mark chunks not found yet
227
+ int fmt_found = 0, data_found = 0;
228
+ memset(info, 0, sizeof(wav_info_t));
229
+
230
+ // iterate all chunks
231
+ while (!feof(fp))
232
+ {
233
+ chunk_header_t chunk;
234
+ if (fread(&chunk, sizeof(chunk_header_t), 1, fp) != 1)
235
+ {
236
+ break; // read failed, maybe end of file
237
+ }
238
+ // check if it's fmt chunk
239
+ if (memcmp(chunk.id, "fmt ", 4) == 0)
240
+ {
241
+ // read fmt data
242
+ fmt_found = 1;
243
+ if (chunk.size < 16)
244
+ {
245
+ fprintf(stderr, "fmt chunk size is abnormal\n");
246
+ fseek(fp, orig_pos, SEEK_SET);
247
+ return -1;
248
+ }
249
+ // read fmt parameters
250
+ if (fread(&info->audio_format, 2, 1, fp) != 1 ||
251
+ fread(&info->num_channels, 2, 1, fp) != 1 ||
252
+ fread(&info->sample_rate, 4, 1, fp) != 1 ||
253
+ fread(&info->byte_rate, 4, 1, fp) != 1 ||
254
+ fread(&info->block_align, 2, 1, fp) != 1 ||
255
+ fread(&info->bits_per_sample, 2, 1, fp) != 1)
256
+ {
257
+ fprintf(stderr, "failed to read fmt data\n");
258
+ fseek(fp, orig_pos, SEEK_SET);
259
+ return -1;
260
+ }
261
+ // skip fmt extension data
262
+ if (chunk.size > 16)
263
+ {
264
+ fseek(fp, chunk.size - 16, SEEK_CUR);
265
+ }
266
+ }
267
+ // check if it's data chunk
268
+ else if (memcmp(chunk.id, "data", 4) == 0)
269
+ {
270
+ data_found = 1;
271
+ info->data_size = chunk.size;
272
+ info->data_offset = ftell(fp); // record data start position
273
+ break; // found data chunk, can exit loop
274
+ }
275
+ // other chunks, skip
276
+ else
277
+ {
278
+ // consider byte alignment, pad odd size
279
+ fseek(fp, (chunk.size + (chunk.size % 2)), SEEK_CUR);
280
+ }
281
+ }
282
+ // check if necessary chunks are found
283
+ if (!fmt_found)
284
+ {
285
+ fprintf(stderr, "fmt chunk not found\n");
286
+ fseek(fp, orig_pos, SEEK_SET);
287
+ return -1;
288
+ }
289
+ if (!data_found)
290
+ {
291
+ fprintf(stderr, "data chunk not found\n");
292
+ fseek(fp, orig_pos, SEEK_SET);
293
+ return -1;
294
+ }
295
+ // restore original file position
296
+ fseek(fp, orig_pos, SEEK_SET);
297
+ return 0;
298
+ }
examples/plot_pr_curves.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # This file is part of TEN Framework, an open source project.
3
+ # Licensed under the Apache License, Version 2.0.
4
+ # See the LICENSE file for more information.
5
+ #
6
+ import os, glob, sys, torchaudio
7
+ import numpy as np
8
+ import scipy.io.wavfile as Wavfile
9
+ import matplotlib.pyplot as plt
10
+ from sklearn.metrics import confusion_matrix
11
+
12
+ os.system('git clone https://github.com/snakers4/silero-vad.git') # Clone the silero-vad repo, using Silero V5
13
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "./silero-vad/src")))
14
+ from silero_vad.utils_vad import VADIterator, init_jit_model
15
+
16
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../include")))
17
+ from ten_vad import TenVad
18
+
19
+ def convert_label_to_framewise(label_file, hop_size):
20
+ frame_duration = hop_size / 16000
21
+ with open(label_file, "r") as f:
22
+ lines = f.readlines()
23
+ content = lines[0].strip().split(",")[1:]
24
+ start = np.array(
25
+ content[::3], dtype=float
26
+ ) # Start point of each audio segment
27
+ end = np.array(
28
+ content[1:][::3], dtype=float
29
+ ) # End point of each audio segment
30
+ lab_manual = np.array(
31
+ content[2:][::3], dtype=int
32
+ ) # label, 0/1 stands for non-speech or speech, respectively
33
+ assert (
34
+ len(start) == len(end)
35
+ and len(start) == len(lab_manual)
36
+ and len(end) == len(lab_manual)
37
+ )
38
+
39
+ num = np.array(
40
+ np.round(((end - start) / frame_duration)), dtype=np.int32
41
+ ) # get number of frames of each audio segment
42
+ label_framewise = np.array([])
43
+ for segment_idx in range(len(num)):
44
+ cur_lab = int(lab_manual[segment_idx])
45
+ num_segment = num[segment_idx]
46
+
47
+ if cur_lab == 1:
48
+ vad_result_this_segment = np.ones(num_segment)
49
+ elif cur_lab == 0:
50
+ vad_result_this_segment = np.zeros(num_segment)
51
+ label_framewise = np.append(label_framewise, vad_result_this_segment)
52
+ frame_num = min(
53
+ label_framewise.__len__(), int((end[-1] - start[0]) / frame_duration)
54
+ )
55
+ label_framewise = label_framewise[:frame_num]
56
+
57
+ return label_framewise
58
+
59
+
60
+ def read_file(file_path):
61
+ with open(file_path, "r") as f:
62
+ lines = f.readlines()
63
+ lines_arr = np.array([])
64
+ for line in lines:
65
+ lines_arr = np.append(lines_arr, float(line.strip()))
66
+
67
+ return lines_arr
68
+
69
+ def get_precision_recall(VAD_result, label, threshold):
70
+ vad_result_hard = np.where(VAD_result >= threshold, 1, 0)
71
+
72
+ # Compute confusion matrix
73
+ TN, FP, FN, TP = confusion_matrix(label, vad_result_hard).ravel()
74
+
75
+ # Compute precision, recall, false positive rate and false negative rate
76
+ precision = TP / (TP + FP) if (TP + FP) > 0 else 0
77
+ recall = TP / (TP + FN) if (TP + FN) > 0 else 0
78
+ FPR = FP / (FP + TN) if (FP + TN) > 0 else 0
79
+ FNR = FN / (TP + FN) if (TP + FN) > 0 else 0
80
+
81
+ return precision, recall, FPR, FNR
82
+
83
+ def silero_vad_inference_single_file(wav_path):
84
+ current_directory = os.path.dirname(os.path.abspath(__file__))
85
+ model = init_jit_model(f'{current_directory}/silero-vad/src/silero_vad/data/silero_vad.jit')
86
+ vad_iterator = VADIterator(model)
87
+ window_size_samples = 512
88
+ speech_probs = np.array([])
89
+
90
+ wav, sr = torchaudio.load(wav_path)
91
+ wav = wav.squeeze(0)
92
+ for i in range(0, len(wav), window_size_samples):
93
+ chunk = wav[i: i+ window_size_samples]
94
+ if len(chunk) < window_size_samples:
95
+ break
96
+ speech_prob = model(chunk, sr).item()
97
+ speech_probs = np.append(speech_probs, speech_prob)
98
+ vad_iterator.reset_states() # reset model states after each audio
99
+
100
+ return speech_probs, window_size_samples
101
+
102
+ def ten_vad_process_wav(ten_vad_instance, wav_path, hop_size=256):
103
+ _, data = Wavfile.read(wav_path)
104
+ num_frames = data.shape[0] // hop_size
105
+ voice_prob_arr = np.array([])
106
+ for i in range(num_frames):
107
+ input_data = data[i * hop_size: (i + 1) * hop_size]
108
+ voice_prob, _ = ten_vad_instance.process(input_data)
109
+ voice_prob_arr = np.append(voice_prob_arr, voice_prob)
110
+
111
+ return voice_prob_arr
112
+
113
+ if __name__ == "__main__":
114
+ # Get the directory of the script
115
+ script_dir = os.path.dirname(os.path.abspath(__file__))
116
+
117
+ # TEN-VAD-TestSet dir
118
+ test_dir = f"{script_dir}/../TEN-VAD-TestSet"
119
+
120
+ # Initialization
121
+ hop_size = 256
122
+ threshold = 0.5
123
+ label_all, vad_result_ten_vad_all = np.array([]), np.array([])
124
+ label_hop_512_all, vad_result_silero_vad_all = np.array([]), np.array([])
125
+ wav_list = glob.glob(f"{test_dir}/*.wav")
126
+
127
+ # Running TEN VAD
128
+ print("Start processing")
129
+ for wav_path in wav_list:
130
+ ten_vad_instance = TenVad(hop_size, threshold)
131
+ label_file = wav_path.replace(".wav", ".scv")
132
+ label = convert_label_to_framewise(
133
+ label_file, hop_size=hop_size
134
+ ) # Convert the VAD label to frame-wise one
135
+ vad_result_ten_vad = ten_vad_process_wav(
136
+ ten_vad_instance, wav_path, hop_size=hop_size
137
+ )
138
+ frame_num = min(label.__len__(), vad_result_ten_vad.__len__())
139
+ vad_result_ten_vad_all = np.append(
140
+ vad_result_ten_vad_all, vad_result_ten_vad[1:frame_num]
141
+ )
142
+ label_all = np.append(label_all, label[:frame_num - 1])
143
+ del ten_vad_instance # To prevent getting different results of each run
144
+
145
+ label_hop_512 = convert_label_to_framewise(
146
+ label_file, hop_size=512
147
+ ) # Convert the VAD label to frame-wise one for Silero VAD
148
+ vad_result_silero_vad, _ = silero_vad_inference_single_file(wav_path)
149
+ frame_num_silero_vad = min(label_hop_512.__len__(), vad_result_silero_vad.__len__())
150
+ vad_result_silero_vad_all = np.append(vad_result_silero_vad_all, vad_result_silero_vad[:frame_num_silero_vad])
151
+ label_hop_512_all = np.append(label_hop_512_all, label_hop_512[:frame_num_silero_vad])
152
+
153
+ # Compute Precision and Recall
154
+ threshold_arr = np.arange(0, 1.01, 0.01)
155
+ pr_data_arr = np.zeros((threshold_arr.__len__(), 3))
156
+ pr_data_silero_vad_arr = np.zeros((threshold_arr.__len__(), 3))
157
+
158
+ for ind, threshold in enumerate(threshold_arr):
159
+ precision, recall, FPR, FNR = get_precision_recall(vad_result_ten_vad_all, label_all, threshold)
160
+ pr_data_arr[ind] = precision, recall, threshold
161
+
162
+ precision_silero_vad, recall_silero_vad, FPR_silero_vad, FNR_silero_vad = get_precision_recall(vad_result_silero_vad_all, label_hop_512_all, threshold)
163
+ pr_data_silero_vad_arr[ind] = precision_silero_vad, recall_silero_vad, threshold
164
+
165
+ # Plot PR Curve
166
+ print("Plotting PR Curve")
167
+ pr_data_arr_to_plot = pr_data_arr[:-1]
168
+ plt.plot(
169
+ pr_data_arr_to_plot[:, 1],
170
+ pr_data_arr_to_plot[:, 0],
171
+ color="red",
172
+ label="TEN VAD",
173
+ ) # Precision on y-axis, Recall on x-axis
174
+ pr_data_silero_vad_arr_to_plot = pr_data_silero_vad_arr[:-1]
175
+ plt.plot(
176
+ pr_data_silero_vad_arr_to_plot[:, 1], # Recall (x-axis)
177
+ pr_data_silero_vad_arr_to_plot[:, 0], # Precision (y-axis)
178
+ color="blue",
179
+ label="Silero VAD",
180
+ )
181
+
182
+ plt.xlabel("Recall", fontsize=14, fontweight="bold", color="black")
183
+ plt.ylabel("Precision", fontsize=14, fontweight="bold", color="black")
184
+ legend = plt.legend()
185
+ legend.get_texts()[0].set_fontweight("bold")
186
+ legend.get_texts()[1].set_fontweight("bold")
187
+ plt.grid(True)
188
+ plt.xlim(0.65, 1)
189
+ plt.ylim(0.7, 1)
190
+ plt.title(
191
+ "Precision-Recall Curve of TEN VAD on TEN-VAD-TestSet",
192
+ fontsize=12,
193
+ color="black",
194
+ fontweight="bold",
195
+ )
196
+ save_path = f"{script_dir}/PR_Curves.png"
197
+ plt.savefig(save_path, dpi=300, bbox_inches="tight")
198
+ print(f"PR Curves png file saved, save path: {save_path}")
199
+
200
+ # Save the PR data to txt file
201
+ pr_data_save_path = f"{script_dir}/PR_data_TEN_VAD.txt"
202
+ with open(pr_data_save_path, "w") as f:
203
+ for ind in range(pr_data_arr.shape[0]):
204
+ precision, recall, threshold = (
205
+ pr_data_arr[ind, 0],
206
+ pr_data_arr[ind, 1],
207
+ pr_data_arr[ind, 2],
208
+ )
209
+ f.write(f"{threshold:.2f} {precision:.4f} {recall:.4f}\n")
210
+ print("Processing done!")
211
+
212
+
examples/s0724-s0730.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d45912ad2eb5ce25e65a69ed9f110d0c9a382b2644e38341d8fecbcc6900d59a
3
+ size 249732
examples/sample_array.h ADDED
The diff for this file is too large to render. See raw diff
 
examples/test.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # This file is part of TEN Framework, an open source project.
3
+ # Licensed under the Apache License, Version 2.0.
4
+ # See the LICENSE file for more information.
5
+ #
6
+ import sys, os
7
+
8
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../include")))
9
+ from ten_vad import TenVad
10
+ import scipy.io.wavfile as Wavfile
11
+
12
+
13
+ if __name__ == "__main__":
14
+ input_file, out_path = sys.argv[1], sys.argv[2]
15
+ sr, data = Wavfile.read(input_file)
16
+ hop_size = 256 # 16 ms per frame
17
+ threshold = 0.5
18
+ ten_vad_instance = TenVad(hop_size, threshold) # Create a TenVad instance
19
+ num_frames = data.shape[0] // hop_size
20
+ # Streaming inference
21
+ with open(out_path, "w") as f:
22
+ for i in range(num_frames):
23
+ audio_data = data[i * hop_size: (i + 1) * hop_size]
24
+ out_probability, out_flags = ten_vad_instance.process(audio_data)
25
+ print("[%d] %0.6f, %d" % (i, out_probability, out_flags))
26
+ f.write("[%d] %0.6f, %d\n" % (i, out_probability, out_flags))