|
#!/bin/bash |
|
set -e |
|
set -x |
|
|
|
if [ $# -ne 1 ]; then |
|
echo "Usage: $0 <corpora dir>" |
|
echo "eg: $0 /all/my/datasets/" |
|
exit 1 |
|
fi |
|
|
|
script_dir=$(dirname $0) |
|
script_dir=$(readlink -f $script_dir) |
|
corpora_root=$1 |
|
cd $corpora_root |
|
|
|
if [ -s SNIPS/all.iob.snips.txt ];then |
|
echo 'Preprocessed text file exist, skip!' |
|
else |
|
if [ ! -d aws-lex-noisy-spoken-language-understanding ];then |
|
echo 'Start downloading text files...' |
|
git clone https://github.com/aws-samples/aws-lex-noisy-spoken-language-understanding.git |
|
fi |
|
|
|
echo 'Start preparing text files...' |
|
mkdir -p SNIPS |
|
python3 "$script_dir/snips_text_norm.py" |
|
python3 "$script_dir/snips_preprocess.py" text aws-lex-noisy-spoken-language-understanding SNIPS |
|
rm SNIPS/single* |
|
fi |
|
|
|
if [ -s SNIPS/valid/Salli-snips-valid-168.wav ];then |
|
echo 'Preprocessed audio file exist, skip!' |
|
else |
|
if [ ! -d audio_slu ];then |
|
echo 'Start downloading audio files...' |
|
wget https://shangwel-asr-evaluation.s3-us-west-2.amazonaws.com/audio_slu_v3.zip |
|
echo 'Start unzipping audio files...' |
|
unzip audio_slu_v3.zip > tmp.log |
|
fi |
|
|
|
echo 'Start converting audio files...' |
|
python "$script_dir/snips_preprocess.py" audio audio_slu SNIPS |
|
fi |
|
|
|
|