lmzjms's picture
Upload 1162 files
0b32ad6 verified
raw
history blame
1.25 kB
#!/bin/bash
set -e
set -x
if [ $# -ne 1 ]; then
echo "Usage: $0 <corpora dir>"
echo "eg: $0 /all/my/datasets/"
exit 1
fi
script_dir=$(dirname $0)
script_dir=$(readlink -f $script_dir)
corpora_root=$1
cd $corpora_root
if [ -s SNIPS/all.iob.snips.txt ];then
echo 'Preprocessed text file exist, skip!'
else
if [ ! -d aws-lex-noisy-spoken-language-understanding ];then
echo 'Start downloading text files...'
git clone https://github.com/aws-samples/aws-lex-noisy-spoken-language-understanding.git
fi
echo 'Start preparing text files...'
mkdir -p SNIPS
python3 "$script_dir/snips_text_norm.py"
python3 "$script_dir/snips_preprocess.py" text aws-lex-noisy-spoken-language-understanding SNIPS
rm SNIPS/single*
fi
if [ -s SNIPS/valid/Salli-snips-valid-168.wav ];then
echo 'Preprocessed audio file exist, skip!'
else
if [ ! -d audio_slu ];then
echo 'Start downloading audio files...'
wget https://shangwel-asr-evaluation.s3-us-west-2.amazonaws.com/audio_slu_v3.zip
echo 'Start unzipping audio files...'
unzip audio_slu_v3.zip > tmp.log
fi
echo 'Start converting audio files...'
python "$script_dir/snips_preprocess.py" audio audio_slu SNIPS
fi