brdhaker3
/

TunASR

Arabic

Model card Files Files and versions Community

brdhaker3 commited on 12 days ago

Commit

636df13

verified ·

1 Parent(s): 75a78b0

Update train.yaml

Browse files

Files changed (1) hide show

train.yaml +179 -174

train.yaml CHANGED Viewed

@@ -1,174 +1,179 @@
-# ################################
-# Model: wav2vec2 + DNN + CTC
-# Augmentation: SpecAugment
-# Authors: Titouan Parcollet 2021
-# ################################
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 1234
-__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref model/<seed>
-wer_file: !ref <output_folder>/wer.txt
-save_folder: !ref <output_folder>/save
-train_log: !ref <output_folder>/train_log.txt
-# URL for the biggest LeBenchmark wav2vec french.
-wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint
-# Data files
-data_folder: /path/to/data  # e.g, /localscratch/cv-corpus-5.1-2020-06-22/fr
-train_tsv_file: !ref <data_folder>/train.tsv  # Standard CommonVoice .tsv files
-dev_tsv_file: !ref <data_folder>/dev.tsv  # Standard CommonVoice .tsv files
-test_tsv_file: !ref <data_folder>/test.tsv  # Standard CommonVoice .tsv files
-accented_letters: True
-language: fr # use 'it' for Italian, 'rw' for Kinyarwanda, 'en' for english
-train_csv: Data/train_wavs/train.csv
-valid_csv: Data/dev_wavs/dev.csv
-test_csv:
-    - Data/test_wavs/test.csv
-skip_prep: True # Skip data preparation
-use_language_modelling: True
-ngram_lm_path: languageModel.arpa
-# We remove utterance slonger than 10s in the train/dev/test sets as
-# longer sentences certainly correspond to "open microphones".
-avoid_if_longer_than: 10.0
-avoid_if_shorter_than: 1.2
-# Training parameters
-number_of_epochs: 12
-lr: 1.0
-lr_wav2vec: 0.0001
-sorting: ascending
-auto_mix_prec: False
-sample_rate: 16000
-ckpt_interval_minutes: 30 # save checkpoint every N min
-# With data_parallel batch_size is split into N jobs
-# With DDP batch_size is multiplied by N jobs
-# Must be 6 per GPU to fit 16GB of VRAM
-batch_size: 10
-test_batch_size: 4
-dataloader_options:
-    batch_size: !ref <batch_size>
-    num_workers: 6
-test_dataloader_options:
-    batch_size: !ref <test_batch_size>
-    num_workers: 6
-# BPE parameters
-token_type: char  # ["unigram", "bpe", "char"]
-character_coverage: 1.0
-# Model parameters
-# activation: !name:torch.nn.LeakyReLU
-wav2vec_output_dim: 1024
-dnn_neurons: 1024
-freeze_wav2vec: False
-freeze_feature_extractor: True
-dropout: 0.15
-warmup_steps: 500 # The wav2vec 2 model isn't updated for this amount of steps
-# Outputs
-output_neurons: 40  # BPE size, index(blank/eos/bos) = 0
-# Decoding parameters
-# Be sure that the bos and eos index match with the BPEs ones
-blank_index: 0
-unk_index: 1
-#
-# Functions and classes
-#
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-    limit: !ref <number_of_epochs>
-enc: !new:speechbrain.nnet.containers.Sequential
-    input_shape: [null, null, !ref <wav2vec_output_dim>]
-    linear1: !name:speechbrain.nnet.linear.Linear
-        n_neurons: !ref <dnn_neurons>
-        bias: True
-    bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
-    activation: !new:torch.nn.LeakyReLU
-    drop: !new:torch.nn.Dropout
-        p: !ref <dropout>
-    linear2: !name:speechbrain.nnet.linear.Linear
-        n_neurons: !ref <dnn_neurons>
-        bias: True
-    bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
-    activation2: !new:torch.nn.LeakyReLU
-    drop2: !new:torch.nn.Dropout
-        p: !ref <dropout>
-    linear3: !name:speechbrain.nnet.linear.Linear
-        n_neurons: !ref <dnn_neurons>
-        bias: True
-    bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
-    activation3: !new:torch.nn.LeakyReLU
-wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
-    source: wavlm-large/
-    output_norm: False
-    freeze: !ref <freeze_wav2vec>
-    freeze_feature_extractor: !ref <freeze_feature_extractor>
-    save_path: !ref <wav2vec2_folder>
-ctc_lin: !new:speechbrain.nnet.linear.Linear
-    input_size: !ref <dnn_neurons>
-    n_neurons: !ref <output_neurons>
-log_softmax: !new:speechbrain.nnet.activations.Softmax
-    apply_log: True
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-    blank_index: !ref <blank_index>
-modules:
-    wav2vec2: !ref <wav2vec2>
-    enc: !ref <enc>
-    ctc_lin: !ref <ctc_lin>
-model: !new:torch.nn.ModuleList
-    - [!ref <enc>, !ref <ctc_lin>]
-model_opt_class: !name:torch.optim.Adadelta
-    lr: !ref <lr>
-    rho: 0.95
-    eps: 1.e-8
-wav2vec_opt_class: !name:torch.optim.Adam
-    lr: !ref <lr_wav2vec>
-lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
-    initial_value: !ref <lr>
-    improvement_threshold: 0.0025
-    annealing_factor: 0.8
-    patient: 0
-lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler
-    initial_value: !ref <lr_wav2vec>
-    improvement_threshold: 0.0025
-    annealing_factor: 0.9
-    patient: 0
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-    checkpoints_dir: !ref <save_folder>
-    recoverables:
-        wav2vec2: !ref <wav2vec2>
-        model: !ref <model>
-        scheduler_model: !ref <lr_annealing_model>
-        scheduler_wav2vec: !ref <lr_annealing_wav2vec>
-        counter: !ref <epoch_counter>
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-    save_file: !ref <train_log>
-error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-    split_tokens: True

+# ################################
+# Model: wav2vec2 + DNN + CTC
+# Augmentation: SpecAugment
+# Authors: Titouan Parcollet 2021
+# ################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 1234
+__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref model/<seed>
+wer_file: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+# URL for the biggest LeBenchmark wav2vec french.
+wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint
+# Data files
+data_folder: /path/to/data  # e.g, /localscratch/cv-corpus-5.1-2020-06-22/fr
+train_tsv_file: !ref <data_folder>/train.tsv  # Standard CommonVoice .tsv files
+dev_tsv_file: !ref <data_folder>/dev.tsv  # Standard CommonVoice .tsv files
+test_tsv_file: !ref <data_folder>/test.tsv  # Standard CommonVoice .tsv files
+accented_letters: True
+language: fr # use 'it' for Italian, 'rw' for Kinyarwanda, 'en' for english
+train_csv: Data/train_wavs/train.csv
+valid_csv: Data/dev_wavs/dev.csv
+test_csv:
+    - Data/test_wavs/test.csv
+skip_prep: True # Skip data preparation
+use_language_modelling: True
+ngram_lm_path: languageModel.arpa
+# We remove utterance slonger than 10s in the train/dev/test sets as
+# longer sentences certainly correspond to "open microphones".
+avoid_if_longer_than: 10.0
+avoid_if_shorter_than: 1.2
+# Training parameters
+number_of_epochs: 12
+lr: 1.0
+lr_wav2vec: 0.0001
+sorting: ascending
+auto_mix_prec: False
+sample_rate: 16000
+ckpt_interval_minutes: 30 # save checkpoint every N min
+# With data_parallel batch_size is split into N jobs
+# With DDP batch_size is multiplied by N jobs
+# Must be 6 per GPU to fit 16GB of VRAM
+batch_size: 10
+test_batch_size: 4
+dataloader_options:
+    batch_size: !ref <batch_size>
+    num_workers: 6
+test_dataloader_options:
+    batch_size: !ref <test_batch_size>
+    num_workers: 6
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+# Model parameters
+# activation: !name:torch.nn.LeakyReLU
+wav2vec_output_dim: 1024
+dnn_neurons: 1024
+freeze_wav2vec: False
+freeze_feature_extractor: True
+dropout: 0.15
+warmup_steps: 500 # The wav2vec 2 model isn't updated for this amount of steps
+# Outputs
+output_neurons: 40  # BPE size, index(blank/eos/bos) = 0
+# Decoding parameters
+# Be sure that the bos and eos index match with the BPEs ones
+blank_index: 0
+unk_index: 1
+#
+# Functions and classes
+#
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+enc: !new:speechbrain.nnet.containers.Sequential
+    input_shape: [null, null, !ref <wav2vec_output_dim>]
+    linear1: !name:speechbrain.nnet.linear.Linear
+        n_neurons: !ref <dnn_neurons>
+        bias: True
+    bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
+    activation: !new:torch.nn.LeakyReLU
+    drop: !new:torch.nn.Dropout
+        p: !ref <dropout>
+    linear2: !name:speechbrain.nnet.linear.Linear
+        n_neurons: !ref <dnn_neurons>
+        bias: True
+    bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
+    activation2: !new:torch.nn.LeakyReLU
+    drop2: !new:torch.nn.Dropout
+        p: !ref <dropout>
+    linear3: !name:speechbrain.nnet.linear.Linear
+        n_neurons: !ref <dnn_neurons>
+        bias: True
+    bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
+    activation3: !new:torch.nn.LeakyReLU
+wav2vec2_hub: microsoft/wavlm-large
+wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+    source: !ref <wav2vec2_hub>
+    output_norm: False
+    freeze: !ref <freeze_wav2vec>
+    freeze_feature_extractor: !ref <freeze_feature_extractor>
+    save_path: !ref <wav2vec2_folder>
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <dnn_neurons>
+    n_neurons: !ref <output_neurons>
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+    blank_index: !ref <blank_index>
+modules:
+    wav2vec2: !ref <wav2vec2>
+    enc: !ref <enc>
+    ctc_lin: !ref <ctc_lin>
+model: !new:torch.nn.ModuleList
+    - [!ref <enc>, !ref <ctc_lin>]
+model_opt_class: !name:torch.optim.Adadelta
+    lr: !ref <lr>
+    rho: 0.95
+    eps: 1.e-8
+wav2vec_opt_class: !name:torch.optim.Adam
+    lr: !ref <lr_wav2vec>
+lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
+    initial_value: !ref <lr>
+    improvement_threshold: 0.0025
+    annealing_factor: 0.8
+    patient: 0
+lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler
+    initial_value: !ref <lr_wav2vec>
+    improvement_threshold: 0.0025
+    annealing_factor: 0.9
+    patient: 0
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        wav2vec2: !ref <wav2vec2>
+        model: !ref <model>
+        scheduler_model: !ref <lr_annealing_model>
+        scheduler_wav2vec: !ref <lr_annealing_wav2vec>
+        counter: !ref <epoch_counter>
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+    split_tokens: True