|
flow: !new:cosyvoice2.flow.flow.CausalMaskedDiffWithXvec |
|
input_size: 512 |
|
output_size: 80 |
|
spk_embed_dim: 192 |
|
output_type: 'mel' |
|
vocab_size: 6561 |
|
encoder: !new:cosyvoice2.transformer.upsample_encoder_v2.UpsampleConformerEncoderV2 |
|
input_size: 512 |
|
output_size: 512 |
|
input_layer: 'linear' |
|
pre_lookahead_len: 3 |
|
num_blocks: 6 |
|
num_up_blocks: 4 |
|
up_stride: 2 |
|
up_scale_factor: 2 |
|
attention_heads: 8 |
|
pos_enc_layer_type: 'rel_pos_espnet' |
|
selfattention_layer_type: 'rel_selfattn' |
|
key_bias: true |
|
linear_units: 2048 |
|
dropout_rate: 0.1 |
|
positional_dropout_rate: 0.1 |
|
attention_dropout_rate: 0.1 |
|
normalize_before: True |
|
decoder: !new:cosyvoice2.flow.flow_matching.CausalConditionalCFM |
|
inference_cfg_rate: 0.7 |
|
estimator: !new:cosyvoice2.flow.decoder_dit.DiT |
|
in_channels: 320 |
|
out_channels: 80 |
|
mlp_ratio: 4.0 |
|
depth: 16 |
|
num_heads: 8 |
|
head_dim: 64 |
|
hidden_size: 512 |
|
|