flow: !new:cosyvoice2.flow.flow.CausalMaskedDiffWithXvec input_size: 512 output_size: 80 spk_embed_dim: 192 output_type: 'mel' vocab_size: 6561 encoder: !new:cosyvoice2.transformer.upsample_encoder_v2.UpsampleConformerEncoderV2 input_size: 512 output_size: 512 input_layer: 'linear' pre_lookahead_len: 3 num_blocks: 6 num_up_blocks: 4 up_stride: 2 up_scale_factor: 2 attention_heads: 8 pos_enc_layer_type: 'rel_pos_espnet' selfattention_layer_type: 'rel_selfattn' key_bias: true linear_units: 2048 dropout_rate: 0.1 positional_dropout_rate: 0.1 attention_dropout_rate: 0.1 normalize_before: True decoder: !new:cosyvoice2.flow.flow_matching.CausalConditionalCFM inference_cfg_rate: 0.7 estimator: !new:cosyvoice2.flow.decoder_dit.DiT in_channels: 320 out_channels: 80 mlp_ratio: 4.0 depth: 16 num_heads: 8 head_dim: 64 hidden_size: 512