danhtran2mind/Vi-F5-TTS

Download Model Checkpoints

from huggingface_hub import snapshot_download

# Download the model to the local directory 'Vi-F5-TTS'
snapshot_download(repo_id="danhtran2mind/Vi-F5-TTS", local_dir="Vi-F5-TTS")

cd Vi-F5-TTS

Install Dependencies

pip install git+https://github.com/danhtran2mind/F5-TTS.git

Inference

f5-tts_infer-cli \
    --model_cfg "vi-fine-tuned-f5-tts.yaml" \
    --ckpt_file "model_last.pt" \
    --vocab_file "vocab.txt" \
    --ref_audio <path_to_your_reference_audio> \
    --ref_text <text_of_your_reference_audio> \
    --gen_text "Theo đơn vị này, hiện nay do chịu ảnh hưởng của cơn bão số một, lượng rác từ đầu nguồn tấp vào bờ biển rất nhiều. Để giữ cho bãi biển luôn xanh, sạch, đẹp, ban quản lý xin kêu gọi các bạn đoàn viên, tình nguyện viên và bà con nhân dân hãy chung tay cùng ban quản lý dọn vệ sinh môi trường tại tuyến biển Hoàng Sa - Võ Nguyên Giáp - Trường Sa và tuyến Nguyễn Tất Thành."

Clear Inference code

from f5_tts.infer.utils_infer import (
    cfg_strength,
    cross_fade_duration,
    device,
    fix_duration,
    infer_process,
    load_model,
    load_vocoder,
    mel_spec_type,
    nfe_step,
    preprocess_ref_audio_text,
    remove_silence_for_generated_wav,
    speed,
    sway_sampling_coef,
    target_rms,
)
from omegaconf import OmegaConf
from hydra.utils import get_class
import torch

import re
import os
import soundfile as sf
from pathlib import Path
import numpy as np
import tomli
from importlib.resources import files
from unidecode import unidecode

ckpt_file = "ckpts/model_last.pt"
vocoder_name = "vocos"
vocab_file = "vocab.txt"
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load TTS model
model_cfg = OmegaConf.load(vi-fine-tuned-f5-tts.yaml")
model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
model_arc = model_cfg.model.arch

ema_model = load_model(
    model_cls, model_arc, ckpt_file, mel_spec_type=vocoder_name, vocab_file=vocab_file, device=device
)

ref_audio = <path_to_your_reference_audio>
ref_text = <path_to_your_reference_audio>
gen_text = "Theo đơn vị này, hiện nay do chịu ảnh hưởng của cơn bão số một, lượng rác từ đầu nguồn tấp vào bờ biển rất nhiều. Để giữ cho bãi biển luôn xanh, sạch, đẹp, ban quản lý xin kêu gọi các bạn đoàn viên, tình nguyện viên và bà con nhân dân hãy chung tay cùng ban quản lý dọn vệ sinh môi trường tại tuyến biển Hoàng Sa - Võ Nguyên Giáp - Trường Sa và tuyến Nguyễn Tất Thành."

voices = {}
save_chunk = True
output_dir = "test"
output_file = "basic_test.wav"
wave_path = Path(output_dir) / output_file
remove_silence = True

if vocoder_name == "vocos":
    vocoder_local_path = "ckpts/vocos-mel-24khz"
elif vocoder_name == "bigvgan":
    vocoder_local_path = "ckpts/bigvgan_v2_24khz_100band_256x"

vocoder = load_vocoder(
    vocoder_name=vocoder_name,
    is_local=False,
    local_path=vocoder_local_path,
    device=device
)

if save_chunk:
    output_chunk_dir = os.path.join(output_dir, f"{Path(output_file).stem}_chunks")
    if not os.path.exists(output_chunk_dir):
        os.makedirs(output_chunk_dir)

def infer():
    main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
    # if "voices" not in config:
    #     voices = {"main": main_voice}
    # else:
    # voices = config["voices"]
    voices["main"] = main_voice

    for voice in voices:
        print("Voice:", voice)
        print("ref_audio ", voices[voice]["ref_audio"])
        voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text(
            voices[voice]["ref_audio"], voices[voice]["ref_text"]
        )
        print("ref_audio_", voices[voice]["ref_audio"], "\n\n")

    generated_audio_segments = []
    reg1 = r"(?=\[\w+\])"
    chunks = re.split(reg1, gen_text)
    print("chunks chunks ", chunks)
    reg2 = r"\[(\w+)\]"
    for text in chunks:
        if not text.strip():
            continue
        match = re.match(reg2, text)
        if match:
            voice = match[1]
        else:
            print("No voice tag found, using main.")
            voice = "main"
        if voice not in voices:
            print(f"Voice {voice} not found, using main.")
            voice = "main"
        text = re.sub(reg2, "", text)
        ref_audio_ = voices[voice]["ref_audio"]
        ref_text_ = voices[voice]["ref_text"]
        gen_text_ = text.strip()
        print(f"Voice: {voice}")
        audio_segment, final_sample_rate, spectrogram = infer_process(
            ref_audio_,
            ref_text_,
            gen_text_,
            ema_model,
            vocoder,
            mel_spec_type=vocoder_name,
            target_rms=target_rms,
            cross_fade_duration=cross_fade_duration,
            nfe_step=nfe_step,
            cfg_strength=cfg_strength,
            sway_sampling_coef=sway_sampling_coef,
            speed=speed,
            fix_duration=fix_duration,
            device=device,
        )
        generated_audio_segments.append(audio_segment)

        if save_chunk:
            if len(gen_text_) > 200:
                gen_text_ = gen_text_[:200] + " ... "
            sf.write(
                os.path.join(output_chunk_dir, f"{len(generated_audio_segments) - 1}_{unidecode(gen_text_)}.wav"),
                audio_segment,
                final_sample_rate,
            )

    if generated_audio_segments:
        final_wave = np.concatenate(generated_audio_segments)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        with open(wave_path, "wb") as f:
            sf.write(f.name, final_wave, final_sample_rate)
            # Remove silence
            if remove_silence:
                remove_silence_for_generated_wav(f.name)
            print(f.name)
    return final_sample_rate, final_wave, wave_path

final_sample_rate, final_wave, wave_path = infer()

from IPython.display import Audio
Audio(data=final_wave, rate=final_sample_rate)

Dependencies Version

Python Version

Python v3.11.11

danhtran2mind
/

Vi-F5-TTS

Download Model Checkpoints

Install Dependencies

Inference

Clear Inference code

Dependencies Version

Python Version

Model tree for danhtran2mind/Vi-F5-TTS

Dataset used to train danhtran2mind/Vi-F5-TTS

Space using danhtran2mind/Vi-F5-TTS 1

Collection including danhtran2mind/Vi-F5-TTS

DanhTran2Mind's TTS