!pip install -U scipy !git clone https://github.com/neonbjb/tortoise-tts.git %cd tortoise-tts !pip install -r requirements.txt !python setup.py install !pip install gradio

import os import gradio as gr import torchaudio import time from datetime import datetime from tortoise.api import TextToSpeech from tortoise.utils.audio import load_audio, load_voice, load_voices import os

Set the Gradio queue flag to disabled

os.environ["COMMANDLINE_ARGS"] = "--no-gradio-queue" VOICE_OPTIONS = [ "random", # special option for random voice "custom_voice", # special option for custom voice "disabled", # special option for disabled voice ]

def inference(text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed): if voice != "custom_voice": voices = [voice] else: voices = []

if voice_b != "disabled":
    voices.append(voice_b)
if voice_c != "disabled":
    voices.append(voice_c)

if emotion != "None/Custom":
    text = f"[I am really {emotion.lower()},] {text}"
elif prompt.strip() != "":
    text = f"[{prompt},] {text}"

c = None
if voice == "custom_voice":
    if mic_audio is None:
        raise gr.Error("Please provide audio from mic when choosing custom voice")
    c = load_audio(mic_audio, 22050)

if len(voices) == 1 or len(voices) == 0:
    if voice == "custom_voice":
        voice_samples, conditioning_latents = [c], None
    else:
        voice_samples, conditioning_latents = load_voice(voice)
else:
    voice_samples, conditioning_latents = load_voices(voices)
    if voice == "custom_voice":
        voice_samples.extend([c])

sample_voice = voice_samples[0] if len(voice_samples) else None

start_time = time.time()
gen, _ = tts.tts_with_preset(
    text,
    voice_samples=voice_samples,
    conditioning_latents=conditioning_latents,
    preset=preset,
    use_deterministic_seed=seed,
    return_deterministic_state=True,
    k=3,
)

with open("Tortoise_TTS_Runs.log", "a") as f:
    f.write(
        f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
    )

return (
    (22050, sample_voice.squeeze().cpu().numpy()),
    (24000, gen[0].squeeze().cpu().numpy()),
    (24000, gen[1].squeeze().cpu().numpy()),
    (24000, gen[2].squeeze().cpu().numpy()),
)

def main(): # Custom HTML for the title title_html = "

RJ VOICE CLONING

"

# Interface components
text = gr.Textbox(lines=4, label="Text:")
emotion = gr.Radio(
    ["None/Custom", "Happy", "Sad", "Angry", "Disgusted", "Arrogant"],
    value="None/Custom",
    label="Select emotion:",
    type="value",
)
prompt = gr.Textbox(lines=1, label="Enter prompt if [Custom] emotion:")
preset = gr.Radio(
    ["ultra_fast", "fast", "standard", "high_quality"],
    value="fast",
    label="Preset mode (determines quality with tradeoff over speed):",
    type="value",
)
voice = gr.Dropdown(
    os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
    value="angie",  # Default voice
    label="Select voice:",
    type="value",
)
mic_audio = gr.Audio(
    label="Record voice (when selected custom_voice):",
    type="filepath"
)
voice_b = gr.Dropdown(
    os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
    value="disabled",
    label="(Optional) Select second voice:",
    type="value",
)
voice_c = gr.Dropdown(
    os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
    value="disabled",
    label="(Optional) Select third voice:",
    type="value",
)
seed = gr.Number(value=0, precision=0, label="Seed (for reproducibility):")

selected_voice = gr.Audio(label="Sample of selected voice (first):")
output_audio_1 = gr.Audio(label="Output [Candidate 1]:")
output_audio_2 = gr.Audio(label="Output [Candidate 2]:")
output_audio_3 = gr.Audio(label="Output [Candidate 3]:")

# Create the Gradio interface
interface = gr.Interface(
    fn=inference,
    inputs=[text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed],
    outputs=[selected_voice, output_audio_1, output_audio_2, output_audio_3],
    title="RJ VOICE CLONING",
    description=title_html,
    css=".gradio-container { background-color: black; color: orange; }"
)

# Launch the interface
interface.launch(share=True)

if name == "main": tts = TextToSpeech()

with open("Tortoise_TTS_Runs.log", "a") as f:
    f.write(
        f"\n\n-------------------------Tortoise TTS Logs, {datetime.now()}-------------------------\n"
    )

main()
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model has no library tag.

Model tree for midhyaraj/voiceclone

Base model

nvidia/NVLM-D-72B
Finetuned
(11)
this model