|
--- |
|
license: mit |
|
datasets: |
|
- OpenSpeechHub/mls_eng_10k_snac_qwen |
|
language: |
|
- en |
|
base_model: |
|
- Qwen/Qwen3-0.6B |
|
pipeline_tag: text-to-speech |
|
library_name: transformers |
|
--- |
|
## Overview |
|
VyvoTTS-v0-Qwen3-0.6B is a Text-to-Speech model based on Qwen3-0.6B, trained to produce natural-sounding English speech. |
|
|
|
- **Type:** Text-to-Speech |
|
- **Language:** English |
|
- **License:** MIT |
|
- **Params:** ~810M |
|
|
|
> **Note:** This model has a high Word Error Rate (WER) as it was trained on a 10,000-hour dataset. To improve the model's accuracy, you should use it as a pretrained base. |
|
> I can recommend the Emilia dataset for this purpose. After the pretraining process is complete, you should perform fine-tuning for single-speaker speech. |
|
|
|
## Usage |
|
Below is an example of using the model with `unsloth` and `SNAC` for speech generation: |
|
|
|
```python |
|
from unsloth import FastLanguageModel |
|
import torch |
|
from snac import SNAC |
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name = "Vyvo/VyvoTTS-v0-Qwen3-0.6B", |
|
max_seq_length= 8192, |
|
dtype = None, |
|
load_in_4bit = False, |
|
) |
|
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz") |
|
tokeniser_length = 151669 |
|
start_of_text = 151643 |
|
end_of_text = 151645 |
|
|
|
start_of_speech = tokeniser_length + 1 |
|
end_of_speech = tokeniser_length + 2 |
|
start_of_human = tokeniser_length + 3 |
|
end_of_human = tokeniser_length + 4 |
|
pad_token = tokeniser_length + 7 |
|
|
|
audio_tokens_start = tokeniser_length + 10 |
|
prompts = ["Hey there my name is Elise, and I'm a speech generation model that can sound like a person."] |
|
chosen_voice = None |
|
|
|
FastLanguageModel.for_inference(model) |
|
snac_model.to("cpu") |
|
prompts_ = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts] |
|
|
|
all_input_ids = [] |
|
for prompt in prompts_: |
|
input_ids = tokenizer(prompt, return_tensors="pt").input_ids |
|
all_input_ids.append(input_ids) |
|
|
|
start_token = torch.tensor([[start_of_human]], dtype=torch.int64) |
|
end_tokens = torch.tensor([[end_of_text, end_of_human]], dtype=torch.int64) |
|
|
|
all_modified_input_ids = [] |
|
for input_ids in all_input_ids: |
|
modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1) |
|
all_modified_input_ids.append(modified_input_ids) |
|
|
|
all_padded_tensors, all_attention_masks = [], [] |
|
max_length = max([m.shape[1] for m in all_modified_input_ids]) |
|
for m in all_modified_input_ids: |
|
padding = max_length - m.shape[1] |
|
padded_tensor = torch.cat([torch.full((1, padding), pad_token, dtype=torch.int64), m], dim=1) |
|
attention_mask = torch.cat([torch.zeros((1, padding), dtype=torch.int64), torch.ones((1, m.shape[1]), dtype=torch.int64)], dim=1) |
|
all_padded_tensors.append(padded_tensor) |
|
all_attention_masks.append(attention_mask) |
|
|
|
input_ids = torch.cat(all_padded_tensors, dim=0).to("cuda") |
|
attention_mask = torch.cat(all_attention_masks, dim=0).to("cuda") |
|
|
|
generated_ids = model.generate( |
|
input_ids=input_ids, |
|
attention_mask=attention_mask, |
|
max_new_tokens=1200, |
|
do_sample=True, |
|
temperature=0.6, |
|
top_p=0.95, |
|
repetition_penalty=1.1, |
|
num_return_sequences=1, |
|
eos_token_id=end_of_speech, |
|
use_cache=True |
|
) |
|
|
|
token_to_find = start_of_speech |
|
token_to_remove = end_of_speech |
|
token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True) |
|
|
|
if len(token_indices[1]) > 0: |
|
last_occurrence_idx = token_indices[1][-1].item() |
|
cropped_tensor = generated_ids[:, last_occurrence_idx+1:] |
|
else: |
|
cropped_tensor = generated_ids |
|
|
|
processed_rows = [] |
|
for row in cropped_tensor: |
|
masked_row = row[row != token_to_remove] |
|
processed_rows.append(masked_row) |
|
|
|
code_lists = [] |
|
for row in processed_rows: |
|
row_length = row.size(0) |
|
new_length = (row_length // 7) * 7 |
|
trimmed_row = row[:new_length] |
|
trimmed_row = [t - audio_tokens_start for t in trimmed_row] |
|
code_lists.append(trimmed_row) |
|
|
|
def redistribute_codes(code_list): |
|
layer_1, layer_2, layer_3 = [], [], [] |
|
for i in range((len(code_list)+1)//7): |
|
layer_1.append(code_list[7*i]) |
|
layer_2.append(code_list[7*i+1]-4096) |
|
layer_3.append(code_list[7*i+2]-(2*4096)) |
|
layer_3.append(code_list[7*i+3]-(3*4096)) |
|
layer_2.append(code_list[7*i+4]-(4*4096)) |
|
layer_3.append(code_list[7*i+5]-(5*4096)) |
|
layer_3.append(code_list[7*i+6]-(6*4096)) |
|
codes = [ |
|
torch.tensor(layer_1).unsqueeze(0), |
|
torch.tensor(layer_2).unsqueeze(0), |
|
torch.tensor(layer_3).unsqueeze(0) |
|
] |
|
audio_hat = snac_model.decode(codes) |
|
return audio_hat |
|
|
|
my_samples = [] |
|
for code_list in code_lists: |
|
samples = redistribute_codes(code_list) |
|
my_samples.append(samples) |
|
|
|
from IPython.display import display, Audio |
|
if len(prompts) != len(my_samples): |
|
raise Exception("Number of prompts and samples do not match") |
|
else: |
|
for i in range(len(my_samples)): |
|
print(prompts[i]) |
|
samples = my_samples[i] |
|
display(Audio(samples.detach().squeeze().to("cpu").numpy(), rate=24000)) |
|
|
|
del my_samples, samples |
|
``` |
|
|
|
## Citation |
|
|
|
If you use this model, please cite: |
|
|
|
```bibtex |
|
@misc{VyvoTTS-v0-Qwen3-0.6B, |
|
title={VyvoTTS-v0-Qwen3-0.6B}, |
|
author={Vyvo}, |
|
year={2025}, |
|
howpublished={\url{https://huggingface.co/Vyvo/VyvoTTS-v0-Qwen3-0.6B}} |
|
} |
|
``` |
|
|