|
"""Tokenization classes for vibevoice.""" |
|
|
|
from typing import List, Optional, Union |
|
|
|
from transformers.utils import logging |
|
from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer |
|
from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast |
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
class VibeVoiceTextTokenizer(Qwen2Tokenizer): |
|
""" |
|
Construct a VibeVoice tokenizer. Based on the Qwen2 tokenizer with additional special tokens for speech. |
|
|
|
Args: |
|
vocab_file (`str`): |
|
Path to the vocabulary file. |
|
merges_file (`str`): |
|
Path to the merges file. |
|
errors (`str`, *optional*, defaults to `"replace"`): |
|
Paradigm to follow when decoding bytes to UTF-8. |
|
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
|
The unknown token. |
|
bos_token (`str`, *optional*): |
|
The beginning of sequence token. Not used for vibevoice. |
|
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
|
The end of sequence token. |
|
pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
|
The token used for padding. |
|
add_special_tokens (`bool`, *optional*, defaults to `True`): |
|
Whether or not to add special tokens when encoding. |
|
""" |
|
|
|
model_input_names = ["input_ids", "attention_mask"] |
|
|
|
def __init__( |
|
self, |
|
vocab_file, |
|
merges_file, |
|
errors="replace", |
|
unk_token="<|endoftext|>", |
|
bos_token=None, |
|
eos_token="<|endoftext|>", |
|
pad_token="<|endoftext|>", |
|
add_prefix_space=False, |
|
add_special_tokens=True, |
|
**kwargs, |
|
): |
|
super().__init__( |
|
vocab_file=vocab_file, |
|
merges_file=merges_file, |
|
errors=errors, |
|
unk_token=unk_token, |
|
bos_token=bos_token, |
|
eos_token=eos_token, |
|
pad_token=pad_token, |
|
add_prefix_space=add_prefix_space, |
|
add_special_tokens=add_special_tokens, |
|
**kwargs, |
|
) |
|
|
|
|
|
self._add_vibevoice_special_tokens() |
|
|
|
def _add_vibevoice_special_tokens(self): |
|
"""Add VibeVoice-specific special tokens.""" |
|
special_tokens = { |
|
"additional_special_tokens": [ |
|
"<|vision_start|>", |
|
"<|vision_end|>", |
|
"<|vision_pad|>", |
|
] |
|
} |
|
num_added = self.add_special_tokens(special_tokens) |
|
|
|
|
|
self._speech_start_id = self.convert_tokens_to_ids("<|vision_start|>") |
|
self._speech_end_id = self.convert_tokens_to_ids("<|vision_end|>") |
|
self._speech_diffusion_id = self.convert_tokens_to_ids("<|vision_pad|>") |
|
|
|
self._eos_id = self.convert_tokens_to_ids('<|endoftext|>') |
|
|
|
return num_added |
|
|
|
@property |
|
def eos_id(self) -> int: |
|
"""Id of the end of sequence token.""" |
|
return self._eos_id |
|
|
|
@property |
|
def speech_start_id(self) -> int: |
|
"""Id of the speech start token.""" |
|
return self._speech_start_id |
|
|
|
@property |
|
def speech_end_id(self) -> int: |
|
"""Id of the speech end token.""" |
|
return self._speech_end_id |
|
|
|
@property |
|
def speech_diffusion_id(self) -> int: |
|
"""Id of the speech diffusion token.""" |
|
return self._speech_diffusion_id |
|
|
|
@property |
|
def pad_id(self) -> int: |
|
"""Id used for padding (returns -100 for loss masking).""" |
|
return -100 |
|
|
|
|
|
class VibeVoiceTextTokenizerFast(Qwen2TokenizerFast): |
|
""" |
|
Construct a "fast" VibeVoice tokenizer (backed by HuggingFace's *tokenizers* library). |
|
Based on the Qwen2 tokenizer with additional special tokens for speech. |
|
|
|
Args: |
|
vocab_file (`str`, *optional*): |
|
Path to the vocabulary file. |
|
merges_file (`str`, *optional*): |
|
Path to the merges file. |
|
tokenizer_file (`str`, *optional*): |
|
Path to [tokenizers](https://github.com/huggingface/tokenizers) file. |
|
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
|
The unknown token. |
|
bos_token (`str`, *optional*): |
|
The beginning of sequence token. Not used for vibevoice. |
|
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
|
The end of sequence token. |
|
pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
|
The token used for padding. |
|
""" |
|
|
|
model_input_names = ["input_ids", "attention_mask"] |
|
|
|
def __init__( |
|
self, |
|
vocab_file=None, |
|
merges_file=None, |
|
tokenizer_file=None, |
|
unk_token="<|endoftext|>", |
|
bos_token=None, |
|
eos_token="<|endoftext|>", |
|
pad_token="<|endoftext|>", |
|
add_prefix_space=False, |
|
**kwargs, |
|
): |
|
super().__init__( |
|
vocab_file=vocab_file, |
|
merges_file=merges_file, |
|
tokenizer_file=tokenizer_file, |
|
unk_token=unk_token, |
|
bos_token=bos_token, |
|
eos_token=eos_token, |
|
pad_token=pad_token, |
|
add_prefix_space=add_prefix_space, |
|
**kwargs, |
|
) |
|
|
|
|
|
self._add_vibevoice_special_tokens() |
|
|
|
def _add_vibevoice_special_tokens(self): |
|
"""Add VibeVoice-specific special tokens.""" |
|
special_tokens = { |
|
"additional_special_tokens": [ |
|
"<|vision_start|>", |
|
"<|vision_end|>", |
|
"<|vision_pad|>", |
|
] |
|
} |
|
num_added = self.add_special_tokens(special_tokens) |
|
|
|
|
|
self._speech_start_id = self.convert_tokens_to_ids("<|vision_start|>") |
|
self._speech_end_id = self.convert_tokens_to_ids("<|vision_end|>") |
|
self._speech_diffusion_id = self.convert_tokens_to_ids("<|vision_pad|>") |
|
|
|
|
|
self._eos_id = self.eos_token_id |
|
self._pad_id = self.convert_tokens_to_ids('<|image_pad|>') |
|
|
|
return num_added |
|
|
|
@property |
|
def eos_id(self) -> int: |
|
"""Id of the end of sequence token.""" |
|
return self._eos_id |
|
|
|
@property |
|
def speech_start_id(self) -> int: |
|
"""Id of the speech start token.""" |
|
return self._speech_start_id |
|
|
|
@property |
|
def speech_end_id(self) -> int: |
|
"""Id of the speech end token.""" |
|
return self._speech_end_id |
|
|
|
@property |
|
def speech_diffusion_id(self) -> int: |
|
"""Id of the speech diffusion token.""" |
|
return self._speech_diffusion_id |
|
|
|
@property |
|
def pad_id(self) -> int: |
|
"""Id used for padding (returns -100 for loss masking).""" |
|
return self._pad_id |
|
|
|
|
|
__all__ = [ |
|
"VibeVoiceTextTokenizer", |
|
"VibeVoiceTextTokenizerFast", |
|
] |