|
|
|
|
|
|
|
import argparse |
|
import json |
|
import os |
|
from pathlib import Path |
|
import re |
|
import torch |
|
from typing import Dict, List, Tuple |
|
|
|
from vibevoice.modular.configuration_vibevoice import ( |
|
VibeVoiceConfig |
|
) |
|
from vibevoice.modular.modeling_vibevoice import VibeVoiceForConditionalGeneration |
|
from transformers.utils import logging |
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
def convert_vibevoice_nnscaler_checkpoint_to_hf( |
|
checkpoint_path: str, |
|
pytorch_dump_folder_path: str, |
|
config_path: str = None, |
|
): |
|
""" |
|
Convert a nnscaler VibeVoice checkpoint to HuggingFace format. |
|
Supports both regular checkpoints and tensor parallel checkpoints. |
|
""" |
|
|
|
|
|
logger.info(f"Loading regular checkpoint from {checkpoint_path}") |
|
checkpoint = torch.load(checkpoint_path, map_location="cpu") |
|
|
|
|
|
init_config_name = checkpoint['train_args']['vars']['model_args']['config_path']['relative_path'] |
|
pretrained_name = checkpoint['train_args']['vars']['data_args']['tokenizer_path'] |
|
|
|
init_config_path = Path(__file__).parent.parent / 'configs' / init_config_name.split('/')[-1] |
|
if init_config_path.exists(): |
|
logger.info(f"Loading initial config from {init_config_path}") |
|
with open(init_config_path, 'r') as f: |
|
init_config = json.load(f) |
|
else: |
|
raise FileNotFoundError(f"Initial config file {init_config_path} not found. Please provide a valid path.") |
|
|
|
tie_word_embeddings = init_config['decoder_config'].get('tie_word_embeddings', True) |
|
logger.info(f"Tie word embeddings: {tie_word_embeddings}") |
|
|
|
init_config['decoder_config']['use_cache'] = True |
|
config = VibeVoiceConfig(**init_config, tie_word_embeddings=tie_word_embeddings) |
|
|
|
|
|
model_state_dict = {k.replace('model.model.', 'model.'): v for k, v in checkpoint["model"].items() if k.startswith('model.model.')} |
|
if not tie_word_embeddings and 'model.lm_head.weight' in checkpoint["model"].keys(): |
|
|
|
model_state_dict['lm_head.weight'] = checkpoint["model"]['model.lm_head.weight'] |
|
|
|
|
|
if config_path: |
|
logger.info(f"Loading config from {config_path}") |
|
with open(config_path, 'r') as f: |
|
config_dict = json.load(f) |
|
config = VibeVoiceConfig.from_dict(config_dict) |
|
|
|
|
|
original_dtype = torch.get_default_dtype() |
|
torch.set_default_dtype(torch.bfloat16) |
|
|
|
|
|
logger.info("Creating HuggingFace VibeVoiceForConditionalGeneration model") |
|
model = VibeVoiceForConditionalGeneration(config) |
|
|
|
|
|
torch.set_default_dtype(original_dtype) |
|
|
|
|
|
logger.info("Loading weights into model") |
|
missing_keys, unexpected_keys = model.load_state_dict(model_state_dict, strict=False) |
|
|
|
if missing_keys: |
|
logger.warning(f"Missing keys: {missing_keys}") |
|
if unexpected_keys: |
|
logger.warning(f"Unexpected keys: {unexpected_keys}") |
|
|
|
|
|
os.makedirs(pytorch_dump_folder_path, exist_ok=True) |
|
|
|
|
|
logger.info(f"Saving model to {pytorch_dump_folder_path}") |
|
|
|
|
|
config.save_pretrained(pytorch_dump_folder_path) |
|
|
|
|
|
logger.info("Saving VibeVoiceProcessor configuration") |
|
processor_config = { |
|
"processor_class": "VibeVoiceProcessor", |
|
"speech_tok_compress_ratio": 3200, |
|
"db_normalize": True, |
|
|
|
"audio_processor": { |
|
"feature_extractor_type": "VibeVoiceTokenizerProcessor", |
|
"sampling_rate": 24000, |
|
"normalize_audio": True, |
|
"target_dB_FS": -25, |
|
"eps": 1e-6, |
|
}, |
|
"language_model_pretrained_name": pretrained_name, |
|
} |
|
|
|
processor_config_path = os.path.join(pytorch_dump_folder_path, "preprocessor_config.json") |
|
with open(processor_config_path, 'w') as f: |
|
json.dump(processor_config, f, indent=2) |
|
logger.info(f"Saved processor config to {processor_config_path}") |
|
|
|
|
|
|
|
logger.info("Saving model weights with sharding...") |
|
model.save_pretrained( |
|
pytorch_dump_folder_path, |
|
max_shard_size="2GB", |
|
safe_serialization=True |
|
) |
|
logger.info(f"Model weights saved to {pytorch_dump_folder_path}") |
|
|
|
logger.info("Conversion complete!") |
|
|
|
|
|
logger.info("Verifying saved model...") |
|
loaded_model = VibeVoiceForConditionalGeneration.from_pretrained(pytorch_dump_folder_path) |
|
logger.info("Model successfully loaded from saved checkpoint!") |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"--nnscaler_checkpoint_path", |
|
type=str, |
|
required=True, |
|
help="Path to the fairseq checkpoint (.pt file). For tensor parallel checkpoints, " |
|
"provide any one of the part files (e.g., checkpoint_1_5000-model_part-0.pt), " |
|
"and the script will automatically detect and merge all parts.", |
|
) |
|
parser.add_argument( |
|
"--pytorch_dump_folder_path", |
|
type=str, |
|
required=True, |
|
help="Path to the output PyTorch model directory", |
|
) |
|
parser.add_argument( |
|
"--config_path", |
|
type=str, |
|
default=None, |
|
help="Optional path to a config JSON file to override extracted config", |
|
) |
|
|
|
args = parser.parse_args() |
|
|
|
convert_vibevoice_nnscaler_checkpoint_to_hf( |
|
args.nnscaler_checkpoint_path, |
|
args.pytorch_dump_folder_path, |
|
args.config_path, |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |