|
from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video, download_models |
|
import torch |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
download_models([ |
|
"Flat2DAnimerge_v45Sharp", |
|
"AnimateDiff_v2", |
|
"ControlNet_v11p_sd15_lineart", |
|
"ControlNet_v11f1e_sd15_tile", |
|
"TextualInversion_VeryBadImageNegative_v1.3" |
|
]) |
|
|
|
|
|
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") |
|
model_manager.load_models([ |
|
"models/stable_diffusion/flat2DAnimerge_v45Sharp.safetensors", |
|
"models/AnimateDiff/mm_sd_v15_v2.ckpt", |
|
"models/ControlNet/control_v11p_sd15_lineart.pth", |
|
"models/ControlNet/control_v11f1e_sd15_tile.pth", |
|
]) |
|
pipe = SDVideoPipeline.from_model_manager( |
|
model_manager, |
|
[ |
|
ControlNetConfigUnit( |
|
processor_id="lineart", |
|
model_path="models/ControlNet/control_v11p_sd15_lineart.pth", |
|
scale=0.5 |
|
), |
|
ControlNetConfigUnit( |
|
processor_id="tile", |
|
model_path="models/ControlNet/control_v11f1e_sd15_tile.pth", |
|
scale=0.5 |
|
) |
|
] |
|
) |
|
pipe.prompter.load_textual_inversions(["models/textual_inversion/verybadimagenegative_v1.3.pt"]) |
|
|
|
|
|
|
|
video = VideoData( |
|
video_file="data/examples/bilibili/BV19w411A7YJ.mp4", |
|
height=1024, width=1024) |
|
input_video = [video[i] for i in range(40*60, 41*60)] |
|
|
|
|
|
torch.manual_seed(0) |
|
output_video = pipe( |
|
prompt="best quality, perfect anime illustration, light, a girl is dancing, smile, solo", |
|
negative_prompt="verybadimagenegative_v1.3", |
|
cfg_scale=3, clip_skip=2, |
|
controlnet_frames=input_video, num_frames=len(input_video), |
|
num_inference_steps=10, height=1024, width=1024, |
|
animatediff_batch_size=32, animatediff_stride=16, |
|
) |
|
|
|
|
|
save_video(output_video, "output_video.mp4", fps=60) |
|
|