finally, everything works locally

Browse files

Files changed (10) hide show

README.md +42 -17
convert_mvdream_to_diffusers.py +2 -2
mvdream/adaptor.py +0 -28
mvdream/attention.py +2 -4
mvdream/models.py +6 -8
mvdream/pipeline_mvdream.py +19 -22
requirements.lock.txt +6 -0
requirements.txt +6 -0
run_imagedream.py +3 -4
run_mvdream.py +3 -4

README.md CHANGED Viewed

@@ -1,15 +1,27 @@
-# MVDream-hf
-modified from https://github.com/KokeCacao/mvdream-hf.
-### convert weights
-MVDream:
 ```bash
 # dependency
-pip install -U omegaconf diffusers safetensors huggingface_hub transformers accelerate
-# download original ckpt
 cd models
 wget https://huggingface.co/MVDream/MVDream/resolve/main/sd-v2.1-base-4view.pt
 wget https://raw.githubusercontent.com/bytedance/MVDream/main/mvdream/configs/sd-v2-base.yaml
@@ -21,18 +33,31 @@ python convert_mvdream_to_diffusers.py --checkpoint_path models/sd-v2.1-base-4vi
 ImageDream:
 ```bash
-# download original ckpt
-wget https://huggingface.co/Peng-Wang/ImageDream/resolve/main/sd-v2.1-base-4view-ipmv-local.pt
-wget https://raw.githubusercontent.com/bytedance/ImageDream/main/extern/ImageDream/imagedream/configs/sd_v2_base_ipmv_local.yaml
 # convert
-python convert_mvdream_to_diffusers.py --checkpoint_path models/sd-v2.1-base-4view-ipmv-local.pt --dump_path ./weights_imagedream --original_config_file models/sd_v2_base_ipmv_local.yaml --half --to_safetensors --test
 ```
-### usage
-example:
-```bash
-python run_mvdream.py "a cute owl"
-python run_imagedream.py data/anya_rgba.png
-```

+# MVDream-diffusers
+A **unified** diffusers implementation of [MVDream](https://github.com/bytedance/MVDream) and [ImageDream](https://github.com/bytedance/ImageDream).
+We provide converted `fp16` weights on [huggingface](TODO).
+### Usage
+```bash
+python run_mvdream.py "a cute owl"
+python run_imagedream.py data/anya_rgba.png
+```
+### Install
 ```bash
 # dependency
+pip install -r requirements.txt
+```
+### Convert weights
+MVDream:
+```bash
+# download original ckpt (we only support the SD 2.1 version)
 cd models
 wget https://huggingface.co/MVDream/MVDream/resolve/main/sd-v2.1-base-4view.pt
 wget https://raw.githubusercontent.com/bytedance/MVDream/main/mvdream/configs/sd-v2-base.yaml
 ImageDream:
 ```bash
+# download original ckpt (we only support the pixel-controller version)
+cd models
+wget https://huggingface.co/Peng-Wang/ImageDream/resolve/main/sd-v2.1-base-4view-ipmv.pt
+wget https://raw.githubusercontent.com/bytedance/ImageDream/main/extern/ImageDream/imagedream/configs/sd_v2_base_ipmv.yaml
+cd ..
 # convert
+python convert_mvdream_to_diffusers.py --checkpoint_path models/sd-v2.1-base-4view-ipmv.pt --dump_path ./weights_imagedream --original_config_file models/sd_v2_base_ipmv.yaml --half --to_safetensors --test
 ```
+### Acknowledgement
+* The original papers:
+    ```bibtex
+    @article{shi2023MVDream,
+        author = {Shi, Yichun and Wang, Peng and Ye, Jianglong and Mai, Long and Li, Kejie and Yang, Xiao},
+        title = {MVDream: Multi-view Diffusion for 3D Generation},
+        journal = {arXiv:2308.16512},
+        year = {2023},
+    }
+    @article{wang2023imagedream,
+        title={ImageDream: Image-Prompt Multi-view Diffusion for 3D Generation},
+        author={Wang, Peng and Shi, Yichun},
+        journal={arXiv preprint arXiv:2312.02201},
+        year={2023}
+    }
+    ```
+* This codebase is modified from [mvdream-hf](https://github.com/KokeCacao/mvdream-hf).

convert_mvdream_to_diffusers.py CHANGED Viewed

@@ -568,7 +568,7 @@ if __name__ == "__main__":
                 images = pipe(
                     image=input_image,
                     prompt="",
-                    negative_prompt="painting, bad quality, flat",
                     output_type="pil",
                     guidance_scale=5.0,
                     num_inference_steps=50,
@@ -582,7 +582,7 @@ if __name__ == "__main__":
                 images = loaded_pipe(
                     image=input_image,
                     prompt="",
-                    negative_prompt="painting, bad quality, flat",
                     output_type="pil",
                     guidance_scale=5.0,
                     num_inference_steps=50,

                 images = pipe(
                     image=input_image,
                     prompt="",
+                    negative_prompt="",
                     output_type="pil",
                     guidance_scale=5.0,
                     num_inference_steps=50,
                 images = loaded_pipe(
                     image=input_image,
                     prompt="",
+                    negative_prompt="",
                     output_type="pil",
                     guidance_scale=5.0,
                     num_inference_steps=50,

mvdream/adaptor.py CHANGED Viewed

@@ -73,34 +73,6 @@ class PerceiverAttention(nn.Module):
         return self.to_out(out)
-class ImageProjModel(torch.nn.Module):
-    """Projection Model"""
-    def __init__(
-        self,
-        cross_attention_dim=1024,
-        clip_embeddings_dim=1024,
-        clip_extra_context_tokens=4,
-    ):
-        super().__init__()
-        self.cross_attention_dim = cross_attention_dim
-        self.clip_extra_context_tokens = clip_extra_context_tokens
-        # from 1024 -> 4 * 1024
-        self.proj = torch.nn.Linear(
-            clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim
-        )
-        self.norm = torch.nn.LayerNorm(cross_attention_dim)
-    def forward(self, image_embeds):
-        embeds = image_embeds
-        clip_extra_context_tokens = self.proj(embeds).reshape(
-            -1, self.clip_extra_context_tokens, self.cross_attention_dim
-        )
-        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
-        return clip_extra_context_tokens
 class Resampler(nn.Module):
     def __init__(
         self,

         return self.to_out(out)
 class Resampler(nn.Module):
     def __init__(
         self,

mvdream/attention.py CHANGED Viewed

@@ -88,7 +88,7 @@ class MemoryEfficientCrossAttention(nn.Module):
         context = default(context, x)
         if self.ip_dim > 0:
-            # context dim [(b frame_num), (77 + img_token), 1024]
             token_len = context.shape[1]
             context_ip = context[:, -self.ip_dim :, :]
             k_ip = self.to_k_ip(context_ip)
@@ -212,9 +212,7 @@ class SpatialTransformer3D(nn.Module):
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
-        self.norm = nn.GroupNorm(
-            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
-        )
         self.proj_in = nn.Linear(in_channels, inner_dim)
         self.transformer_blocks = nn.ModuleList(

         context = default(context, x)
         if self.ip_dim > 0:
+            # context： [B, 77 + 16(ip), 1024]
             token_len = context.shape[1]
             context_ip = context[:, -self.ip_dim :, :]
             k_ip = self.to_k_ip(context_ip)
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
         self.proj_in = nn.Linear(in_channels, inner_dim)
         self.transformer_blocks = nn.ModuleList(

mvdream/models.py CHANGED Viewed

@@ -14,7 +14,7 @@ from .util import (
     timestep_embedding,
 )
 from .attention import SpatialTransformer3D
-from .adaptor import Resampler, ImageProjModel
 import kiui
@@ -266,15 +266,13 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
         num_heads_upsample=-1,
         use_scale_shift_norm=False,
         resblock_updown=False,
-        transformer_depth=1,  # custom transformer support
-        context_dim=None,  # custom transformer support
-        n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
-        disable_self_attentions=None,
         num_attention_blocks=None,
-        disable_middle_self_attn=False,
         adm_in_channels=None,
         camera_dim=None,
-        ip_dim=0,
         ip_weight=1.0,
         **kwargs,
     ):
@@ -604,7 +602,7 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
         # imagedream variant
         if self.ip_dim > 0:
-            x[(num_frames - 1) :: num_frames, :, :, :] = ip_img
             ip_emb = self.image_embed(ip)
             context = torch.cat((context, ip_emb), 1)

     timestep_embedding,
 )
 from .attention import SpatialTransformer3D
+from .adaptor import Resampler
 import kiui
         num_heads_upsample=-1,
         use_scale_shift_norm=False,
         resblock_updown=False,
+        transformer_depth=1,
+        context_dim=None,
+        n_embed=None,
         num_attention_blocks=None,
         adm_in_channels=None,
         camera_dim=None,
+        ip_dim=0, # imagedream uses ip_dim > 0
         ip_weight=1.0,
         **kwargs,
     ):
         # imagedream variant
         if self.ip_dim > 0:
+            x[(num_frames - 1) :: num_frames, :, :, :] = ip_img # place at [4, 9]
             ip_emb = self.image_embed(ip)
             context = torch.cat((context, ip_emb), 1)

mvdream/pipeline_mvdream.py CHANGED Viewed

@@ -405,29 +405,27 @@ class MVDreamPipeline(DiffusionPipeline):
     def encode_image(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
-        image = (image * 255).astype(np.uint8)
         image = self.feature_extractor(image, return_tensors="pt").pixel_values
         image = image.to(device=device, dtype=dtype)
-        image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-        image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-        # imagedream directly use zero as uncond image embeddings
-        uncond_image_enc_hidden_states = torch.zeros_like(image_enc_hidden_states)
-        return uncond_image_enc_hidden_states, image_enc_hidden_states
     def encode_image_latents(self, image, device, num_images_per_prompt):
-        image = torch.from_numpy(image).unsqueeze(0).permute(0, 3, 1, 2) # [1, 3, H, W]
-        image = image.to(device=device)
-        image = F.interpolate(image, (256, 256), mode='bilinear', align_corners=False)
         dtype = next(self.image_encoder.parameters()).dtype
         image = image.to(dtype=dtype)
         posterior = self.vae.encode(image).latent_dist
         latents = posterior.sample() * self.vae.config.scaling_factor # [B, C, H, W]
         latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
@@ -436,13 +434,13 @@ class MVDreamPipeline(DiffusionPipeline):
     @torch.no_grad()
     def __call__(
         self,
-        prompt: str = "a car",
         image: Optional[np.ndarray] = None,
         height: int = 256,
         width: int = 256,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.0,
-        negative_prompt: str = "bad quality",
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
@@ -454,7 +452,6 @@ class MVDreamPipeline(DiffusionPipeline):
     ):
         self.unet = self.unet.to(device=device)
         self.vae = self.vae.to(device=device)
         self.text_encoder = self.text_encoder.to(device=device)
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
@@ -466,10 +463,9 @@ class MVDreamPipeline(DiffusionPipeline):
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
-        # imagedream variant (TODO: debug)
         if image is not None:
             assert isinstance(image, np.ndarray) and image.dtype == np.float32
             self.image_encoder = self.image_encoder.to(device=device)
             image_embeds_neg, image_embeds_pos = self.encode_image(image, device, num_images_per_prompt)
             image_latents_neg, image_latents_pos = self.encode_image_latents(image, device, num_images_per_prompt)
@@ -496,7 +492,11 @@ class MVDreamPipeline(DiffusionPipeline):
             None,
         )
-        camera = get_camera(num_frames, extra_view=(actual_num_frames != num_frames)).to(dtype=latents.dtype, device=device)
         # Prepare extra step kwargs.
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -508,10 +508,7 @@ class MVDreamPipeline(DiffusionPipeline):
                 # expand the latents if we are doing classifier free guidance
                 multiplier = 2 if do_classifier_free_guidance else 1
                 latent_model_input = torch.cat([latents] * multiplier)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t
-                )
                 unet_inputs = {
                     'x': latent_model_input,

     def encode_image(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
+        if image.dtype == np.float32:
+            image = (image * 255).astype(np.uint8)
         image = self.feature_extractor(image, return_tensors="pt").pixel_values
         image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        return torch.zeros_like(image_embeds), image_embeds
     def encode_image_latents(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
+        image = torch.from_numpy(image).unsqueeze(0).permute(0, 3, 1, 2).to(device=device) # [1, 3, H, W]
+        image = 2 * image - 1
+        image = F.interpolate(image, (256, 256), mode='bilinear', align_corners=False)
         image = image.to(dtype=dtype)
         posterior = self.vae.encode(image).latent_dist
         latents = posterior.sample() * self.vae.config.scaling_factor # [B, C, H, W]
         latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
     @torch.no_grad()
     def __call__(
         self,
+        prompt: str = "",
         image: Optional[np.ndarray] = None,
         height: int = 256,
         width: int = 256,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.0,
+        negative_prompt: str = "",
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
     ):
         self.unet = self.unet.to(device=device)
         self.vae = self.vae.to(device=device)
         self.text_encoder = self.text_encoder.to(device=device)
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
+        # imagedream variant
         if image is not None:
             assert isinstance(image, np.ndarray) and image.dtype == np.float32
             self.image_encoder = self.image_encoder.to(device=device)
             image_embeds_neg, image_embeds_pos = self.encode_image(image, device, num_images_per_prompt)
             image_latents_neg, image_latents_pos = self.encode_image_latents(image, device, num_images_per_prompt)
             None,
         )
+        if image is not None:
+            camera = get_camera(num_frames, elevation=5, extra_view=True).to(dtype=latents.dtype, device=device)
+        else:
+            camera = get_camera(num_frames, elevation=15, extra_view=False).to(dtype=latents.dtype, device=device)
+        camera = camera.repeat_interleave(num_images_per_prompt, dim=0)
         # Prepare extra step kwargs.
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
                 # expand the latents if we are doing classifier free guidance
                 multiplier = 2 if do_classifier_free_guidance else 1
                 latent_model_input = torch.cat([latents] * multiplier)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                 unet_inputs = {
                     'x': latent_model_input,

requirements.lock.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+omegaconf == 2.3.0
+diffusers == 0.23.1
+safetensors == 0.4.1
+huggingface_hub == 0.19.4
+transformers == 4.35.2
+accelerate == 0.25.0.dev0

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+omegaconf
+diffusers
+safetensors
+huggingface_hub
+transformers
+accelerate

run_imagedream.py CHANGED Viewed

@@ -17,9 +17,9 @@ parser.add_argument("image", type=str, default='data/anya_rgba.png')
 parser.add_argument("--prompt", type=str, default="")
 args = parser.parse_args()
-while True:
     input_image = kiui.read_image(args.image, mode='float')
-    image = pipe(args.prompt, input_image)
     grid = np.concatenate(
         [
             np.concatenate([image[0], image[2]], axis=0),
@@ -28,5 +28,4 @@ while True:
         axis=1,
     )
     # kiui.vis.plot_image(grid)
-    kiui.write_image('test_imagedream.jpg', grid)
-    break

 parser.add_argument("--prompt", type=str, default="")
 args = parser.parse_args()
+for i in range(5):
     input_image = kiui.read_image(args.image, mode='float')
+    image = pipe(args.prompt, input_image, guidance_scale=5)
     grid = np.concatenate(
         [
             np.concatenate([image[0], image[2]], axis=0),
         axis=1,
     )
     # kiui.vis.plot_image(grid)
+    kiui.write_image(f'test_imagedream_{i}.jpg', grid)

run_mvdream.py CHANGED Viewed

@@ -5,7 +5,7 @@ import argparse
 from mvdream.pipeline_mvdream import MVDreamPipeline
 pipe = MVDreamPipeline.from_pretrained(
-    "./weights", # local weights
     # "ashawkey/mvdream-sd2.1-diffusers",
     torch_dtype=torch.float16
 )
@@ -16,7 +16,7 @@ parser = argparse.ArgumentParser(description="MVDream")
 parser.add_argument("prompt", type=str, default="a cute owl 3d model")
 args = parser.parse_args()
-while True:
     image = pipe(args.prompt)
     grid = np.concatenate(
         [
@@ -26,5 +26,4 @@ while True:
         axis=1,
     )
     # kiui.vis.plot_image(grid)
-    kiui.write_image('test_mvdream.jpg', grid)
-    break

 from mvdream.pipeline_mvdream import MVDreamPipeline
 pipe = MVDreamPipeline.from_pretrained(
+    "./weights_mvdream", # local weights
     # "ashawkey/mvdream-sd2.1-diffusers",
     torch_dtype=torch.float16
 )
 parser.add_argument("prompt", type=str, default="a cute owl 3d model")
 args = parser.parse_args()
+for i in range(5):
     image = pipe(args.prompt)
     grid = np.concatenate(
         [
         axis=1,
     )
     # kiui.vis.plot_image(grid)
+    kiui.write_image(f'test_mvdream_{i}.jpg', grid)