# coding=utf-8 # Copyright 2024 VLV Team and the HuggingFace Inc. team. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """VLV model configuration""" from typing import Optional, Dict, Any from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) class VLV_Config(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`VLV_MODEL`]. It is used to instantiate a VLV model according to the specified arguments, defining the model architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: model_type (`str`, *optional*, defaults to "VLV_decoder"): The model type identifier. batch_size (`int`, *optional*, defaults to 1): The batch size for inference. deepspeed (`bool`, *optional*, defaults to True): Whether to use deepspeed. distributed (`bool`, *optional*, defaults to True): Whether to use distributed training. fp32 (`bool`, *optional*, defaults to True): Whether to use fp32 precision. guidance_scale (`float`, *optional*, defaults to 2.0): The guidance scale for generation. hidden_size (`int`, *optional*, defaults to 128): The hidden size of the model. image_size (`int`, *optional*, defaults to 768): The size of input images. learnable_token_length (`int`, *optional*, defaults to 77): The length of learnable tokens. local_rank (`int`, *optional*, defaults to 0): The local rank for distributed training. mixed_precision (`str`, *optional*, defaults to "bf16"): The mixed precision mode. num_inference_steps (`int`, *optional*, defaults to 50): The number of inference steps. torch_dtype (`str`, *optional*, defaults to "bfloat16"): The torch dtype. use_text_encoder (`bool`, *optional*, defaults to True): Whether to use text encoder. verbose (`bool`, *optional*, defaults to True): Whether to enable verbose mode. qwen_model (`str`, *optional*, defaults to "Qwen/Qwen2.5-3B"): The Qwen model to use. qwen2_config (`dict`, *optional*): The Qwen2 configuration. max_length (`int`, *optional*, defaults to 300): Maximum length for generation. num_beams (`int`, *optional*, defaults to 4): Number of beams for beam search. """ model_type = "VLV_decoder" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, model_type: str = "VLV_decoder", batch_size: int = 1, deepspeed: bool = True, distributed: bool = True, fp32: bool = True, guidance_scale: float = 2.0, hidden_size: int = 128, image_size: int = 768, learnable_token_length: int = 77, local_rank: int = 0, mixed_precision: str = "bf16", num_inference_steps: int = 50, torch_dtype: str = "bfloat16", transformers_version: str = "4.51.1", use_text_encoder: bool = True, verbose: bool = True, qwen_model: str = "Qwen/Qwen2.5-3B", stable_diffusion_model_path: str = "stabilityai/stable-diffusion-2-1-base", florence2_model_path: str = "microsoft/Florence-2-large", qwen2_config: Optional[Dict[str, Any]] = None, max_length: int = 300, num_beams: int = 4, **kwargs, ): self.model_type = model_type self.batch_size = batch_size self.deepspeed = deepspeed self.distributed = distributed self.fp32 = fp32 self.guidance_scale = guidance_scale self.hidden_size = hidden_size self.image_size = image_size self.learnable_token_length = learnable_token_length self.local_rank = local_rank self.mixed_precision = mixed_precision self.num_inference_steps = num_inference_steps self.torch_dtype = torch_dtype self.transformers_version = transformers_version self.use_text_encoder = use_text_encoder self.verbose = verbose self.qwen_model = qwen_model self.stable_diffusion_model_path = stable_diffusion_model_path self.florence2_model_path = florence2_model_path self.qwen2_config = qwen2_config or self._get_default_qwen2_config() self.max_length = max_length self.num_beams = num_beams super().__init__(**kwargs) def _get_default_qwen2_config(self): """Get default Qwen2 configuration.""" return { "architectures": ["Qwen2ForCausalLM"], "attention_dropout": 0.0, "bos_token_id": 151643, "eos_token_id": 151643, "hidden_act": "silu", "hidden_size": 2048, "initializer_range": 0.02, "intermediate_size": 11008, "max_position_embeddings": 32768, "max_window_layers": 36, "model_type": "qwen2", "num_attention_heads": 16, "num_hidden_layers": 36, "num_key_value_heads": 2, "rms_norm_eps": 1e-06, "rope_theta": 1000000.0, "sliding_window": 32768, "tie_word_embeddings": True, "torch_dtype": "bfloat16", "transformers_version": "4.40.1", "use_cache": True, "use_mrope": False, "use_sliding_window": False, "vocab_size": 151936 } class CLIPDecoderConfig(PretrainedConfig): r""" Configuration class for CLIPDecoder model (legacy support). """ model_type = "vlv_stage2" def __init__( self, input_dim: int = 1024, bf16: bool = False, **kwargs, ): self.input_dim = input_dim self.bf16 = bf16 super().__init__(**kwargs)