|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""VLV model configuration""" |
|
|
|
from typing import Optional, Dict, Any |
|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.utils import logging |
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
class VLV_Config(PretrainedConfig): |
|
r""" |
|
This is the configuration class to store the configuration of a [`VLV_MODEL`]. It is used to instantiate a VLV model |
|
according to the specified arguments, defining the model architecture. |
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
documentation from [`PretrainedConfig`] for more information. |
|
|
|
Args: |
|
model_type (`str`, *optional*, defaults to "VLV_decoder"): |
|
The model type identifier. |
|
batch_size (`int`, *optional*, defaults to 1): |
|
The batch size for inference. |
|
deepspeed (`bool`, *optional*, defaults to True): |
|
Whether to use deepspeed. |
|
distributed (`bool`, *optional*, defaults to True): |
|
Whether to use distributed training. |
|
fp32 (`bool`, *optional*, defaults to True): |
|
Whether to use fp32 precision. |
|
guidance_scale (`float`, *optional*, defaults to 2.0): |
|
The guidance scale for generation. |
|
hidden_size (`int`, *optional*, defaults to 128): |
|
The hidden size of the model. |
|
image_size (`int`, *optional*, defaults to 768): |
|
The size of input images. |
|
learnable_token_length (`int`, *optional*, defaults to 77): |
|
The length of learnable tokens. |
|
local_rank (`int`, *optional*, defaults to 0): |
|
The local rank for distributed training. |
|
mixed_precision (`str`, *optional*, defaults to "bf16"): |
|
The mixed precision mode. |
|
num_inference_steps (`int`, *optional*, defaults to 50): |
|
The number of inference steps. |
|
torch_dtype (`str`, *optional*, defaults to "bfloat16"): |
|
The torch dtype. |
|
use_text_encoder (`bool`, *optional*, defaults to True): |
|
Whether to use text encoder. |
|
verbose (`bool`, *optional*, defaults to True): |
|
Whether to enable verbose mode. |
|
qwen_model (`str`, *optional*, defaults to "Qwen/Qwen2.5-3B"): |
|
The Qwen model to use. |
|
qwen2_config (`dict`, *optional*): |
|
The Qwen2 configuration. |
|
max_length (`int`, *optional*, defaults to 300): |
|
Maximum length for generation. |
|
num_beams (`int`, *optional*, defaults to 4): |
|
Number of beams for beam search. |
|
""" |
|
|
|
model_type = "VLV_decoder" |
|
keys_to_ignore_at_inference = ["past_key_values"] |
|
|
|
def __init__( |
|
self, |
|
model_type: str = "VLV_decoder", |
|
batch_size: int = 1, |
|
deepspeed: bool = True, |
|
distributed: bool = True, |
|
fp32: bool = True, |
|
guidance_scale: float = 2.0, |
|
hidden_size: int = 128, |
|
image_size: int = 768, |
|
learnable_token_length: int = 77, |
|
local_rank: int = 0, |
|
mixed_precision: str = "bf16", |
|
num_inference_steps: int = 50, |
|
torch_dtype: str = "bfloat16", |
|
transformers_version: str = "4.51.1", |
|
use_text_encoder: bool = True, |
|
verbose: bool = True, |
|
qwen_model: str = "Qwen/Qwen2.5-3B", |
|
stable_diffusion_model_path: str = "stabilityai/stable-diffusion-2-1-base", |
|
florence2_model_path: str = "microsoft/Florence-2-large", |
|
qwen2_config: Optional[Dict[str, Any]] = None, |
|
max_length: int = 300, |
|
num_beams: int = 4, |
|
**kwargs, |
|
): |
|
self.model_type = model_type |
|
self.batch_size = batch_size |
|
self.deepspeed = deepspeed |
|
self.distributed = distributed |
|
self.fp32 = fp32 |
|
self.guidance_scale = guidance_scale |
|
self.hidden_size = hidden_size |
|
self.image_size = image_size |
|
self.learnable_token_length = learnable_token_length |
|
self.local_rank = local_rank |
|
self.mixed_precision = mixed_precision |
|
self.num_inference_steps = num_inference_steps |
|
self.torch_dtype = torch_dtype |
|
self.transformers_version = transformers_version |
|
self.use_text_encoder = use_text_encoder |
|
self.verbose = verbose |
|
self.qwen_model = qwen_model |
|
self.stable_diffusion_model_path = stable_diffusion_model_path |
|
self.florence2_model_path = florence2_model_path |
|
self.qwen2_config = qwen2_config or self._get_default_qwen2_config() |
|
self.max_length = max_length |
|
self.num_beams = num_beams |
|
|
|
super().__init__(**kwargs) |
|
|
|
def _get_default_qwen2_config(self): |
|
"""Get default Qwen2 configuration.""" |
|
return { |
|
"architectures": ["Qwen2ForCausalLM"], |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 11008, |
|
"max_position_embeddings": 32768, |
|
"max_window_layers": 36, |
|
"model_type": "qwen2", |
|
"num_attention_heads": 16, |
|
"num_hidden_layers": 36, |
|
"num_key_value_heads": 2, |
|
"rms_norm_eps": 1e-06, |
|
"rope_theta": 1000000.0, |
|
"sliding_window": 32768, |
|
"tie_word_embeddings": True, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.40.1", |
|
"use_cache": True, |
|
"use_mrope": False, |
|
"use_sliding_window": False, |
|
"vocab_size": 151936 |
|
} |
|
|
|
|
|
class CLIPDecoderConfig(PretrainedConfig): |
|
r""" |
|
Configuration class for CLIPDecoder model (legacy support). |
|
""" |
|
|
|
model_type = "vlv_stage2" |
|
|
|
def __init__( |
|
self, |
|
input_dim: int = 1024, |
|
bf16: bool = False, |
|
**kwargs, |
|
): |
|
self.input_dim = input_dim |
|
self.bf16 = bf16 |
|
super().__init__(**kwargs) |