lambertxiao's picture
Overwrite with converted Qwen2.5-3B model files
492f6af verified
# coding=utf-8
# Copyright 2024 VLV Team and the HuggingFace Inc. team. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VLV model configuration"""
from typing import Optional, Dict, Any
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class VLV_Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`VLV_MODEL`]. It is used to instantiate a VLV model
according to the specified arguments, defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
model_type (`str`, *optional*, defaults to "VLV_decoder"):
The model type identifier.
batch_size (`int`, *optional*, defaults to 1):
The batch size for inference.
deepspeed (`bool`, *optional*, defaults to True):
Whether to use deepspeed.
distributed (`bool`, *optional*, defaults to True):
Whether to use distributed training.
fp32 (`bool`, *optional*, defaults to True):
Whether to use fp32 precision.
guidance_scale (`float`, *optional*, defaults to 2.0):
The guidance scale for generation.
hidden_size (`int`, *optional*, defaults to 128):
The hidden size of the model.
image_size (`int`, *optional*, defaults to 768):
The size of input images.
learnable_token_length (`int`, *optional*, defaults to 77):
The length of learnable tokens.
local_rank (`int`, *optional*, defaults to 0):
The local rank for distributed training.
mixed_precision (`str`, *optional*, defaults to "bf16"):
The mixed precision mode.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of inference steps.
torch_dtype (`str`, *optional*, defaults to "bfloat16"):
The torch dtype.
use_text_encoder (`bool`, *optional*, defaults to True):
Whether to use text encoder.
verbose (`bool`, *optional*, defaults to True):
Whether to enable verbose mode.
qwen_model (`str`, *optional*, defaults to "Qwen/Qwen2.5-3B"):
The Qwen model to use.
qwen2_config (`dict`, *optional*):
The Qwen2 configuration.
max_length (`int`, *optional*, defaults to 300):
Maximum length for generation.
num_beams (`int`, *optional*, defaults to 4):
Number of beams for beam search.
"""
model_type = "VLV_decoder"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
model_type: str = "VLV_decoder",
batch_size: int = 1,
deepspeed: bool = True,
distributed: bool = True,
fp32: bool = True,
guidance_scale: float = 2.0,
hidden_size: int = 128,
image_size: int = 768,
learnable_token_length: int = 77,
local_rank: int = 0,
mixed_precision: str = "bf16",
num_inference_steps: int = 50,
torch_dtype: str = "bfloat16",
transformers_version: str = "4.51.1",
use_text_encoder: bool = True,
verbose: bool = True,
qwen_model: str = "Qwen/Qwen2.5-3B",
stable_diffusion_model_path: str = "stabilityai/stable-diffusion-2-1-base",
florence2_model_path: str = "microsoft/Florence-2-large",
qwen2_config: Optional[Dict[str, Any]] = None,
max_length: int = 300,
num_beams: int = 4,
**kwargs,
):
self.model_type = model_type
self.batch_size = batch_size
self.deepspeed = deepspeed
self.distributed = distributed
self.fp32 = fp32
self.guidance_scale = guidance_scale
self.hidden_size = hidden_size
self.image_size = image_size
self.learnable_token_length = learnable_token_length
self.local_rank = local_rank
self.mixed_precision = mixed_precision
self.num_inference_steps = num_inference_steps
self.torch_dtype = torch_dtype
self.transformers_version = transformers_version
self.use_text_encoder = use_text_encoder
self.verbose = verbose
self.qwen_model = qwen_model
self.stable_diffusion_model_path = stable_diffusion_model_path
self.florence2_model_path = florence2_model_path
self.qwen2_config = qwen2_config or self._get_default_qwen2_config()
self.max_length = max_length
self.num_beams = num_beams
super().__init__(**kwargs)
def _get_default_qwen2_config(self):
"""Get default Qwen2 configuration."""
return {
"architectures": ["Qwen2ForCausalLM"],
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151643,
"hidden_act": "silu",
"hidden_size": 2048,
"initializer_range": 0.02,
"intermediate_size": 11008,
"max_position_embeddings": 32768,
"max_window_layers": 36,
"model_type": "qwen2",
"num_attention_heads": 16,
"num_hidden_layers": 36,
"num_key_value_heads": 2,
"rms_norm_eps": 1e-06,
"rope_theta": 1000000.0,
"sliding_window": 32768,
"tie_word_embeddings": True,
"torch_dtype": "bfloat16",
"transformers_version": "4.40.1",
"use_cache": True,
"use_mrope": False,
"use_sliding_window": False,
"vocab_size": 151936
}
class CLIPDecoderConfig(PretrainedConfig):
r"""
Configuration class for CLIPDecoder model (legacy support).
"""
model_type = "vlv_stage2"
def __init__(
self,
input_dim: int = 1024,
bf16: bool = False,
**kwargs,
):
self.input_dim = input_dim
self.bf16 = bf16
super().__init__(**kwargs)