Vision-Language-Vision-Captioner-Qwen2.5-3B / configuration_vlv.py

Overwrite with converted Qwen2.5-3B model files

492f6af verified 8 days ago

6.66 kB

	# coding=utf-8
	# Copyright 2024 VLV Team and the HuggingFace Inc. team. All rights reserved.
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""VLV model configuration"""

	from typing import Optional, Dict, Any
	from transformers.configuration_utils import PretrainedConfig
	from transformers.utils import logging

	logger = logging.get_logger(__name__)


	class VLV_Config(PretrainedConfig):
	r"""
	This is the configuration class to store the configuration of a [`VLV_MODEL`]. It is used to instantiate a VLV model
	according to the specified arguments, defining the model architecture.

	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PretrainedConfig`] for more information.

	Args:
	model_type (`str`, optional, defaults to "VLV_decoder"):
	The model type identifier.
	batch_size (`int`, optional, defaults to 1):
	The batch size for inference.
	deepspeed (`bool`, optional, defaults to True):
	Whether to use deepspeed.
	distributed (`bool`, optional, defaults to True):
	Whether to use distributed training.
	fp32 (`bool`, optional, defaults to True):
	Whether to use fp32 precision.
	guidance_scale (`float`, optional, defaults to 2.0):
	The guidance scale for generation.
	hidden_size (`int`, optional, defaults to 128):
	The hidden size of the model.
	image_size (`int`, optional, defaults to 768):
	The size of input images.
	learnable_token_length (`int`, optional, defaults to 77):
	The length of learnable tokens.
	local_rank (`int`, optional, defaults to 0):
	The local rank for distributed training.
	mixed_precision (`str`, optional, defaults to "bf16"):
	The mixed precision mode.
	num_inference_steps (`int`, optional, defaults to 50):
	The number of inference steps.
	torch_dtype (`str`, optional, defaults to "bfloat16"):
	The torch dtype.
	use_text_encoder (`bool`, optional, defaults to True):
	Whether to use text encoder.
	verbose (`bool`, optional, defaults to True):
	Whether to enable verbose mode.
	qwen_model (`str`, optional, defaults to "Qwen/Qwen2.5-3B"):
	The Qwen model to use.
	qwen2_config (`dict`, optional):
	The Qwen2 configuration.
	max_length (`int`, optional, defaults to 300):
	Maximum length for generation.
	num_beams (`int`, optional, defaults to 4):
	Number of beams for beam search.
	"""

	model_type = "VLV_decoder"
	keys_to_ignore_at_inference = ["past_key_values"]

	def __init__(
	self,
	model_type: str = "VLV_decoder",
	batch_size: int = 1,
	deepspeed: bool = True,
	distributed: bool = True,
	fp32: bool = True,
	guidance_scale: float = 2.0,
	hidden_size: int = 128,
	image_size: int = 768,
	learnable_token_length: int = 77,
	local_rank: int = 0,
	mixed_precision: str = "bf16",
	num_inference_steps: int = 50,
	torch_dtype: str = "bfloat16",
	transformers_version: str = "4.51.1",
	use_text_encoder: bool = True,
	verbose: bool = True,
	qwen_model: str = "Qwen/Qwen2.5-3B",
	stable_diffusion_model_path: str = "stabilityai/stable-diffusion-2-1-base",
	florence2_model_path: str = "microsoft/Florence-2-large",
	qwen2_config: Optional[Dict[str, Any]] = None,
	max_length: int = 300,
	num_beams: int = 4,
	**kwargs,
	):
	self.model_type = model_type
	self.batch_size = batch_size
	self.deepspeed = deepspeed
	self.distributed = distributed
	self.fp32 = fp32
	self.guidance_scale = guidance_scale
	self.hidden_size = hidden_size
	self.image_size = image_size
	self.learnable_token_length = learnable_token_length
	self.local_rank = local_rank
	self.mixed_precision = mixed_precision
	self.num_inference_steps = num_inference_steps
	self.torch_dtype = torch_dtype
	self.transformers_version = transformers_version
	self.use_text_encoder = use_text_encoder
	self.verbose = verbose
	self.qwen_model = qwen_model
	self.stable_diffusion_model_path = stable_diffusion_model_path
	self.florence2_model_path = florence2_model_path
	self.qwen2_config = qwen2_config or self._get_default_qwen2_config()
	self.max_length = max_length
	self.num_beams = num_beams

	super().__init__(**kwargs)

	def _get_default_qwen2_config(self):
	"""Get default Qwen2 configuration."""
	return {
	"architectures": ["Qwen2ForCausalLM"],
	"attention_dropout": 0.0,
	"bos_token_id": 151643,
	"eos_token_id": 151643,
	"hidden_act": "silu",
	"hidden_size": 2048,
	"initializer_range": 0.02,
	"intermediate_size": 11008,
	"max_position_embeddings": 32768,
	"max_window_layers": 36,
	"model_type": "qwen2",
	"num_attention_heads": 16,
	"num_hidden_layers": 36,
	"num_key_value_heads": 2,
	"rms_norm_eps": 1e-06,
	"rope_theta": 1000000.0,
	"sliding_window": 32768,
	"tie_word_embeddings": True,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.40.1",
	"use_cache": True,
	"use_mrope": False,
	"use_sliding_window": False,
	"vocab_size": 151936
	}


	class CLIPDecoderConfig(PretrainedConfig):
	r"""
	Configuration class for CLIPDecoder model (legacy support).
	"""

	model_type = "vlv_stage2"

	def __init__(
	self,
	input_dim: int = 1024,
	bf16: bool = False,
	**kwargs,
	):
	self.input_dim = input_dim
	self.bf16 = bf16
	super().__init__(**kwargs)