File size: 6,660 Bytes

492f6af

# coding=utf-8
# Copyright 2024 VLV Team and the HuggingFace Inc. team. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""VLV model configuration"""

from typing import Optional, Dict, Any
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging

logger = logging.get_logger(__name__)


class VLV_Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`VLV_MODEL`]. It is used to instantiate a VLV model
    according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        model_type (`str`, *optional*, defaults to "VLV_decoder"):
            The model type identifier.
        batch_size (`int`, *optional*, defaults to 1):
            The batch size for inference.
        deepspeed (`bool`, *optional*, defaults to True):
            Whether to use deepspeed.
        distributed (`bool`, *optional*, defaults to True):
            Whether to use distributed training.
        fp32 (`bool`, *optional*, defaults to True):
            Whether to use fp32 precision.
        guidance_scale (`float`, *optional*, defaults to 2.0):
            The guidance scale for generation.
        hidden_size (`int`, *optional*, defaults to 128):
            The hidden size of the model.
        image_size (`int`, *optional*, defaults to 768):
            The size of input images.
        learnable_token_length (`int`, *optional*, defaults to 77):
            The length of learnable tokens.
        local_rank (`int`, *optional*, defaults to 0):
            The local rank for distributed training.
        mixed_precision (`str`, *optional*, defaults to "bf16"):
            The mixed precision mode.
        num_inference_steps (`int`, *optional*, defaults to 50):
            The number of inference steps.
        torch_dtype (`str`, *optional*, defaults to "bfloat16"):
            The torch dtype.
        use_text_encoder (`bool`, *optional*, defaults to True):
            Whether to use text encoder.
        verbose (`bool`, *optional*, defaults to True):
            Whether to enable verbose mode.
        qwen_model (`str`, *optional*, defaults to "Qwen/Qwen2.5-3B"):
            The Qwen model to use.
        qwen2_config (`dict`, *optional*):
            The Qwen2 configuration.
        max_length (`int`, *optional*, defaults to 300):
            Maximum length for generation.
        num_beams (`int`, *optional*, defaults to 4):
            Number of beams for beam search.
    """

    model_type = "VLV_decoder"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        model_type: str = "VLV_decoder",
        batch_size: int = 1,
        deepspeed: bool = True,
        distributed: bool = True,
        fp32: bool = True,
        guidance_scale: float = 2.0,
        hidden_size: int = 128,
        image_size: int = 768,
        learnable_token_length: int = 77,
        local_rank: int = 0,
        mixed_precision: str = "bf16",
        num_inference_steps: int = 50,
        torch_dtype: str = "bfloat16",
        transformers_version: str = "4.51.1",
        use_text_encoder: bool = True,
        verbose: bool = True,
        qwen_model: str = "Qwen/Qwen2.5-3B",
        stable_diffusion_model_path: str = "stabilityai/stable-diffusion-2-1-base",
        florence2_model_path: str = "microsoft/Florence-2-large",
        qwen2_config: Optional[Dict[str, Any]] = None,
        max_length: int = 300,
        num_beams: int = 4,
        **kwargs,
    ):
        self.model_type = model_type
        self.batch_size = batch_size
        self.deepspeed = deepspeed
        self.distributed = distributed
        self.fp32 = fp32
        self.guidance_scale = guidance_scale
        self.hidden_size = hidden_size
        self.image_size = image_size
        self.learnable_token_length = learnable_token_length
        self.local_rank = local_rank
        self.mixed_precision = mixed_precision
        self.num_inference_steps = num_inference_steps
        self.torch_dtype = torch_dtype
        self.transformers_version = transformers_version
        self.use_text_encoder = use_text_encoder
        self.verbose = verbose
        self.qwen_model = qwen_model
        self.stable_diffusion_model_path = stable_diffusion_model_path
        self.florence2_model_path = florence2_model_path
        self.qwen2_config = qwen2_config or self._get_default_qwen2_config()
        self.max_length = max_length
        self.num_beams = num_beams

        super().__init__(**kwargs)

    def _get_default_qwen2_config(self):
        """Get default Qwen2 configuration."""
        return {
            "architectures": ["Qwen2ForCausalLM"],
            "attention_dropout": 0.0,
            "bos_token_id": 151643,
            "eos_token_id": 151643,
            "hidden_act": "silu",
            "hidden_size": 2048,
            "initializer_range": 0.02,
            "intermediate_size": 11008,
            "max_position_embeddings": 32768,
            "max_window_layers": 36,
            "model_type": "qwen2",
            "num_attention_heads": 16,
            "num_hidden_layers": 36,
            "num_key_value_heads": 2,
            "rms_norm_eps": 1e-06,
            "rope_theta": 1000000.0,
            "sliding_window": 32768,
            "tie_word_embeddings": True,
            "torch_dtype": "bfloat16",
            "transformers_version": "4.40.1",
            "use_cache": True,
            "use_mrope": False,
            "use_sliding_window": False,
            "vocab_size": 151936
        }


class CLIPDecoderConfig(PretrainedConfig):
    r"""
    Configuration class for CLIPDecoder model (legacy support).
    """
    
    model_type = "vlv_stage2"
    
    def __init__(
        self,
        input_dim: int = 1024,
        bf16: bool = False,
        **kwargs,
    ):
        self.input_dim = input_dim
        self.bf16 = bf16
        super().__init__(**kwargs)