File size: 6,660 Bytes
492f6af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# coding=utf-8
# Copyright 2024 VLV Team and the HuggingFace Inc. team. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""VLV model configuration"""

from typing import Optional, Dict, Any
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging

logger = logging.get_logger(__name__)


class VLV_Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`VLV_MODEL`]. It is used to instantiate a VLV model
    according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        model_type (`str`, *optional*, defaults to "VLV_decoder"):
            The model type identifier.
        batch_size (`int`, *optional*, defaults to 1):
            The batch size for inference.
        deepspeed (`bool`, *optional*, defaults to True):
            Whether to use deepspeed.
        distributed (`bool`, *optional*, defaults to True):
            Whether to use distributed training.
        fp32 (`bool`, *optional*, defaults to True):
            Whether to use fp32 precision.
        guidance_scale (`float`, *optional*, defaults to 2.0):
            The guidance scale for generation.
        hidden_size (`int`, *optional*, defaults to 128):
            The hidden size of the model.
        image_size (`int`, *optional*, defaults to 768):
            The size of input images.
        learnable_token_length (`int`, *optional*, defaults to 77):
            The length of learnable tokens.
        local_rank (`int`, *optional*, defaults to 0):
            The local rank for distributed training.
        mixed_precision (`str`, *optional*, defaults to "bf16"):
            The mixed precision mode.
        num_inference_steps (`int`, *optional*, defaults to 50):
            The number of inference steps.
        torch_dtype (`str`, *optional*, defaults to "bfloat16"):
            The torch dtype.
        use_text_encoder (`bool`, *optional*, defaults to True):
            Whether to use text encoder.
        verbose (`bool`, *optional*, defaults to True):
            Whether to enable verbose mode.
        qwen_model (`str`, *optional*, defaults to "Qwen/Qwen2.5-3B"):
            The Qwen model to use.
        qwen2_config (`dict`, *optional*):
            The Qwen2 configuration.
        max_length (`int`, *optional*, defaults to 300):
            Maximum length for generation.
        num_beams (`int`, *optional*, defaults to 4):
            Number of beams for beam search.
    """

    model_type = "VLV_decoder"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        model_type: str = "VLV_decoder",
        batch_size: int = 1,
        deepspeed: bool = True,
        distributed: bool = True,
        fp32: bool = True,
        guidance_scale: float = 2.0,
        hidden_size: int = 128,
        image_size: int = 768,
        learnable_token_length: int = 77,
        local_rank: int = 0,
        mixed_precision: str = "bf16",
        num_inference_steps: int = 50,
        torch_dtype: str = "bfloat16",
        transformers_version: str = "4.51.1",
        use_text_encoder: bool = True,
        verbose: bool = True,
        qwen_model: str = "Qwen/Qwen2.5-3B",
        stable_diffusion_model_path: str = "stabilityai/stable-diffusion-2-1-base",
        florence2_model_path: str = "microsoft/Florence-2-large",
        qwen2_config: Optional[Dict[str, Any]] = None,
        max_length: int = 300,
        num_beams: int = 4,
        **kwargs,
    ):
        self.model_type = model_type
        self.batch_size = batch_size
        self.deepspeed = deepspeed
        self.distributed = distributed
        self.fp32 = fp32
        self.guidance_scale = guidance_scale
        self.hidden_size = hidden_size
        self.image_size = image_size
        self.learnable_token_length = learnable_token_length
        self.local_rank = local_rank
        self.mixed_precision = mixed_precision
        self.num_inference_steps = num_inference_steps
        self.torch_dtype = torch_dtype
        self.transformers_version = transformers_version
        self.use_text_encoder = use_text_encoder
        self.verbose = verbose
        self.qwen_model = qwen_model
        self.stable_diffusion_model_path = stable_diffusion_model_path
        self.florence2_model_path = florence2_model_path
        self.qwen2_config = qwen2_config or self._get_default_qwen2_config()
        self.max_length = max_length
        self.num_beams = num_beams

        super().__init__(**kwargs)

    def _get_default_qwen2_config(self):
        """Get default Qwen2 configuration."""
        return {
            "architectures": ["Qwen2ForCausalLM"],
            "attention_dropout": 0.0,
            "bos_token_id": 151643,
            "eos_token_id": 151643,
            "hidden_act": "silu",
            "hidden_size": 2048,
            "initializer_range": 0.02,
            "intermediate_size": 11008,
            "max_position_embeddings": 32768,
            "max_window_layers": 36,
            "model_type": "qwen2",
            "num_attention_heads": 16,
            "num_hidden_layers": 36,
            "num_key_value_heads": 2,
            "rms_norm_eps": 1e-06,
            "rope_theta": 1000000.0,
            "sliding_window": 32768,
            "tie_word_embeddings": True,
            "torch_dtype": "bfloat16",
            "transformers_version": "4.40.1",
            "use_cache": True,
            "use_mrope": False,
            "use_sliding_window": False,
            "vocab_size": 151936
        }


class CLIPDecoderConfig(PretrainedConfig):
    r"""
    Configuration class for CLIPDecoder model (legacy support).
    """
    
    model_type = "vlv_stage2"
    
    def __init__(
        self,
        input_dim: int = 1024,
        bf16: bool = False,
        **kwargs,
    ):
        self.input_dim = input_dim
        self.bf16 = bf16
        super().__init__(**kwargs)