Tima5
/

TimaAI

Safetensors

Russian

timaai_v3

custom_code

fp8

Model card Files Files and versions Community

Tima5 commited on Mar 25

Commit

3b8bead

verified ·

1 Parent(s): 5accf0d

Update configuration_timaai.py

Browse files

Files changed (1) hide show

configuration_timaai.py +10 -10

configuration_timaai.py CHANGED Viewed

@@ -3,12 +3,12 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
-DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-class DeepseekV3Config(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the DeepSeek-V3.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -17,7 +17,7 @@ class DeepseekV3Config(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 129280):
             Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`DeepseekV3Model`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 11008):
@@ -27,7 +27,7 @@ class DeepseekV3Config(PretrainedConfig):
         num_hidden_layers (`int`, *optional*, defaults to 32):
             Number of hidden layers in the Transformer decoder.
         num_nextn_predict_layers (`int`, *optional*, defaults to 1):
-            Number of nextn predict layers in the DeepSeekV3 Model.
         num_attention_heads (`int`, *optional*, defaults to 32):
             Number of attention heads for each attention layer in the Transformer decoder.
         n_shared_experts (`int`, *optional*, defaults to None):
@@ -102,16 +102,16 @@ class DeepseekV3Config(PretrainedConfig):
             The dropout ratio for the attention probabilities.
     ```python
-    >>> from transformers import DeepseekV3Model, DeepseekV3Config
-    >>> # Initializing a Deepseek-V3 style configuration
-    >>> configuration = DeepseekV3Config()
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "deepseek_v3"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(

 logger = logging.get_logger(__name__)
+timaai_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class timaaiV3Config(PretrainedConfig):
     r"""
+    This is the configuration class to store the configuration of a [`timaaiV3Model`]. It is used to instantiate an timaai
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the timaai-V3.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 129280):
             Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`timaaiV3Model`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 11008):
         num_hidden_layers (`int`, *optional*, defaults to 32):
             Number of hidden layers in the Transformer decoder.
         num_nextn_predict_layers (`int`, *optional*, defaults to 1):
+            Number of nextn predict layers in the timaaiV3 Model.
         num_attention_heads (`int`, *optional*, defaults to 32):
             Number of attention heads for each attention layer in the Transformer decoder.
         n_shared_experts (`int`, *optional*, defaults to None):
             The dropout ratio for the attention probabilities.
     ```python
+    >>> from transformers import timaaiV3Model, timaaiV3Config
+    >>> # Initializing a timaai-V3 style configuration
+    >>> configuration = timaaiV3Config()
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+    model_type = "timaai_v3"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(