File size: 5,318 Bytes

33b8599

""" RWKV configuration"""

from transformers.configuration_utils import PretrainedConfig
# from transformers.utils import logging
# logger = logging.get_logger(__name__)

# Import the dependencies
from .modeling_blocks_rwkv7 import RWKV7GooseConfigMap

class RWKV7Config(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`Rwkv7Model`]. It is used to instantiate a RWKV7
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the RWVK-7
    [RWKV/v7-Goose-1.6B-Pile-HF](https://huggingface.co/RWKV/v7-Goose-1.6B-Pile-HF) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 65536):
            Vocabulary size of the RWKV7 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Rwkv7Model`].
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the model.
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the embeddings and hidden states.
        
        hidden_size_att (`int`, *optional*):
            Dimensionality of the attention hidden states. Will be computed from `hidden_size` if unset.
        hidden_size_ffn (`int`, *optional*):
            Dimensionality of the FFN hidden states. Will be computed from `hidden_size` if unset.
        head_size (`int`, *optional*, defaults to 64): 
            head_size of rwkv7 self_attention module.
        tmix_backend (`str`, *optional*, defaults to "auto"):
            Backend to use for the time mix module. "auto" defaults to "pytorch" if the device is "cpu" and "cuda" otherwise.
            (Valid values: "auto", "pytorch", "cuda", "triton", "triton_bighead", "fla", "fla_fused", "pytorch_ref", "pytorch_ref_fp32")
        init_state_wkv (`bool`, *optional*, defaults to `False`):
            Whether to initialize the wkv state in the model. Used for WKV state tuning.
        
        device (`str`, *optional*):
            Device to use for the model. Use the respective torch.device types
        dtype (`str`, *optional*):
            Model weights data type. Use the respective torch.dtype types

        bos_token_id (`int`, *optional*, defaults to 0):
            The id of the beginning of sentence token in the vocabulary. Defaults to 0.
        eos_token_id (`int`, *optional*, defaults to 0):
            The id of the end of sentence token in the vocabulary. Defaults to 0.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether or not to tie the word embeddings with the input token embeddings.
            (this value is currently ignored in our implementation)

    Example:

    ```python
    >>> from transformers import Rwkv7Config, Rwkv7Model

    >>> # Initializing a Rwkv7 configuration
    >>> configuration = Rwkv7Config()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = Rwkv7Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "rwkv7"

    def __init__(
        self,
        ########################################
        # Vocab, layer count, and hidden size
        vocab_size=65536,
        num_hidden_layers=24,
        hidden_size=768,
        # Optional hidden sizes
        hidden_size_att=None,
        hidden_size_ffn=None,
        # Headsize, timemix backend
        head_size=64,
        tmix_backend="auto",
        init_state_wkv=False,
        # Trainer model configs
        dropout_rate=0.0,
        # Torch device and dtype
        device=None,
        dtype=None,
        # Tokenizer related settings in HF configuration
        bos_token_id=0,
        eos_token_id=0,
        tie_word_embeddings=False,
        ########################################
        **kwargs,
    ):
        # Normalize dtype if torch_dtype is set within kwargs
        if dtype is None and "torch_dtype" in kwargs:
            dtype = kwargs["torch_dtype"]

        self.vocab_size = vocab_size
        self.num_hidden_layers = num_hidden_layers
        self.hidden_size = hidden_size
        self.hidden_size_att = hidden_size_att
        self.hidden_size_ffn = hidden_size_ffn

        self.head_size = head_size
        self.tmix_backend = tmix_backend
        self.init_state_wkv = init_state_wkv

        self.device = device
        self.dtype = dtype

        self.dropout_rate = dropout_rate
        
        # Forward to the HF PretrainedConfig
        super().__init__(
            tie_word_embeddings=tie_word_embeddings, 
            bos_token_id=bos_token_id, 
            eos_token_id=eos_token_id, 
            **kwargs
        )

    
    @staticmethod
    def from_model_state_dict(state_dict: dict, **kwargs):
        goose_config = RWKV7GooseConfigMap.from_model_state_dict(state_dict)
        # Join dictionary with **goose_config.__dict__ and **kwargs
        return RWKV7Config(**{**goose_config.__dict__, **kwargs})