|
""" RWKV configuration""" |
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
|
|
|
|
|
|
|
|
from .modeling_blocks_rwkv7 import RWKV7GooseConfigMap |
|
|
|
class RWKV7Config(PretrainedConfig): |
|
""" |
|
This is the configuration class to store the configuration of a [`Rwkv7Model`]. It is used to instantiate a RWKV7 |
|
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the |
|
defaults will yield a similar configuration to that of the RWVK-7 |
|
[RWKV/v7-Goose-1.6B-Pile-HF](https://huggingface.co/RWKV/v7-Goose-1.6B-Pile-HF) architecture. |
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
documentation from [`PretrainedConfig`] for more information. |
|
|
|
Args: |
|
vocab_size (`int`, *optional*, defaults to 65536): |
|
Vocabulary size of the RWKV7 model. Defines the number of different tokens that can be represented by the |
|
`inputs_ids` passed when calling [`Rwkv7Model`]. |
|
num_hidden_layers (`int`, *optional*, defaults to 24): |
|
Number of hidden layers in the model. |
|
hidden_size (`int`, *optional*, defaults to 768): |
|
Dimensionality of the embeddings and hidden states. |
|
|
|
hidden_size_att (`int`, *optional*): |
|
Dimensionality of the attention hidden states. Will be computed from `hidden_size` if unset. |
|
hidden_size_ffn (`int`, *optional*): |
|
Dimensionality of the FFN hidden states. Will be computed from `hidden_size` if unset. |
|
head_size (`int`, *optional*, defaults to 64): |
|
head_size of rwkv7 self_attention module. |
|
tmix_backend (`str`, *optional*, defaults to "auto"): |
|
Backend to use for the time mix module. "auto" defaults to "pytorch" if the device is "cpu" and "cuda" otherwise. |
|
(Valid values: "auto", "pytorch", "cuda", "triton", "triton_bighead", "fla", "fla_fused", "pytorch_ref", "pytorch_ref_fp32") |
|
init_state_wkv (`bool`, *optional*, defaults to `False`): |
|
Whether to initialize the wkv state in the model. Used for WKV state tuning. |
|
|
|
device (`str`, *optional*): |
|
Device to use for the model. Use the respective torch.device types |
|
dtype (`str`, *optional*): |
|
Model weights data type. Use the respective torch.dtype types |
|
|
|
bos_token_id (`int`, *optional*, defaults to 0): |
|
The id of the beginning of sentence token in the vocabulary. Defaults to 0. |
|
eos_token_id (`int`, *optional*, defaults to 0): |
|
The id of the end of sentence token in the vocabulary. Defaults to 0. |
|
tie_word_embeddings (`bool`, *optional*, defaults to `False`): |
|
Whether or not to tie the word embeddings with the input token embeddings. |
|
(this value is currently ignored in our implementation) |
|
|
|
Example: |
|
|
|
```python |
|
>>> from transformers import Rwkv7Config, Rwkv7Model |
|
|
|
>>> # Initializing a Rwkv7 configuration |
|
>>> configuration = Rwkv7Config() |
|
|
|
>>> # Initializing a model (with random weights) from the configuration |
|
>>> model = Rwkv7Model(configuration) |
|
|
|
>>> # Accessing the model configuration |
|
>>> configuration = model.config |
|
```""" |
|
|
|
model_type = "rwkv7" |
|
|
|
def __init__( |
|
self, |
|
|
|
|
|
vocab_size=65536, |
|
num_hidden_layers=24, |
|
hidden_size=768, |
|
|
|
hidden_size_att=None, |
|
hidden_size_ffn=None, |
|
|
|
head_size=64, |
|
tmix_backend="auto", |
|
init_state_wkv=False, |
|
|
|
dropout_rate=0.0, |
|
|
|
device=None, |
|
dtype=None, |
|
|
|
bos_token_id=0, |
|
eos_token_id=0, |
|
tie_word_embeddings=False, |
|
|
|
**kwargs, |
|
): |
|
|
|
if dtype is None and "torch_dtype" in kwargs: |
|
dtype = kwargs["torch_dtype"] |
|
|
|
self.vocab_size = vocab_size |
|
self.num_hidden_layers = num_hidden_layers |
|
self.hidden_size = hidden_size |
|
self.hidden_size_att = hidden_size_att |
|
self.hidden_size_ffn = hidden_size_ffn |
|
|
|
self.head_size = head_size |
|
self.tmix_backend = tmix_backend |
|
self.init_state_wkv = init_state_wkv |
|
|
|
self.device = device |
|
self.dtype = dtype |
|
|
|
self.dropout_rate = dropout_rate |
|
|
|
|
|
super().__init__( |
|
tie_word_embeddings=tie_word_embeddings, |
|
bos_token_id=bos_token_id, |
|
eos_token_id=eos_token_id, |
|
**kwargs |
|
) |
|
|
|
|
|
@staticmethod |
|
def from_model_state_dict(state_dict: dict, **kwargs): |
|
goose_config = RWKV7GooseConfigMap.from_model_state_dict(state_dict) |
|
|
|
return RWKV7Config(**{**goose_config.__dict__, **kwargs}) |