rwkv7-my-hf-test-code-do-not-use / configuration_rwkv7.py

Upload 13 files

33b8599 verified 7 months ago

5.32 kB

	""" RWKV configuration"""

	from transformers.configuration_utils import PretrainedConfig
	# from transformers.utils import logging
	# logger = logging.get_logger(__name__)

	# Import the dependencies
	from .modeling_blocks_rwkv7 import RWKV7GooseConfigMap

	class RWKV7Config(PretrainedConfig):
	"""
	This is the configuration class to store the configuration of a [`Rwkv7Model`]. It is used to instantiate a RWKV7
	model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
	defaults will yield a similar configuration to that of the RWVK-7
	[RWKV/v7-Goose-1.6B-Pile-HF](https://huggingface.co/RWKV/v7-Goose-1.6B-Pile-HF) architecture.

	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PretrainedConfig`] for more information.

	Args:
	vocab_size (`int`, optional, defaults to 65536):
	Vocabulary size of the RWKV7 model. Defines the number of different tokens that can be represented by the
	`inputs_ids` passed when calling [`Rwkv7Model`].
	num_hidden_layers (`int`, optional, defaults to 24):
	Number of hidden layers in the model.
	hidden_size (`int`, optional, defaults to 768):
	Dimensionality of the embeddings and hidden states.

	hidden_size_att (`int`, optional):
	Dimensionality of the attention hidden states. Will be computed from `hidden_size` if unset.
	hidden_size_ffn (`int`, optional):
	Dimensionality of the FFN hidden states. Will be computed from `hidden_size` if unset.
	head_size (`int`, optional, defaults to 64):
	head_size of rwkv7 self_attention module.
	tmix_backend (`str`, optional, defaults to "auto"):
	Backend to use for the time mix module. "auto" defaults to "pytorch" if the device is "cpu" and "cuda" otherwise.
	(Valid values: "auto", "pytorch", "cuda", "triton", "triton_bighead", "fla", "fla_fused", "pytorch_ref", "pytorch_ref_fp32")
	init_state_wkv (`bool`, optional, defaults to `False`):
	Whether to initialize the wkv state in the model. Used for WKV state tuning.

	device (`str`, optional):
	Device to use for the model. Use the respective torch.device types
	dtype (`str`, optional):
	Model weights data type. Use the respective torch.dtype types

	bos_token_id (`int`, optional, defaults to 0):
	The id of the beginning of sentence token in the vocabulary. Defaults to 0.
	eos_token_id (`int`, optional, defaults to 0):
	The id of the end of sentence token in the vocabulary. Defaults to 0.
	tie_word_embeddings (`bool`, optional, defaults to `False`):
	Whether or not to tie the word embeddings with the input token embeddings.
	(this value is currently ignored in our implementation)

	Example:

	```python
	>>> from transformers import Rwkv7Config, Rwkv7Model

	>>> # Initializing a Rwkv7 configuration
	>>> configuration = Rwkv7Config()

	>>> # Initializing a model (with random weights) from the configuration
	>>> model = Rwkv7Model(configuration)

	>>> # Accessing the model configuration
	>>> configuration = model.config
	```"""

	model_type = "rwkv7"

	def __init__(
	self,
	########################################
	# Vocab, layer count, and hidden size
	vocab_size=65536,
	num_hidden_layers=24,
	hidden_size=768,
	# Optional hidden sizes
	hidden_size_att=None,
	hidden_size_ffn=None,
	# Headsize, timemix backend
	head_size=64,
	tmix_backend="auto",
	init_state_wkv=False,
	# Trainer model configs
	dropout_rate=0.0,
	# Torch device and dtype
	device=None,
	dtype=None,
	# Tokenizer related settings in HF configuration
	bos_token_id=0,
	eos_token_id=0,
	tie_word_embeddings=False,
	########################################
	**kwargs,
	):
	# Normalize dtype if torch_dtype is set within kwargs
	if dtype is None and "torch_dtype" in kwargs:
	dtype = kwargs["torch_dtype"]

	self.vocab_size = vocab_size
	self.num_hidden_layers = num_hidden_layers
	self.hidden_size = hidden_size
	self.hidden_size_att = hidden_size_att
	self.hidden_size_ffn = hidden_size_ffn

	self.head_size = head_size
	self.tmix_backend = tmix_backend
	self.init_state_wkv = init_state_wkv

	self.device = device
	self.dtype = dtype

	self.dropout_rate = dropout_rate

	# Forward to the HF PretrainedConfig
	super().__init__(
	tie_word_embeddings=tie_word_embeddings,
	bos_token_id=bos_token_id,
	eos_token_id=eos_token_id,
	**kwargs
	)


	@staticmethod
	def from_model_state_dict(state_dict: dict, **kwargs):
	goose_config = RWKV7GooseConfigMap.from_model_state_dict(state_dict)
	# Join dictionary with goose_config.__dict__ and kwargs
	return RWKV7Config({goose_config.__dict__, **kwargs})