fix

Browse files

Files changed (13) hide show

.gitattributes +0 -0
LICENSE-CODE +0 -0
LICENSE-MODEL +0 -0
README.md +0 -0
config.json +12 -8
configuration_minimax_m1.py +14 -14
main.py +11 -5
merges.txt +0 -0
model.safetensors.index.json +0 -0
modeling_minimax_m1.py +57 -57
tokenizer.json +0 -0
tokenizer_config.json +1 -1
vocab.json +0 -0

.gitattributes CHANGED Viewed

File without changes

LICENSE-CODE CHANGED Viewed

File without changes

LICENSE-MODEL CHANGED Viewed

File without changes

README.md CHANGED Viewed

File without changes

config.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "architectures": [
-    "MiniMaxM1ForCausalLM"
   ],
   "attention_dropout": 0.0,
   "attn_type_list": [
@@ -86,21 +86,24 @@
     1
   ],
   "auto_map": {
-    "AutoConfig": "configuration_minimax_m1.MiniMaxM1Config",
-    "AutoModelForCausalLM": "modeling_minimax_m1.MiniMaxM1ForCausalLM"
   },
-  "bos_token_id": 1,
-  "eos_token_id": 2,
   "head_dim": 128,
   "hidden_act": "silu",
   "hidden_size": 6144,
   "initializer_range": 0.02,
   "intermediate_size": 9216,
   "layernorm_full_attention_alpha": 3.5565588200778455,
   "layernorm_linear_attention_alpha": 3.5565588200778455,
   "layernorm_mlp_alpha": 3.5565588200778455,
-  "max_position_embeddings": 131072,
-  "model_type": "mixtral",
   "num_attention_heads": 64,
   "num_experts_per_tok": 2,
   "num_hidden_layers": 80,
@@ -117,7 +120,8 @@
   "shared_moe_mode": "sigmoid",
   "sliding_window": null,
   "tie_word_embeddings": false,
-  "transformers_version": "4.49.0",
   "use_cache": true,
   "vocab_size": 200064
 }

 {
   "architectures": [
+    "MiniMaxText01ForCausalLM"
   ],
   "attention_dropout": 0.0,
   "attn_type_list": [
     1
   ],
   "auto_map": {
+    "AutoConfig": "configuration_minimax_m1.MiniMaxText01Config",
+    "AutoModelForCausalLM": "modeling_minimax_m1.MiniMaxText01ForCausalLM"
   },
+  "bos_token_id": null,
+  "eos_token_id": null,
   "head_dim": 128,
   "hidden_act": "silu",
   "hidden_size": 6144,
   "initializer_range": 0.02,
   "intermediate_size": 9216,
   "layernorm_full_attention_alpha": 3.5565588200778455,
+  "layernorm_full_attention_beta": 1.0,
   "layernorm_linear_attention_alpha": 3.5565588200778455,
+  "layernorm_linear_attention_beta": 1.0,
   "layernorm_mlp_alpha": 3.5565588200778455,
+  "layernorm_mlp_beta": 1.0,
+  "max_position_embeddings": 10240000,
+  "model_type": "minimax_text_01",
   "num_attention_heads": 64,
   "num_experts_per_tok": 2,
   "num_hidden_layers": 80,
   "shared_moe_mode": "sigmoid",
   "sliding_window": null,
   "tie_word_embeddings": false,
+  "transformers_version": "4.45.2",
   "use_cache": true,
   "vocab_size": 200064
 }

configuration_minimax_m1.py CHANGED Viewed

@@ -1,4 +1,4 @@
-""" MiniMaxM1 model configuration"""
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
@@ -7,11 +7,11 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
-class MiniMaxM1Config(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`MiniMaxM1Model`]. It is used to instantiate an
-    MiniMaxM1 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the MiniMaxM1.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -19,8 +19,8 @@ class MiniMaxM1Config(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the MiniMaxM1 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`MiniMaxM1Model`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 14336):
@@ -39,7 +39,7 @@ class MiniMaxM1Config(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
         max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
-            The maximum sequence length that this model might ever be used with. MiniMaxM1's sliding window attention
             allows sequence of up to 4096*32 tokens.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -76,19 +76,19 @@ class MiniMaxM1Config(PretrainedConfig):
             Amount of noise to add to the router.
     ```python
-    >>> from transformers import MiniMaxM1Model, MiniMaxM1Config
-    >>> # Initializing a MiniMaxM1 style configuration
-    >>> configuration = MiniMaxM1Config()
-    >>> # Initializing a model from the MiniMaxM1 style configuration
-    >>> model = MiniMaxM1Model(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "MiniMaxM1Model"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(

+""" MiniMaxText01 model configuration"""
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
+class MiniMaxText01Config(PretrainedConfig):
     r"""
+    This is the configuration class to store the configuration of a [`MiniMaxText01Model`]. It is used to instantiate an
+    MiniMaxText01 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the MiniMaxText01.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the MiniMaxText01 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MiniMaxText01Model`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 14336):
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
         max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. MiniMaxText01's sliding window attention
             allows sequence of up to 4096*32 tokens.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
             Amount of noise to add to the router.
     ```python
+    >>> from transformers import MiniMaxText01Model, MiniMaxText01Config
+    >>> # Initializing a MiniMaxText01 style configuration
+    >>> configuration = MiniMaxText01Config()
+    >>> # Initializing a model from the MiniMaxText01 style configuration
+    >>> model = MiniMaxText01Model(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+    model_type = "MiniMaxText01"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(

main.py CHANGED Viewed

@@ -63,13 +63,18 @@ def main():
             device_map[f'model.layers.{i * layers_per_device + j}'] = f'cuda:{i}'
     tokenizer = AutoTokenizer.from_pretrained(model_id)
-    prompt = "Hello!"
-    messages = [
-        {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant created by Minimax based on MiniMax-M1 model."}]},
-        {"role": "user", "content": [{"type": "text", "text": prompt}]},
     ]
     text = tokenizer.apply_chat_template(
-        messages,
         tokenize=False,
         add_generation_prompt=True
     )
@@ -98,3 +103,4 @@ def main():
 if __name__ == "__main__":
     main()

             device_map[f'model.layers.{i * layers_per_device + j}'] = f'cuda:{i}'
     tokenizer = AutoTokenizer.from_pretrained(model_id)
+    message = [
+        {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+        {"role": "user", "content": [{"type": "text", "text": "Hello, what is the weather today?"}]}
+    ]
+    tools = [
+        {"name": "get_location", "description": "Get the location of the user.", "parameters": {"type": "object", "properties": {}}},
+        {"name": "get_weather", "description": "Get the weather of a city.", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The name of the city"}}}},
+        {"name": "get_news", "description": "Get the news.", "parameters": {"type": "object", "properties": {"domain": {"type": "string", "description": "The domain of the news"}}}}
     ]
     text = tokenizer.apply_chat_template(
+        message,
+        tools,
         tokenize=False,
         add_generation_prompt=True
     )
 if __name__ == "__main__":
     main()

merges.txt CHANGED Viewed

File without changes

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

modeling_minimax_m1.py CHANGED Viewed

@@ -1,4 +1,4 @@
-""" PyTorch MiniMaxM1 model."""
 import inspect
 import math
 import warnings
@@ -31,7 +31,7 @@ from transformers.utils import (
     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
-from .configuration_minimax_m1 import MiniMaxM1Config
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
@@ -52,7 +52,7 @@ BLOCK = 256
 logger = logging.get_logger(__name__)
-_CONFIG_FOR_DOC = "MiniMaxM1Config"
 def get_activation_fn(activation):
@@ -207,8 +207,8 @@ class GLU(nn.Module):
         return output
-class MiniMaxM1LightningAttention(nn.Module):
-    def __init__(self, config: MiniMaxM1Config, layer_idx: Optional[int] = None):
         super().__init__()
         bias = False
         self.hidden_size = config.hidden_size
@@ -217,7 +217,7 @@ class MiniMaxM1LightningAttention(nn.Module):
         self.out_proj = nn.Linear(self.head_dim * self.num_heads, self.hidden_size, bias=bias)
         self.act = get_activation_fn(config.hidden_act)
-        self.norm = MiniMaxM1RMSNorm(self.head_dim * self.num_heads)
         self.qkv_proj = nn.Linear(self.hidden_size, 3 * self.head_dim * self.num_heads, bias=bias)
         self.output_gate = nn.Linear(self.hidden_size, self.head_dim * self.num_heads, bias=bias)
@@ -338,11 +338,11 @@ class MiniMaxM1LightningAttention(nn.Module):
         return output, attn_weights, kv
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->MiniMaxM1
-class MiniMaxM1RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        MiniMaxM1RMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -356,8 +356,8 @@ class MiniMaxM1RMSNorm(nn.Module):
         return self.weight * hidden_states.to(input_dtype)
-# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->MiniMaxM1
-class MiniMaxM1RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -447,14 +447,14 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-# Copied from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->MiniMaxM1
-class MiniMaxM1Attention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
     and "Generating Long Sequences with Sparse Transformers".
     """
-    def __init__(self, config: MiniMaxM1Config, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -481,7 +481,7 @@ class MiniMaxM1Attention(nn.Module):
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
         self.rotary_dim = getattr(config, 'rotary_dim', self.head_dim)
-        self.rotary_emb = MiniMaxM1RotaryEmbedding(
             self.rotary_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
@@ -572,10 +572,10 @@ class MiniMaxM1Attention(nn.Module):
         return attn_output, attn_weights, past_key_value
-# Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->MiniMaxM1
-class MiniMaxM1FlashAttention2(MiniMaxM1Attention):
     """
-    MiniMaxM1 flash attention module. This module inherits from `MiniMaxM1Attention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
@@ -836,7 +836,7 @@ class MiniMaxM1FlashAttention2(MiniMaxM1Attention):
         )
-class MiniMaxM1MLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -852,8 +852,8 @@ class MiniMaxM1MLP(nn.Module):
         return down_proj
-class MiniMaxM1BlockSparseTop2MLP(nn.Module):
-    def __init__(self, config: MiniMaxM1Config):
         super().__init__()
         self.ffn_dim = config.intermediate_size
         self.hidden_dim = config.hidden_size
@@ -870,15 +870,15 @@ class MiniMaxM1BlockSparseTop2MLP(nn.Module):
         return current_hidden_states
-class MiniMaxM1BLockSparseTop2MLP(MiniMaxM1BlockSparseTop2MLP):
     def __init__(self, *args, **kwargs):
         logger.warning_once(
-            "MiniMaxM1BLockSparseTop2MLP is deprecated by MiniMaxM1BlockSparseTop2MLP and will be removed in v4.40."
         )
         super().__init__(*args, **kwargs)
-class MiniMaxM1SparseMoeBlock(nn.Module):
     """
     This implementation is
     strictly equivalent to standard MoE with full capacity (no
@@ -900,7 +900,7 @@ class MiniMaxM1SparseMoeBlock(nn.Module):
         # gating
         self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
-        self.experts = nn.ModuleList([MiniMaxM1BlockSparseTop2MLP(config) for _ in range(self.num_experts)])
         # Jitter parameters
         self.jitter_noise = config.router_jitter_noise
@@ -946,8 +946,8 @@ class MiniMaxM1SparseMoeBlock(nn.Module):
         return final_hidden_states, router_logits
-class MiniMaxM1DecoderLayer(nn.Module):
-    def __init__(self, config: MiniMaxM1Config, layer_idx: int):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -956,9 +956,9 @@ class MiniMaxM1DecoderLayer(nn.Module):
         self.layer_idx = layer_idx
-        self.block_sparse_moe = MiniMaxM1SparseMoeBlock(config)
-        self.input_layernorm = MiniMaxM1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = MiniMaxM1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.postnorm = getattr(config, 'postnorm', False)
         self.layernorm_attention_alpha = getattr(config, 'layernorm_linear_attention_alpha', 1) \
@@ -972,14 +972,14 @@ class MiniMaxM1DecoderLayer(nn.Module):
         self.shared_moe = False
         if shared_intermediate > 0:
             self.shared_moe = True
-            self.shared_mlp = MiniMaxM1MLP(config)
             self.coefficient = torch.nn.Linear(self.hidden_size, 1, bias=False)
     def build_attn(self, config, layer_idx):
         if config.attention_type == 0:
-            Attention_module = MiniMaxM1LightningAttention
         else:
-            Attention_module = MiniMaxM1FlashAttention2
         return Attention_module(
             config,
@@ -1081,7 +1081,7 @@ MIXTRAL_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`MiniMaxM1Config`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1089,15 +1089,15 @@ MIXTRAL_START_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare MiniMaxM1 Model outputting raw hidden-states without any specific head on top.",
     MIXTRAL_START_DOCSTRING,
 )
-# Copied from transformers.models.mistral.modeling_mistral.MistralPreTrainedModel with Mistral->MiniMaxM1
-class MiniMaxM1PreTrainedModel(PreTrainedModel):
-    config_class = MiniMaxM1Config
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["MiniMaxM1DecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
@@ -1182,19 +1182,19 @@ MIXTRAL_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare MiniMaxM1 Model outputting raw hidden-states without any specific head on top.",
     MIXTRAL_START_DOCSTRING,
 )
-# Copied from transformers.models.mistral.modeling_mistral.MistralModel with MISTRAL->MIXTRAL,Mistral->MiniMaxM1
-class MiniMaxM1Model(MiniMaxM1PreTrainedModel):
     """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniMaxM1DecoderLayer`]
     Args:
-        config: MiniMaxM1Config
     """
-    def __init__(self, config: MiniMaxM1Config):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -1212,10 +1212,10 @@ class MiniMaxM1Model(MiniMaxM1PreTrainedModel):
             else:
                 _config._attn_implementation = config_copy._attn_implementation
                 _config.attention_type = 1
-            self.layers.append(MiniMaxM1DecoderLayer(_config, i))
         self._attn_implementation = config_copy._attn_implementation
-        self.norm = MiniMaxM1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         self.slopes = self._build_slope_tensor(config.num_attention_heads)
@@ -1327,7 +1327,7 @@ class MiniMaxM1Model(MiniMaxM1PreTrainedModel):
             if is_padding_right:
                 raise ValueError(
                     "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of MiniMaxM1. Make sure to "
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
         slope_rates = [self.slopes.to(default_device) for _ in range(len(self.layers))]
@@ -1401,12 +1401,12 @@ class MiniMaxM1Model(MiniMaxM1PreTrainedModel):
         )
-class MiniMaxM1ForCausalLM(MiniMaxM1PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
-        self.model = MiniMaxM1Model(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.router_aux_loss_coef = config.router_aux_loss_coef
@@ -1462,9 +1462,9 @@ class MiniMaxM1ForCausalLM(MiniMaxM1PreTrainedModel):
         Example:
         ```python
-        >>> from transformers import AutoTokenizer, MiniMaxM1ForCausalLM
-        >>> model = MiniMaxM1ForCausalLM.from_pretrained(PATH_TO_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_WEIGHTS)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
@@ -1579,9 +1579,9 @@ class MiniMaxM1ForCausalLM(MiniMaxM1PreTrainedModel):
 @add_start_docstrings(
     """
-    The MiniMaxM1 Model transformer with a sequence classification head on top (linear layer).
-    [`MiniMaxM1ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -1592,12 +1592,12 @@ class MiniMaxM1ForCausalLM(MiniMaxM1PreTrainedModel):
     """,
     MIXTRAL_START_DOCSTRING,
 )
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->MiniMaxM1, LLAMA->MIXTRAL
-class MiniMaxM1ForSequenceClassification(MiniMaxM1PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = MiniMaxM1Model(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing

+""" PyTorch MiniMaxText01 model."""
 import inspect
 import math
 import warnings
     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_minimax_m1 import MiniMaxText01Config
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
 logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "MiniMaxText01Config"
 def get_activation_fn(activation):
         return output
+class MiniMaxText01LightningAttention(nn.Module):
+    def __init__(self, config: MiniMaxText01Config, layer_idx: Optional[int] = None):
         super().__init__()
         bias = False
         self.hidden_size = config.hidden_size
         self.out_proj = nn.Linear(self.head_dim * self.num_heads, self.hidden_size, bias=bias)
         self.act = get_activation_fn(config.hidden_act)
+        self.norm = MiniMaxText01RMSNorm(self.head_dim * self.num_heads)
         self.qkv_proj = nn.Linear(self.hidden_size, 3 * self.head_dim * self.num_heads, bias=bias)
         self.output_gate = nn.Linear(self.hidden_size, self.head_dim * self.num_heads, bias=bias)
         return output, attn_weights, kv
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->MiniMaxText01
+class MiniMaxText01RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
+        MiniMaxText01RMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
         return self.weight * hidden_states.to(input_dtype)
+# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->MiniMaxText01
+class MiniMaxText01RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+# Copied from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->MiniMaxText01
+class MiniMaxText01Attention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
     and "Generating Long Sequences with Sparse Transformers".
     """
+    def __init__(self, config: MiniMaxText01Config, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
         self.rotary_dim = getattr(config, 'rotary_dim', self.head_dim)
+        self.rotary_emb = MiniMaxText01RotaryEmbedding(
             self.rotary_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
         return attn_output, attn_weights, past_key_value
+# Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->MiniMaxText01
+class MiniMaxText01FlashAttention2(MiniMaxText01Attention):
     """
+    MiniMaxText01 flash attention module. This module inherits from `MiniMaxText01Attention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
         )
+class MiniMaxText01MLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
         return down_proj
+class MiniMaxText01BlockSparseTop2MLP(nn.Module):
+    def __init__(self, config: MiniMaxText01Config):
         super().__init__()
         self.ffn_dim = config.intermediate_size
         self.hidden_dim = config.hidden_size
         return current_hidden_states
+class MiniMaxText01BLockSparseTop2MLP(MiniMaxText01BlockSparseTop2MLP):
     def __init__(self, *args, **kwargs):
         logger.warning_once(
+            "MiniMaxText01BLockSparseTop2MLP is deprecated by MiniMaxText01BlockSparseTop2MLP and will be removed in v4.40."
         )
         super().__init__(*args, **kwargs)
+class MiniMaxText01SparseMoeBlock(nn.Module):
     """
     This implementation is
     strictly equivalent to standard MoE with full capacity (no
         # gating
         self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+        self.experts = nn.ModuleList([MiniMaxText01BlockSparseTop2MLP(config) for _ in range(self.num_experts)])
         # Jitter parameters
         self.jitter_noise = config.router_jitter_noise
         return final_hidden_states, router_logits
+class MiniMaxText01DecoderLayer(nn.Module):
+    def __init__(self, config: MiniMaxText01Config, layer_idx: int):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
         self.layer_idx = layer_idx
+        self.block_sparse_moe = MiniMaxText01SparseMoeBlock(config)
+        self.input_layernorm = MiniMaxText01RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MiniMaxText01RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.postnorm = getattr(config, 'postnorm', False)
         self.layernorm_attention_alpha = getattr(config, 'layernorm_linear_attention_alpha', 1) \
         self.shared_moe = False
         if shared_intermediate > 0:
             self.shared_moe = True
+            self.shared_mlp = MiniMaxText01MLP(config)
             self.coefficient = torch.nn.Linear(self.hidden_size, 1, bias=False)
     def build_attn(self, config, layer_idx):
         if config.attention_type == 0:
+            Attention_module = MiniMaxText01LightningAttention
         else:
+            Attention_module = MiniMaxText01FlashAttention2
         return Attention_module(
             config,
     and behavior.
     Parameters:
+        config ([`MiniMaxText01Config`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 @add_start_docstrings(
+    "The bare MiniMaxText01 Model outputting raw hidden-states without any specific head on top.",
     MIXTRAL_START_DOCSTRING,
 )
+# Copied from transformers.models.mistral.modeling_mistral.MistralPreTrainedModel with Mistral->MiniMaxText01
+class MiniMaxText01PreTrainedModel(PreTrainedModel):
+    config_class = MiniMaxText01Config
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["MiniMaxText01DecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
 @add_start_docstrings(
+    "The bare MiniMaxText01 Model outputting raw hidden-states without any specific head on top.",
     MIXTRAL_START_DOCSTRING,
 )
+# Copied from transformers.models.mistral.modeling_mistral.MistralModel with MISTRAL->MIXTRAL,Mistral->MiniMaxText01
+class MiniMaxText01Model(MiniMaxText01PreTrainedModel):
     """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniMaxText01DecoderLayer`]
     Args:
+        config: MiniMaxText01Config
     """
+    def __init__(self, config: MiniMaxText01Config):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
             else:
                 _config._attn_implementation = config_copy._attn_implementation
                 _config.attention_type = 1
+            self.layers.append(MiniMaxText01DecoderLayer(_config, i))
         self._attn_implementation = config_copy._attn_implementation
+        self.norm = MiniMaxText01RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         self.slopes = self._build_slope_tensor(config.num_attention_heads)
             if is_padding_right:
                 raise ValueError(
                     "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of MiniMaxText01. Make sure to "
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
         slope_rates = [self.slopes.to(default_device) for _ in range(len(self.layers))]
         )
+class MiniMaxText01ForCausalLM(MiniMaxText01PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
+        self.model = MiniMaxText01Model(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.router_aux_loss_coef = config.router_aux_loss_coef
         Example:
         ```python
+        >>> from transformers import AutoTokenizer, MiniMaxText01ForCausalLM
+        >>> model = MiniMaxText01ForCausalLM.from_pretrained(PATH_TO_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_WEIGHTS)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
 @add_start_docstrings(
     """
+    The MiniMaxText01 Model transformer with a sequence classification head on top (linear layer).
+    [`MiniMaxText01ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     """,
     MIXTRAL_START_DOCSTRING,
 )
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->MiniMaxText01, LLAMA->MIXTRAL
+class MiniMaxText01ForSequenceClassification(MiniMaxText01PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = MiniMaxText01Model(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing

tokenizer.json CHANGED Viewed

File without changes

tokenizer_config.json CHANGED Viewed

@@ -6,5 +6,5 @@
   "model_max_length": 40960000,
   "tokenizer_class": "GPT2Tokenizer",
   "unk_token": "<end_of_document>",
-  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '<beginning_of_sentence>system ai_setting=assistant\\n' + message['content'][0]['text'] + '<end_of_sentence>\\n'}}{% elif message['role'] == 'user' %}{{ '<beginning_of_sentence>user name=user\\n' + message['content'][0]['text'] + '<end_of_sentence>\\n'}}{% elif message['role'] == 'assistant' %}{{ '<beginning_of_sentence>ai name=assistant\\n' }}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] }}{% endgeneration %}{% endfor %}{{ '<end_of_sentence>\\n' }}{% elif message['role'] == 'function' %}{{ '<beginning_of_sentence>system function_response=functions\\n' + '{\"name\": \"' + message['name'] + '\", \"response\": ' + message['content'][0]['text'] + '}' + '<end_of_sentence>\\n'}}{% endif %}{% endfor %}{% if tools %}{% for function in tools %}{{ '<beginning_of_sentence>system function_setting=functions\\n' + function | tojson + '<end_of_sentence>\\n'}}{% endfor %}{% endif %}{% if add_generation_prompt %}{{ '<beginning_of_sentence>ai name=assistant\\n' }}{% endif %}"
 }

   "model_max_length": 40960000,
   "tokenizer_class": "GPT2Tokenizer",
   "unk_token": "<end_of_document>",
+  "chat_template": "{{ '<begin_of_document>' -}}{% set ns = namespace(system_prompt='') -%}{% for message in messages -%}{% if message['role'] == 'system' -%}{% set ns.system_prompt = ns.system_prompt + message['content'][0]['text'] + '\n' -%}{% endif -%}{%- endfor -%}{% if ns.system_prompt != '' -%}{{ '<beginning_of_sentence>system ai_setting=assistant\n' + ns.system_prompt + '<end_of_sentence>\n' -}}{%- endif -%}{% if tools -%}{{ '<beginning_of_sentence>system tool_setting=tools\nYou are provided with these tools:\n<tools>\n' -}}{% for tool in tools -%}{{ tool | tojson ~ '\n' -}}{%- endfor -%}{{ '</tools>\n\nIf you need to call tools, please respond with <tool_calls></tool_calls> XML tags, and provide tool-name and json-object of arguments, following the format below:\n<tool_calls>\n{''name'': <tool-name-1>, ''arguments'': <args-json-object-1>}\n...\n</tool_calls><end_of_sentence>\n' -}}{%- endif -%}{% for message in messages -%}{% if message['role'] == 'user' -%}{{ '<beginning_of_sentence>user name=user\n' + message['content'][0]['text'] + '<end_of_sentence>\n' -}}{% elif message['role'] == 'assistant' -%}{{ '<beginning_of_sentence>ai name=assistant\n' -}}{% for content in message['content'] | selectattr('type', 'equalto', 'text') -%}{{ content['text'] -}}{%- endfor -%}{{ '<end_of_sentence>\n' -}}{% elif message['role'] == 'tool' -%}{{ '<beginning_of_sentence>tool name=tools\n' }} {%- for content in message['content'] -%}{{- 'tool name: ' + content['name'] + '\n' + 'tool result: ' + content['text'] + '\n\n' -}} {%- endfor -%}{{- '<end_of_sentence>\n' -}}{% endif -%}{%- endfor -%}{% if add_generation_prompt -%}{{ '<beginning_of_sentence>ai name=assistant\n' -}}{%- endif -%}"
 }

vocab.json CHANGED Viewed

File without changes