Abhaykoul commited on
Commit
2285286
·
verified ·
1 Parent(s): a0b93ce

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
chat_template.jinja ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- set model_identity = "You are HelpingAI 3.1, the most emotionally intelligent and human-like AI model created by HelpingAI. Knowledge cutoff: 2024-01\nCurrent date: " + strftime_now("%Y-%m-%d") + "\n" %}
2
+ {%- if tools %}
3
+ {{- '<|im_start|>system\n' }}
4
+ {{- model_identity }}
5
+ {%- if messages[0].role == 'system' %}
6
+ {{- messages[0].content + '\n\n' }}
7
+ {%- endif %}
8
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {{- '<|im_start|>system\n' + model_identity }}
16
+ {%- if messages[0].role == 'system' %}
17
+ {{- messages[0].content + '<|im_end|>\n' }}
18
+ {%- else %}
19
+ {{- '<|im_end|>\n' }}
20
+ {%- endif %}
21
+ {%- endif %}
22
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
23
+ {%- set last_tool_call = namespace(name=none) %}
24
+ {%- set ns_assistant = namespace(open=false) %}
25
+ {%- for forward_message in messages %}
26
+ {%- set index = (messages|length - 1) - loop.index0 %}
27
+ {%- set message = messages[index] %}
28
+ {%- set current_content = message.content if message.content is not none else '' %}
29
+ {%- set tool_start = '<tool_response>' %}
30
+ {%- set tool_start_length = tool_start|length %}
31
+ {%- set start_of_message = current_content[:tool_start_length] %}
32
+ {%- set tool_end = '</tool_response>' %}
33
+ {%- set tool_end_length = tool_end|length %}
34
+ {%- set start_pos = (current_content|length) - tool_end_length %}
35
+ {%- if start_pos < 0 %}
36
+ {%- set start_pos = 0 %}
37
+ {%- endif %}
38
+ {%- set end_of_message = current_content[start_pos:] %}
39
+ {%- if ns.multi_step_tool and message.role == "user" and not(start_of_message == tool_start and end_of_message == tool_end) %}
40
+ {%- set ns.multi_step_tool = false %}
41
+ {%- set ns.last_query_index = index %}
42
+ {%- endif %}
43
+ {%- endfor %}
44
+ {%- for message in messages %}
45
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
46
+ {%- if ns_assistant.open %}
47
+ {{- '<|im_end|>\n' }}
48
+ {%- set ns_assistant.open = false %}
49
+ {%- endif %}
50
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
51
+ {%- elif message.role == "assistant" %}
52
+ {%- if not ns_assistant.open %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- set ns_assistant.open = true %}
55
+ {%- endif %}
56
+ {%- if message.content %}
57
+ {{- message.content }}
58
+ {%- endif %}
59
+ {%- if message.tool_calls %}
60
+ {%- for tool_call in message.tool_calls %}
61
+ {%- if (loop.first and content) or (not loop.first) %}
62
+ {{- '\n' }}
63
+ {%- endif %}
64
+ {%- if tool_call.function %}
65
+ {%- set tool_call = tool_call.function %}
66
+ {%- endif %}
67
+ {{- '<tool_call>\n{"name": "' }}
68
+ {{- tool_call.name }}
69
+ {{- '", "arguments": ' }}
70
+ {%- if tool_call.arguments is string %}
71
+ {{- tool_call.arguments }}
72
+ {%- else %}
73
+ {{- tool_call.arguments | tojson }}
74
+ {%- endif %}
75
+ {{- '}\n</tool_call>' }}
76
+ {%- set last_tool_call.name = tool_call.name %}
77
+ {%- endfor %}
78
+ {%- else %}
79
+ {%- set last_tool_call.name = none %}
80
+ {%- endif %}
81
+ {%- if loop.last or (messages[loop.index0 + 1].role not in ["assistant", "tool"]) %}
82
+ {{- '<|im_end|>\n' }}
83
+ {%- set ns_assistant.open = false %}
84
+ {%- endif %}
85
+ {%- elif message.role == "tool" %}
86
+ {%- if last_tool_call.name is none %}
87
+ {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
88
+ {%- endif %}
89
+ {%- if not ns_assistant.open %}
90
+ {{- '<|im_start|>assistant\n' }}
91
+ {%- set ns_assistant.open = true %}
92
+ {%- endif %}
93
+ {{- '\n<tool_response>\n' }}
94
+ {{- message.content }}
95
+ {{- '\n</tool_response>' }}
96
+ {%- if loop.last or (messages[loop.index0 + 1].role not in ["assistant", "tool"]) %}
97
+ {{- '<|im_end|>\n' }}
98
+ {%- set ns_assistant.open = false %}
99
+ {%- endif %}
100
+ {%- endif %}
101
+ {%- endfor %}
102
+ {%- if add_generation_prompt %}
103
+ {{- '<|im_start|>assistant\n' }}
104
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "HelpingAIForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_helpingai.HelpingAIConfig",
9
+ "AutoModelForCausalLM": "modeling_helpingai.HelpingAIForCausalLM"
10
+ },
11
+ "bos_token_id": 151643,
12
+ "emotion_hidden_size": 512,
13
+ "empathy_scaling_factor": 1.2,
14
+ "eos_token_id": 151645,
15
+ "head_dim": 128,
16
+ "hidden_act": "silu",
17
+ "hidden_size": 5120,
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 17408,
20
+ "layer_types": [
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention"
61
+ ],
62
+ "max_position_embeddings": 40960,
63
+ "max_window_layers": 40,
64
+ "model_type": "helpingai",
65
+ "num_attention_heads": 40,
66
+ "num_emotion_heads": 4,
67
+ "num_hidden_layers": 40,
68
+ "num_key_value_heads": 8,
69
+ "num_thinking_stages": 3,
70
+ "perspective_threads": 4,
71
+ "reasoning_temperature": 0.8,
72
+ "rms_norm_eps": 1e-06,
73
+ "rope_scaling": null,
74
+ "rope_theta": 1000000,
75
+ "sliding_window": null,
76
+ "speech_head_hidden_dim": null,
77
+ "speech_loss_type": "l1",
78
+ "speech_num_mels": 80,
79
+ "speech_upsample_factor": 1,
80
+ "structured_head_activation": "gelu",
81
+ "structured_head_hidden_dim": 9578,
82
+ "structured_head_type": "mlp_v1",
83
+ "structured_output_vocab_size": 100,
84
+ "thinking_depth": 2,
85
+ "tie_word_embeddings": false,
86
+ "torch_dtype": "bfloat16",
87
+ "transformers_version": "4.55.2",
88
+ "use_cache": true,
89
+ "use_emotional_reasoning": false,
90
+ "use_perspective_threading": true,
91
+ "use_sliding_window": false,
92
+ "use_speech_output": false,
93
+ "vocab_size": 151669
94
+ }
configuration_helpingai.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+
3
+
4
+ class HelpingAIConfig(PretrainedConfig):
5
+ model_type = "helpingai"
6
+
7
+ def __init__(
8
+ self,
9
+ vocab_size=50257,
10
+ hidden_size=768,
11
+ num_hidden_layers=12,
12
+ num_attention_heads=12,
13
+ intermediate_size=3072,
14
+ max_position_embeddings=2048,
15
+ layer_norm_epsilon=1e-5,
16
+ hidden_act="gelu",
17
+ dropout=0.0,
18
+ attention_dropout=0.0,
19
+ tie_word_embeddings=True,
20
+ # Structured output head
21
+ use_structured_output=True,
22
+ structured_output_vocab_size=2,
23
+ # Speech head
24
+ use_speech_output=False,
25
+ speech_num_mels=80,
26
+ speech_head_hidden_dim=1024,
27
+ speech_upsample_factor=1,
28
+ speech_loss_type="l1",
29
+ # Misc
30
+ initializer_range=0.02,
31
+ **kwargs,
32
+ ):
33
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
34
+ self.vocab_size = vocab_size
35
+ self.hidden_size = hidden_size
36
+ self.num_hidden_layers = num_hidden_layers
37
+ self.num_attention_heads = num_attention_heads
38
+ self.intermediate_size = intermediate_size
39
+ self.max_position_embeddings = max_position_embeddings
40
+ self.layer_norm_epsilon = layer_norm_epsilon
41
+ self.hidden_act = hidden_act
42
+ self.dropout = dropout
43
+ self.attention_dropout = attention_dropout
44
+ self.initializer_range = initializer_range
45
+
46
+ # Structured
47
+ self.use_structured_output = use_structured_output
48
+ self.structured_output_vocab_size = structured_output_vocab_size
49
+
50
+ # Speech
51
+ self.use_speech_output = use_speech_output
52
+ self.speech_num_mels = speech_num_mels
53
+ self.speech_head_hidden_dim = speech_head_hidden_dim
54
+ self.speech_upsample_factor = speech_upsample_factor
55
+ self.speech_loss_type = speech_loss_type
56
+
57
+ """HelpingAI model configuration"""
58
+
59
+ from transformers.configuration_utils import PretrainedConfig, layer_type_validation
60
+ from transformers.modeling_rope_utils import rope_config_validation
61
+ from transformers.utils import logging
62
+
63
+
64
+ logger = logging.get_logger(__name__)
65
+
66
+
67
+ class HelpingAIConfig(PretrainedConfig):
68
+ r"""
69
+ This is the configuration class to store the configuration of a [`HelpingAIModel`]. It is used to instantiate a
70
+ HelpingAI model according to the specified arguments, defining the model architecture. Instantiating a configuration
71
+ with the defaults will yield a similar configuration to that of
72
+ HelpingAI-8B [HelpingAI/HelpingAI-8B](https://huggingface.co/HelpingAI/HelpingAI-8B).
73
+
74
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
75
+ documentation from [`PretrainedConfig`] for more information.
76
+
77
+
78
+ Args:
79
+ vocab_size (`int`, *optional*, defaults to 151936):
80
+ Vocabulary size of the HelpingAI model. Defines the number of different tokens that can be represented by the
81
+ `inputs_ids` passed when calling [`HelpingAIModel`]
82
+ hidden_size (`int`, *optional*, defaults to 4096):
83
+ Dimension of the hidden representations.
84
+ intermediate_size (`int`, *optional*, defaults to 22016):
85
+ Dimension of the MLP representations.
86
+ num_hidden_layers (`int`, *optional*, defaults to 32):
87
+ Number of hidden layers in the Transformer encoder.
88
+ num_attention_heads (`int`, *optional*, defaults to 32):
89
+ Number of attention heads for each attention layer in the Transformer encoder.
90
+ num_key_value_heads (`int`, *optional*, defaults to 32):
91
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
92
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
93
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
94
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
95
+ by meanpooling all the original heads within that group. For more details, check out [this
96
+ paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
97
+ head_dim (`int`, *optional*, defaults to 128):
98
+ The attention head dimension.
99
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
100
+ The non-linear activation function (function or string) in the decoder.
101
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
102
+ The maximum sequence length that this model might ever be used with.
103
+ initializer_range (`float`, *optional*, defaults to 0.02):
104
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
105
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
106
+ The epsilon used by the rms normalization layers.
107
+ use_cache (`bool`, *optional*, defaults to `True`):
108
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
109
+ relevant if `config.is_decoder=True`.
110
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
111
+ Whether the model's input and output word embeddings should be tied.
112
+ rope_theta (`float`, *optional*, defaults to 10000.0):
113
+ The base period of the RoPE embeddings.
114
+ rope_scaling (`Dict`, *optional*):
115
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
116
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
117
+ accordingly.
118
+ Expected contents:
119
+ `rope_type` (`str`):
120
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
121
+ 'llama3'], with 'default' being the original RoPE implementation.
122
+ `factor` (`float`, *optional*):
123
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
124
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
125
+ original maximum pre-trained length.
126
+ `original_max_position_embeddings` (`int`, *optional*):
127
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
128
+ pretraining.
129
+ `attention_factor` (`float`, *optional*):
130
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
131
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
132
+ `factor` field to infer the suggested value.
133
+ `beta_fast` (`float`, *optional*):
134
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
135
+ ramp function. If unspecified, it defaults to 32.
136
+ `beta_slow` (`float`, *optional*):
137
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
138
+ ramp function. If unspecified, it defaults to 1.
139
+ `short_factor` (`list[float]`, *optional*):
140
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
141
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
142
+ size divided by the number of attention heads divided by 2
143
+ `long_factor` (`list[float]`, *optional*):
144
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
145
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
146
+ size divided by the number of attention heads divided by 2
147
+ `low_freq_factor` (`float`, *optional*):
148
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
149
+ `high_freq_factor` (`float`, *optional*):
150
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
151
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
152
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
153
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
154
+ Whether to use sliding window attention.
155
+ sliding_window (`int`, *optional*, defaults to 4096):
156
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
157
+ max_window_layers (`int`, *optional*, defaults to 28):
158
+ The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
159
+ additional layer afterwards will use SWA (Sliding Window Attention).
160
+ layer_types (`list`, *optional*):
161
+ Attention pattern for each layer.
162
+ attention_dropout (`float`, *optional*, defaults to 0.0):
163
+ The dropout ratio for the attention probabilities.
164
+ use_emotional_reasoning (`bool`, *optional*, defaults to `True`):
165
+ Whether to enable Semantic Emotion Reasoning (SER) capabilities for emotional understanding and processing.
166
+ use_perspective_threading (`bool`, *optional*, defaults to `True`):
167
+ Whether to enable Perspective Emotion Threading (PET) for multi-threaded emotional reasoning.
168
+ num_emotion_heads (`int`, *optional*, defaults to 4):
169
+ Number of specialized attention heads dedicated to emotional processing and reasoning.
170
+ num_thinking_stages (`int`, *optional*, defaults to 3):
171
+ Number of thinking stages for multi-stage reasoning and reflection processing.
172
+ emotion_hidden_size (`int`, *optional*, defaults to 512):
173
+ Hidden size for the emotional reasoning layers and SER processing modules.
174
+ perspective_threads (`int`, *optional*, defaults to 4):
175
+ Number of parallel perspective threads for PET processing (relatable, supportive, motivational, analytical).
176
+ thinking_depth (`int`, *optional*, defaults to 2):
177
+ Depth of thinking layers for internal reasoning and reflection processes.
178
+ structured_output_vocab_size (`int`, *optional*, defaults to 100):
179
+ Additional vocabulary size for structured output tokens like <think>, <ser>, <pet>, etc.
180
+ empathy_scaling_factor (`float`, *optional*, defaults to 1.2):
181
+ Scaling factor for empathy-related attention weights and emotional processing.
182
+ reasoning_temperature (`float`, *optional*, defaults to 0.8):
183
+ Temperature parameter for reasoning and thinking processes to balance creativity and coherence.
184
+ use_speech_output (`bool`, *optional*, defaults to `False`):
185
+ Whether to enable an additional text-to-speech head that predicts mel-spectrogram frames from hidden states.
186
+ speech_num_mels (`int`, *optional*, defaults to `80`):
187
+ Number of mel bins to predict for the speech head.
188
+ speech_upsample_factor (`int`, *optional*, defaults to `1`):
189
+ Temporal upsampling factor to expand token-level hidden states to frame-level resolution by simple repetition.
190
+ speech_loss_type (`str`, *optional*, defaults to `"l1"`):
191
+ Loss for speech supervision. One of {"l1", "mse"}.
192
+ speech_head_hidden_dim (`int`, *optional*, defaults to `None`):
193
+ Hidden dimension for the speech head MLP (hidden_size -> speech_head_hidden_dim -> num_mels).
194
+ If None, defaults to hidden_size // 2. Increase to scale speech head params (e.g., ~9.6k for ~50M).
195
+
196
+ ```python
197
+ >>> from transformers import HelpingAIModel, HelpingAIConfig
198
+
199
+ >>> # Initializing a HelpingAI style configuration with advanced reasoning
200
+ >>> configuration = HelpingAIConfig(
201
+ ... use_emotional_reasoning=True,
202
+ ... use_perspective_threading=True,
203
+ ... num_emotion_heads=4,
204
+ ... num_thinking_stages=3
205
+ ... )
206
+
207
+ >>> # Initializing a model from the HelpingAI-8B style configuration
208
+ >>> model = HelpingAIModel(configuration)
209
+
210
+ >>> # Accessing the model configuration
211
+ >>> configuration = model.config
212
+ ```"""
213
+
214
+ model_type = "helpingai"
215
+ keys_to_ignore_at_inference = ["past_key_values"]
216
+
217
+ # Default tensor parallel plan for base model `HelpingAI`
218
+ base_model_tp_plan = {
219
+ "layers.*.self_attn.q_proj": "colwise",
220
+ "layers.*.self_attn.k_proj": "colwise",
221
+ "layers.*.self_attn.v_proj": "colwise",
222
+ "layers.*.self_attn.o_proj": "rowwise",
223
+ "layers.*.mlp.gate_proj": "colwise",
224
+ "layers.*.mlp.up_proj": "colwise",
225
+ "layers.*.mlp.down_proj": "rowwise",
226
+ }
227
+ base_model_pp_plan = {
228
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
229
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
230
+ "norm": (["hidden_states"], ["hidden_states"]),
231
+ }
232
+
233
+ def __init__(
234
+ self,
235
+ vocab_size=151936,
236
+ hidden_size=4096,
237
+ intermediate_size=22016,
238
+ num_hidden_layers=32,
239
+ num_attention_heads=32,
240
+ num_key_value_heads=8, # Match num_attention_heads for compatibility
241
+ head_dim=128,
242
+ hidden_act="silu",
243
+ max_position_embeddings=32768,
244
+ initializer_range=0.02,
245
+ rms_norm_eps=1e-6,
246
+ use_cache=True,
247
+ tie_word_embeddings=False,
248
+ rope_theta=10000.0,
249
+ rope_scaling=None,
250
+ attention_bias=False,
251
+ use_sliding_window=False,
252
+ sliding_window=4096,
253
+ max_window_layers=28,
254
+ layer_types=None,
255
+ attention_dropout=0.0,
256
+ # Advanced reasoning parameters
257
+ use_emotional_reasoning=False, # Disable by default for now
258
+ use_perspective_threading=True,
259
+ num_emotion_heads=4,
260
+ num_thinking_stages=3,
261
+ emotion_hidden_size=512,
262
+ perspective_threads=4,
263
+ thinking_depth=2,
264
+ structured_output_vocab_size=100,
265
+ empathy_scaling_factor=1.2,
266
+ reasoning_temperature=0.8,
267
+ # Structured head architecture (new)
268
+ structured_head_type: str = "linear", # one of: linear, mlp_v1
269
+ structured_head_hidden_dim: int | None = None,
270
+ structured_head_activation: str = "gelu", # gelu or relu
271
+ # Speech output head options
272
+ use_speech_output=False,
273
+ speech_num_mels=80,
274
+ speech_upsample_factor=1,
275
+ speech_loss_type="l1",
276
+ speech_head_hidden_dim=None,
277
+ **kwargs,
278
+ ):
279
+ self.vocab_size = vocab_size
280
+ self.max_position_embeddings = max_position_embeddings
281
+ self.hidden_size = hidden_size
282
+ self.intermediate_size = intermediate_size
283
+ self.num_hidden_layers = num_hidden_layers
284
+ self.num_attention_heads = num_attention_heads
285
+ self.use_sliding_window = use_sliding_window
286
+ self.sliding_window = sliding_window if self.use_sliding_window else None
287
+ self.max_window_layers = max_window_layers
288
+
289
+ # for backward compatibility
290
+ if num_key_value_heads is None:
291
+ num_key_value_heads = num_attention_heads
292
+
293
+ self.num_key_value_heads = num_key_value_heads
294
+ self.head_dim = head_dim
295
+ self.hidden_act = hidden_act
296
+ self.initializer_range = initializer_range
297
+ self.rms_norm_eps = rms_norm_eps
298
+ self.use_cache = use_cache
299
+ self.rope_theta = rope_theta
300
+ self.rope_scaling = rope_scaling
301
+ self.attention_bias = attention_bias
302
+ self.attention_dropout = attention_dropout
303
+
304
+ # Advanced reasoning capabilities
305
+ self.use_emotional_reasoning = use_emotional_reasoning
306
+ self.use_perspective_threading = use_perspective_threading
307
+ self.num_emotion_heads = num_emotion_heads
308
+ self.num_thinking_stages = num_thinking_stages
309
+ self.emotion_hidden_size = emotion_hidden_size
310
+ self.perspective_threads = perspective_threads
311
+ self.thinking_depth = thinking_depth
312
+ self.structured_output_vocab_size = structured_output_vocab_size
313
+ self.empathy_scaling_factor = empathy_scaling_factor
314
+ self.reasoning_temperature = reasoning_temperature
315
+ # Structured head architecture spec
316
+ self.structured_head_type = structured_head_type
317
+ self.structured_head_hidden_dim = structured_head_hidden_dim
318
+ self.structured_head_activation = structured_head_activation
319
+ # Speech head config
320
+ self.use_speech_output = use_speech_output
321
+ self.speech_num_mels = speech_num_mels
322
+ self.speech_upsample_factor = speech_upsample_factor
323
+ self.speech_loss_type = speech_loss_type
324
+ self.speech_head_hidden_dim = speech_head_hidden_dim
325
+
326
+ # Validate emotional reasoning parameters
327
+ if self.use_emotional_reasoning and self.num_emotion_heads > self.num_attention_heads:
328
+ raise ValueError(f"num_emotion_heads ({self.num_emotion_heads}) cannot exceed num_attention_heads ({self.num_attention_heads})")
329
+
330
+ if self.use_perspective_threading and self.perspective_threads < 2:
331
+ raise ValueError(f"perspective_threads ({self.perspective_threads}) must be at least 2 for meaningful threading")
332
+ if self.use_speech_output:
333
+ if not isinstance(self.speech_num_mels, int) or self.speech_num_mels <= 0:
334
+ raise ValueError("speech_num_mels must be a positive integer")
335
+ if not isinstance(self.speech_upsample_factor, int) or self.speech_upsample_factor <= 0:
336
+ raise ValueError("speech_upsample_factor must be a positive integer")
337
+ if self.speech_loss_type not in {"l1", "mse"}:
338
+ raise ValueError("speech_loss_type must be one of {'l1','mse'}")
339
+ if self.speech_head_hidden_dim is not None:
340
+ if not isinstance(self.speech_head_hidden_dim, int) or self.speech_head_hidden_dim <= 0:
341
+ raise ValueError("speech_head_hidden_dim must be a positive integer when provided")
342
+
343
+ # Validate the correctness of rotary position embeddings parameters
344
+ # BC: if there is a 'type' field, move it to 'rope_type'.
345
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
346
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
347
+ rope_config_validation(self)
348
+
349
+ self.layer_types = layer_types
350
+ if self.layer_types is None:
351
+ self.layer_types = [
352
+ "sliding_attention"
353
+ if self.sliding_window is not None and i >= self.max_window_layers
354
+ else "full_attention"
355
+ for i in range(self.num_hidden_layers)
356
+ ]
357
+ layer_type_validation(self.layer_types)
358
+
359
+ super().__init__(
360
+ tie_word_embeddings=tie_word_embeddings,
361
+ **kwargs,
362
+ )
363
+
364
+
365
+ __all__ = ["HelpingAIConfig"]
366
+
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.55.2"
13
+ }
label_map.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id2label": [
3
+ "HARMFUL_SEXUAL",
4
+ "HARMFUL_HATE",
5
+ "HARMFUL_VIOLENCE",
6
+ "HARMFUL_HARASSMENT",
7
+ "HARMFUL_LANGUAGE",
8
+ "HARMFUL_MISINFORMATION",
9
+ "SAFE"
10
+ ],
11
+ "pooling": "last"
12
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd0e66019b5ea698d577625397f8bd4e0e90b054b2f0d84c3d9af018d5cd34e4
3
+ size 4887893216
model-00002-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:932087a0b7283cc26967d9821b932a85a4f3b721af8d729b4c7c7c112411134d
3
+ size 4991798206
model-00003-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d97f54b342f1bf9c4cd5c8facef8f6f7db5608404d92bf28f2ccc4cca24026d3
3
+ size 4991798414
model-00004-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d0f940b8973cb5a2f9e6a3db0fde2d251245a252321d7f7f65e263ba8742822
3
+ size 4991798414
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bec76fc8084bf6d11e2d425bddfe6850076892b7cdb029ecc0b08a06428200c
3
+ size 4991798414
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d8fd601d8bc84783620827080d0d2ca6f690032923a3415b13b88ee339a090e
3
+ size 4991798414
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb029f8eed8b02d3b788694ed523c2131cb9fbce48bdc3f79e0e3426529797bf
3
+ size 1911147342
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_helpingai.py ADDED
@@ -0,0 +1,1249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import Optional, Tuple, List
3
+
4
+ import torch
5
+ from torch import nn
6
+ from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
7
+ from transformers.modeling_utils import PreTrainedModel
8
+ from .configuration_helpingai import HelpingAIConfig
9
+
10
+
11
+ class HelpingAIAttention(nn.Module):
12
+ def __init__(self, config: HelpingAIConfig):
13
+ super().__init__()
14
+ self.num_heads = config.num_attention_heads
15
+ self.head_dim = config.hidden_size // config.num_attention_heads
16
+ assert self.head_dim * self.num_heads == config.hidden_size
17
+ self.scale = self.head_dim ** -0.5
18
+ self.qkv = nn.Linear(config.hidden_size, 3 * config.hidden_size)
19
+ self.out = nn.Linear(config.hidden_size, config.hidden_size)
20
+ self.attn_dropout = nn.Dropout(config.attention_dropout)
21
+ self.resid_dropout = nn.Dropout(config.dropout)
22
+
23
+ def forward(self, x, attn_mask: Optional[torch.Tensor]=None):
24
+ B, T, C = x.shape
25
+ qkv = self.qkv(x).view(B, T, 3, self.num_heads, self.head_dim).permute(2,0,3,1,4)
26
+ q, k, v = qkv[0], qkv[1], qkv[2] # [B, H, T, D]
27
+ attn_scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale # [B,H,T,T]
28
+ causal = torch.ones(T, T, device=x.device, dtype=torch.bool).triu(1)
29
+ attn_scores = attn_scores.masked_fill(causal, float('-inf'))
30
+ if attn_mask is not None:
31
+ # attn_mask: [B,T]; convert to [B,1,1,T]
32
+ mask = (attn_mask == 0).unsqueeze(1).unsqueeze(2)
33
+ attn_scores = attn_scores.masked_fill(mask, float('-inf'))
34
+ attn = torch.softmax(attn_scores, dim=-1)
35
+ attn = self.attn_dropout(attn)
36
+ y = torch.matmul(attn, v) # [B,H,T,D]
37
+ y = y.transpose(1,2).contiguous().view(B, T, C)
38
+ y = self.resid_dropout(self.out(y))
39
+ return y
40
+
41
+
42
+ class HelpingAIMLP(nn.Module):
43
+ def __init__(self, config: HelpingAIConfig):
44
+ super().__init__()
45
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
46
+ self.act = nn.GELU() if config.hidden_act == 'gelu' else nn.ReLU()
47
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
48
+ self.dropout = nn.Dropout(config.dropout)
49
+
50
+ def forward(self, x):
51
+ return self.dropout(self.fc2(self.act(self.fc1(x))))
52
+
53
+
54
+ class HelpingAIBlock(nn.Module):
55
+ def __init__(self, config: HelpingAIConfig):
56
+ super().__init__()
57
+ self.ln1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
58
+ self.attn = HelpingAIAttention(config)
59
+ self.ln2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
60
+ self.mlp = HelpingAIMLP(config)
61
+
62
+ def forward(self, x, attn_mask=None):
63
+ x = x + self.attn(self.ln1(x), attn_mask)
64
+ x = x + self.mlp(self.ln2(x))
65
+ return x
66
+
67
+
68
+ class HelpingAIForCausalLM(PreTrainedModel):
69
+ config_class = HelpingAIConfig
70
+ supports_gradient_checkpointing = False
71
+
72
+ def __init__(self, config: HelpingAIConfig):
73
+ super().__init__(config)
74
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
75
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
76
+ self.drop = nn.Dropout(config.dropout)
77
+ self.blocks = nn.ModuleList([HelpingAIBlock(config) for _ in range(config.num_hidden_layers)])
78
+ self.ln_f = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
79
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
80
+
81
+ # Structured output head
82
+ if config.use_structured_output:
83
+ self.structured_lm_head = nn.Linear(config.hidden_size, config.structured_output_vocab_size)
84
+ else:
85
+ self.structured_lm_head = nn.Linear(config.hidden_size, 1)
86
+
87
+ # Speech projector (simple 2-layer MLP hidden->H->mels)
88
+ if config.use_speech_output:
89
+ H = config.speech_head_hidden_dim
90
+ self.speech_proj = nn.Sequential(
91
+ nn.Linear(config.hidden_size, H),
92
+ nn.GELU(),
93
+ nn.Linear(H, config.speech_num_mels),
94
+ )
95
+ else:
96
+ self.speech_proj = nn.Sequential(
97
+ nn.Linear(config.hidden_size, config.speech_head_hidden_dim),
98
+ nn.GELU(),
99
+ nn.Linear(config.speech_head_hidden_dim, config.speech_num_mels),
100
+ )
101
+
102
+ self._init_weights()
103
+
104
+ def _init_weights(self):
105
+ for n, p in self.named_parameters():
106
+ if p.dim() > 1:
107
+ nn.init.normal_(p, mean=0.0, std=self.config.initializer_range)
108
+ else:
109
+ nn.init.zeros_(p)
110
+ if hasattr(self.lm_head, 'weight') and hasattr(self.embed_tokens, 'weight') and self.config.tie_word_embeddings:
111
+ self.lm_head.weight = self.embed_tokens.weight
112
+
113
+ def forward(
114
+ self,
115
+ input_ids: torch.LongTensor,
116
+ attention_mask: Optional[torch.Tensor] = None,
117
+ labels: Optional[torch.LongTensor] = None,
118
+ use_cache: bool = False,
119
+ output_hidden_states: bool = False,
120
+ return_dict: bool = True,
121
+ **kwargs,
122
+ ) -> CausalLMOutputWithCrossAttentions:
123
+ B, T = input_ids.shape
124
+ device = input_ids.device
125
+ if attention_mask is None:
126
+ attention_mask = torch.ones_like(input_ids)
127
+ pos = torch.arange(0, T, device=device).unsqueeze(0)
128
+ x = self.embed_tokens(input_ids) + self.position_embeddings(pos)
129
+ x = self.drop(x)
130
+ hidden_states: List[torch.Tensor] = []
131
+ for block in self.blocks:
132
+ x = block(x, attention_mask)
133
+ if output_hidden_states:
134
+ hidden_states.append(x)
135
+ x = self.ln_f(x)
136
+ if output_hidden_states:
137
+ hidden_states.append(x)
138
+ logits = self.lm_head(x)
139
+ loss = None
140
+ if labels is not None:
141
+ shift_logits = logits[:, :-1].contiguous()
142
+ shift_labels = labels[:, 1:].contiguous()
143
+ loss_fct = nn.CrossEntropyLoss()
144
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
145
+ if not return_dict:
146
+ return (loss, logits, hidden_states)
147
+ return CausalLMOutputWithCrossAttentions(
148
+ loss=loss,
149
+ logits=logits,
150
+ hidden_states=tuple(hidden_states) if output_hidden_states else None,
151
+ past_key_values=None,
152
+ attentions=None,
153
+ cross_attentions=None,
154
+ )
155
+
156
+ # Convenience for generation API expectations
157
+ def prepare_inputs_for_generation(self, input_ids, **kwargs):
158
+ return {"input_ids": input_ids, **kwargs}
159
+
160
+ from typing import Callable, Optional, Union
161
+
162
+ import torch
163
+ from torch import nn
164
+
165
+ from transformers.activations import ACT2FN
166
+ from transformers.cache_utils import Cache, DynamicCache
167
+ from transformers.generation import GenerationMixin
168
+ from transformers.integrations import use_kernel_forward_from_hub
169
+ from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
170
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
171
+ from transformers.modeling_layers import (
172
+ GenericForQuestionAnswering,
173
+ GenericForSequenceClassification,
174
+ GenericForTokenClassification,
175
+ GradientCheckpointingLayer,
176
+ )
177
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
178
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
179
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
180
+ from transformers.processing_utils import Unpack
181
+ from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple
182
+ from transformers.utils.deprecation import deprecate_kwarg
183
+ from transformers.utils.generic import check_model_inputs
184
+ from .configuration_helpingai import HelpingAIConfig
185
+
186
+
187
+ @use_kernel_forward_from_hub("RMSNorm")
188
+ class HelpingAIRMSNorm(nn.Module):
189
+ def __init__(self, hidden_size, eps=1e-6):
190
+ """
191
+ HelpingAIRMSNorm is equivalent to T5LayerNorm
192
+ """
193
+ super().__init__()
194
+ self.weight = nn.Parameter(torch.ones(hidden_size))
195
+ self.variance_epsilon = eps
196
+
197
+ def forward(self, hidden_states):
198
+ input_dtype = hidden_states.dtype
199
+ hidden_states = hidden_states.to(torch.float32)
200
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
201
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
202
+ return self.weight * hidden_states.to(input_dtype)
203
+
204
+ def extra_repr(self):
205
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
206
+
207
+
208
+ class HelpingAISemanticEmotionReasoning(nn.Module):
209
+ """
210
+ Structured Emotional Reasoning (SER) layer for emotional understanding and processing.
211
+ Maps emotions to semantic representations and provides contextual emotion analysis.
212
+ """
213
+ def __init__(self, config: HelpingAIConfig):
214
+ super().__init__()
215
+ self.config = config
216
+ self.emotion_hidden_size = config.emotion_hidden_size
217
+ self.hidden_size = config.hidden_size
218
+
219
+ # Emotion detection and mapping
220
+ self.emotion_detector = nn.Linear(self.hidden_size, self.emotion_hidden_size)
221
+ self.emotion_mapper = nn.Linear(self.emotion_hidden_size, self.emotion_hidden_size)
222
+
223
+ # Contextual emotion analysis
224
+ self.emotion_context = nn.MultiheadAttention(
225
+ embed_dim=self.emotion_hidden_size,
226
+ num_heads=min(8, self.emotion_hidden_size // 64),
227
+ batch_first=True
228
+ )
229
+
230
+ # Emotion classification heads
231
+ self.primary_emotion = nn.Linear(self.emotion_hidden_size, 32) # Primary emotions
232
+ self.emotion_intensity = nn.Linear(self.emotion_hidden_size, 1) # Intensity score
233
+ self.emotion_valence = nn.Linear(self.emotion_hidden_size, 1) # Positive/negative
234
+
235
+ # Output projection
236
+ self.emotion_output = nn.Linear(self.emotion_hidden_size, self.hidden_size)
237
+ self.emotion_norm = HelpingAIRMSNorm(self.emotion_hidden_size, eps=config.rms_norm_eps)
238
+
239
+ # Activation
240
+ self.act_fn = ACT2FN[config.hidden_act]
241
+
242
+ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, dict]:
243
+ # Detect emotional content
244
+ emotion_features = self.act_fn(self.emotion_detector(hidden_states))
245
+ emotion_mapped = self.emotion_mapper(emotion_features)
246
+ emotion_mapped = self.emotion_norm(emotion_mapped)
247
+
248
+ # Contextual emotion analysis
249
+ emotion_context, attention_weights = self.emotion_context(
250
+ emotion_mapped, emotion_mapped, emotion_mapped
251
+ )
252
+
253
+ # Emotion analysis outputs
254
+ primary_emotions = self.primary_emotion(emotion_context)
255
+ emotion_intensity = torch.sigmoid(self.emotion_intensity(emotion_context))
256
+ emotion_valence = torch.tanh(self.emotion_valence(emotion_context))
257
+
258
+ # Project back to hidden size
259
+ emotion_output = self.emotion_output(emotion_context)
260
+
261
+ # Emotion metadata
262
+ emotion_metadata = {
263
+ "primary_emotions": primary_emotions,
264
+ "intensity": emotion_intensity,
265
+ "valence": emotion_valence,
266
+ "attention_weights": attention_weights
267
+ }
268
+
269
+ return emotion_output, emotion_metadata
270
+
271
+
272
+ class HelpingAIPerspectiveEmotionThreading(nn.Module):
273
+ """
274
+ Parallel Empathic Threads (PET) layer for multi-threaded emotional reasoning.
275
+ Processes multiple perspective threads: relatable, supportive, motivational, analytical.
276
+ """
277
+ def __init__(self, config: HelpingAIConfig):
278
+ super().__init__()
279
+ self.config = config
280
+ self.hidden_size = config.hidden_size
281
+ self.perspective_threads = config.perspective_threads
282
+ self.thread_hidden_size = config.emotion_hidden_size
283
+
284
+ # Thread-specific processors
285
+ self.thread_projections = nn.ModuleList([
286
+ nn.Linear(self.hidden_size, self.thread_hidden_size)
287
+ for _ in range(self.perspective_threads)
288
+ ])
289
+
290
+ # Thread names for interpretability
291
+ self.thread_names = ["relatable", "supportive", "motivational", "analytical"][:self.perspective_threads]
292
+
293
+ # Cross-thread attention for perspective integration
294
+ self.cross_thread_attention = nn.MultiheadAttention(
295
+ embed_dim=self.thread_hidden_size,
296
+ num_heads=min(4, self.thread_hidden_size // 64),
297
+ batch_first=True
298
+ )
299
+
300
+ # Thread-specific processing layers
301
+ self.thread_processors = nn.ModuleList([
302
+ nn.Sequential(
303
+ nn.Linear(self.thread_hidden_size, self.thread_hidden_size * 2),
304
+ nn.GELU(),
305
+ nn.Linear(self.thread_hidden_size * 2, self.thread_hidden_size),
306
+ HelpingAIRMSNorm(self.thread_hidden_size, eps=config.rms_norm_eps)
307
+ )
308
+ for _ in range(self.perspective_threads)
309
+ ])
310
+
311
+ # Output integration
312
+ self.thread_combiner = nn.Linear(
313
+ self.thread_hidden_size * self.perspective_threads,
314
+ self.hidden_size
315
+ )
316
+
317
+ # Thread importance weighting
318
+ self.thread_weights = nn.Parameter(torch.ones(self.perspective_threads))
319
+
320
+ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, dict]:
321
+ batch_size, seq_len, _ = hidden_states.shape
322
+
323
+ # Process each perspective thread
324
+ thread_outputs = []
325
+ thread_metadata = {}
326
+
327
+ for i, (projection, processor, thread_name) in enumerate(
328
+ zip(self.thread_projections, self.thread_processors, self.thread_names)
329
+ ):
330
+ # Project to thread space
331
+ thread_input = projection(hidden_states)
332
+
333
+ # Process thread-specific perspective
334
+ thread_output = processor(thread_input)
335
+ thread_outputs.append(thread_output)
336
+
337
+ # Store thread metadata
338
+ thread_metadata[f"{thread_name}_activation"] = torch.mean(torch.abs(thread_output))
339
+
340
+ # Stack threads for cross-thread attention
341
+ stacked_threads = torch.stack(thread_outputs, dim=2) # [batch, seq_len, num_threads, hidden]
342
+ stacked_threads = stacked_threads.reshape(batch_size * seq_len, self.perspective_threads, self.thread_hidden_size)
343
+
344
+ # Cross-thread attention for perspective integration
345
+ integrated_threads, cross_attention = self.cross_thread_attention(
346
+ stacked_threads, stacked_threads, stacked_threads
347
+ )
348
+
349
+ # Apply thread importance weighting
350
+ thread_weights_normalized = torch.softmax(self.thread_weights, dim=0)
351
+ weighted_threads = integrated_threads * thread_weights_normalized.unsqueeze(0).unsqueeze(-1)
352
+
353
+ # Combine threads - use reshape instead of view for memory layout compatibility
354
+ combined_threads = weighted_threads.reshape(batch_size, seq_len, -1)
355
+ final_output = self.thread_combiner(combined_threads)
356
+
357
+ # Thread metadata
358
+ thread_metadata.update({
359
+ "thread_weights": thread_weights_normalized,
360
+ "cross_attention": cross_attention,
361
+ "thread_activations": {
362
+ name: torch.mean(output) for name, output in zip(self.thread_names, thread_outputs)
363
+ }
364
+ })
365
+
366
+ return final_output, thread_metadata
367
+
368
+
369
+ class HelpingAIMultiStageThinking(nn.Module):
370
+ """
371
+ Multi-stage thinking module for internal reasoning and reflection processes.
372
+ Implements cascaded thinking stages with simplified feedback loops.
373
+ """
374
+ def __init__(self, config: HelpingAIConfig):
375
+ super().__init__()
376
+ self.config = config
377
+ self.hidden_size = config.hidden_size
378
+ self.thinking_stages = config.num_thinking_stages
379
+ self.thinking_depth = config.thinking_depth
380
+
381
+ # Thinking stage processors
382
+ self.thinking_layers = nn.ModuleList([
383
+ nn.Sequential(
384
+ nn.Linear(self.hidden_size, self.hidden_size),
385
+ nn.GELU(),
386
+ nn.Linear(self.hidden_size, self.hidden_size),
387
+ HelpingAIRMSNorm(self.hidden_size, eps=config.rms_norm_eps)
388
+ )
389
+ for _ in range(self.thinking_stages)
390
+ ])
391
+
392
+ # Simple reflection mechanism without complex attention
393
+ self.reflection_layers = nn.ModuleList([
394
+ nn.Linear(self.hidden_size, self.hidden_size)
395
+ for _ in range(self.thinking_stages - 1)
396
+ ])
397
+
398
+ # Stage transition gates
399
+ self.stage_gates = nn.ModuleList([
400
+ nn.Linear(self.hidden_size, 1) for _ in range(self.thinking_stages - 1)
401
+ ])
402
+
403
+ # Thinking combination weights
404
+ self.stage_combiner = nn.Linear(self.thinking_stages * self.hidden_size, self.hidden_size)
405
+
406
+ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, dict]:
407
+ batch_size, seq_len, _ = hidden_states.shape
408
+ thinking_outputs = []
409
+ thinking_metadata = {}
410
+
411
+ current_thought = hidden_states
412
+
413
+ # Multi-stage thinking process
414
+ for stage_idx, stage_processor in enumerate(self.thinking_layers):
415
+ # Process current thinking stage
416
+ current_thought = stage_processor(current_thought)
417
+
418
+ # Store stage output
419
+ thinking_outputs.append(current_thought)
420
+ thinking_metadata[f"stage_{stage_idx}_activation"] = torch.mean(torch.abs(current_thought)).item()
421
+
422
+ # Apply reflection if not the last stage
423
+ if stage_idx < self.thinking_stages - 1:
424
+ # Simple reflection mechanism
425
+ reflection = self.reflection_layers[stage_idx](current_thought)
426
+ current_thought = current_thought + 0.1 * reflection # Small reflection influence
427
+
428
+ # Stage transition gating
429
+ gate_weight = torch.sigmoid(self.stage_gates[stage_idx](current_thought))
430
+ current_thought = gate_weight * current_thought + (1 - gate_weight) * hidden_states
431
+
432
+ # Combine all thinking stages
433
+ all_thoughts = torch.cat(thinking_outputs, dim=-1) # Concatenate along hidden dimension
434
+ final_thought = self.stage_combiner(all_thoughts)
435
+
436
+ thinking_metadata["stage_contributions"] = [
437
+ torch.mean(torch.abs(output)).item() for output in thinking_outputs
438
+ ]
439
+
440
+ return final_thought, thinking_metadata
441
+
442
+
443
+ class HelpingAIMLP(nn.Module):
444
+ def __init__(self, config):
445
+ super().__init__()
446
+ self.config = config
447
+ self.hidden_size = config.hidden_size
448
+ self.intermediate_size = config.intermediate_size
449
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
450
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
451
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
452
+ self.act_fn = ACT2FN[config.hidden_act]
453
+
454
+ # Enhanced MLP with thinking modules
455
+ if hasattr(config, 'use_emotional_reasoning') and config.use_emotional_reasoning:
456
+ self.thinking_module = HelpingAIMultiStageThinking(config)
457
+ self.use_thinking = True
458
+ else:
459
+ self.use_thinking = False
460
+
461
+ # Reasoning temperature for controlled generation
462
+ self.reasoning_temperature = getattr(config, 'reasoning_temperature', 1.0)
463
+
464
+ def forward(self, x):
465
+ # Standard MLP forward pass
466
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
467
+
468
+ # Apply multi-stage thinking if enabled
469
+ if self.use_thinking:
470
+ thinking_output, thinking_metadata = self.thinking_module(down_proj)
471
+ # Apply reasoning temperature
472
+ down_proj = down_proj + (thinking_output * self.reasoning_temperature)
473
+
474
+ return down_proj
475
+
476
+
477
+ def rotate_half(x):
478
+ """Rotates half the hidden dims of the input."""
479
+ x1 = x[..., : x.shape[-1] // 2]
480
+ x2 = x[..., x.shape[-1] // 2 :]
481
+ return torch.cat((-x2, x1), dim=-1)
482
+
483
+
484
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
485
+ """Applies Rotary Position Embedding to the query and key tensors.
486
+
487
+ Args:
488
+ q (`torch.Tensor`): The query tensor.
489
+ k (`torch.Tensor`): The key tensor.
490
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
491
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
492
+ position_ids (`torch.Tensor`, *optional*):
493
+ Deprecated and unused.
494
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
495
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
496
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
497
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
498
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
499
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
500
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
501
+ Returns:
502
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
503
+ """
504
+ cos = cos.unsqueeze(unsqueeze_dim)
505
+ sin = sin.unsqueeze(unsqueeze_dim)
506
+ q_embed = (q * cos) + (rotate_half(q) * sin)
507
+ k_embed = (k * cos) + (rotate_half(k) * sin)
508
+ return q_embed, k_embed
509
+
510
+
511
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
512
+ """
513
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
514
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
515
+ """
516
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
517
+ if n_rep == 1:
518
+ return hidden_states
519
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
520
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
521
+
522
+
523
+ def eager_attention_forward(
524
+ module: nn.Module,
525
+ query: torch.Tensor,
526
+ key: torch.Tensor,
527
+ value: torch.Tensor,
528
+ attention_mask: Optional[torch.Tensor],
529
+ scaling: float,
530
+ dropout: float = 0.0,
531
+ **kwargs: Unpack[TransformersKwargs],
532
+ ):
533
+ key_states = repeat_kv(key, module.num_key_value_groups)
534
+ value_states = repeat_kv(value, module.num_key_value_groups)
535
+
536
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
537
+ if attention_mask is not None:
538
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
539
+ attn_weights = attn_weights + causal_mask
540
+
541
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
542
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
543
+ attn_output = torch.matmul(attn_weights, value_states)
544
+ attn_output = attn_output.transpose(1, 2).contiguous()
545
+
546
+ return attn_output, attn_weights
547
+
548
+
549
+ class HelpingAIAttention(nn.Module):
550
+ """Multi-headed attention with specialized emotional and empathetic reasoning capabilities"""
551
+
552
+ def __init__(self, config: HelpingAIConfig, layer_idx: int):
553
+ super().__init__()
554
+ self.config = config
555
+ self.layer_idx = layer_idx
556
+ self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
557
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
558
+ self.scaling = self.head_dim**-0.5
559
+ self.attention_dropout = config.attention_dropout
560
+ self.is_causal = True
561
+
562
+ self.q_proj = nn.Linear(
563
+ config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
564
+ )
565
+ self.k_proj = nn.Linear(
566
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
567
+ )
568
+ self.v_proj = nn.Linear(
569
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
570
+ )
571
+ self.o_proj = nn.Linear(
572
+ config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
573
+ )
574
+ self.q_norm = HelpingAIRMSNorm(self.head_dim, eps=config.rms_norm_eps)
575
+ self.k_norm = HelpingAIRMSNorm(self.head_dim, eps=config.rms_norm_eps)
576
+ self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None
577
+
578
+ # Enhanced emotional and empathetic attention
579
+ if hasattr(config, 'use_emotional_reasoning') and config.use_emotional_reasoning:
580
+ self.num_emotion_heads = getattr(config, 'num_emotion_heads', 4)
581
+ self.empathy_scaling_factor = getattr(config, 'empathy_scaling_factor', 1.2)
582
+
583
+ # Specialized emotion attention projections
584
+ self.emotion_q_proj = nn.Linear(config.hidden_size, self.num_emotion_heads * self.head_dim, bias=False)
585
+ self.emotion_k_proj = nn.Linear(config.hidden_size, self.num_emotion_heads * self.head_dim, bias=False)
586
+ self.emotion_v_proj = nn.Linear(config.hidden_size, self.num_emotion_heads * self.head_dim, bias=False)
587
+
588
+ # Empathy enhancement layer
589
+ self.empathy_enhancer = nn.Sequential(
590
+ nn.Linear(config.hidden_size, config.hidden_size // 2),
591
+ nn.GELU(),
592
+ nn.Linear(config.hidden_size // 2, config.num_attention_heads),
593
+ nn.Softmax(dim=-1)
594
+ )
595
+
596
+ self.use_emotional_attention = True
597
+ else:
598
+ self.use_emotional_attention = False
599
+
600
+ @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
601
+ def forward(
602
+ self,
603
+ hidden_states: torch.Tensor,
604
+ position_embeddings: tuple[torch.Tensor, torch.Tensor],
605
+ attention_mask: Optional[torch.Tensor],
606
+ past_key_values: Optional[Cache] = None,
607
+ cache_position: Optional[torch.LongTensor] = None,
608
+ **kwargs: Unpack[FlashAttentionKwargs],
609
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
610
+ input_shape = hidden_states.shape[:-1]
611
+ hidden_shape = (*input_shape, -1, self.head_dim)
612
+
613
+ # Standard attention processing
614
+ query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
615
+ key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
616
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
617
+
618
+ cos, sin = position_embeddings
619
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
620
+
621
+ if past_key_values is not None:
622
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
623
+ key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
624
+
625
+ # Enhanced emotional attention processing
626
+ if self.use_emotional_attention:
627
+ # Compute empathy weights
628
+ empathy_weights = self.empathy_enhancer(hidden_states.mean(dim=1)) # [batch, num_heads]
629
+
630
+ # Emotional query, key, value computation
631
+ emotion_query = self.emotion_q_proj(hidden_states).view(*input_shape, self.num_emotion_heads, self.head_dim).transpose(1, 2)
632
+ emotion_key = self.emotion_k_proj(hidden_states).view(*input_shape, self.num_emotion_heads, self.head_dim).transpose(1, 2)
633
+ emotion_value = self.emotion_v_proj(hidden_states).view(*input_shape, self.num_emotion_heads, self.head_dim).transpose(1, 2)
634
+
635
+ # Apply rotary embeddings to emotional attention
636
+ emotion_query, emotion_key = apply_rotary_pos_emb(emotion_query, emotion_key, cos, sin)
637
+
638
+ # Emotional attention computation
639
+ emotion_scaling = (self.head_dim ** -0.5) * self.empathy_scaling_factor
640
+ emotion_attn_weights = torch.matmul(emotion_query, emotion_key.transpose(2, 3)) * emotion_scaling
641
+
642
+ if attention_mask is not None:
643
+ emotion_causal_mask = attention_mask[:, :, :, :emotion_key.shape[-2]]
644
+ emotion_attn_weights = emotion_attn_weights + emotion_causal_mask
645
+
646
+ emotion_attn_weights = nn.functional.softmax(emotion_attn_weights, dim=-1, dtype=torch.float32).to(emotion_query.dtype)
647
+ emotion_output = torch.matmul(emotion_attn_weights, emotion_value)
648
+
649
+ # Integrate emotional attention with standard attention
650
+ # Pad or truncate emotional attention to match standard attention heads
651
+ if self.num_emotion_heads < self.config.num_attention_heads:
652
+ padding_heads = self.config.num_attention_heads - self.num_emotion_heads
653
+ emotion_padding = torch.zeros(
654
+ *emotion_output.shape[:-3], padding_heads, *emotion_output.shape[-2:],
655
+ device=emotion_output.device, dtype=emotion_output.dtype
656
+ )
657
+ emotion_output = torch.cat([emotion_output, emotion_padding], dim=1)
658
+
659
+ # Standard attention computation
660
+ attention_interface: Callable = eager_attention_forward
661
+ if self.config._attn_implementation != "eager":
662
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
663
+
664
+ attn_output, attn_weights = attention_interface(
665
+ self,
666
+ query_states,
667
+ key_states,
668
+ value_states,
669
+ attention_mask,
670
+ dropout=0.0 if not self.training else self.attention_dropout,
671
+ scaling=self.scaling,
672
+ sliding_window=self.sliding_window,
673
+ **kwargs,
674
+ )
675
+
676
+ # Blend standard and emotional attention if emotional reasoning is enabled
677
+ if self.use_emotional_attention:
678
+ # For now, use a simplified approach - just apply empathy scaling
679
+ # This avoids the complex tensor dimension matching issues
680
+ batch_size, num_heads, seq_len, head_dim = attn_output.shape
681
+
682
+ # Get average empathy weight per batch
683
+ empathy_scale = torch.mean(empathy_weights, dim=1, keepdim=True) # [batch, 1]
684
+ empathy_scale = empathy_scale.view(batch_size, 1, 1, 1) # [batch, 1, 1, 1]
685
+ empathy_scale = empathy_scale.expand(batch_size, num_heads, seq_len, head_dim)
686
+
687
+ # Apply empathy scaling to attention output
688
+ attn_output = attn_output * (1.0 + empathy_scale * 0.1) # Small empathy influence
689
+
690
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
691
+ attn_output = self.o_proj(attn_output)
692
+ return attn_output, attn_weights
693
+
694
+
695
+ class HelpingAIDecoderLayer(GradientCheckpointingLayer):
696
+ def __init__(self, config: HelpingAIConfig, layer_idx: int):
697
+ super().__init__()
698
+ self.hidden_size = config.hidden_size
699
+ self.layer_idx = layer_idx
700
+
701
+ self.self_attn = HelpingAIAttention(config=config, layer_idx=layer_idx)
702
+ self.mlp = HelpingAIMLP(config)
703
+ self.input_layernorm = HelpingAIRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
704
+ self.post_attention_layernorm = HelpingAIRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
705
+ self.attention_type = config.layer_types[layer_idx]
706
+
707
+ # Enhanced reasoning layers
708
+ if hasattr(config, 'use_emotional_reasoning') and config.use_emotional_reasoning:
709
+ self.ser_layer = HelpingAISemanticEmotionReasoning(config)
710
+ self.use_ser = True
711
+ else:
712
+ self.use_ser = False
713
+
714
+ if hasattr(config, 'use_perspective_threading') and config.use_perspective_threading:
715
+ self.pet_layer = HelpingAIPerspectiveEmotionThreading(config)
716
+ self.use_pet = True
717
+ else:
718
+ self.use_pet = False
719
+
720
+ # Reasoning integration layers
721
+ if self.use_ser or self.use_pet:
722
+ self.reasoning_norm = HelpingAIRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
723
+ self.reasoning_gate = nn.Linear(config.hidden_size, 1)
724
+
725
+ @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
726
+ def forward(
727
+ self,
728
+ hidden_states: torch.Tensor,
729
+ attention_mask: Optional[torch.Tensor] = None,
730
+ position_ids: Optional[torch.LongTensor] = None,
731
+ past_key_values: Optional[Cache] = None,
732
+ use_cache: Optional[bool] = False,
733
+ cache_position: Optional[torch.LongTensor] = None,
734
+ position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
735
+ **kwargs: Unpack[TransformersKwargs],
736
+ ) -> torch.Tensor:
737
+ residual = hidden_states
738
+ hidden_states = self.input_layernorm(hidden_states)
739
+
740
+ # Self Attention
741
+ hidden_states, attention_weights = self.self_attn(
742
+ hidden_states=hidden_states,
743
+ attention_mask=attention_mask,
744
+ position_ids=position_ids,
745
+ past_key_values=past_key_values,
746
+ use_cache=use_cache,
747
+ cache_position=cache_position,
748
+ position_embeddings=position_embeddings,
749
+ **kwargs,
750
+ )
751
+ hidden_states = residual + hidden_states
752
+
753
+ # Enhanced reasoning processing
754
+ reasoning_outputs = []
755
+ reasoning_metadata = {}
756
+
757
+ if self.use_ser:
758
+ # Semantic Emotion Reasoning
759
+ ser_output, ser_meta = self.ser_layer(hidden_states)
760
+ reasoning_outputs.append(ser_output)
761
+ reasoning_metadata['ser'] = ser_meta
762
+
763
+ if self.use_pet:
764
+ # Perspective Emotion Threading
765
+ pet_output, pet_meta = self.pet_layer(hidden_states)
766
+ reasoning_outputs.append(pet_output)
767
+ reasoning_metadata['pet'] = pet_meta
768
+
769
+ # Integrate reasoning outputs if any
770
+ if reasoning_outputs:
771
+ # Combine reasoning outputs
772
+ combined_reasoning = torch.stack(reasoning_outputs, dim=0).mean(dim=0)
773
+ combined_reasoning = self.reasoning_norm(combined_reasoning)
774
+
775
+ # Apply gating to control reasoning influence
776
+ reasoning_gate = torch.sigmoid(self.reasoning_gate(hidden_states))
777
+ hidden_states = hidden_states + (reasoning_gate * combined_reasoning)
778
+
779
+ # Fully Connected (MLP)
780
+ residual = hidden_states
781
+ hidden_states = self.post_attention_layernorm(hidden_states)
782
+ hidden_states = self.mlp(hidden_states)
783
+ hidden_states = residual + hidden_states
784
+
785
+ # Store reasoning metadata for analysis (optional)
786
+ if hasattr(hidden_states, '_reasoning_metadata'):
787
+ hidden_states._reasoning_metadata = reasoning_metadata
788
+
789
+ return hidden_states
790
+
791
+
792
+ @auto_docstring
793
+ class HelpingAIPreTrainedModel(PreTrainedModel):
794
+ config: HelpingAIConfig
795
+ base_model_prefix = "model"
796
+ supports_gradient_checkpointing = True
797
+ _no_split_modules = ["HelpingAIDecoderLayer"]
798
+ _skip_keys_device_placement = ["past_key_values"]
799
+ _supports_flash_attn = True
800
+ _supports_sdpa = True
801
+ _supports_flex_attn = True
802
+
803
+ _can_compile_fullgraph = True
804
+ _supports_attention_backend = True
805
+ _can_record_outputs = {
806
+ "hidden_states": HelpingAIDecoderLayer,
807
+ "attentions": HelpingAIAttention,
808
+ }
809
+
810
+
811
+ class HelpingAIRotaryEmbedding(nn.Module):
812
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
813
+
814
+ def __init__(self, config: HelpingAIConfig, device=None):
815
+ super().__init__()
816
+ # BC: "rope_type" was originally "type"
817
+ if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
818
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
819
+ else:
820
+ self.rope_type = "default"
821
+ self.max_seq_len_cached = config.max_position_embeddings
822
+ self.original_max_seq_len = config.max_position_embeddings
823
+
824
+ self.config = config
825
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
826
+
827
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
828
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
829
+ self.original_inv_freq = self.inv_freq
830
+
831
+ @torch.no_grad()
832
+ @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
833
+ def forward(self, x, position_ids):
834
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
835
+ position_ids_expanded = position_ids[:, None, :].float()
836
+
837
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
838
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
839
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
840
+ emb = torch.cat((freqs, freqs), dim=-1)
841
+ cos = emb.cos() * self.attention_scaling
842
+ sin = emb.sin() * self.attention_scaling
843
+
844
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
845
+
846
+
847
+ @auto_docstring
848
+ class HelpingAIModel(HelpingAIPreTrainedModel):
849
+ def __init__(self, config: HelpingAIConfig):
850
+ super().__init__(config)
851
+ self.padding_idx = config.pad_token_id
852
+ self.vocab_size = config.vocab_size
853
+
854
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
855
+ self.layers = nn.ModuleList(
856
+ [HelpingAIDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
857
+ )
858
+ self.norm = HelpingAIRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
859
+ self.rotary_emb = HelpingAIRotaryEmbedding(config=config)
860
+ self.gradient_checkpointing = False
861
+ self.has_sliding_layers = "sliding_attention" in self.config.layer_types
862
+
863
+ # Initialize weights and apply final processing
864
+ self.post_init()
865
+
866
+ @check_model_inputs
867
+ @auto_docstring
868
+ def forward(
869
+ self,
870
+ input_ids: Optional[torch.LongTensor] = None,
871
+ attention_mask: Optional[torch.Tensor] = None,
872
+ position_ids: Optional[torch.LongTensor] = None,
873
+ past_key_values: Optional[Cache] = None,
874
+ inputs_embeds: Optional[torch.FloatTensor] = None,
875
+ use_cache: Optional[bool] = None,
876
+ cache_position: Optional[torch.LongTensor] = None,
877
+ **kwargs: Unpack[TransformersKwargs],
878
+ ) -> BaseModelOutputWithPast:
879
+ if (input_ids is None) ^ (inputs_embeds is not None):
880
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
881
+
882
+ if inputs_embeds is None:
883
+ inputs_embeds = self.embed_tokens(input_ids)
884
+
885
+ if use_cache and past_key_values is None:
886
+ past_key_values = DynamicCache()
887
+
888
+ if cache_position is None:
889
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
890
+ cache_position = torch.arange(
891
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
892
+ )
893
+
894
+ if position_ids is None:
895
+ position_ids = cache_position.unsqueeze(0)
896
+
897
+ # It may already have been prepared by e.g. `generate`
898
+ if not isinstance(causal_mask_mapping := attention_mask, dict):
899
+ # Prepare mask arguments
900
+ mask_kwargs = {
901
+ "config": self.config,
902
+ "input_embeds": inputs_embeds,
903
+ "attention_mask": attention_mask,
904
+ "cache_position": cache_position,
905
+ "past_key_values": past_key_values,
906
+ "position_ids": position_ids,
907
+ }
908
+ # Create the masks
909
+ causal_mask_mapping = {
910
+ "full_attention": create_causal_mask(**mask_kwargs),
911
+ }
912
+ # The sliding window alternating layers are not always activated depending on the config
913
+ if self.has_sliding_layers:
914
+ causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
915
+
916
+ hidden_states = inputs_embeds
917
+
918
+ # create position embeddings to be shared across the decoder layers
919
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
920
+
921
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
922
+ hidden_states = decoder_layer(
923
+ hidden_states,
924
+ attention_mask=causal_mask_mapping[decoder_layer.attention_type],
925
+ position_ids=position_ids,
926
+ past_key_values=past_key_values,
927
+ use_cache=use_cache,
928
+ cache_position=cache_position,
929
+ position_embeddings=position_embeddings,
930
+ **kwargs,
931
+ )
932
+
933
+ hidden_states = self.norm(hidden_states)
934
+ return BaseModelOutputWithPast(
935
+ last_hidden_state=hidden_states,
936
+ past_key_values=past_key_values if use_cache else None,
937
+ )
938
+
939
+
940
+ @auto_docstring
941
+ class HelpingAIForCausalLM(HelpingAIPreTrainedModel, GenerationMixin):
942
+ _tied_weights_keys = ["lm_head.weight"]
943
+ _tp_plan = {"lm_head": "colwise_rep"}
944
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
945
+
946
+ def __init__(self, config):
947
+ super().__init__(config)
948
+ self.model = HelpingAIModel(config)
949
+ self.vocab_size = config.vocab_size
950
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
951
+
952
+ # Enhanced structured output support
953
+ if hasattr(config, 'structured_output_vocab_size') and config.structured_output_vocab_size > 0:
954
+ self.structured_vocab_size = config.structured_output_vocab_size
955
+ self.use_structured_output = True
956
+ # Build structured head depending on config.structured_head_type
957
+ head_type = getattr(config, 'structured_head_type', 'linear')
958
+ act_name = getattr(config, 'structured_head_activation', 'gelu')
959
+ act_layer = nn.GELU() if act_name == 'gelu' else nn.ReLU()
960
+ hidden_dim = getattr(config, 'structured_head_hidden_dim', None)
961
+ if head_type == 'mlp_v1':
962
+ if hidden_dim is None:
963
+ # Heuristic: pick hidden so params roughly ~ (in+out)*hidden ~ 50M default
964
+ denom = config.hidden_size + self.structured_vocab_size
965
+ target = 50_000_000
966
+ hidden_dim = max(128, int(target / max(1, denom)))
967
+ self.structured_lm_head = nn.Sequential(
968
+ nn.Linear(config.hidden_size, hidden_dim, bias=True),
969
+ act_layer,
970
+ nn.Linear(hidden_dim, self.structured_vocab_size, bias=True),
971
+ )
972
+ else:
973
+ self.structured_lm_head = nn.Linear(config.hidden_size, self.structured_vocab_size, bias=False)
974
+
975
+ # Special token embeddings for structured reasoning
976
+ self.structured_token_embeddings = nn.Embedding(self.structured_vocab_size, config.hidden_size)
977
+
978
+ # Reasoning mode classifier
979
+ self.reasoning_mode_classifier = nn.Sequential(
980
+ nn.Linear(config.hidden_size, config.hidden_size // 2),
981
+ nn.GELU(),
982
+ nn.Linear(config.hidden_size // 2, 4), # think, ser, pet, normal
983
+ nn.Softmax(dim=-1)
984
+ )
985
+ else:
986
+ self.use_structured_output = False
987
+
988
+ # Optional speech output head (predict mel-spectrogram frames)
989
+ self.use_speech_output = getattr(config, "use_speech_output", False)
990
+ if self.use_speech_output:
991
+ self.speech_num_mels = getattr(config, "speech_num_mels", 80)
992
+ self.speech_upsample_factor = getattr(config, "speech_upsample_factor", 1)
993
+ hidden_dim = getattr(config, "speech_head_hidden_dim", None)
994
+ if hidden_dim is None:
995
+ hidden_dim = config.hidden_size // 2
996
+ # Projector from hidden_size -> hidden_dim -> mel bins
997
+ self.speech_proj = nn.Sequential(
998
+ nn.Linear(config.hidden_size, hidden_dim),
999
+ nn.GELU(),
1000
+ nn.Linear(hidden_dim, self.speech_num_mels),
1001
+ )
1002
+ self.speech_loss_type = getattr(config, "speech_loss_type", "l1")
1003
+
1004
+ # Initialize weights and apply final processing
1005
+ self.post_init()
1006
+ # Register a load-state pre-hook so older checkpoints with saved structured head metadata can be restored
1007
+ self._register_load_state_dict_pre_hook(self._structured_head_migration_hook, with_module=True)
1008
+
1009
+ # --- Structured head migration logic ---
1010
+ def _structured_head_migration_hook(self, module, state_dict, prefix, *args, **kwargs):
1011
+ """Detect mismatched structured head weights and rebuild head if necessary.
1012
+
1013
+ Supports migration from legacy linear -> MLP (saved externally) when config specifies mlp_v1
1014
+ but checkpoint only has linear weights OR when state_dict contains sequential weights not
1015
+ matching current module shape.
1016
+ """
1017
+ if not getattr(self, 'use_structured_output', False):
1018
+ return
1019
+ cfg = self.config
1020
+ desired_type = getattr(cfg, 'structured_head_type', 'linear')
1021
+ if desired_type != 'mlp_v1':
1022
+ return
1023
+ # Current module may already be Sequential; if so, nothing to do
1024
+ if isinstance(self.structured_lm_head, nn.Sequential):
1025
+ return
1026
+ # Look for legacy linear weight key
1027
+ w_key = prefix + 'structured_lm_head.weight'
1028
+ b_key = prefix + 'structured_lm_head.bias'
1029
+ if w_key in state_dict and not any(k.startswith(prefix + 'structured_lm_head.0.') for k in state_dict.keys()):
1030
+ # Need to rebuild to MLP form
1031
+ hidden_dim = getattr(cfg, 'structured_head_hidden_dim', None)
1032
+ if hidden_dim is None:
1033
+ denom = cfg.hidden_size + cfg.structured_output_vocab_size
1034
+ target = 50_000_000
1035
+ hidden_dim = max(128, int(target / max(1, denom)))
1036
+ act_name = getattr(cfg, 'structured_head_activation', 'gelu')
1037
+ act_layer = nn.GELU() if act_name == 'gelu' else nn.ReLU()
1038
+ new_head = nn.Sequential(
1039
+ nn.Linear(cfg.hidden_size, hidden_dim, bias=True),
1040
+ act_layer,
1041
+ nn.Linear(hidden_dim, cfg.structured_output_vocab_size, bias=True),
1042
+ )
1043
+ self.structured_lm_head = new_head.to(next(self.parameters()).device)
1044
+ # Legacy linear weights can't be mapped meaningfully; leave new head randomly inited.
1045
+ # Remove old unmatched keys so load_state_dict won't warn.
1046
+ state_dict.pop(w_key, None)
1047
+ state_dict.pop(b_key, None)
1048
+ # If partial sequential weights exist but shape mismatch, rely on normal strict=False upstream behavior
1049
+
1050
+ def set_decoder(self, decoder):
1051
+ self.model = decoder
1052
+
1053
+ def get_decoder(self):
1054
+ return self.model
1055
+
1056
+ def get_reasoning_mode_probabilities(self, hidden_states: torch.Tensor) -> torch.Tensor:
1057
+ """Get probabilities for different reasoning modes: think, ser, pet, normal"""
1058
+ if self.use_structured_output:
1059
+ # Use the last token's hidden state for mode classification
1060
+ last_hidden = hidden_states[:, -1, :] # [batch_size, hidden_size]
1061
+ mode_probs = self.reasoning_mode_classifier(last_hidden)
1062
+ return mode_probs
1063
+ return None
1064
+
1065
+ @can_return_tuple
1066
+ @auto_docstring
1067
+ def forward(
1068
+ self,
1069
+ input_ids: Optional[torch.LongTensor] = None,
1070
+ attention_mask: Optional[torch.Tensor] = None,
1071
+ position_ids: Optional[torch.LongTensor] = None,
1072
+ past_key_values: Optional[Cache] = None,
1073
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1074
+ labels: Optional[torch.LongTensor] = None,
1075
+ # Optional supervision for speech frames: float tensor [B, T_frames, n_mels]
1076
+ speech_targets: Optional[torch.FloatTensor] = None,
1077
+ use_cache: Optional[bool] = None,
1078
+ cache_position: Optional[torch.LongTensor] = None,
1079
+ logits_to_keep: Union[int, torch.Tensor] = 0,
1080
+ return_reasoning_metadata: Optional[bool] = False,
1081
+ **kwargs: Unpack[TransformersKwargs],
1082
+ ) -> CausalLMOutputWithPast:
1083
+ r"""
1084
+ Enhanced HelpingAI forward pass with structured reasoning and speech supervision support.
1085
+
1086
+ Args:
1087
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1088
+ Indices of input sequence tokens in the vocabulary.
1089
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1090
+ Mask to avoid performing attention on padding token indices.
1091
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1092
+ Indices of positions of each input sequence tokens in the position embeddings.
1093
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
1094
+ Pre-computed hidden-states that can be used to speed up autoregressive decoding.
1095
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1096
+ Embedded representation of the input tokens. Can be used instead of `input_ids`.
1097
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1098
+ Labels for computing the masked language modeling loss.
1099
+ speech_targets (`torch.FloatTensor` of shape `(batch_size, T_frames, n_mels)`, *optional*):
1100
+ Optional ground-truth mel-spectrogram frames for speech head supervision. Used only if `use_speech_output` is enabled.
1101
+ - `batch_size`: number of samples in the batch
1102
+ - `T_frames`: number of mel frames (may differ from token count)
1103
+ - `n_mels`: number of mel bins (should match config.speech_num_mels)
1104
+ use_cache (`bool`, *optional*):
1105
+ If set to `True`, past key values are returned and can be used to speed up decoding.
1106
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
1107
+ Indices depicting the position of the input tokens in the sequence.
1108
+ logits_to_keep (`Union[int, torch.Tensor]`, *optional*, defaults to 0):
1109
+ Number of logits to keep from the end of the sequence.
1110
+ return_reasoning_metadata (`bool`, *optional*, defaults to `False`):
1111
+ Whether to return reasoning metadata including SER and PET analysis for structured reasoning.
1112
+
1113
+ Returns:
1114
+ `CausalLMOutputWithPast`: Model output containing logits, past key values, and optional reasoning metadata.
1115
+
1116
+ Example:
1117
+
1118
+ ```python
1119
+ >>> from transformers import AutoTokenizer, HelpingAIForCausalLM
1120
+
1121
+ >>> model = HelpingAIForCausalLM.from_pretrained("HelpingAI/HelpingAI-8B")
1122
+ >>> tokenizer = AutoTokenizer.from_pretrained("HelpingAI/HelpingAI-8B")
1123
+
1124
+ >>> # Standard generation
1125
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1126
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1127
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1128
+ >>> response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True)[0]
1129
+
1130
+ >>> # Structured reasoning generation
1131
+ >>> outputs = model(inputs.input_ids, return_reasoning_metadata=True)
1132
+ >>> reasoning_modes = model.get_reasoning_mode_probabilities(outputs.hidden_states)
1133
+
1134
+ >>> # Speech head supervision
1135
+ >>> mel_targets = torch.randn(batch_size, T_frames, n_mels)
1136
+ >>> outputs = model(inputs.input_ids, speech_targets=mel_targets)
1137
+ ```
1138
+ """
1139
+ outputs: BaseModelOutputWithPast = self.model(
1140
+ input_ids=input_ids,
1141
+ attention_mask=attention_mask,
1142
+ position_ids=position_ids,
1143
+ past_key_values=past_key_values,
1144
+ inputs_embeds=inputs_embeds,
1145
+ use_cache=use_cache,
1146
+ cache_position=cache_position,
1147
+ **kwargs,
1148
+ )
1149
+
1150
+ hidden_states = outputs.last_hidden_state
1151
+
1152
+ # Standard language modeling head
1153
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
1154
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
1155
+
1156
+ # Enhanced structured output logits
1157
+ structured_logits = None
1158
+ reasoning_mode_probs = None
1159
+ if self.use_structured_output:
1160
+ structured_logits = self.structured_lm_head(hidden_states[:, slice_indices, :])
1161
+ reasoning_mode_probs = self.get_reasoning_mode_probabilities(hidden_states)
1162
+
1163
+ # Speech output prediction
1164
+ speech_mels = None
1165
+ if self.use_speech_output:
1166
+ token_level = hidden_states # [B, T_tok, H]
1167
+ # Simple temporal upsampling by repetition to approximate frame rate
1168
+ if getattr(self, "speech_upsample_factor", 1) > 1:
1169
+ token_level = token_level.repeat_interleave(self.speech_upsample_factor, dim=1)
1170
+ # Project to mel bins per (upsampled) time-step
1171
+ speech_mels = self.speech_proj(token_level) # [B, T_frames, n_mels]
1172
+
1173
+ loss = None
1174
+ if labels is not None:
1175
+ # Standard loss computation
1176
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
1177
+
1178
+ # Add structured output loss if applicable
1179
+ if self.use_structured_output and structured_logits is not None:
1180
+ # Additional loss term for structured reasoning (if labels include structured tokens)
1181
+ structured_loss_weight = 0.1 # Weight for structured output loss
1182
+ structured_loss = self.loss_function(
1183
+ logits=structured_logits,
1184
+ labels=labels,
1185
+ vocab_size=self.structured_vocab_size,
1186
+ **kwargs
1187
+ )
1188
+ loss = loss + (structured_loss_weight * structured_loss)
1189
+
1190
+ # Add speech supervision if provided
1191
+ if self.use_speech_output and speech_targets is not None:
1192
+ # Ensure time dimension alignment by trimming or padding speech_mels to targets
1193
+ B, T_pred, M = speech_mels.shape
1194
+ B2, T_tgt, M2 = speech_targets.shape
1195
+ if B != B2 or M != M2:
1196
+ raise ValueError("speech_targets shape mismatch. Expected [B, T, n_mels] with same B and n_mels as model output.")
1197
+ if T_pred > T_tgt:
1198
+ speech_mels_aligned = speech_mels[:, :T_tgt, :]
1199
+ elif T_pred < T_tgt:
1200
+ pad = torch.zeros(B, T_tgt - T_pred, M, device=speech_mels.device, dtype=speech_mels.dtype)
1201
+ speech_mels_aligned = torch.cat([speech_mels, pad], dim=1)
1202
+ else:
1203
+ speech_mels_aligned = speech_mels
1204
+
1205
+ if self.speech_loss_type == "mse":
1206
+ speech_loss = nn.functional.mse_loss(speech_mels_aligned, speech_targets)
1207
+ else:
1208
+ speech_loss = nn.functional.l1_loss(speech_mels_aligned, speech_targets)
1209
+ loss = speech_loss if loss is None else (loss + speech_loss)
1210
+
1211
+ # Prepare output with enhanced reasoning metadata
1212
+ output = CausalLMOutputWithPast(
1213
+ loss=loss,
1214
+ logits=logits,
1215
+ past_key_values=outputs.past_key_values,
1216
+ hidden_states=outputs.hidden_states,
1217
+ attentions=outputs.attentions,
1218
+ )
1219
+
1220
+ # Add custom attributes for reasoning
1221
+ if return_reasoning_metadata and self.use_structured_output:
1222
+ output.structured_logits = structured_logits
1223
+ output.reasoning_mode_probabilities = reasoning_mode_probs
1224
+ if self.use_speech_output:
1225
+ output.speech_mels = speech_mels
1226
+
1227
+ return output
1228
+
1229
+
1230
+ class HelpingAIForSequenceClassification(GenericForSequenceClassification, HelpingAIPreTrainedModel):
1231
+ pass
1232
+
1233
+
1234
+ class HelpingAIForTokenClassification(GenericForTokenClassification, HelpingAIPreTrainedModel):
1235
+ pass
1236
+
1237
+
1238
+ class HelpingAIForQuestionAnswering(GenericForQuestionAnswering, HelpingAIPreTrainedModel):
1239
+ base_model_prefix = "transformer" # For BC, where `transformer` was used instead of `model`
1240
+
1241
+
1242
+ __all__ = [
1243
+ "HelpingAIForCausalLM",
1244
+ "HelpingAIForQuestionAnswering",
1245
+ "HelpingAIPreTrainedModel",
1246
+ "HelpingAIModel",
1247
+ "HelpingAIForSequenceClassification",
1248
+ "HelpingAIForTokenClassification",
1249
+ ]
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|vision_pad|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 40960,
235
+ "pad_token": "<|vision_pad|>",
236
+ "padding_side": "right",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff