Enable AutoProcessor and support latest transformers release

#3
chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
config.json CHANGED
@@ -2,20 +2,50 @@
2
  "architectures": [
3
  "IsaacForConditionalGeneration"
4
  ],
 
 
5
  "auto_map": {
6
- "AutoProcessor": "modular_isaac.IsaacProcessor",
7
  "AutoConfig": "modular_isaac.IsaacConfig",
8
  "AutoModelForCausalLM": "modular_isaac.IsaacForConditionalGeneration"
9
  },
10
- "attention_bias": false,
11
- "attention_dropout": 0.0,
12
  "bos_token_id": 151643,
 
13
  "eos_token_id": 151645,
14
  "head_dim": 128,
15
  "hidden_act": "silu",
16
  "hidden_size": 2048,
17
  "initializer_range": 0.02,
18
  "intermediate_size": 6144,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  "max_position_embeddings": 40960,
20
  "max_sequence_length": 16384,
21
  "max_window_layers": 28,
@@ -33,8 +63,7 @@
33
  "rope_theta": 1000000.0,
34
  "sliding_window": null,
35
  "tie_word_embeddings": false,
36
- "torch_dtype": "float32",
37
- "transformers_version": "4.51.1",
38
  "use_cache": true,
39
  "use_sliding_window": false,
40
  "video_patch_size": 16,
@@ -57,4 +86,4 @@
57
  "vision_min_num_patches": 256,
58
  "vision_token": "<image>",
59
  "vocab_size": 151936
60
- }
 
2
  "architectures": [
3
  "IsaacForConditionalGeneration"
4
  ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
  "auto_map": {
 
8
  "AutoConfig": "modular_isaac.IsaacConfig",
9
  "AutoModelForCausalLM": "modular_isaac.IsaacForConditionalGeneration"
10
  },
 
 
11
  "bos_token_id": 151643,
12
+ "dtype": "float32",
13
  "eos_token_id": 151645,
14
  "head_dim": 128,
15
  "hidden_act": "silu",
16
  "hidden_size": 2048,
17
  "initializer_range": 0.02,
18
  "intermediate_size": 6144,
19
+ "layer_types": [
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention"
48
+ ],
49
  "max_position_embeddings": 40960,
50
  "max_sequence_length": 16384,
51
  "max_window_layers": 28,
 
63
  "rope_theta": 1000000.0,
64
  "sliding_window": null,
65
  "tie_word_embeddings": false,
66
+ "transformers_version": "4.56.1",
 
67
  "use_cache": true,
68
  "use_sliding_window": false,
69
  "video_patch_size": 16,
 
86
  "vision_min_num_patches": 256,
87
  "vision_token": "<image>",
88
  "vocab_size": 151936
89
+ }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 151643,
4
  "eos_token_id": 151645,
5
- "transformers_version": "4.51.1"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 151643,
4
  "eos_token_id": 151645,
5
+ "transformers_version": "4.56.1"
6
  }
model.safetensors.index.json CHANGED
@@ -1,5 +1,6 @@
1
  {
2
  "metadata": {
 
3
  "total_size": 10268292032
4
  },
5
  "weight_map": {
 
1
  {
2
  "metadata": {
3
+ "total_parameters": 2567073008,
4
  "total_size": 10268292032
5
  },
6
  "weight_map": {
modular_isaac.py CHANGED
@@ -14,15 +14,19 @@ import PIL.Image
14
  from transformers import (
15
  AutoTokenizer,
16
  BatchFeature,
 
17
  Qwen3Config,
18
  Qwen3ForCausalLM,
19
  Qwen3PreTrainedModel,
20
  )
 
21
  from transformers.generation.utils import GenerationMixin
22
  from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
23
  from transformers.models.qwen3.modeling_qwen3 import Qwen3DecoderLayer, Qwen3Model
 
24
  from transformers.processing_utils import ProcessorMixin
25
  from transformers.tokenization_utils import TensorType
 
26
  import re
27
 
28
  from transformers.models.siglip2.modeling_siglip2 import (
@@ -62,7 +66,6 @@ class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig):
62
  num_patches: int = 256,
63
  **kwargs,
64
  ):
65
- # Call parent with all vision config parameters
66
  super().__init__(**kwargs)
67
 
68
  # Add our custom fields
@@ -874,16 +877,20 @@ def create_text_event(tokenizer: AutoTokenizer, text: str, time: float = 0.0) ->
874
 
875
 
876
  class IsaacProcessor(ProcessorMixin):
877
- attributes = []
878
- tokenizer_class = ("AutoTokenizer",)
 
879
 
880
  def __init__(
881
  self,
882
- tokenizer: AutoTokenizer,
883
- config: IsaacConfig,
884
  ):
885
- super().__init__()
886
  self.tokenizer = tokenizer
 
 
 
887
  self.config = config
888
 
889
  # Use vision token from config
@@ -1121,8 +1128,9 @@ class IsaacRotaryEmbedding(nn.Module):
1121
  class IsaacModel(Qwen3Model):
1122
  def __init__(self, config: IsaacConfig):
1123
  super().__init__(config)
 
1124
  self.layers = torch.nn.ModuleList(
1125
- [Qwen3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
1126
  )
1127
  self.rotary_emb = IsaacRotaryEmbedding(config, device=self.device)
1128
 
@@ -1276,7 +1284,7 @@ class IsaacModel(Qwen3Model):
1276
  **kwargs,
1277
  )
1278
 
1279
- hidden_states = layer_outputs[0]
1280
 
1281
  # Final layer norm
1282
  hidden_states = self.norm(hidden_states)
@@ -1286,6 +1294,159 @@ class IsaacModel(Qwen3Model):
1286
  past_key_values=past_key_values,
1287
  )
1288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1289
 
1290
  class IsaacForConditionalGeneration(Qwen3ForCausalLM, GenerationMixin):
1291
  """Isaac multimodal model for conditional generation."""
 
14
  from transformers import (
15
  AutoTokenizer,
16
  BatchFeature,
17
+ Cache,
18
  Qwen3Config,
19
  Qwen3ForCausalLM,
20
  Qwen3PreTrainedModel,
21
  )
22
+ from transformers.cache_utils import SlidingWindowCache, StaticCache
23
  from transformers.generation.utils import GenerationMixin
24
  from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
25
  from transformers.models.qwen3.modeling_qwen3 import Qwen3DecoderLayer, Qwen3Model
26
+ from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
27
  from transformers.processing_utils import ProcessorMixin
28
  from transformers.tokenization_utils import TensorType
29
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
30
  import re
31
 
32
  from transformers.models.siglip2.modeling_siglip2 import (
 
66
  num_patches: int = 256,
67
  **kwargs,
68
  ):
 
69
  super().__init__(**kwargs)
70
 
71
  # Add our custom fields
 
877
 
878
 
879
  class IsaacProcessor(ProcessorMixin):
880
+ attributes = ["tokenizer"]
881
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
882
+
883
 
884
  def __init__(
885
  self,
886
+ tokenizer: Qwen2Tokenizer,
887
+ config: IsaacConfig | dict,
888
  ):
889
+ super().__init__(tokenizer)
890
  self.tokenizer = tokenizer
891
+
892
+ if isinstance(config, dict):
893
+ config = IsaacConfig(**config)
894
  self.config = config
895
 
896
  # Use vision token from config
 
1128
  class IsaacModel(Qwen3Model):
1129
  def __init__(self, config: IsaacConfig):
1130
  super().__init__(config)
1131
+ text_cfg = getattr(config, "get_text_config", lambda: config)()
1132
  self.layers = torch.nn.ModuleList(
1133
+ [Qwen3DecoderLayer(text_cfg, layer_idx) for layer_idx in range(config.num_hidden_layers)]
1134
  )
1135
  self.rotary_emb = IsaacRotaryEmbedding(config, device=self.device)
1136
 
 
1284
  **kwargs,
1285
  )
1286
 
1287
+ hidden_states = layer_outputs[0] if isinstance(layer_outputs, tuple) else layer_outputs
1288
 
1289
  # Final layer norm
1290
  hidden_states = self.norm(hidden_states)
 
1294
  past_key_values=past_key_values,
1295
  )
1296
 
1297
+ def _update_causal_mask(
1298
+ self,
1299
+ attention_mask: torch.Tensor,
1300
+ input_tensor: torch.Tensor,
1301
+ cache_position: torch.Tensor,
1302
+ past_key_values: Cache,
1303
+ output_attentions: bool = False,
1304
+ ):
1305
+ if self.config._attn_implementation == "flash_attention_2":
1306
+ if attention_mask is not None and past_key_values is not None:
1307
+ is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
1308
+ if is_padding_right:
1309
+ raise ValueError(
1310
+ "You are attempting to perform batched generation with padding_side='right'"
1311
+ " this may lead to unexpected behaviour for Flash Attention version of Qwen3. Make sure to "
1312
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
1313
+ )
1314
+ if attention_mask is not None and 0.0 in attention_mask:
1315
+ return attention_mask
1316
+ return None
1317
+
1318
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
1319
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
1320
+ # to infer the attention mask.
1321
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
1322
+ using_static_cache = isinstance(past_key_values, StaticCache)
1323
+ using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
1324
+
1325
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
1326
+ if (
1327
+ self.config._attn_implementation == "sdpa"
1328
+ and not (using_static_cache or using_sliding_window_cache)
1329
+ and not output_attentions
1330
+ ):
1331
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
1332
+ attention_mask,
1333
+ inputs_embeds=input_tensor,
1334
+ past_key_values_length=past_seen_tokens,
1335
+ sliding_window=self.config.sliding_window,
1336
+ is_training=self.training,
1337
+ ):
1338
+ return None
1339
+
1340
+ dtype, device = input_tensor.dtype, input_tensor.device
1341
+ min_dtype = torch.finfo(dtype).min
1342
+ sequence_length = input_tensor.shape[1]
1343
+ # SlidingWindowCache or StaticCache
1344
+ if using_sliding_window_cache or using_static_cache:
1345
+ target_length = past_key_values.get_max_cache_shape()
1346
+ # DynamicCache or no cache
1347
+ else:
1348
+ target_length = (
1349
+ attention_mask.shape[-1]
1350
+ if isinstance(attention_mask, torch.Tensor)
1351
+ else past_seen_tokens + sequence_length + 1
1352
+ )
1353
+
1354
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
1355
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
1356
+ attention_mask,
1357
+ sequence_length=sequence_length,
1358
+ target_length=target_length,
1359
+ dtype=dtype,
1360
+ device=device,
1361
+ cache_position=cache_position,
1362
+ batch_size=input_tensor.shape[0],
1363
+ config=self.config,
1364
+ past_key_values=past_key_values,
1365
+ )
1366
+
1367
+ if (
1368
+ self.config._attn_implementation == "sdpa"
1369
+ and attention_mask is not None
1370
+ and attention_mask.device.type in ["cuda", "xpu", "npu"]
1371
+ and not output_attentions
1372
+ ):
1373
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
1374
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
1375
+ # Details: https://github.com/pytorch/pytorch/issues/110213
1376
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
1377
+
1378
+ return causal_mask
1379
+
1380
+ @staticmethod
1381
+ def _prepare_4d_causal_attention_mask_with_cache_position(
1382
+ attention_mask: torch.Tensor,
1383
+ sequence_length: int,
1384
+ target_length: int,
1385
+ dtype: torch.dtype,
1386
+ device: torch.device,
1387
+ cache_position: torch.Tensor,
1388
+ batch_size: int,
1389
+ config: Qwen3Config,
1390
+ past_key_values: Cache,
1391
+ ):
1392
+ """
1393
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
1394
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
1395
+
1396
+ Args:
1397
+ attention_mask (`torch.Tensor`):
1398
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
1399
+ sequence_length (`int`):
1400
+ The sequence length being processed.
1401
+ target_length (`int`):
1402
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
1403
+ dtype (`torch.dtype`):
1404
+ The dtype to use for the 4D attention mask.
1405
+ device (`torch.device`):
1406
+ The device to place the 4D attention mask on.
1407
+ cache_position (`torch.Tensor`):
1408
+ Indices depicting the position of the input sequence tokens in the sequence.
1409
+ batch_size (`torch.Tensor`):
1410
+ Batch size.
1411
+ config (`Qwen3Config`):
1412
+ The model's configuration class
1413
+ past_key_values (`Cache`):
1414
+ The cache class that is being used currently to generate
1415
+ """
1416
+ if attention_mask is not None and attention_mask.dim() == 4:
1417
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
1418
+ causal_mask = attention_mask
1419
+ else:
1420
+ min_dtype = torch.finfo(dtype).min
1421
+ causal_mask = torch.full(
1422
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
1423
+ )
1424
+ diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
1425
+ if config.sliding_window is not None:
1426
+ # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
1427
+ # the check is needed to verify is current checkpoint was trained with sliding window or not
1428
+ if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
1429
+ sliding_attend_mask = torch.arange(target_length, device=device) <= (
1430
+ cache_position.reshape(-1, 1) - config.sliding_window
1431
+ )
1432
+ diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
1433
+ causal_mask *= diagonal_attend_mask
1434
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
1435
+ if attention_mask is not None:
1436
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
1437
+ if attention_mask.shape[-1] > target_length:
1438
+ attention_mask = attention_mask[:, :target_length]
1439
+ mask_length = attention_mask.shape[-1]
1440
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
1441
+ causal_mask.device
1442
+ )
1443
+ padding_mask = padding_mask == 0
1444
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
1445
+ padding_mask, min_dtype
1446
+ )
1447
+ return causal_mask
1448
+
1449
+
1450
 
1451
  class IsaacForConditionalGeneration(Qwen3ForCausalLM, GenerationMixin):
1452
  """Isaac multimodal model for conditional generation."""
processor_config.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "modular_isaac.IsaacProcessor"
4
+ },
5
+ "config": {
6
+ "_name_or_path": "",
7
+ "add_cross_attention": false,
8
+ "architectures": [
9
+ "IsaacForConditionalGeneration"
10
+ ],
11
+ "attention_bias": false,
12
+ "attention_dropout": 0.0,
13
+ "auto_map": {
14
+ "AutoModelForCausalLM": "modular_isaac.IsaacForConditionalGeneration"
15
+ },
16
+ "bad_words_ids": null,
17
+ "begin_suppress_tokens": null,
18
+ "bos_token_id": 151643,
19
+ "chunk_size_feed_forward": 0,
20
+ "cross_attention_hidden_size": null,
21
+ "decoder_start_token_id": null,
22
+ "diversity_penalty": 0.0,
23
+ "do_sample": false,
24
+ "dtype": "float32",
25
+ "early_stopping": false,
26
+ "encoder_no_repeat_ngram_size": 0,
27
+ "eos_token_id": 151645,
28
+ "exponential_decay_length_penalty": null,
29
+ "finetuning_task": null,
30
+ "forced_bos_token_id": null,
31
+ "forced_eos_token_id": null,
32
+ "head_dim": 128,
33
+ "hidden_act": "silu",
34
+ "hidden_size": 2048,
35
+ "id2label": {
36
+ "0": "LABEL_0",
37
+ "1": "LABEL_1"
38
+ },
39
+ "initializer_range": 0.02,
40
+ "intermediate_size": 6144,
41
+ "is_decoder": false,
42
+ "is_encoder_decoder": false,
43
+ "label2id": {
44
+ "LABEL_0": 0,
45
+ "LABEL_1": 1
46
+ },
47
+ "layer_types": [
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention",
68
+ "full_attention",
69
+ "full_attention",
70
+ "full_attention",
71
+ "full_attention",
72
+ "full_attention",
73
+ "full_attention",
74
+ "full_attention",
75
+ "full_attention"
76
+ ],
77
+ "length_penalty": 1.0,
78
+ "max_length": 20,
79
+ "max_position_embeddings": 40960,
80
+ "max_sequence_length": 16384,
81
+ "max_window_layers": 28,
82
+ "min_length": 0,
83
+ "model_type": "isaac",
84
+ "no_repeat_ngram_size": 0,
85
+ "num_attention_heads": 16,
86
+ "num_beam_groups": 1,
87
+ "num_beams": 1,
88
+ "num_hidden_layers": 28,
89
+ "num_key_value_heads": 8,
90
+ "num_return_sequences": 1,
91
+ "output_attentions": false,
92
+ "output_hidden_states": false,
93
+ "output_scores": false,
94
+ "pad_token_id": null,
95
+ "pixel_shuffle_scale": 2,
96
+ "prefix": null,
97
+ "problem_type": null,
98
+ "pruned_heads": {},
99
+ "remove_invalid_values": false,
100
+ "repetition_penalty": 1.0,
101
+ "return_dict": true,
102
+ "return_dict_in_generate": false,
103
+ "rms_norm_eps": 1e-06,
104
+ "rope_scaling": {
105
+ "mrope_interleaved": true,
106
+ "mrope_section": null,
107
+ "rope_type": "default"
108
+ },
109
+ "rope_theta": 1000000.0,
110
+ "sep_token_id": null,
111
+ "sliding_window": null,
112
+ "suppress_tokens": null,
113
+ "task_specific_params": null,
114
+ "temperature": 1.0,
115
+ "tf_legacy_loss": false,
116
+ "tie_encoder_decoder": false,
117
+ "tie_word_embeddings": false,
118
+ "tokenizer_class": null,
119
+ "top_k": 50,
120
+ "top_p": 1.0,
121
+ "torchscript": false,
122
+ "transformers_version": "4.56.1",
123
+ "typical_p": 1.0,
124
+ "use_bfloat16": false,
125
+ "use_cache": true,
126
+ "use_sliding_window": false,
127
+ "video_patch_size": 16,
128
+ "vision_config": {
129
+ "_name_or_path": "",
130
+ "add_cross_attention": false,
131
+ "architectures": null,
132
+ "attention_dropout": 0.0,
133
+ "bad_words_ids": null,
134
+ "begin_suppress_tokens": null,
135
+ "bos_token_id": null,
136
+ "chunk_size_feed_forward": 0,
137
+ "cross_attention_hidden_size": null,
138
+ "decoder_start_token_id": null,
139
+ "diversity_penalty": 0.0,
140
+ "do_sample": false,
141
+ "dtype": null,
142
+ "early_stopping": false,
143
+ "encoder_no_repeat_ngram_size": 0,
144
+ "eos_token_id": null,
145
+ "exponential_decay_length_penalty": null,
146
+ "finetuning_task": null,
147
+ "forced_bos_token_id": null,
148
+ "forced_eos_token_id": null,
149
+ "hidden_act": "gelu_pytorch_tanh",
150
+ "hidden_size": 1152,
151
+ "id2label": {
152
+ "0": "LABEL_0",
153
+ "1": "LABEL_1"
154
+ },
155
+ "image_size": 256,
156
+ "intermediate_size": 4304,
157
+ "is_decoder": false,
158
+ "is_encoder_decoder": false,
159
+ "label2id": {
160
+ "LABEL_0": 0,
161
+ "LABEL_1": 1
162
+ },
163
+ "layer_norm_eps": 1e-06,
164
+ "length_penalty": 1.0,
165
+ "max_length": 20,
166
+ "min_length": 0,
167
+ "model_type": "pixel_shuffle_siglip2",
168
+ "no_repeat_ngram_size": 0,
169
+ "num_attention_heads": 16,
170
+ "num_beam_groups": 1,
171
+ "num_beams": 1,
172
+ "num_channels": 3,
173
+ "num_hidden_layers": 27,
174
+ "num_patches": 256,
175
+ "num_return_sequences": 1,
176
+ "output_attentions": false,
177
+ "output_hidden_states": false,
178
+ "output_scores": false,
179
+ "pad_token_id": null,
180
+ "patch_size": 16,
181
+ "pixel_shuffle_scale_factor": 2,
182
+ "prefix": null,
183
+ "problem_type": null,
184
+ "pruned_heads": {},
185
+ "remove_invalid_values": false,
186
+ "repetition_penalty": 1.0,
187
+ "return_dict": true,
188
+ "return_dict_in_generate": false,
189
+ "sep_token_id": null,
190
+ "suppress_tokens": null,
191
+ "task_specific_params": null,
192
+ "temperature": 1.0,
193
+ "tf_legacy_loss": false,
194
+ "tie_encoder_decoder": false,
195
+ "tie_word_embeddings": true,
196
+ "tokenizer_class": null,
197
+ "top_k": 50,
198
+ "top_p": 1.0,
199
+ "torchscript": false,
200
+ "typical_p": 1.0,
201
+ "use_bfloat16": false
202
+ },
203
+ "vision_max_num_patches": 6144,
204
+ "vision_min_num_patches": 256,
205
+ "vision_token": "<image>",
206
+ "vocab_size": 151936
207
+ },
208
+ "processor_class": "IsaacProcessor"
209
+ }
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
- size 11422654
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ceaf87113caa06d8b2e2f6966ab11d12ac590cb887b64c591cae70ea89245f4
3
+ size 11422655
tokenizer_config.json CHANGED
@@ -226,15 +226,17 @@
226
  "<|image_pad|>",
227
  "<|video_pad|>"
228
  ],
 
 
 
229
  "bos_token": null,
230
- "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
  "clean_up_tokenization_spaces": false,
232
  "eos_token": "<|im_end|>",
233
  "errors": "replace",
234
  "extra_special_tokens": {},
235
  "model_max_length": 131072,
236
  "pad_token": "<|endoftext|>",
237
- "processor_class": "Qwen2_5_VLProcessor",
238
  "split_special_tokens": false,
239
  "tokenizer_class": "Qwen2Tokenizer",
240
  "unk_token": null
 
226
  "<|image_pad|>",
227
  "<|video_pad|>"
228
  ],
229
+ "auto_map": {
230
+ "AutoProcessor": "modular_isaac.IsaacProcessor"
231
+ },
232
  "bos_token": null,
 
233
  "clean_up_tokenization_spaces": false,
234
  "eos_token": "<|im_end|>",
235
  "errors": "replace",
236
  "extra_special_tokens": {},
237
  "model_max_length": 131072,
238
  "pad_token": "<|endoftext|>",
239
+ "processor_class": "IsaacProcessor",
240
  "split_special_tokens": false,
241
  "tokenizer_class": "Qwen2Tokenizer",
242
  "unk_token": null