Upload folder using huggingface_hub
Browse files- model.safetensors +2 -2
- processing_maira2.py +15 -20
- tokenizer.model +2 -2
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3bfb1d6f0ec0f0949cd84df187a2bfb571242c4ca9bdd519c4af512716ae23a
|
3 |
+
size 4240896
|
processing_maira2.py
CHANGED
@@ -55,9 +55,9 @@ class Maira2Processor(LlavaProcessor):
|
|
55 |
self,
|
56 |
image_processor: BaseImageProcessor = None,
|
57 |
tokenizer: PreTrainedTokenizer = None,
|
58 |
-
patch_size=None,
|
59 |
-
vision_feature_select_strategy=None,
|
60 |
-
chat_template=None,
|
61 |
image_token: str = "<image>",
|
62 |
phrase_start_token: str = "<obj>",
|
63 |
phrase_end_token: str = "</obj>",
|
@@ -301,12 +301,12 @@ class Maira2Processor(LlavaProcessor):
|
|
301 |
)
|
302 |
messages = [{"content": prompt, "role": "user"}]
|
303 |
if assistant_text is not None:
|
304 |
-
messages.append(
|
305 |
-
{"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"}
|
306 |
-
)
|
307 |
return messages
|
308 |
|
309 |
-
def _construct_chat_messages_phrase_grounding(
|
|
|
|
|
310 |
"""
|
311 |
This function constructs the chat messages for phrase grounding used in the phrase grounding task.
|
312 |
|
@@ -331,9 +331,7 @@ class Maira2Processor(LlavaProcessor):
|
|
331 |
]
|
332 |
messages = [{"content": prompt, "role": "user"}]
|
333 |
if assistant_text is not None:
|
334 |
-
messages.append(
|
335 |
-
{"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"}
|
336 |
-
)
|
337 |
return messages
|
338 |
|
339 |
def format_reporting_input(
|
@@ -390,9 +388,7 @@ class Maira2Processor(LlavaProcessor):
|
|
390 |
assistant_text=assistant_text,
|
391 |
)
|
392 |
add_generation_prompt = assistant_text is None
|
393 |
-
text = self.tokenizer.apply_chat_template(
|
394 |
-
messages, add_generation_prompt=add_generation_prompt, tokenize=False
|
395 |
-
)
|
396 |
return text, images
|
397 |
|
398 |
def format_phrase_grounding_input(
|
@@ -423,9 +419,7 @@ class Maira2Processor(LlavaProcessor):
|
|
423 |
)
|
424 |
messages = self._construct_chat_messages_phrase_grounding(phrase)
|
425 |
add_generation_prompt = assistant_text is None
|
426 |
-
text = self.tokenizer.apply_chat_template(
|
427 |
-
messages, add_generation_prompt=add_generation_prompt, tokenize=False
|
428 |
-
)
|
429 |
return text, images
|
430 |
|
431 |
def format_and_preprocess_reporting_input(
|
@@ -548,7 +542,9 @@ class Maira2Processor(LlavaProcessor):
|
|
548 |
assert len(text) == 0
|
549 |
return split_text
|
550 |
|
551 |
-
def convert_output_to_plaintext_or_grounded_sequence(
|
|
|
|
|
552 |
"""
|
553 |
This function converts the input text to a grounded sequence by extracting the grounded phrases and bounding
|
554 |
boxes from the text. If the text is plaintext without any grounded phrases, it returns the text as is.
|
@@ -703,7 +699,6 @@ class Maira2Processor(LlavaProcessor):
|
|
703 |
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
704 |
**kwargs,
|
705 |
)
|
706 |
-
|
707 |
if images is not None:
|
708 |
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
|
709 |
else:
|
@@ -730,6 +725,6 @@ class Maira2Processor(LlavaProcessor):
|
|
730 |
sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
|
731 |
prompt_strings.append(sample)
|
732 |
|
733 |
-
output_kwargs.pop("return_mm_token_type_ids")
|
734 |
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
735 |
-
return BatchFeature(data={**text_inputs, **image_inputs})
|
|
|
55 |
self,
|
56 |
image_processor: BaseImageProcessor = None,
|
57 |
tokenizer: PreTrainedTokenizer = None,
|
58 |
+
patch_size = None,
|
59 |
+
vision_feature_select_strategy = None,
|
60 |
+
chat_template = None,
|
61 |
image_token: str = "<image>",
|
62 |
phrase_start_token: str = "<obj>",
|
63 |
phrase_end_token: str = "</obj>",
|
|
|
301 |
)
|
302 |
messages = [{"content": prompt, "role": "user"}]
|
303 |
if assistant_text is not None:
|
304 |
+
messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
|
|
|
|
|
305 |
return messages
|
306 |
|
307 |
+
def _construct_chat_messages_phrase_grounding(
|
308 |
+
self, phrase: str, assistant_text: str = None
|
309 |
+
):
|
310 |
"""
|
311 |
This function constructs the chat messages for phrase grounding used in the phrase grounding task.
|
312 |
|
|
|
331 |
]
|
332 |
messages = [{"content": prompt, "role": "user"}]
|
333 |
if assistant_text is not None:
|
334 |
+
messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
|
|
|
|
|
335 |
return messages
|
336 |
|
337 |
def format_reporting_input(
|
|
|
388 |
assistant_text=assistant_text,
|
389 |
)
|
390 |
add_generation_prompt = assistant_text is None
|
391 |
+
text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
|
|
|
|
|
392 |
return text, images
|
393 |
|
394 |
def format_phrase_grounding_input(
|
|
|
419 |
)
|
420 |
messages = self._construct_chat_messages_phrase_grounding(phrase)
|
421 |
add_generation_prompt = assistant_text is None
|
422 |
+
text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
|
|
|
|
|
423 |
return text, images
|
424 |
|
425 |
def format_and_preprocess_reporting_input(
|
|
|
542 |
assert len(text) == 0
|
543 |
return split_text
|
544 |
|
545 |
+
def convert_output_to_plaintext_or_grounded_sequence(
|
546 |
+
self, text: str
|
547 |
+
):
|
548 |
"""
|
549 |
This function converts the input text to a grounded sequence by extracting the grounded phrases and bounding
|
550 |
boxes from the text. If the text is plaintext without any grounded phrases, it returns the text as is.
|
|
|
699 |
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
700 |
**kwargs,
|
701 |
)
|
|
|
702 |
if images is not None:
|
703 |
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
|
704 |
else:
|
|
|
725 |
sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
|
726 |
prompt_strings.append(sample)
|
727 |
|
728 |
+
output_kwargs.pop("return_mm_token_type_ids", None)
|
729 |
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
730 |
+
return BatchFeature(data={**text_inputs, **image_inputs})
|
tokenizer.model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
3 |
+
size 499723
|