IlyasMoutawwakil HF Staff commited on
Commit
1dd02e1
·
verified ·
1 Parent(s): 9e1c96e

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. model.safetensors +2 -2
  2. processing_maira2.py +15 -20
  3. tokenizer.model +2 -2
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a655577b6a338c3cef1d9cd004220624b5d2ec80f3756b00059e5d1b32e8355
3
- size 132
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3bfb1d6f0ec0f0949cd84df187a2bfb571242c4ca9bdd519c4af512716ae23a
3
+ size 4240896
processing_maira2.py CHANGED
@@ -55,9 +55,9 @@ class Maira2Processor(LlavaProcessor):
55
  self,
56
  image_processor: BaseImageProcessor = None,
57
  tokenizer: PreTrainedTokenizer = None,
58
- patch_size=None,
59
- vision_feature_select_strategy=None,
60
- chat_template=None,
61
  image_token: str = "<image>",
62
  phrase_start_token: str = "<obj>",
63
  phrase_end_token: str = "</obj>",
@@ -301,12 +301,12 @@ class Maira2Processor(LlavaProcessor):
301
  )
302
  messages = [{"content": prompt, "role": "user"}]
303
  if assistant_text is not None:
304
- messages.append(
305
- {"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"}
306
- )
307
  return messages
308
 
309
- def _construct_chat_messages_phrase_grounding(self, phrase: str, assistant_text: str = None):
 
 
310
  """
311
  This function constructs the chat messages for phrase grounding used in the phrase grounding task.
312
 
@@ -331,9 +331,7 @@ class Maira2Processor(LlavaProcessor):
331
  ]
332
  messages = [{"content": prompt, "role": "user"}]
333
  if assistant_text is not None:
334
- messages.append(
335
- {"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"}
336
- )
337
  return messages
338
 
339
  def format_reporting_input(
@@ -390,9 +388,7 @@ class Maira2Processor(LlavaProcessor):
390
  assistant_text=assistant_text,
391
  )
392
  add_generation_prompt = assistant_text is None
393
- text = self.tokenizer.apply_chat_template(
394
- messages, add_generation_prompt=add_generation_prompt, tokenize=False
395
- )
396
  return text, images
397
 
398
  def format_phrase_grounding_input(
@@ -423,9 +419,7 @@ class Maira2Processor(LlavaProcessor):
423
  )
424
  messages = self._construct_chat_messages_phrase_grounding(phrase)
425
  add_generation_prompt = assistant_text is None
426
- text = self.tokenizer.apply_chat_template(
427
- messages, add_generation_prompt=add_generation_prompt, tokenize=False
428
- )
429
  return text, images
430
 
431
  def format_and_preprocess_reporting_input(
@@ -548,7 +542,9 @@ class Maira2Processor(LlavaProcessor):
548
  assert len(text) == 0
549
  return split_text
550
 
551
- def convert_output_to_plaintext_or_grounded_sequence(self, text: str):
 
 
552
  """
553
  This function converts the input text to a grounded sequence by extracting the grounded phrases and bounding
554
  boxes from the text. If the text is plaintext without any grounded phrases, it returns the text as is.
@@ -703,7 +699,6 @@ class Maira2Processor(LlavaProcessor):
703
  tokenizer_init_kwargs=self.tokenizer.init_kwargs,
704
  **kwargs,
705
  )
706
-
707
  if images is not None:
708
  image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
709
  else:
@@ -730,6 +725,6 @@ class Maira2Processor(LlavaProcessor):
730
  sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
731
  prompt_strings.append(sample)
732
 
733
- output_kwargs.pop("return_mm_token_type_ids")
734
  text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
735
- return BatchFeature(data={**text_inputs, **image_inputs})
 
55
  self,
56
  image_processor: BaseImageProcessor = None,
57
  tokenizer: PreTrainedTokenizer = None,
58
+ patch_size = None,
59
+ vision_feature_select_strategy = None,
60
+ chat_template = None,
61
  image_token: str = "<image>",
62
  phrase_start_token: str = "<obj>",
63
  phrase_end_token: str = "</obj>",
 
301
  )
302
  messages = [{"content": prompt, "role": "user"}]
303
  if assistant_text is not None:
304
+ messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
 
 
305
  return messages
306
 
307
+ def _construct_chat_messages_phrase_grounding(
308
+ self, phrase: str, assistant_text: str = None
309
+ ):
310
  """
311
  This function constructs the chat messages for phrase grounding used in the phrase grounding task.
312
 
 
331
  ]
332
  messages = [{"content": prompt, "role": "user"}]
333
  if assistant_text is not None:
334
+ messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
 
 
335
  return messages
336
 
337
  def format_reporting_input(
 
388
  assistant_text=assistant_text,
389
  )
390
  add_generation_prompt = assistant_text is None
391
+ text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
 
 
392
  return text, images
393
 
394
  def format_phrase_grounding_input(
 
419
  )
420
  messages = self._construct_chat_messages_phrase_grounding(phrase)
421
  add_generation_prompt = assistant_text is None
422
+ text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
 
 
423
  return text, images
424
 
425
  def format_and_preprocess_reporting_input(
 
542
  assert len(text) == 0
543
  return split_text
544
 
545
+ def convert_output_to_plaintext_or_grounded_sequence(
546
+ self, text: str
547
+ ):
548
  """
549
  This function converts the input text to a grounded sequence by extracting the grounded phrases and bounding
550
  boxes from the text. If the text is plaintext without any grounded phrases, it returns the text as is.
 
699
  tokenizer_init_kwargs=self.tokenizer.init_kwargs,
700
  **kwargs,
701
  )
 
702
  if images is not None:
703
  image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
704
  else:
 
725
  sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
726
  prompt_strings.append(sample)
727
 
728
+ output_kwargs.pop("return_mm_token_type_ids", None)
729
  text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
730
+ return BatchFeature(data={**text_inputs, **image_inputs})
tokenizer.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a8f238a200be6c23fbba0f9a999ab4fe3c09ca303b29805e68cf6659bfb7d89
3
- size 131
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723