Fix a few bugs

1. Fix a bug when specificing image size
2. Fix bug: ratio index out of range

Files changed (2) hide show

hunyuan.py CHANGED Viewed

@@ -2344,6 +2344,7 @@ class HunyuanImage3ForCausalMM(HunyuanImage3PreTrainedModel, GenerationMixin):
             extra_auto_stops = [tkw.boi_token_id]
         stop_token_id = dict(
             auto=[tkw.eos_token_id] + extra_auto_stops,
             recaption=[tkw.end_recaption_token_id, tkw.end_answer_token_id, tkw.eos_token_id],
             think=[tkw.end_recaption_token_id, tkw.end_answer_token_id, tkw.eos_token_id],
             img_ratio=extra_auto_stops,
@@ -2642,6 +2643,10 @@ class HunyuanImage3ForCausalMM(HunyuanImage3PreTrainedModel, GenerationMixin):
                 prompt=prompt, cot_text=cot_text, bot_task="img_ratio", system_prompt=system_prompt, seed=seed)
             outputs = self._generate(**model_inputs, **kwargs, verbose=verbose)
             ratio_index = outputs[0, -1].item() - self._tkwrapper.ratio_token_offset
             reso = self.image_processor.reso_group[ratio_index]
             image_size = reso.height, reso.width

             extra_auto_stops = [tkw.boi_token_id]
         stop_token_id = dict(
             auto=[tkw.eos_token_id] + extra_auto_stops,
+            image=[tkw.eos_token_id],
             recaption=[tkw.end_recaption_token_id, tkw.end_answer_token_id, tkw.eos_token_id],
             think=[tkw.end_recaption_token_id, tkw.end_answer_token_id, tkw.eos_token_id],
             img_ratio=extra_auto_stops,
                 prompt=prompt, cot_text=cot_text, bot_task="img_ratio", system_prompt=system_prompt, seed=seed)
             outputs = self._generate(**model_inputs, **kwargs, verbose=verbose)
             ratio_index = outputs[0, -1].item() - self._tkwrapper.ratio_token_offset
+            # In some cases, the generated ratio_index is out of range. A valid ratio_index should be in [0, 32].
+            # If ratio_index is out of range, we set it to 16 (i.e., 1:1).
+            if ratio_index < 0 or ratio_index >= len(self.image_processor.reso_group):
+                ratio_index = 16
             reso = self.image_processor.reso_group[ratio_index]
             image_size = reso.height, reso.width

tokenizer_wrapper.py CHANGED Viewed

@@ -1313,6 +1313,7 @@ class TokenizerWrapper(object):
             # We can add special tokens for the bot lastest message according to different tasks
             bot_response_prefix = dict(
                 auto=_bot_prefix,
                 think=f"{_bot_prefix}<think>",
                 recaption=f"{_bot_prefix}<recaption>",
                 img_ratio=f"{_bot_prefix}{answer_prefix}<boi><img_size_{image_base_size}>",
@@ -1345,15 +1346,15 @@ class TokenizerWrapper(object):
             batch_system_prompt: Optional[List[str]] = None,
             batch_cot_text: Optional[List[str]] = None,
             max_length: Optional[int] = None,
-            bot_task: str = "auto",    # auto/think/recaption/img_ratio
             image_base_size: int = 1024,
             sequence_template: str = "pretrain",
             cfg_factor: int = 1,
             add_assistant_prefix: Optional[bool] = None,
             drop_think: bool = False,
     ) -> Dict[str, Any]:
-        assert bot_task in ["auto", "think", "recaption", "img_ratio"], \
-            f"bot_task should be one of ['auto', 'think', 'recaption', 'img_ratio'], but got {bot_task}."
         if batch_message_list is None:
             # Simple text-to-image or text-cot-to-image task

             # We can add special tokens for the bot lastest message according to different tasks
             bot_response_prefix = dict(
                 auto=_bot_prefix,
+                image="",
                 think=f"{_bot_prefix}<think>",
                 recaption=f"{_bot_prefix}<recaption>",
                 img_ratio=f"{_bot_prefix}{answer_prefix}<boi><img_size_{image_base_size}>",
             batch_system_prompt: Optional[List[str]] = None,
             batch_cot_text: Optional[List[str]] = None,
             max_length: Optional[int] = None,
+            bot_task: str = "auto",    # auto/image/think/recaption/img_ratio
             image_base_size: int = 1024,
             sequence_template: str = "pretrain",
             cfg_factor: int = 1,
             add_assistant_prefix: Optional[bool] = None,
             drop_think: bool = False,
     ) -> Dict[str, Any]:
+        assert bot_task in ["image", "auto", "think", "recaption", "img_ratio"], \
+            f"bot_task should be one of ['image', 'auto', 'think', 'recaption', 'img_ratio'], but got {bot_task}."
         if batch_message_list is None:
             # Simple text-to-image or text-cot-to-image task