--- license: apache-2.0 --- Model Usage: ~~~ from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor from qwen_vl_utils import process_vision_info model = Qwen2_5_VLForConditionalGeneration.from_pretrained( model_path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto", ) processor = AutoProcessor.from_pretrained(model_path, max_pixels=262144) reason_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within tags. During this reasoning process, prioritize analyzing the local regions of the image by leveraging the bounding box coordinates in the format [x_min, y_min, x_max, y_max]. The final answer MUST BE put in \boxed{}. An example is like: reasoning process 1 with [x_min1, y_min1, x_max1, y_max1]; reasoning process 2 with [x_min2, y_min2, x_max2, y_max2] . The answer is: \boxed{answer}." def get_label(images, content1): content_list = [] for image_url in images: content_list.append({ "type": "image", "image": image_url, }) if mode == 'think': content_list.append({"type": "text", "text": content1 + '\n' + reason_prompt + '\n'}) else: content_list.append({"type": "text", "text": content1}) messages = [ { "role": "user", "content": content_list } ] # Preparation for inference text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # print(text) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to("cuda") # Inference: Generation of the output generated_ids = model.generate(**inputs, max_new_tokens=4096, do_sample=True, temperature=0.6) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) # print(output_text) # print(output_text[0]) return output_text[0] ~~~