# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the correct prompt format on vision language models for text generation. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ import os import random from contextlib import contextmanager from dataclasses import asdict from typing import NamedTuple, Optional from huggingface_hub import snapshot_download from transformers import AutoTokenizer from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.lora.request import LoRARequest from vllm.multimodal.image import convert_image_mode from vllm.utils import FlexibleArgumentParser class ModelRequestData(NamedTuple): engine_args: EngineArgs prompts: list[str] stop_token_ids: Optional[list[int]] = None lora_requests: Optional[list[LoRARequest]] = None # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # lower-end GPUs. # Unless specified, these settings have been tested to work on a single L4. # Aria def run_aria(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "rhymes-ai/Aria" # NOTE: Need L40 (or equivalent) to avoid OOM engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=2, dtype="bfloat16", limit_mm_per_prompt={modality: 1}, ) prompts = [ ( f"<|im_start|>user\n<|img|>{question}" "<|im_end|>\n<|im_start|>assistant\n" ) for question in questions ] stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] return ModelRequestData( engine_args=engine_args, prompts=prompts, stop_token_ids=stop_token_ids, ) # Aya Vision def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "CohereForAI/aya-vision-8b" engine_args = EngineArgs( model=model_name, max_model_len=2048, max_num_seqs=2, mm_processor_kwargs={"crop_to_patches": True}, limit_mm_per_prompt={modality: 1}, ) prompts = [ f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # BLIP-2 def run_blip2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" # BLIP-2 prompt format is inaccurate on HuggingFace model repository. # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa prompts = [f"Question: {question} Answer:" for question in questions] engine_args = EngineArgs( model="Salesforce/blip2-opt-2.7b", limit_mm_per_prompt={modality: 1}, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Chameleon def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"{question}" for question in questions] engine_args = EngineArgs( model="facebook/chameleon-7b", max_model_len=4096, max_num_seqs=2, limit_mm_per_prompt={modality: 1}, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, ) def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "CohereLabs/command-a-vision-07-2025" engine_args = EngineArgs( model=model_name, max_model_len=32768, tensor_parallel_size=4, limit_mm_per_prompt={modality: 1}, ) prompts = [ f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><|IMG_PATCH|>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Deepseek-VL2 def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "deepseek-ai/deepseek-vl2-tiny" engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=2, hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, limit_mm_per_prompt={modality: 1}, ) prompts = [ f"<|User|>: \n{question}\n\n<|Assistant|>:" for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Florence2 def run_florence2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" engine_args = EngineArgs( model="microsoft/Florence-2-large", tokenizer="Isotr0py/Florence-2-tokenizer", max_model_len=4096, max_num_seqs=2, trust_remote_code=True, dtype="bfloat16", limit_mm_per_prompt={modality: 1}, ) prompts = ["" for _ in questions] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Fuyu def run_fuyu(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"{question}\n" for question in questions] engine_args = EngineArgs( model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2, limit_mm_per_prompt={modality: 1}, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Gemma 3 def run_gemma3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "google/gemma-3-4b-it" engine_args = EngineArgs( model=model_name, max_model_len=2048, max_num_seqs=2, mm_processor_kwargs={"do_pan_and_scan": True}, limit_mm_per_prompt={modality: 1}, ) prompts = [ ( "user\n" f"{question}\n" "model\n" ) for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Gemma3N def run_gemma3n(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "google/gemma-3n-E2B-it" engine_args = EngineArgs( model=model_name, max_model_len=2048, max_num_seqs=2, limit_mm_per_prompt={modality: 1}, enforce_eager=True, ) prompts = [ ( "user\n" f"{question}\n" "model\n" ) for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # GLM-4v def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "zai-org/glm-4v-9b" engine_args = EngineArgs( model=model_name, max_model_len=2048, max_num_seqs=2, trust_remote_code=True, enforce_eager=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}, limit_mm_per_prompt={modality: 1}, ) prompts = [ ( "<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>" f"{question}<|assistant|>" ) for question in questions ] stop_token_ids = [151329, 151336, 151338] return ModelRequestData( engine_args=engine_args, prompts=prompts, stop_token_ids=stop_token_ids, ) # GLM-4.1V def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData: model_name = "zai-org/GLM-4.1V-9B-Thinking" engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=2, mm_processor_kwargs={ "size": {"shortest_edge": 12544, "longest_edge": 47040000}, "fps": 1, }, limit_mm_per_prompt={modality: 1}, enforce_eager=True, ) if modality == "image": placeholder = "<|begin_of_image|><|image|><|end_of_image|>" elif modality == "video": placeholder = "<|begin_of_video|><|video|><|end_of_video|>" prompts = [ ( "[gMASK]<|system|>\nYou are a helpful assistant.<|user|>\n" f"{placeholder}" f"{question}<|assistant|>assistant\n" ) for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # GLM-4.5V def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData: model_name = "zai-org/GLM-4.5V" engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=2, mm_processor_kwargs={ "size": {"shortest_edge": 12544, "longest_edge": 47040000}, "fps": 1, }, limit_mm_per_prompt={modality: 1}, enforce_eager=True, tensor_parallel_size=4, ) if modality == "image": placeholder = "<|begin_of_image|><|image|><|end_of_image|>" elif modality == "video": placeholder = "<|begin_of_video|><|video|><|end_of_video|>" prompts = [ ( "[gMASK]<|system|>\nYou are a helpful assistant.<|user|>\n" f"{placeholder}" f"{question}<|assistant|>assistant\n" ) for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # GLM-4.5V-FP8 def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData: model_name = "zai-org/GLM-4.5V-FP8" engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=2, mm_processor_kwargs={ "size": {"shortest_edge": 12544, "longest_edge": 47040000}, "fps": 1, }, limit_mm_per_prompt={modality: 1}, enforce_eager=True, tensor_parallel_size=4, ) if modality == "image": placeholder = "<|begin_of_image|><|image|><|end_of_image|>" elif modality == "video": placeholder = "<|begin_of_video|><|video|><|end_of_video|>" prompts = [ ( "[gMASK]<|system|>\nYou are a helpful assistant.<|user|>\n" f"{placeholder}" f"{question}<|assistant|>assistant\n" ) for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # H2OVL-Mississippi def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "h2oai/h2ovl-mississippi-800m" engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=8192, limit_mm_per_prompt={modality: 1}, ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) messages = [ [{"role": "user", "content": f"\n{question}"}] for question in questions ] prompts = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Stop tokens for H2OVL-Mississippi # https://huggingface.co/h2oai/h2ovl-mississippi-800m stop_token_ids = [tokenizer.eos_token_id] return ModelRequestData( engine_args=engine_args, prompts=prompts, stop_token_ids=stop_token_ids, ) # naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B def run_hyperclovax_seed_vision( questions: list[str], modality: str ) -> ModelRequestData: model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=8192 if modality == "image" else 16384, limit_mm_per_prompt={modality: 1}, ) messages = list() for question in questions: if modality == "image": """ ocr: List the words in the image in raster order. Even if the word order feels unnatural for reading, the model will handle it as long as it follows raster order. e.g. "Naver, CLOVA, bigshane" lens_keywords: List the entity names in the image. e.g. "iPhone" lens_local_keywords: List the entity names with quads in the image. e.g. "[0.07, 0.21, 0.92, 0.90] iPhone" """ messages.append( [ { "role": "user", "content": [ { "type": "image", "ocr": "", "lens_keywords": "", "lens_local_keywords": "", }, { "type": "text", "text": question, }, ], } ] ) elif modality == "video": messages.append( [ { "role": "user", "content": [ { "type": "video", }, { "type": "text", "text": question, }, ], } ] ) else: raise ValueError(f"Unsupported modality: {modality}") prompts = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, stop_token_ids=None, ) # Idefics3-8B-Llama3 def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "HuggingFaceM4/Idefics3-8B-Llama3" engine_args = EngineArgs( model=model_name, max_model_len=8192, max_num_seqs=2, enforce_eager=True, # if you are running out of memory, you can reduce the "longest_edge". # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations mm_processor_kwargs={ "size": {"longest_edge": 3 * 364}, }, limit_mm_per_prompt={modality: 1}, ) prompts = [ (f"<|begin_of_text|>User:{question}\nAssistant:") for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Intern-S1 def run_interns1(questions: list[str], modality: str) -> ModelRequestData: model_name = "internlm/Intern-S1" engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=8192, max_num_seqs=2, limit_mm_per_prompt={modality: 1}, enforce_eager=True, ) if modality == "image": placeholder = "" elif modality == "video": placeholder = "