|
import os |
|
import uuid |
|
import time |
|
import psutil |
|
import uvicorn |
|
import torch |
|
import cv2 |
|
import shutil |
|
from fastapi import FastAPI, File, UploadFile, Form, HTTPException |
|
from fastapi.responses import JSONResponse |
|
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor |
|
from qwen_vl_utils import process_vision_info |
|
from video_processor import extract_frames, FrameSamplingMethod |
|
import argparse |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--model_path", type=str, default="Qwen/Qwen2.5-VL-3B-Instruct-AWQ") |
|
args = parser.parse_args() |
|
try: |
|
import pynvml |
|
pynvml.nvmlInit() |
|
GPU_METRICS_AVAILABLE = True |
|
except (ImportError, pynvml.NVMLError): |
|
GPU_METRICS_AVAILABLE = False |
|
|
|
import logging |
|
|
|
|
|
|
|
LOG_DIR = f"logs/{args.model_path.split('/')[-1]}" |
|
TEMP_VIDEO_DIR = "temp_videos" |
|
os.makedirs(LOG_DIR, exist_ok=True) |
|
os.makedirs(TEMP_VIDEO_DIR, exist_ok=True) |
|
|
|
log_filename = f"{LOG_DIR}/{time.strftime('%Y%m%d_%H%M%S')}.log" |
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', filename=log_filename, filemode='a') |
|
|
|
|
|
app = FastAPI(title="Qwen2.5-VL-AWQ Video Inference Service") |
|
|
|
|
|
logging.info(f"Loading model: {args.model_path}") |
|
model_load_start = time.time() |
|
|
|
model = Qwen2_5_VLForConditionalGeneration.from_pretrained( |
|
args.model_path, |
|
torch_dtype='float16', |
|
device_map="auto" |
|
) |
|
processor = AutoProcessor.from_pretrained(args.model_path) |
|
|
|
logging.info(f"Model loaded in {time.time() - model_load_start:.2f} seconds") |
|
|
|
@app.post("/video-inference/") |
|
async def video_inference( |
|
prompt: str = Form(...), |
|
video_file: UploadFile = File(...), |
|
sampling_method: FrameSamplingMethod = Form(FrameSamplingMethod.CONTENT_AWARE), |
|
sampling_rate: int = Form(5), |
|
): |
|
""" |
|
接收视频和文本提示,进行推理并返回结果。 |
|
""" |
|
request_start_time = time.time() |
|
request_id = str(uuid.uuid4()) |
|
logging.info(f"[{request_id}] Received new video inference request. Prompt: '{prompt}', Video: '{video_file.filename}'") |
|
|
|
if not video_file.content_type.startswith("video/"): |
|
logging.error(f"[{request_id}] Uploaded file '{video_file.filename}' is not a video. Content-Type: {video_file.content_type}") |
|
raise HTTPException(status_code=400, detail="Uploaded file is not a video.") |
|
|
|
file_extension = os.path.splitext(video_file.filename)[1] |
|
temp_video_path = os.path.join(TEMP_VIDEO_DIR, f"{request_id}{file_extension}") |
|
temp_frame_dir = os.path.join(TEMP_VIDEO_DIR, request_id) |
|
os.makedirs(temp_frame_dir, exist_ok=True) |
|
|
|
try: |
|
|
|
with open(temp_video_path, "wb") as buffer: |
|
content = await video_file.read() |
|
buffer.write(content) |
|
|
|
logging.info(f"[{request_id}] Video saved to temporary file: {temp_video_path}") |
|
logging.info(f"[{request_id}] Extracting frames using method: {sampling_method.value}, rate/threshold: {sampling_rate}") |
|
|
|
frames = extract_frames(temp_video_path, sampling_method, sampling_rate) |
|
if not frames: |
|
logging.error(f"[{request_id}] Could not extract any frames from the video: {temp_video_path}") |
|
raise HTTPException(status_code=400, detail="Could not extract any frames from the video.") |
|
|
|
logging.info(f"[{request_id}] Extracted {len(frames)} frames successfully. Saving to temporary files...") |
|
|
|
|
|
frame_paths = [] |
|
for i, frame in enumerate(frames): |
|
frame_path = os.path.join(temp_frame_dir, f"frame_{i:04d}.jpg") |
|
cv2.imwrite(frame_path, frame) |
|
abs_frame_path = os.path.abspath(frame_path) |
|
frame_paths.append(f"file://{abs_frame_path}") |
|
|
|
logging.info(f"[{request_id}] {len(frame_paths)} frames saved to {temp_frame_dir}") |
|
|
|
|
|
content = [ |
|
{ |
|
"type": "video", |
|
"video": frame_paths, |
|
"resized_height": 280, |
|
"resized_width": 420, |
|
}, |
|
{"type": "text", "text": prompt}, |
|
] |
|
messages = [{"role": "user", "content": content}] |
|
|
|
|
|
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
image_inputs, video_inputs = process_vision_info(messages) |
|
logging.info(f"[{request_id}] Video processing finished.") |
|
inputs = processor( |
|
text=[text], |
|
images=image_inputs, |
|
videos=video_inputs, |
|
padding=True, |
|
return_tensors="pt", |
|
).to(model.device) |
|
logging.info(f"[{request_id}] Input tokens: {len(inputs.input_ids[0])}") |
|
|
|
logging.info(f"[{request_id}] Sending request to model '{args.model_path}'...") |
|
psutil.cpu_percent(interval=None) |
|
psutil.cpu_percent(interval=None, percpu=True) |
|
inference_start_time = time.time() |
|
|
|
generated_ids = model.generate(**inputs, max_new_tokens=512) |
|
|
|
inference_end_time = time.time() |
|
cpu_usage = psutil.cpu_percent(interval=None) |
|
cpu_core_utilization = psutil.cpu_percent(interval=None, percpu=True) |
|
|
|
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] |
|
output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] |
|
|
|
logging.info(f"[{request_id}] Received response from model successfully.") |
|
|
|
|
|
total_request_processing_time = time.time() - request_start_time |
|
model_inference_latency = inference_end_time - inference_start_time |
|
num_generated_tokens = len(generated_ids_trimmed[0]) |
|
tokens_per_second = num_generated_tokens / model_inference_latency if model_inference_latency > 0 else 0 |
|
|
|
cpu_freq_info = psutil.cpu_freq() |
|
cpu_freq = cpu_freq_info.current if cpu_freq_info else 'N/A' |
|
|
|
gpu_metrics_log = "Not available (pynvml not installed or NVIDIA driver issue)" |
|
if GPU_METRICS_AVAILABLE: |
|
try: |
|
handle = pynvml.nvmlDeviceGetHandleByIndex(0) |
|
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) |
|
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) |
|
gpu_metrics_log = f"GPU Utilization: {utilization.gpu}%, Memory Used: {memory_info.used / (1024**2):.2f}/{memory_info.total / (1024**2):.2f} MB" |
|
except pynvml.NVMLError as e: |
|
gpu_metrics_log = f"Could not retrieve GPU metrics: {e}" |
|
|
|
|
|
log_message = f""" |
|
[{request_id}] --- Performance & System Metrics --- |
|
[Request Info] |
|
- Prompt: "{prompt}" |
|
- Model: {args.model_path} |
|
- Sampling Method: {sampling_method.value}, Rate: {sampling_rate}, Frames: {len(frames)} |
|
[Latency & Throughput] |
|
- Tokens/Second: {tokens_per_second:.2f} |
|
- Latency (Model Inference): {model_inference_latency:.4f} s |
|
- Batch Processing Latency (Total Request Time): {total_request_processing_time:.4f} s |
|
- Throughput (for this request): {1/total_request_processing_time if total_request_processing_time > 0 else float('inf'):.2f} req/s |
|
[Token Usage] |
|
- Prompt Tokens: {len(inputs.input_ids[0])} |
|
- Response Tokens: {num_generated_tokens} |
|
[System Usage at Completion] |
|
- CPU Usage: {cpu_usage}% |
|
- CPU Core Utilization: {cpu_core_utilization}% |
|
- CPU Frequency: {cpu_freq} MHz |
|
- GPU: {gpu_metrics_log} |
|
[Response] |
|
- {output_text} |
|
----------------------------------------------------""" |
|
logging.info(log_message) |
|
|
|
return JSONResponse(content={"response": output_text}) |
|
|
|
except Exception as e: |
|
logging.error(f"[{request_id}] An error occurred during processing: {str(e)}", exc_info=True) |
|
raise HTTPException(status_code=500, detail=f"An error occurred during processing: {str(e)}") |
|
finally: |
|
if os.path.exists(temp_video_path): |
|
os.remove(temp_video_path) |
|
logging.info(f"[{request_id}] Cleaned up temporary file: {temp_video_path}") |
|
if os.path.exists(temp_frame_dir): |
|
shutil.rmtree(temp_frame_dir) |
|
logging.info(f"[{request_id}] Cleaned up temporary frame directory: {temp_frame_dir}") |
|
|
|
|
|
if __name__ == "__main__": |
|
uvicorn.run(app, host="0.0.0.0", port=8010) |
|
|