File size: 8,220 Bytes
f8ba0eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import os
import uuid
import base64
from typing import List
import time
import psutil
import ollama
import uvicorn
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import JSONResponse

try:
    import pynvml
    pynvml.nvmlInit()
    GPU_METRICS_AVAILABLE = True
except (ImportError, pynvml.NVMLError):
    GPU_METRICS_AVAILABLE = False

from video_processor import extract_frames, FrameSamplingMethod, encode_frames_to_base64

import logging
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default="qwen2.5vl-int4:latest")
args = parser.parse_args()

os.makedirs(f'logs/{args.model_name}', exist_ok=True)

# 初始化FastAPI应用
app = FastAPI(title="Qwen2.5-VL Video Inference Service")

# 定义一个临时目录来存储上传的视频
TEMP_VIDEO_DIR = "temp_videos"
os.makedirs(TEMP_VIDEO_DIR, exist_ok=True)

# 使用当前时间戳生成唯一的日志文件名
log_filename = f"logs/{args.model_name}/{time.strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', filename=log_filename, filemode='a')

@app.post("/video-inference/")
async def video_inference(
   prompt: str = Form(...),
   video_file: UploadFile = File(...),
   sampling_method: FrameSamplingMethod = Form(FrameSamplingMethod.CONTENT_AWARE),
   sampling_rate: int = Form(5),
):
   """
   接收视频和文本提示,进行推理并返回结果。
   - prompt: 用户的问题。
   - video_file: 上传的视频文件。
   - sampling_method: 帧采样方法 ('uniform' 或 'content_aware')。
   - sampling_rate: 采样率或阈值。
   """
   
   request_start_time = time.time()
   request_id = str(uuid.uuid4())
   logging.info(f"[{request_id}] Received new video inference request. Prompt: '{prompt}', Video: '{video_file.filename}'")

   # 验证上传的文件类型
   if not video_file.content_type.startswith("video/"):
       logging.error(f"[{request_id}] Uploaded file '{video_file.filename}' is not a video. Content-Type: {video_file.content_type}")
       raise HTTPException(status_code=400, detail="Uploaded file is not a video.")

   # 将上传的视频保存到临时文件
   file_extension = os.path.splitext(video_file.filename)[1]
   temp_video_path = os.path.join(TEMP_VIDEO_DIR, f"{request_id}{file_extension}")
   
   try:
       # 1. 保存并提取帧
       with open(temp_video_path, "wb") as buffer:
           content = await video_file.read()
           buffer.write(content)
       logging.info(f"[{request_id}] Video saved to temporary file: {temp_video_path}")

       logging.info(f"[{request_id}] Extracting frames using method: {sampling_method.value}, rate/threshold: {sampling_rate}")
       frames = extract_frames(temp_video_path, sampling_method, sampling_rate)
       if not frames:
           logging.error(f"[{request_id}] Could not extract any frames from the video: {temp_video_path}")
           raise HTTPException(status_code=400, detail="Could not extract any frames from the video.")
       
       logging.info(f"[{request_id}] Extracted {len(frames)} frames successfully.")

       # 2. 将帧编码为Base64
       base64_frames = encode_frames_to_base64(frames)
       logging.info(f"[{request_id}] Encoded {len(base64_frames)} frames to Base64.")

       # 3. 构造面向视频的提示
       final_prompt = (
           f"请分析以下从视频中按时间顺序提取的图像帧序列。"
           f"根据这些帧回答用户的问题。\n\n"
           f"用户问题: \"{prompt}\""
       )
       
       # 4. 调用Ollama API
       try:
           logging.info(f"[{request_id}] Sending request to Ollama model '{args.model_name}'...")
           
           # 初始化CPU使用率测量,以便我们测量Ollama调用期间的平均使用率
           psutil.cpu_percent(interval=None)
           psutil.cpu_percent(interval=None, percpu=True)
           
           ollama_start_time = time.time()
           response = ollama.chat(
               model=args.model_name,  # 使用我们创建的自定义模型!
               messages=[
                   {
                       'role': 'user',
                       'content': final_prompt,
                       'images': base64_frames,
                   }
               ]
           )
           ollama_end_time = time.time()
           
           # 在Ollama调用后立即获取CPU使用率,以获得准确的平均值
           cpu_usage = psutil.cpu_percent(interval=None)
           cpu_core_utilization = psutil.cpu_percent(interval=None, percpu=True)

           logging.info(f"[{request_id}] Received response from Ollama successfully.")

           # --- 指标计算 ---
           total_request_processing_time = time.time() - request_start_time
           ollama_total_latency = ollama_end_time - ollama_start_time
           
           eval_count = response.get('eval_count', 0)
           eval_duration_ns = response.get('eval_duration', 1)
           tokens_per_second = eval_count / (eval_duration_ns / 1e9) if eval_duration_ns > 0 else 0

           load_duration_ns = response.get('load_duration', 0)
           prompt_eval_duration_ns = response.get('prompt_eval_duration', 0)
           first_token_latency = prompt_eval_duration_ns / 1e9

           cpu_freq_info = psutil.cpu_freq()
           cpu_freq = cpu_freq_info.current if cpu_freq_info else 'N/A'

           gpu_metrics_log = "Not available (pynvml not installed or NVIDIA driver issue)"
           if GPU_METRICS_AVAILABLE:
               try:
                   handle = pynvml.nvmlDeviceGetHandleByIndex(0)
                   utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
                   memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                   gpu_metrics_log = (
                       f"GPU Utilization: {utilization.gpu}%, "
                       f"Memory Used: {memory_info.used / (1024**2):.2f}/{memory_info.total / (1024**2):.2f} MB"
                   )
               except pynvml.NVMLError as e:
                   gpu_metrics_log = f"Could not retrieve GPU metrics: {e}"

           # --- 格式化日志 ---
           log_message = f"""
[{request_id}] --- Performance & System Metrics ---
  [Request Info]
    - Prompt: "{prompt}"
    - Model: {response.get('model')}
  [Latency & Throughput]
    - Tokens/Second: {tokens_per_second:.2f}
    - Latency (First Token): {first_token_latency:.4f} s
    - Latency (Ollama Total): {ollama_total_latency:.4f} s
    - Batch Processing Latency (Total Request Time): {total_request_processing_time:.4f} s
    - Throughput (for this request): {1/total_request_processing_time if total_request_processing_time > 0 else float('inf'):.2f} req/s
  [Token Usage]
    - Prompt Tokens: {response.get('prompt_eval_count', 'N/A')}
    - Response Tokens: {eval_count}
  [System Usage at Completion]
    - CPU Usage: {cpu_usage}%
    - CPU Core Utilization: {cpu_core_utilization}%
    - CPU Frequency: {cpu_freq} MHz
    - GPU: {gpu_metrics_log}
  [Response]
    - {response['message']['content']}
----------------------------------------------------"""
           logging.info(log_message)
           
           # 返回模型的响应内容
           return JSONResponse(content={"response": response['message']['content']})
       
       except Exception as ollama_error:
           # 更具体地处理Ollama的错误
           logging.error(f"[{request_id}] Ollama inference failed: {str(ollama_error)}", exc_info=True)
           raise HTTPException(status_code=503, detail=f"Ollama inference failed: {str(ollama_error)}")

   except Exception as e:
       logging.error(f"[{request_id}] An error occurred during processing: {str(e)}", exc_info=True)
       raise HTTPException(status_code=500, detail=f"An error occurred during processing: {str(e)}")
   finally:
       # 清理临时文件
       if os.path.exists(temp_video_path):
           os.remove(temp_video_path)
           logging.info(f"[{request_id}] Cleaned up temporary file: {temp_video_path}")

if __name__ == "__main__":
   uvicorn.run(app, host="0.0.0.0", port=8008)