THUdyh commited on
Commit
934c18a
·
verified ·
1 Parent(s): ef2c677

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +325 -3
README.md CHANGED
@@ -1,3 +1,325 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model:
4
+ - Qwen/Qwen2.5-7B-Instruct
5
+ language:
6
+ - en
7
+ - zh
8
+ datasets:
9
+ - HuggingFaceFV/finevideo
10
+ ---
11
+
12
+ # Ola-7B
13
+
14
+ ## Model Summary
15
+
16
+ The Ola-7B model is developed by people from Tencent, Tsinghua University and Nanyang Technological University.
17
+ Based on Qwen2.5 language model, it is trained on text, image, video and audio data with a context window of 32K tokens. It can take both image/video, text and audio as input and output text.
18
+
19
+ Ola offers an on-demand solution to seamlessly and efficiently process visual inputs with arbitrary spatial sizes and temporal lengths.
20
+
21
+ - **Repository:** https://github.com/Ola-Omni/Ola
22
+ - **Languages:** English, Chinese
23
+ - **Paper:** https://huggingface.co/papers/2502.04328
24
+
25
+ ## Use
26
+
27
+ 1. Download the speech encoder at https://huggingface.co/THUdyh/Ola_speech_encoders.
28
+ 2. Replace the path in config.json with local path of speech encoders.
29
+
30
+ We provide a simple generation process for using our model. For more details, please refer to our [Github Repo](https://github.com/Ola-Omni/Ola)
31
+
32
+ ```
33
+ import os
34
+ os.environ['LOWRES_RESIZE'] = '384x32'
35
+ os.environ['HIGHRES_BASE'] = '0x32'
36
+ os.environ['VIDEO_RESIZE'] = "0x64"
37
+ os.environ['VIDEO_MAXRES'] = "480"
38
+ os.environ['VIDEO_MINRES'] = "288"
39
+ os.environ['MAXRES'] = '1536'
40
+ os.environ['MINRES'] = '0'
41
+ os.environ['REGIONAL_POOL'] = '2x'
42
+ os.environ['FORCE_NO_DOWNSAMPLE'] = '1'
43
+ os.environ['LOAD_VISION_EARLY'] = '1'
44
+ os.environ['SKIP_LOAD_VIT'] = '1'
45
+
46
+
47
+ import gradio as gr
48
+ import torch
49
+ import re
50
+ from decord import VideoReader, cpu
51
+ from PIL import Image
52
+ import numpy as np
53
+ import transformers
54
+ import moviepy.editor as mp
55
+ from typing import Dict, Optional, Sequence, List
56
+ import librosa
57
+ import whisper
58
+ from ola.conversation import conv_templates, SeparatorStyle
59
+ from ola.model.builder import load_pretrained_model
60
+ from ola.utils import disable_torch_init
61
+ from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token
62
+ from ola.mm_utils import get_model_name_from_path, KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image_genli
63
+ from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
64
+
65
+ model_path = ""
66
+ tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None)
67
+ model = model.to('cuda').eval()
68
+ model = model.bfloat16()
69
+
70
+ USE_SPEECH=False
71
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
72
+
73
+
74
+ def load_audio(audio_file_name):
75
+ speech_wav, samplerate = librosa.load(audio_file_name, sr=16000)
76
+ if len(speech_wav.shape) > 1:
77
+ speech_wav = speech_wav[:, 0]
78
+ speech_wav = speech_wav.astype(np.float32)
79
+ CHUNK_LIM = 480000
80
+ SAMPLE_RATE = 16000
81
+ speechs = []
82
+ speech_wavs = []
83
+
84
+ if len(speech_wav) <= CHUNK_LIM:
85
+ speech = whisper.pad_or_trim(speech_wav)
86
+ speech_wav = whisper.pad_or_trim(speech_wav)
87
+ speechs.append(speech)
88
+ speech_wavs.append(torch.from_numpy(speech_wav).unsqueeze(0))
89
+ else:
90
+ for i in range(0, len(speech_wav), CHUNK_LIM):
91
+ chunk = speech_wav[i : i + CHUNK_LIM]
92
+ if len(chunk) < CHUNK_LIM:
93
+ chunk = whisper.pad_or_trim(chunk)
94
+ speechs.append(chunk)
95
+ speech_wavs.append(torch.from_numpy(chunk).unsqueeze(0))
96
+ mels = []
97
+ for chunk in speechs:
98
+ chunk = whisper.log_mel_spectrogram(chunk, n_mels=128).permute(1, 0).unsqueeze(0)
99
+ mels.append(chunk)
100
+
101
+ mels = torch.cat(mels, dim=0)
102
+ speech_wavs = torch.cat(speech_wavs, dim=0)
103
+ if mels.shape[0] > 25:
104
+ mels = mels[:25]
105
+ speech_wavs = speech_wavs[:25]
106
+
107
+ speech_length = torch.LongTensor([mels.shape[1]] * mels.shape[0])
108
+ speech_chunks = torch.LongTensor([mels.shape[0]])
109
+ return mels, speech_length, speech_chunks, speech_wavs
110
+
111
+ def extract_audio(videos_file_path):
112
+ my_clip = mp.VideoFileClip(videos_file_path)
113
+ return my_clip.audio
114
+
115
+ def ola_inference(multimodal, audio_path):
116
+ visual, text = multimodal["files"][0], multimodal["text"]
117
+ if visual.endswith("image2.png"):
118
+ modality = "video"
119
+ visual = f"{cur_dir}/case/case1.mp4"
120
+ if visual.endswith(".mp4"):
121
+ modality = "video"
122
+ else:
123
+ modality = "image"
124
+
125
+ # input audio and video, do not parse audio in the video, else parse audio in the video
126
+ if audio_path:
127
+ USE_SPEECH = True
128
+ elif modality == "video":
129
+ USE_SPEECH = True
130
+ else:
131
+ USE_SPEECH = False
132
+
133
+ speechs = []
134
+ speech_lengths = []
135
+ speech_wavs = []
136
+ speech_chunks = []
137
+ if modality == "video":
138
+ vr = VideoReader(visual, ctx=cpu(0))
139
+ total_frame_num = len(vr)
140
+ fps = round(vr.get_avg_fps())
141
+ uniform_sampled_frames = np.linspace(0, total_frame_num - 1, 64, dtype=int)
142
+ frame_idx = uniform_sampled_frames.tolist()
143
+ spare_frames = vr.get_batch(frame_idx).asnumpy()
144
+ video = [Image.fromarray(frame) for frame in spare_frames]
145
+ else:
146
+ image = [Image.open(visual)]
147
+ image_sizes = [image[0].size]
148
+
149
+ if USE_SPEECH and audio_path:
150
+ audio_path = audio_path
151
+ speech, speech_length, speech_chunk, speech_wav = load_audio(audio_path)
152
+ speechs.append(speech.bfloat16().to('cuda'))
153
+ speech_lengths.append(speech_length.to('cuda'))
154
+ speech_chunks.append(speech_chunk.to('cuda'))
155
+ speech_wavs.append(speech_wav.to('cuda'))
156
+ print('load audio')
157
+ elif USE_SPEECH and not audio_path:
158
+ # parse audio in the video
159
+ audio = extract_audio(visual)
160
+ audio.write_audiofile("./video_audio.wav")
161
+ video_audio_path = './video_audio.wav'
162
+ speech, speech_length, speech_chunk, speech_wav = load_audio(video_audio_path)
163
+ speechs.append(speech.bfloat16().to('cuda'))
164
+ speech_lengths.append(speech_length.to('cuda'))
165
+ speech_chunks.append(speech_chunk.to('cuda'))
166
+ speech_wavs.append(speech_wav.to('cuda'))
167
+ else:
168
+ speechs = [torch.zeros(1, 3000, 128).bfloat16().to('cuda')]
169
+ speech_lengths = [torch.LongTensor([3000]).to('cuda')]
170
+ speech_wavs = [torch.zeros([1, 480000]).to('cuda')]
171
+ speech_chunks = [torch.LongTensor([1]).to('cuda')]
172
+
173
+ conv_mode = "qwen_1_5"
174
+ if text:
175
+ qs = text
176
+ else:
177
+ qs = ''
178
+ if USE_SPEECH and audio_path:
179
+ qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
180
+ elif USE_SPEECH:
181
+ qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
182
+ else:
183
+ qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
184
+
185
+ conv = conv_templates[conv_mode].copy()
186
+ conv.append_message(conv.roles[0], qs)
187
+ conv.append_message(conv.roles[1], None)
188
+ prompt = conv.get_prompt()
189
+ if USE_SPEECH and audio_path:
190
+ input_ids = tokenizer_speech_question_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
191
+ elif USE_SPEECH:
192
+ input_ids = tokenizer_speech_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
193
+ else:
194
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
195
+
196
+ if modality == "video":
197
+ video_processed = []
198
+ for idx, frame in enumerate(video):
199
+ image_processor.do_resize = False
200
+ image_processor.do_center_crop = False
201
+ frame = process_anyres_video(frame, image_processor)
202
+
203
+ if frame_idx is not None and idx in frame_idx:
204
+ video_processed.append(frame.unsqueeze(0))
205
+ elif frame_idx is None:
206
+ video_processed.append(frame.unsqueeze(0))
207
+
208
+ if frame_idx is None:
209
+ frame_idx = np.arange(0, len(video_processed), dtype=int).tolist()
210
+
211
+ video_processed = torch.cat(video_processed, dim=0).bfloat16().to("cuda")
212
+ video_processed = (video_processed, video_processed)
213
+
214
+ video_data = (video_processed, (384, 384), "video")
215
+ else:
216
+ image_processor.do_resize = False
217
+ image_processor.do_center_crop = False
218
+ image_tensor, image_highres_tensor = [], []
219
+ for visual in image:
220
+ image_tensor_, image_highres_tensor_ = process_anyres_highres_image_genli(visual, image_processor)
221
+ image_tensor.append(image_tensor_)
222
+ image_highres_tensor.append(image_highres_tensor_)
223
+ if all(x.shape == image_tensor[0].shape for x in image_tensor):
224
+ image_tensor = torch.stack(image_tensor, dim=0)
225
+ if all(x.shape == image_highres_tensor[0].shape for x in image_highres_tensor):
226
+ image_highres_tensor = torch.stack(image_highres_tensor, dim=0)
227
+ if type(image_tensor) is list:
228
+ image_tensor = [_image.bfloat16().to("cuda") for _image in image_tensor]
229
+ else:
230
+ image_tensor = image_tensor.bfloat16().to("cuda")
231
+ if type(image_highres_tensor) is list:
232
+ image_highres_tensor = [_image.bfloat16().to("cuda") for _image in image_highres_tensor]
233
+ else:
234
+ image_highres_tensor = image_highres_tensor.bfloat16().to("cuda")
235
+
236
+ pad_token_ids = 151643
237
+
238
+ attention_masks = input_ids.ne(pad_token_ids).long().to('cuda')
239
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
240
+ keywords = [stop_str]
241
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
242
+
243
+ gen_kwargs = {}
244
+
245
+ if "max_new_tokens" not in gen_kwargs:
246
+ gen_kwargs["max_new_tokens"] = 1024
247
+ if "temperature" not in gen_kwargs:
248
+ gen_kwargs["temperature"] = 0.2
249
+ if "top_p" not in gen_kwargs:
250
+ gen_kwargs["top_p"] = None
251
+ if "num_beams" not in gen_kwargs:
252
+ gen_kwargs["num_beams"] = 1
253
+
254
+ with torch.inference_mode():
255
+ if modality == "video":
256
+ output_ids = model.generate(
257
+ inputs=input_ids,
258
+ images=video_data[0][0],
259
+ images_highres=video_data[0][1],
260
+ modalities=video_data[2],
261
+ speech=speechs,
262
+ speech_lengths=speech_lengths,
263
+ speech_chunks=speech_chunks,
264
+ speech_wav=speech_wavs,
265
+ attention_mask=attention_masks,
266
+ use_cache=True,
267
+ stopping_criteria=[stopping_criteria],
268
+ do_sample=True if gen_kwargs["temperature"] > 0 else False,
269
+ temperature=gen_kwargs["temperature"],
270
+ top_p=gen_kwargs["top_p"],
271
+ num_beams=gen_kwargs["num_beams"],
272
+ max_new_tokens=gen_kwargs["max_new_tokens"],
273
+ )
274
+ else:
275
+ output_ids = model.generate(
276
+ inputs=input_ids,
277
+ images=image_tensor,
278
+ images_highres=image_highres_tensor,
279
+ image_sizes=image_sizes,
280
+ modalities=['image'],
281
+ speech=speechs,
282
+ speech_lengths=speech_lengths,
283
+ speech_chunks=speech_chunks,
284
+ speech_wav=speech_wavs,
285
+ attention_mask=attention_masks,
286
+ use_cache=True,
287
+ stopping_criteria=[stopping_criteria],
288
+ do_sample=True if gen_kwargs["temperature"] > 0 else False,
289
+ temperature=gen_kwargs["temperature"],
290
+ top_p=gen_kwargs["top_p"],
291
+ num_beams=gen_kwargs["num_beams"],
292
+ max_new_tokens=gen_kwargs["max_new_tokens"],
293
+ )
294
+
295
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
296
+ outputs = outputs.strip()
297
+ if outputs.endswith(stop_str):
298
+ outputs = outputs[:-len(stop_str)]
299
+ outputs = outputs.strip()
300
+ return outputs, None
301
+ ```
302
+
303
+ ### Model Architecture
304
+
305
+ - **Architecture:** Pre-trained [Oryx-ViT](https://huggingface.co/THUdyh/Oryx-ViT) + Qwen2.5-7B
306
+
307
+ - **Data:** a mixture of more than 5M image/video/audio data, training for 3 stage.
308
+
309
+ - **Precision:** BFloat16
310
+
311
+ #### Hardware & Software
312
+
313
+ - **Hardware:** 64 \* NVIDIA Tesla A100
314
+
315
+ - **Orchestration:** HuggingFace Trainer
316
+
317
+ - **Code:** Pytorch
318
+
319
+ ## Citation
320
+ @article{liu2025ola,
321
+ title={Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment},
322
+ author={Liu, Zuyan and Dong, Yuhao and Wang, Jiahui and Liu, Ziwei and Hu, Winston and Lu, Jiwen and Rao, Yongming},
323
+ journal={arXiv preprint arXiv:2502.04328},
324
+ year={2025}
325
+ }