Update README.md
Browse files
README.md
CHANGED
@@ -26,20 +26,18 @@ The LLaVAction-0.5B model is trained on EPIC-KITCHENS-100-MQA, based on Qwen2 la
|
|
26 |
- **Point of Contact**: [Mackenzie Mathis](https://people.epfl.ch/mackenzie.mathis)
|
27 |
- **Languages**: English
|
28 |
-
|
29 |
-
##
|
30 |
|
31 |
### Intended use
|
32 |
The model was trained on EPIC-KITCHENS-100-MQA. It's intended to be used on videos that are similar to EPIC-KITCHENS-100.
|
33 |
|
34 |
|
35 |
-
**Feel free to share your generations in the Community tab!**
|
36 |
-
|
37 |
-
|
38 |
### Generation
|
39 |
We provide the simple generation process for using our model. For more details, you could refer to Github.
|
40 |
|
41 |
```python
|
42 |
!pip install llavaction
|
|
|
43 |
from llavaction.model.builder import load_pretrained_model
|
44 |
from llavaction.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
|
45 |
from llavaction.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
|
@@ -53,6 +51,15 @@ import warnings
|
|
53 |
from decord import VideoReader, cpu
|
54 |
import numpy as np
|
55 |
warnings.filterwarnings("ignore")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
def load_video(video_path, max_frames_num,fps=1,force_sample=False):
|
57 |
if max_frames_num == 0:
|
58 |
return np.zeros((1, 336, 336, 3))
|
@@ -69,21 +76,19 @@ def load_video(video_path, max_frames_num,fps=1,force_sample=False):
|
|
69 |
spare_frames = vr.get_batch(frame_idx).asnumpy()
|
70 |
# import pdb;pdb.set_trace()
|
71 |
return spare_frames,frame_time,video_time
|
|
|
72 |
pretrained = "MLAdaptiveIntelligence/LLaVAction-0.5B"
|
73 |
model_name = "llava_qwen"
|
74 |
device = "cuda"
|
75 |
device_map = "auto"
|
76 |
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map) # Add any other thing you want to pass in llava_model_args
|
77 |
model.eval()
|
78 |
-
video_path = "XXXX"
|
79 |
max_frames_num = 64
|
80 |
video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
|
81 |
video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().to(torch.bfloat16)
|
82 |
video = [video]
|
83 |
conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
|
84 |
time_instruction = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. "
|
85 |
-
perspective_prompt = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing?"
|
86 |
-
task_prompt = "Describe in details what you see from the video frames."
|
87 |
question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruction}\n{perspective_prompt} {task_prompt}"
|
88 |
conv = copy.deepcopy(conv_templates[conv_template])
|
89 |
conv.append_message(conv.roles[0], question)
|
@@ -105,6 +110,8 @@ print(text_outputs)
|
|
105 |
|
106 |
## Training
|
107 |
|
|
|
|
|
108 |
|
109 |
### Model
|
110 |
- **Architecture**: SO400M + Qwen2
|
|
|
26 |
- **Point of Contact**: [Mackenzie Mathis](https://people.epfl.ch/mackenzie.mathis)
|
27 |
- **Languages**: English
|
28 |
-
|
29 |
+
## Useage
|
30 |
|
31 |
### Intended use
|
32 |
The model was trained on EPIC-KITCHENS-100-MQA. It's intended to be used on videos that are similar to EPIC-KITCHENS-100.
|
33 |
|
34 |
|
|
|
|
|
|
|
35 |
### Generation
|
36 |
We provide the simple generation process for using our model. For more details, you could refer to Github.
|
37 |
|
38 |
```python
|
39 |
!pip install llavaction
|
40 |
+
|
41 |
from llavaction.model.builder import load_pretrained_model
|
42 |
from llavaction.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
|
43 |
from llavaction.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
|
|
|
51 |
from decord import VideoReader, cpu
|
52 |
import numpy as np
|
53 |
warnings.filterwarnings("ignore")
|
54 |
+
|
55 |
+
#Your video (it assumes an egocentric view point)
|
56 |
+
video_path = "XXXX"
|
57 |
+
|
58 |
+
#These are the prompts we trained with, but you can test others:
|
59 |
+
perspective_prompt = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing?"
|
60 |
+
task_prompt = "Describe in details what you see from the video frames."
|
61 |
+
|
62 |
+
|
63 |
def load_video(video_path, max_frames_num,fps=1,force_sample=False):
|
64 |
if max_frames_num == 0:
|
65 |
return np.zeros((1, 336, 336, 3))
|
|
|
76 |
spare_frames = vr.get_batch(frame_idx).asnumpy()
|
77 |
# import pdb;pdb.set_trace()
|
78 |
return spare_frames,frame_time,video_time
|
79 |
+
|
80 |
pretrained = "MLAdaptiveIntelligence/LLaVAction-0.5B"
|
81 |
model_name = "llava_qwen"
|
82 |
device = "cuda"
|
83 |
device_map = "auto"
|
84 |
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map) # Add any other thing you want to pass in llava_model_args
|
85 |
model.eval()
|
|
|
86 |
max_frames_num = 64
|
87 |
video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
|
88 |
video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().to(torch.bfloat16)
|
89 |
video = [video]
|
90 |
conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
|
91 |
time_instruction = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. "
|
|
|
|
|
92 |
question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruction}\n{perspective_prompt} {task_prompt}"
|
93 |
conv = copy.deepcopy(conv_templates[conv_template])
|
94 |
conv.append_message(conv.roles[0], question)
|
|
|
110 |
|
111 |
## Training
|
112 |
|
113 |
+
See Ye et al. 2025
|
114 |
+
|
115 |
|
116 |
### Model
|
117 |
- **Architecture**: SO400M + Qwen2
|