mwmathis commited on
Commit
6985a4f
·
verified ·
1 Parent(s): f7e786c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +14 -7
README.md CHANGED
@@ -26,20 +26,18 @@ The LLaVAction-0.5B model is trained on EPIC-KITCHENS-100-MQA, based on Qwen2 la
26
  - **Point of Contact**: [Mackenzie Mathis](https://people.epfl.ch/mackenzie.mathis)
27
  - **Languages**: English
28
  -
29
- ## Use
30
 
31
  ### Intended use
32
  The model was trained on EPIC-KITCHENS-100-MQA. It's intended to be used on videos that are similar to EPIC-KITCHENS-100.
33
 
34
 
35
- **Feel free to share your generations in the Community tab!**
36
-
37
-
38
  ### Generation
39
  We provide the simple generation process for using our model. For more details, you could refer to Github.
40
 
41
  ```python
42
  !pip install llavaction
 
43
  from llavaction.model.builder import load_pretrained_model
44
  from llavaction.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
45
  from llavaction.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
@@ -53,6 +51,15 @@ import warnings
53
  from decord import VideoReader, cpu
54
  import numpy as np
55
  warnings.filterwarnings("ignore")
 
 
 
 
 
 
 
 
 
56
  def load_video(video_path, max_frames_num,fps=1,force_sample=False):
57
  if max_frames_num == 0:
58
  return np.zeros((1, 336, 336, 3))
@@ -69,21 +76,19 @@ def load_video(video_path, max_frames_num,fps=1,force_sample=False):
69
  spare_frames = vr.get_batch(frame_idx).asnumpy()
70
  # import pdb;pdb.set_trace()
71
  return spare_frames,frame_time,video_time
 
72
  pretrained = "MLAdaptiveIntelligence/LLaVAction-0.5B"
73
  model_name = "llava_qwen"
74
  device = "cuda"
75
  device_map = "auto"
76
  tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map) # Add any other thing you want to pass in llava_model_args
77
  model.eval()
78
- video_path = "XXXX"
79
  max_frames_num = 64
80
  video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
81
  video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().to(torch.bfloat16)
82
  video = [video]
83
  conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
84
  time_instruction = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. "
85
- perspective_prompt = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing?"
86
- task_prompt = "Describe in details what you see from the video frames."
87
  question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruction}\n{perspective_prompt} {task_prompt}"
88
  conv = copy.deepcopy(conv_templates[conv_template])
89
  conv.append_message(conv.roles[0], question)
@@ -105,6 +110,8 @@ print(text_outputs)
105
 
106
  ## Training
107
 
 
 
108
 
109
  ### Model
110
  - **Architecture**: SO400M + Qwen2
 
26
  - **Point of Contact**: [Mackenzie Mathis](https://people.epfl.ch/mackenzie.mathis)
27
  - **Languages**: English
28
  -
29
+ ## Useage
30
 
31
  ### Intended use
32
  The model was trained on EPIC-KITCHENS-100-MQA. It's intended to be used on videos that are similar to EPIC-KITCHENS-100.
33
 
34
 
 
 
 
35
  ### Generation
36
  We provide the simple generation process for using our model. For more details, you could refer to Github.
37
 
38
  ```python
39
  !pip install llavaction
40
+
41
  from llavaction.model.builder import load_pretrained_model
42
  from llavaction.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
43
  from llavaction.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
 
51
  from decord import VideoReader, cpu
52
  import numpy as np
53
  warnings.filterwarnings("ignore")
54
+
55
+ #Your video (it assumes an egocentric view point)
56
+ video_path = "XXXX"
57
+
58
+ #These are the prompts we trained with, but you can test others:
59
+ perspective_prompt = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing?"
60
+ task_prompt = "Describe in details what you see from the video frames."
61
+
62
+
63
  def load_video(video_path, max_frames_num,fps=1,force_sample=False):
64
  if max_frames_num == 0:
65
  return np.zeros((1, 336, 336, 3))
 
76
  spare_frames = vr.get_batch(frame_idx).asnumpy()
77
  # import pdb;pdb.set_trace()
78
  return spare_frames,frame_time,video_time
79
+
80
  pretrained = "MLAdaptiveIntelligence/LLaVAction-0.5B"
81
  model_name = "llava_qwen"
82
  device = "cuda"
83
  device_map = "auto"
84
  tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map) # Add any other thing you want to pass in llava_model_args
85
  model.eval()
 
86
  max_frames_num = 64
87
  video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
88
  video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().to(torch.bfloat16)
89
  video = [video]
90
  conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
91
  time_instruction = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. "
 
 
92
  question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruction}\n{perspective_prompt} {task_prompt}"
93
  conv = copy.deepcopy(conv_templates[conv_template])
94
  conv.append_message(conv.roles[0], question)
 
110
 
111
  ## Training
112
 
113
+ See Ye et al. 2025
114
+
115
 
116
  ### Model
117
  - **Architecture**: SO400M + Qwen2