MLAdaptiveIntelligence
/

LLaVAction-0.5B

@@ -26,20 +26,18 @@ The LLaVAction-0.5B model is trained on EPIC-KITCHENS-100-MQA, based on Qwen2 la
 - **Point of Contact**: [Mackenzie Mathis](https://people.epfl.ch/mackenzie.mathis)
 - **Languages**: English
 -
-## Use
 ### Intended use
 The model was trained on EPIC-KITCHENS-100-MQA. It's intended to be used on videos that are similar to EPIC-KITCHENS-100.
-**Feel free to share your generations in the Community tab!**
 ### Generation
 We provide the simple generation process for using our model. For more details, you could refer to Github.
 ```python
 !pip install llavaction
 from llavaction.model.builder import load_pretrained_model
 from llavaction.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
 from llavaction.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
@@ -53,6 +51,15 @@ import warnings
 from decord import VideoReader, cpu
 import numpy as np
 warnings.filterwarnings("ignore")
 def load_video(video_path, max_frames_num,fps=1,force_sample=False):
     if max_frames_num == 0:
         return np.zeros((1, 336, 336, 3))
@@ -69,21 +76,19 @@ def load_video(video_path, max_frames_num,fps=1,force_sample=False):
     spare_frames = vr.get_batch(frame_idx).asnumpy()
     # import pdb;pdb.set_trace()
     return spare_frames,frame_time,video_time
 pretrained = "MLAdaptiveIntelligence/LLaVAction-0.5B"
 model_name = "llava_qwen"
 device = "cuda"
 device_map = "auto"
 tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)  # Add any other thing you want to pass in llava_model_args
 model.eval()
-video_path = "XXXX"
 max_frames_num = 64
 video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
 video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().to(torch.bfloat16)
 video = [video]
 conv_template = "qwen_1_5"  # Make sure you use correct chat template for different models
 time_instruction = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. "
-perspective_prompt = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing?"
-task_prompt = "Describe in details what you see from the video frames."
 question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruction}\n{perspective_prompt} {task_prompt}"
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], question)
@@ -105,6 +110,8 @@ print(text_outputs)
 ## Training
 ### Model
 - **Architecture**: SO400M + Qwen2

 - **Point of Contact**: [Mackenzie Mathis](https://people.epfl.ch/mackenzie.mathis)
 - **Languages**: English
 -
+## Useage
 ### Intended use
 The model was trained on EPIC-KITCHENS-100-MQA. It's intended to be used on videos that are similar to EPIC-KITCHENS-100.
 ### Generation
 We provide the simple generation process for using our model. For more details, you could refer to Github.
 ```python
 !pip install llavaction
 from llavaction.model.builder import load_pretrained_model
 from llavaction.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
 from llavaction.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
 from decord import VideoReader, cpu
 import numpy as np
 warnings.filterwarnings("ignore")
+#Your video (it assumes an egocentric view point)
+video_path = "XXXX"
+#These are the prompts we trained with, but you can test others:
+perspective_prompt = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing?"
+task_prompt = "Describe in details what you see from the video frames."
 def load_video(video_path, max_frames_num,fps=1,force_sample=False):
     if max_frames_num == 0:
         return np.zeros((1, 336, 336, 3))
     spare_frames = vr.get_batch(frame_idx).asnumpy()
     # import pdb;pdb.set_trace()
     return spare_frames,frame_time,video_time
 pretrained = "MLAdaptiveIntelligence/LLaVAction-0.5B"
 model_name = "llava_qwen"
 device = "cuda"
 device_map = "auto"
 tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)  # Add any other thing you want to pass in llava_model_args
 model.eval()
 max_frames_num = 64
 video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
 video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().to(torch.bfloat16)
 video = [video]
 conv_template = "qwen_1_5"  # Make sure you use correct chat template for different models
 time_instruction = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. "
 question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruction}\n{perspective_prompt} {task_prompt}"
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], question)
 ## Training
+See Ye et al. 2025
 ### Model
 - **Architecture**: SO400M + Qwen2