diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..52373fe24473b1aa44333d318f578ae6bf04b49b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index 1f48c32e9df46945f0ac7142bfda001a1beb4747..b669260acff5b42bccc03a6c58a4d5c36101c6a1 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,74 @@ ---- -license: cc-by-nc-4.0 ---- +--- +language: en +license: cc-by-nc-4.0 +tags: +- soccer +- video-qa +- question-answering +- vision-language +- multimodal +- sports-analysis +library_name: transformers +pipeline_tag: video-text-to-text +--- + +# Soccer QA 4B - Soccer Video Question Answering Model + +**⚠️ RESEARCH USE ONLY - NON-COMMERCIAL LICENSE** + +Soccer QA 4B is a unified video question-answering model specifically designed for soccer video understanding. + +## Model Description + +This model can answer questions about soccer videos by analyzing visual content and generating natural language responses. + +**Example:** +- **Input**: Video + "What unfolded during the game in the video?" +- **Output**: "During the game, there was a foul committed by a player from the yellow-jerseyed team, leading to a yellow card being issued..." + +## Architecture +- **Vision Encoder**: DWT-VJEPA2-based video encoder (vit_giant, 1408 dim) +- **Text Model**: LLaMA 3.2-3B with LoRA fine-tuning +- **Vision-Text Bridge**: Learned projection layer (1408 → 2048 → 3072) +- **Specialization**: Fine-tuned on soccer video QA data + +## Usage (Helper functions are in repo) + +```python +from soccer_qa_inference import SoccerQA + +model = SoccerQA("/path/to/model") +answer = model.ask("video.mp4", "Was this a Foul?", max_tokens=45) +print(answer) +``` + +## Model Details +- **Parameters**: ~4B total +- **Input**: Video files (16 frames, 256x256) + text questions +- **Output**: Natural language answers +- **Domain**: Soccer/football video analysis +- **Context**: Handles complex game situations, player actions, fouls, etc. + +## Training Data +- Soccer video clips with question-answer pairs +- Covers various game situations: fouls, shots, saves, player actions +- Annotated with detailed descriptions of game events + +## Limitations +- Research use only, no commercial applications +- Optimized specifically for soccer content +- May not generalize well to other sports or video domains +- Requires high-quality video input for best results + +## License +CC-BY-NC-4.0 - Research use only, no commercial applications permitted. + +## Citation +```bibtex +@misc{soccer-qa-4b-2025, + title={Soccer QA 4B: Video Question Answering for Soccer Analysis}, + author={Varun Kodathala, Sports Vision}, + year={2025}, + note={Research model for soccer video understanding} +} +``` \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a51866840beedee6d50940d53a2af7d07d4b4b8f --- /dev/null +++ b/config.json @@ -0,0 +1,26 @@ +{ + "model_type": "soccer_qa_4b", + "architectures": [ + "SoccerQA4BModel" + ], + "vision_dim": 1408, + "projection_dim": 2048, + "text_dim": 3072, + "img_size": 256, + "num_frames": 16, + "max_length": 256, + "temperature": 0.7, + "imagenet_mean": [ + 0.485, + 0.456, + 0.406 + ], + "imagenet_std": [ + 0.229, + 0.224, + 0.225 + ], + "hidden_size": 3072, + "vocab_size": 128257, + "model_description": "Soccer video question answering model" +} \ No newline at end of file diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..32e06cad1fb762d4ed84cd6d5fa44c1b2a9d48a0 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f268918412af5ab623937ca776bf5c91eb26f04c3ff7e4cc257598aeda61b7cc +size 18512562808 diff --git a/soccer_qa_inference.py b/soccer_qa_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..afb5c2ffbf1997c255a95f40ae61f1395f41274c --- /dev/null +++ b/soccer_qa_inference.py @@ -0,0 +1,304 @@ +#!/usr/bin/env python3 +""" +Soccer QA Inference - Single Class, Clean API + +Usage in Colab: + from soccer_qa_inference import SoccerQA + model = SoccerQA("soccer-qa-3b-unified") + answer = model.ask("video.mp4", "What happened?", max_tokens=128) +""" + +import os +import json +import torch +import torch.nn as nn +import numpy as np +from safetensors.torch import load_file +from transformers import AutoTokenizer, AutoModelForCausalLM +from decord import VideoReader + +# Import your existing modules +import src.datasets.utils.video.transforms as video_transforms +import src.datasets.utils.video.volume_transforms as volume_transforms +from src.models.vision_transformer import vit_giant_rope + +IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) +IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) + +def get_video(fname, num_frames=16): + """Load video and sample frames uniformly""" + vr = VideoReader(fname) + frame_idx = np.linspace(0, len(vr) - 1, num=num_frames).astype(np.int64) + video = vr.get_batch(frame_idx).asnumpy() + return video + +def build_video_transform(img_size): + """Build video preprocessing transform""" + short_side_size = int(256.0 / 224 * img_size) + eval_transform = video_transforms.Compose([ + video_transforms.Resize(short_side_size, interpolation="bilinear"), + video_transforms.CenterCrop(size=(img_size, img_size)), + volume_transforms.ClipToTensor(), + video_transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + ]) + return eval_transform + +class SoccerQA: + """Single class for Soccer QA inference - Clean Colab API""" + + def __init__(self, model_dir="/home/varunkodathala/jepa_llm/soccer_pretrain/soccer-qa-3b-unified"): + """Initialize Soccer QA model + + Args: + model_dir: Path to merged model directory + """ + self.model_dir = model_dir + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + print(f"🚀 Loading Soccer QA from {model_dir}...") + + # Load config and tokenizer + self._load_config() + self._load_tokenizer() + + # Build models + self._build_vision_model() + self._build_text_model() + self._build_projection() + + # Load all weights + self._load_weights() + + # Build video transforms + self.video_transform = build_video_transform(self.img_size) + + print("✅ Soccer QA ready!") + + def _load_config(self): + """Load model configuration""" + config_path = os.path.join(self.model_dir, "config.json") + with open(config_path, 'r') as f: + self.config = json.load(f) + + self.vision_dim = self.config["vision_dim"] # 1408 + self.projection_dim = self.config["projection_dim"] # 2048 + self.text_dim = self.config["text_dim"] # 3072 + self.img_size = self.config["img_size"] # 256 + self.num_frames = self.config["num_frames"] # 16 + + def _load_tokenizer(self): + """Load tokenizer with