#!/usr/bin/env python3 """ Soccer QA Inference - Single Class, Clean API Usage in Colab: from soccer_qa_inference import SoccerQA model = SoccerQA("soccer-qa-3b-unified") answer = model.ask("video.mp4", "What happened?", max_tokens=128) """ import os import json import torch import torch.nn as nn import numpy as np from safetensors.torch import load_file from transformers import AutoTokenizer, AutoModelForCausalLM from decord import VideoReader # Import your existing modules import src.datasets.utils.video.transforms as video_transforms import src.datasets.utils.video.volume_transforms as volume_transforms from src.models.vision_transformer import vit_giant_rope IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) def get_video(fname, num_frames=16): """Load video and sample frames uniformly""" vr = VideoReader(fname) frame_idx = np.linspace(0, len(vr) - 1, num=num_frames).astype(np.int64) video = vr.get_batch(frame_idx).asnumpy() return video def build_video_transform(img_size): """Build video preprocessing transform""" short_side_size = int(256.0 / 224 * img_size) eval_transform = video_transforms.Compose([ video_transforms.Resize(short_side_size, interpolation="bilinear"), video_transforms.CenterCrop(size=(img_size, img_size)), volume_transforms.ClipToTensor(), video_transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), ]) return eval_transform class SoccerQA: """Single class for Soccer QA inference - Clean Colab API""" def __init__(self, model_dir="/home/varunkodathala/jepa_llm/soccer_pretrain/soccer-qa-3b-unified"): """Initialize Soccer QA model Args: model_dir: Path to merged model directory """ self.model_dir = model_dir self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"🚀 Loading Soccer QA from {model_dir}...") # Load config and tokenizer self._load_config() self._load_tokenizer() # Build models self._build_vision_model() self._build_text_model() self._build_projection() # Load all weights self._load_weights() # Build video transforms self.video_transform = build_video_transform(self.img_size) print("✅ Soccer QA ready!") def _load_config(self): """Load model configuration""" config_path = os.path.join(self.model_dir, "config.json") with open(config_path, 'r') as f: self.config = json.load(f) self.vision_dim = self.config["vision_dim"] # 1408 self.projection_dim = self.config["projection_dim"] # 2048 self.text_dim = self.config["text_dim"] # 3072 self.img_size = self.config["img_size"] # 256 self.num_frames = self.config["num_frames"] # 16 def _load_tokenizer(self): """Load tokenizer with