import cv2 import torch from PIL import Image import numpy as np import os import shutil import gradio as gr import mediapipe as mp from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration, BitsAndBytesConfig device = "cuda" if torch.cuda.is_available() else "cpu" model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf" quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4" ) model = LlavaNextVideoForConditionalGeneration.from_pretrained( model_id, quantization_config=quantization_config, low_cpu_mem_usage=True, device_map="auto" ) processor = LlavaNextVideoProcessor.from_pretrained(model_id) mpHands = mp.solutions.hands hands = mpHands.Hands(static_image_mode=True, max_num_hands=2) mpDraw = mp.solutions.drawing_utils def track_hand_position(frame): height, width = frame.shape[:2] mid_width = width // 2 imgRGB = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) results = hands.process(imgRGB) hand_positions = [] if results.multi_hand_landmarks: for handLms in results.multi_hand_landmarks: cx_values = [] for lm in handLms.landmark: cx = int(lm.x * width) cx_values.append(cx) avg_cx = sum(cx_values) / len(cx_values) if avg_cx < mid_width: hand_positions.append("Region A") else: hand_positions.append("Region B") mpDraw.draw_landmarks(frame, handLms, mpHands.HAND_CONNECTIONS) return frame, hand_positions def add_regions_to_frame(frame, frame_idx, output_dir): height, width = frame.shape[:2] mid_width = width // 2 overlay = frame.copy() cv2.rectangle(overlay, (0, 0), (mid_width, height), (255, 0, 0), -1) cv2.rectangle(overlay, (mid_width, 0), (width, height), (0, 255, 0), -1) frame = cv2.addWeighted(frame, 0.7, overlay, 0.3, 0) cv2.line(frame, (mid_width, 0), (mid_width, height), (255, 255, 255), 3) cv2.putText(frame, "Region A", (mid_width//4, height//2), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 3) cv2.putText(frame, "Region B", (mid_width + mid_width//4, height//2), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 3) tracked_frame, hand_pos = track_hand_position(frame.copy()) cv2.imwrite(f"{output_dir}/frame_{frame_idx:03d}.jpg", tracked_frame) return tracked_frame, hand_pos def sample_frames(video_path, num_frames): output_dir = "/tmp/processed_frames" if os.path.exists(output_dir): shutil.rmtree(output_dir) os.makedirs(output_dir) video = cv2.VideoCapture(video_path) if not video.isOpened(): raise ValueError(f"Could not open video file: {video_path}") total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) interval = max(1, total_frames // num_frames) frames = [] frame_count = 0 hand_tracking_log = [] for i in range(total_frames): ret, frame = video.read() if not ret: continue if i % interval == 0 and len(frames) < num_frames: processed_frame, hand_positions = add_regions_to_frame(frame, frame_count, output_dir) pil_img = Image.fromarray(cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB)) frames.append(pil_img) hand_tracking_log.append(f"Frame {frame_count}: {hand_positions}") frame_count += 1 video.release() frame_paths = [f"{output_dir}/frame_{i:03d}.jpg" for i in range(frame_count)] return frames, frame_paths, hand_tracking_log def analyze_video(video_path): conversation = [ { "role": "user", "content": [ {"type": "text", "text": """Analyze this gas pipe quality control video and classify into one category: 1) PASSED - pipe taken from Region A, dipped in water, no bubbles, moved to Region B. Example: Person picks pipe from left side, tests in water, no bubbles seen, places in right side. 2) FAILED - pipe tested in water, bubbles visible. Example: Person dips pipe in water, bubbles appear indicating leak, pipe rejected. 3) CHEATING - pipe moved from A to B without testing. Example: Person takes pipe from left and directly places in right without water test. Give classification and brief reason."""}, {"type": "video"}, ], }, ] prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) video_frames, frame_paths, hand_log = sample_frames(video_path, 8) inputs = processor(text=prompt, videos=video_frames, padding=True) inputs = {k: v.to(model.device) for k, v in inputs.items()} output = model.generate( **inputs, max_new_tokens=150, do_sample=True, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.1, pad_token_id=processor.tokenizer.eos_token_id ) result = processor.decode(output[0][2:], skip_special_tokens=True) hand_tracking_summary = "\n".join(hand_log) return frame_paths, result, hand_tracking_summary examples = [ ["07.mp4"], ["07_part1.mp4"], ["07_part2.mp4"] ] iface = gr.Interface( fn=analyze_video, inputs=gr.Video(), outputs=[ gr.Gallery(label="Processed Frames"), gr.Textbox(label="LLM Analysis", lines=10), gr.Textbox(label="Hand Tracking Log", lines=15) ], title="Gas Pipe Quality Control Analyzer", examples=examples, cache_examples=False ) iface.launch(share=True)