import os import logging import requests import tempfile import torch from transformers import pipeline from faster_whisper import WhisperModel from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_core.documents import Document from langchain_community.vectorstores import FAISS from app.db import SessionLocal # Assuming SQLAlchemy session from app.models import User # Assuming SQLAlchemy User model # Setup logger logger = logging.getLogger("app.utils.whisper_llm") logger.setLevel(logging.INFO) if not logger.handlers: handler = logging.StreamHandler() formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) # Whisper Model Initialization def get_whisper_model(): if torch.cuda.is_available(): device = "cuda" compute_type = "float32" logger.info("✅ GPU detected: Using CUDA with float32 compute") else: device = "cpu" compute_type = "int8" logger.warning("⚠️ GPU not available: Falling back to CPU with int8 compute") try: model = WhisperModel("base", device=device, compute_type=compute_type) logger.info(f"📦 Loaded Faster-Whisper model on {device} with compute_type={compute_type}") return model except Exception as e: logger.error(f"❌ Failed to load Whisper model: {e}") raise whisper_model = get_whisper_model() # Summarizer try: summarizer = pipeline("summarization", model="facebook/bart-large-cnn") logger.info("📦 Hugging Face summarizer pipeline loaded successfully.") except Exception as e: logger.error(f"❌ Failed to load summarization pipeline: {e}") raise # Chunked summarization def summarize_in_chunks(text, chunk_size=800, overlap=100): summaries = [] words = text.split() step = chunk_size - overlap for i in range(0, len(words), step): chunk = " ".join(words[i:i + chunk_size]) if len(chunk.strip()) == 0: continue try: result = summarizer(chunk, max_length=256, min_length=64, do_sample=False) summaries.append(result[0]['summary_text']) except Exception as e: logger.error(f"❌ Chunk summarization failed: {e}") return " ".join(summaries) # 🧠 Get user from Neon DB def get_user(user_id: int): db = SessionLocal() try: return db.query(User).filter(User.id == user_id).first() finally: db.close() # ⚡ Core Analyzer Function with per-user FAISS ingestion def analyze(video_url: str, user_id: int): # Verify user exists user = get_user(user_id) if not user: raise ValueError(f"❌ User with ID {user_id} not found in Neon DB") logger.info(f"📥 Starting video analysis for user: {user.email} (ID: {user.id})") try: with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp: with requests.get(video_url, stream=True, timeout=60) as response: response.raise_for_status() for chunk in response.iter_content(chunk_size=8192): tmp.write(chunk) tmp_path = tmp.name logger.info(f"🎞️ Video saved to temp file: {tmp_path}") except Exception as e: logger.error(f"❌ Failed to download video: {e}") raise try: logger.info("🧠 Transcribing audio with Faster-Whisper...") segments, _ = whisper_model.transcribe(tmp_path) text = " ".join(segment.text for segment in segments) logger.info(f"✅ Transcription completed. Length: {len(text)} characters.") except Exception as e: logger.error(f"❌ Transcription failed: {e}") raise try: logger.info("📝 Summarizing transcript with Hugging Face model...") summary = summarize_in_chunks(text) logger.info("✅ Summarization completed.") except Exception as e: logger.error(f"❌ Summarization failed: {e}") raise try: logger.info("📊 Creating/updating FAISS vector store for user...") documents = [Document(page_content=summary)] embeddings = OpenAIEmbeddings() user_vector_path = f"vector_store/user_{user_id}" os.makedirs(user_vector_path, exist_ok=True) if os.path.exists(os.path.join(user_vector_path, "index.faiss")): vector_store = FAISS.load_local(user_vector_path, embeddings) vector_store.add_documents(documents) else: vector_store = FAISS.from_documents(documents, embeddings) vector_store.save_local(user_vector_path) logger.info(f"✅ Vector store saved at: {user_vector_path}") except Exception as e: logger.error(f"❌ Failed to create vector store: {e}") raise return text, summary