Spaces:

peace2024
/

dubswayAgenticV2

Building

App Files Files Community

peace2024 commited on Jun 23

Commit

e27e999

1 Parent(s): d7a468c

saving changes

Browse files

Files changed (11) hide show

.gitignore +49 -0
app/agent/custom_chatbot.py +77 -0
app/auth.py +1 -1
app/main.py +4 -2
app/pdf_ingestion.py +47 -0
app/utils/pdf.py +21 -4
app/utils/whisper_llm.py +134 -14
requirements.txt +30 -0
worker/__init__.py +0 -0
worker/daemon.py +70 -0
worker/gpu_test.py +24 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,49 @@

+# Python cache
+__pycache__/
+*.py[cod]
+*.so
+*.pyd
+# Virtual environments
+env/
+venv/
+myenv/
+.myenv/
+*.env
+# Jupyter/IPython
+.ipynb_checkpoints/
+# VSCode & PyCharm
+.vscode/
+.idea/
+# Logs and temp
+*.log
+*.tmp
+*.temp
+*.bak
+*.swp
+*.DS_Store
+# Model caches
+*.pt
+*.ckpt
+*.onnx
+*.pb
+*.tflite
+*.pkl
+# Hugging Face transformers cache
+~/.cache/
+huggingface/
+.cache/
+# Output PDFs, reports
+*.pdf
+*.out
+# Data files
+*.csv
+*.tsv
+*.js*

app/agent/custom_chatbot.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import logging
+from fastapi import APIRouter, HTTPException
+from pydantic.v1 import BaseModel, EmailStr
+from dotenv import load_dotenv
+from langchain_groq import ChatGroq
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains import create_retrieval_chain
+# Load environment variables
+load_dotenv()
+router = APIRouter()
+logger = logging.getLogger("custom_chatbot")
+# LangChain LLM setup
+groq_api_key = os.getenv("GROQ_API_KEY")
+llm = ChatGroq(groq_api_key=groq_api_key, model_name="Llama3-8b-8192")
+# Prompt template
+prompt_template = ChatPromptTemplate.from_template("""
+Answer the question based only on the provided context.
+<context>
+{context}
+</context>
+Question: {input}
+""")
+# Input schema with user_id
+class ChatRequest(BaseModel):
+    query: str
+    user_id: int
+# Load vector store for a specific user
+def load_user_vector_store(user_id: int):
+    user_path = f"vector_store/user_{user_id}"
+    index_file = os.path.join(user_path, "index.faiss")
+    if not os.path.exists(index_file):
+        raise FileNotFoundError(f"No vector store found for user {user_id}")
+    embeddings = OpenAIEmbeddings()
+    return FAISS.load_local(user_path, embeddings)
+# Endpoint
+@router.post("/custom-chatbot")
+async def custom_chatbot(request: ChatRequest):
+    query = request.query
+    user_id = request.user_id
+    try:
+        vector_store = load_user_vector_store(user_id)
+        retriever = vector_store.as_retriever()
+        doc_chain = create_stuff_documents_chain(llm, prompt_template)
+        rag_chain = create_retrieval_chain(retriever, doc_chain)
+        response = rag_chain.invoke({"input": query})
+        return {
+            "answer": response["answer"],
+            "sources": [doc.page_content for doc in response["context"]],
+        }
+    except FileNotFoundError as e:
+        logger.warning(f"📭 Vector store missing for user {user_id}")
+        raise HTTPException(status_code=404, detail=str(e))
+    except Exception as e:
+        logger.error(f"❌ Error in custom chatbot: {e}")
+        raise HTTPException(status_code=500, detail="Internal server error")

app/auth.py CHANGED Viewed

@@ -3,7 +3,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from passlib.context import CryptContext
 from jose import jwt
-from pydantic import BaseModel, EmailStr
 from app.database import get_db  # Updated: use the correct async session dependency
 from app.models import User
 import os

 from sqlalchemy.future import select
 from passlib.context import CryptContext
 from jose import jwt
+from pydantic.v1 import BaseModel, EmailStr
 from app.database import get_db  # Updated: use the correct async session dependency
 from app.models import User
 import os

app/main.py CHANGED Viewed

@@ -5,7 +5,8 @@ import logging
 from app.auth import router as auth_router
 from app.upload import router as upload_router
 from app.dashboard import router as dashboard_router
 # Initialize logger
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -31,7 +32,8 @@ app.add_middleware(
 app.include_router(auth_router, prefix="/api", tags=["Auth"])
 app.include_router(upload_router, prefix="/api", tags=["Upload"])
 app.include_router(dashboard_router, prefix="/api", tags=["Dashboard"])
 @app.on_event("startup")
 async def startup_event():

 from app.auth import router as auth_router
 from app.upload import router as upload_router
 from app.dashboard import router as dashboard_router
+from app.agent.custom_chatbot import router as custom_chatbot_router
+# from app.routes import pdf_ingestion
 # Initialize logger
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app.include_router(auth_router, prefix="/api", tags=["Auth"])
 app.include_router(upload_router, prefix="/api", tags=["Upload"])
 app.include_router(dashboard_router, prefix="/api", tags=["Dashboard"])
+app.include_router(custom_chatbot_router, prefix="/api", tags=["Custom Chatbot"])
+# app.include_router(pdf_ingestion.router, prefix="/api", tags=["PDF Ingestion"])
 @app.on_event("startup")
 async def startup_event():

app/pdf_ingestion.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+import logging
+from fastapi import APIRouter, HTTPException
+from pydantic.v1 import BaseModel, EmailStr
+from langchain_community.document_loaders import PyPDFDirectoryLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores import FAISS
+from dotenv import load_dotenv
+load_dotenv()
+router = APIRouter()
+logger = logging.getLogger("pdf_ingestion")
+class IngestRequest(BaseModel):
+    user_id: int
+@router.post("/ingest-pdfs")
+async def ingest_pdfs(request: IngestRequest):
+    user_id = request.user_id
+    user_pdf_path = f"./pdfs/user_{user_id}"
+    user_vector_path = f"./vector_store/user_{user_id}"
+    if not os.path.exists(user_pdf_path):
+        raise HTTPException(status_code=404, detail=f"No PDF directory found for user {user_id}")
+    try:
+        logger.info(f"📥 Loading PDFs for user {user_id} from {user_pdf_path}")
+        loader = PyPDFDirectoryLoader(user_pdf_path)
+        documents = loader.load()
+        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+        split_docs = splitter.split_documents(documents)
+        embeddings = OpenAIEmbeddings()
+        vector_store = FAISS.from_documents(split_docs, embeddings)
+        os.makedirs(user_vector_path, exist_ok=True)
+        vector_store.save_local(user_vector_path)
+        logger.info(f"✅ Re-ingested and saved vector store for user {user_id}")
+        return {"message": f"Vector store updated for user {user_id}", "documents_ingested": len(split_docs)}
+    except Exception as e:
+        logger.error(f"❌ PDF ingestion failed: {e}")
+        raise HTTPException(status_code=500, detail="Failed to ingest PDFs and build vector store.")

app/utils/pdf.py CHANGED Viewed

@@ -5,11 +5,28 @@ from io import BytesIO
 def generate(transcription: str, summary: str):
     buffer = BytesIO()
     c = canvas.Canvas(buffer)
     c.drawString(100, 800, "📄 Video Summary Report")
-    c.drawString(100, 770, "Transcription:")
-    c.drawString(100, 750, transcription[:1000])
-    c.drawString(100, 700, "Summary:")
-    c.drawString(100, 680, summary[:1000])
     c.save()
     buffer.seek(0)
     return buffer.read()

 def generate(transcription: str, summary: str):
     buffer = BytesIO()
     c = canvas.Canvas(buffer)
+    # Title
+    c.setFont("Helvetica-Bold", 14)
     c.drawString(100, 800, "📄 Video Summary Report")
+    c.setFont("Helvetica", 12)
+    # Transcription section
+    c.drawString(100, 770, "Transcription (first 1000 characters):")
+    for i, line in enumerate(split_lines(transcription[:1000], 90)):
+        c.drawString(100, 750 - i * 15, line)
+    # Summary section
+    offset = 750 - (len(transcription[:1000]) // 90 + 1) * 15 - 30
+    c.drawString(100, offset, "Summary (first 1000 characters):")
+    for i, line in enumerate(split_lines(summary[:1000], 90)):
+        c.drawString(100, offset - 20 - i * 15, line)
     c.save()
     buffer.seek(0)
     return buffer.read()
+def split_lines(text, width):
+    return [text[i:i + width] for i in range(0, len(text), width)]

app/utils/whisper_llm.py CHANGED Viewed

@@ -1,22 +1,142 @@
-import whisper
-from transformers import pipeline
 import requests
 import tempfile
-def analyze(video_url: str):
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
-    with requests.get(video_url, stream=True) as r:
-        for chunk in r.iter_content(8192):
-            tmp.write(chunk)
-    tmp.close()
-    model = whisper.load_model("base")
-    result = model.transcribe(tmp.name)
-    text = result["text"]
     summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-    summary = summarizer(text, max_length=512, min_length=128, do_sample=False)[0][
-        "summary_text"
-    ]
     return text, summary

+import os
+import logging
 import requests
 import tempfile
+import torch
+from transformers import pipeline
+from faster_whisper import WhisperModel
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings
+from langchain_core.documents import Document
+from langchain_community.vectorstores import FAISS
+from app.db import SessionLocal  # Assuming SQLAlchemy session
+from app.models import User  # Assuming SQLAlchemy User model
+# Setup logger
+logger = logging.getLogger("app.utils.whisper_llm")
+logger.setLevel(logging.INFO)
+if not logger.handlers:
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+# Whisper Model Initialization
+def get_whisper_model():
+    if torch.cuda.is_available():
+        device = "cuda"
+        compute_type = "float32"
+        logger.info("✅ GPU detected: Using CUDA with float32 compute")
+    else:
+        device = "cpu"
+        compute_type = "int8"
+        logger.warning("⚠️ GPU not available: Falling back to CPU with int8 compute")
+    try:
+        model = WhisperModel("base", device=device, compute_type=compute_type)
+        logger.info(f"📦 Loaded Faster-Whisper model on {device} with compute_type={compute_type}")
+        return model
+    except Exception as e:
+        logger.error(f"❌ Failed to load Whisper model: {e}")
+        raise
+whisper_model = get_whisper_model()
+# Summarizer
+try:
     summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+    logger.info("📦 Hugging Face summarizer pipeline loaded successfully.")
+except Exception as e:
+    logger.error(f"❌ Failed to load summarization pipeline: {e}")
+    raise
+# Chunked summarization
+def summarize_in_chunks(text, chunk_size=800, overlap=100):
+    summaries = []
+    words = text.split()
+    step = chunk_size - overlap
+    for i in range(0, len(words), step):
+        chunk = " ".join(words[i:i + chunk_size])
+        if len(chunk.strip()) == 0:
+            continue
+        try:
+            result = summarizer(chunk, max_length=256, min_length=64, do_sample=False)
+            summaries.append(result[0]['summary_text'])
+        except Exception as e:
+            logger.error(f"❌ Chunk summarization failed: {e}")
+    return " ".join(summaries)
+# 🧠 Get user from Neon DB
+def get_user(user_id: int):
+    db = SessionLocal()
+    try:
+        return db.query(User).filter(User.id == user_id).first()
+    finally:
+        db.close()
+# ⚡ Core Analyzer Function with per-user FAISS ingestion
+def analyze(video_url: str, user_id: int):
+    # Verify user exists
+    user = get_user(user_id)
+    if not user:
+        raise ValueError(f"❌ User with ID {user_id} not found in Neon DB")
+    logger.info(f"📥 Starting video analysis for user: {user.email} (ID: {user.id})")
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
+            with requests.get(video_url, stream=True, timeout=60) as response:
+                response.raise_for_status()
+                for chunk in response.iter_content(chunk_size=8192):
+                    tmp.write(chunk)
+            tmp_path = tmp.name
+        logger.info(f"🎞️ Video saved to temp file: {tmp_path}")
+    except Exception as e:
+        logger.error(f"❌ Failed to download video: {e}")
+        raise
+    try:
+        logger.info("🧠 Transcribing audio with Faster-Whisper...")
+        segments, _ = whisper_model.transcribe(tmp_path)
+        text = " ".join(segment.text for segment in segments)
+        logger.info(f"✅ Transcription completed. Length: {len(text)} characters.")
+    except Exception as e:
+        logger.error(f"❌ Transcription failed: {e}")
+        raise
+    try:
+        logger.info("📝 Summarizing transcript with Hugging Face model...")
+        summary = summarize_in_chunks(text)
+        logger.info("✅ Summarization completed.")
+    except Exception as e:
+        logger.error(f"❌ Summarization failed: {e}")
+        raise
+    try:
+        logger.info("📊 Creating/updating FAISS vector store for user...")
+        documents = [Document(page_content=summary)]
+        embeddings = OpenAIEmbeddings()
+        user_vector_path = f"vector_store/user_{user_id}"
+        os.makedirs(user_vector_path, exist_ok=True)
+        if os.path.exists(os.path.join(user_vector_path, "index.faiss")):
+            vector_store = FAISS.load_local(user_vector_path, embeddings)
+            vector_store.add_documents(documents)
+        else:
+            vector_store = FAISS.from_documents(documents, embeddings)
+        vector_store.save_local(user_vector_path)
+        logger.info(f"✅ Vector store saved at: {user_vector_path}")
+    except Exception as e:
+        logger.error(f"❌ Failed to create vector store: {e}")
+        raise
     return text, summary

requirements.txt CHANGED Viewed

@@ -15,3 +15,33 @@ databases
 psycopg2-binary
 passlib[bcrypt]
 python-jose[cryptography]

 psycopg2-binary
 passlib[bcrypt]
 python-jose[cryptography]
+faster-whisper
+torch==2.2.2+cu121
+torchvision==0.17.2+cu121
+torchaudio==2.2.2+cu121
+--extra-index-url https://download.pytorch.org/whl/cu121
+# CTranslate2 GPU build for CUDA 12
+langchain_openai
+langchain_core
+python-dotenv
+streamlit
+langchain_community
+langserve
+sse_starlette
+bs4
+pypdf
+chromadb
+faiss-cpu
+groq
+cassio
+beautifulsoup4
+langchain-groq
+wikipedia
+arxiv
+langchainhub
+sentence_transformers
+PyPDF2
+langchain-objectbox
+pypdf
+langchain_groq
+langchain

worker/__init__.py ADDED Viewed

File without changes

worker/daemon.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import asyncio
+import os
+import time
+from datetime import datetime
+from sqlalchemy.future import select
+from sqlalchemy.ext.asyncio import AsyncSession
+from app.database import AsyncSessionLocal
+from app.models import VideoUpload
+from app.utils import whisper_llm, pdf, s3
+POLL_INTERVAL = 200  # seconds
+async def process_pending_videos():
+    async with AsyncSessionLocal() as session:
+        try:
+            result = await session.execute(
+                select(VideoUpload).where(VideoUpload.status == "pending")
+            )
+            pending_videos = result.scalars().all()
+            for video in pending_videos:
+                print(f"🎬 Processing video ID {video.id} for user {video.user_id}")
+                try:
+                    transcription, summary = whisper_llm.analyze(video.video_url)
+                except Exception as e:
+                    print(f"❌ Whisper failed: {e}")
+                    continue
+                try:
+                    pdf_bytes = pdf.generate(transcription, summary)
+                except Exception as e:
+                    print(f"❌ PDF generation failed: {e}")
+                    continue
+                try:
+                    pdf_key = f"pdfs/{video.id}.pdf"
+                    pdf_url = s3.upload_pdf_bytes(pdf_bytes, pdf_key)
+                except Exception as e:
+                    print(f"❌ Upload to S3 failed: {e}")
+                    continue
+                video.status = "completed"
+                video.pdf_url = pdf_url
+                video.updated_at = datetime.utcnow()
+                await session.commit()
+                print(f"✅ Completed video {video.id}")
+        except Exception as e:
+            print(f"❌ DB error: {e}")
+async def run_worker():
+    print("🚀 Async worker started (Neon)...")
+    while True:
+        print("🔁 Checking for pending videos...")
+        try:
+            await process_pending_videos()
+        except Exception as e:
+            print(f"❌ Worker loop crashed: {e}")
+        await asyncio.sleep(POLL_INTERVAL)
+if __name__ == "__main__":
+    asyncio.run(run_worker())

worker/gpu_test.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import torch
+def check_gpu():
+    print("🔍 Checking CUDA and GPU details...\n")
+    # Check if CUDA is available
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        print("✅ CUDA is available.")
+        print(f"🖥️  GPU Name: {torch.cuda.get_device_name(0)}")
+        print(f"📊 GPU Memory: {round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2)} GB")
+        # Create a tensor on GPU
+        x = torch.rand(1000, 1000).to(device)
+        y = torch.mm(x, x)
+        print(f"🚀 Tensor computation successful on GPU! Tensor shape: {y.shape}")
+    else:
+        print("❌ CUDA is NOT available. Using CPU fallback.")
+        x = torch.rand(1000, 1000)
+        y = torch.mm(x, x)
+        print(f"✅ CPU computation done. Tensor shape: {y.shape}")
+if __name__ == "__main__":
+    check_gpu()