peace2024 commited on
Commit
e27e999
Β·
1 Parent(s): d7a468c

saving changes

Browse files
.gitignore ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.so
5
+ *.pyd
6
+
7
+ # Virtual environments
8
+ env/
9
+ venv/
10
+ myenv/
11
+ .myenv/
12
+ *.env
13
+
14
+ # Jupyter/IPython
15
+ .ipynb_checkpoints/
16
+
17
+ # VSCode & PyCharm
18
+ .vscode/
19
+ .idea/
20
+
21
+ # Logs and temp
22
+ *.log
23
+ *.tmp
24
+ *.temp
25
+ *.bak
26
+ *.swp
27
+ *.DS_Store
28
+
29
+ # Model caches
30
+ *.pt
31
+ *.ckpt
32
+ *.onnx
33
+ *.pb
34
+ *.tflite
35
+ *.pkl
36
+
37
+ # Hugging Face transformers cache
38
+ ~/.cache/
39
+ huggingface/
40
+ .cache/
41
+
42
+ # Output PDFs, reports
43
+ *.pdf
44
+ *.out
45
+
46
+ # Data files
47
+ *.csv
48
+ *.tsv
49
+ *.js*
app/agent/custom_chatbot.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from fastapi import APIRouter, HTTPException
4
+ from pydantic.v1 import BaseModel, EmailStr
5
+ from dotenv import load_dotenv
6
+
7
+ from langchain_groq import ChatGroq
8
+ from langchain_openai import OpenAIEmbeddings
9
+ from langchain_community.vectorstores import FAISS
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain.chains.combine_documents import create_stuff_documents_chain
12
+ from langchain_core.prompts import ChatPromptTemplate
13
+ from langchain.chains import create_retrieval_chain
14
+
15
+ # Load environment variables
16
+ load_dotenv()
17
+
18
+ router = APIRouter()
19
+
20
+ logger = logging.getLogger("custom_chatbot")
21
+
22
+ # LangChain LLM setup
23
+ groq_api_key = os.getenv("GROQ_API_KEY")
24
+ llm = ChatGroq(groq_api_key=groq_api_key, model_name="Llama3-8b-8192")
25
+
26
+ # Prompt template
27
+ prompt_template = ChatPromptTemplate.from_template("""
28
+ Answer the question based only on the provided context.
29
+ <context>
30
+ {context}
31
+ </context>
32
+
33
+ Question: {input}
34
+ """)
35
+
36
+ # Input schema with user_id
37
+ class ChatRequest(BaseModel):
38
+ query: str
39
+ user_id: int
40
+
41
+ # Load vector store for a specific user
42
+ def load_user_vector_store(user_id: int):
43
+ user_path = f"vector_store/user_{user_id}"
44
+ index_file = os.path.join(user_path, "index.faiss")
45
+
46
+ if not os.path.exists(index_file):
47
+ raise FileNotFoundError(f"No vector store found for user {user_id}")
48
+
49
+ embeddings = OpenAIEmbeddings()
50
+ return FAISS.load_local(user_path, embeddings)
51
+
52
+ # Endpoint
53
+ @router.post("/custom-chatbot")
54
+ async def custom_chatbot(request: ChatRequest):
55
+ query = request.query
56
+ user_id = request.user_id
57
+
58
+ try:
59
+ vector_store = load_user_vector_store(user_id)
60
+ retriever = vector_store.as_retriever()
61
+
62
+ doc_chain = create_stuff_documents_chain(llm, prompt_template)
63
+ rag_chain = create_retrieval_chain(retriever, doc_chain)
64
+
65
+ response = rag_chain.invoke({"input": query})
66
+ return {
67
+ "answer": response["answer"],
68
+ "sources": [doc.page_content for doc in response["context"]],
69
+ }
70
+
71
+ except FileNotFoundError as e:
72
+ logger.warning(f"πŸ“­ Vector store missing for user {user_id}")
73
+ raise HTTPException(status_code=404, detail=str(e))
74
+
75
+ except Exception as e:
76
+ logger.error(f"❌ Error in custom chatbot: {e}")
77
+ raise HTTPException(status_code=500, detail="Internal server error")
app/auth.py CHANGED
@@ -3,7 +3,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
3
  from sqlalchemy.future import select
4
  from passlib.context import CryptContext
5
  from jose import jwt
6
- from pydantic import BaseModel, EmailStr
7
  from app.database import get_db # Updated: use the correct async session dependency
8
  from app.models import User
9
  import os
 
3
  from sqlalchemy.future import select
4
  from passlib.context import CryptContext
5
  from jose import jwt
6
+ from pydantic.v1 import BaseModel, EmailStr
7
  from app.database import get_db # Updated: use the correct async session dependency
8
  from app.models import User
9
  import os
app/main.py CHANGED
@@ -5,7 +5,8 @@ import logging
5
  from app.auth import router as auth_router
6
  from app.upload import router as upload_router
7
  from app.dashboard import router as dashboard_router
8
-
 
9
  # Initialize logger
10
  logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger(__name__)
@@ -31,7 +32,8 @@ app.add_middleware(
31
  app.include_router(auth_router, prefix="/api", tags=["Auth"])
32
  app.include_router(upload_router, prefix="/api", tags=["Upload"])
33
  app.include_router(dashboard_router, prefix="/api", tags=["Dashboard"])
34
-
 
35
 
36
  @app.on_event("startup")
37
  async def startup_event():
 
5
  from app.auth import router as auth_router
6
  from app.upload import router as upload_router
7
  from app.dashboard import router as dashboard_router
8
+ from app.agent.custom_chatbot import router as custom_chatbot_router
9
+ # from app.routes import pdf_ingestion
10
  # Initialize logger
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
 
32
  app.include_router(auth_router, prefix="/api", tags=["Auth"])
33
  app.include_router(upload_router, prefix="/api", tags=["Upload"])
34
  app.include_router(dashboard_router, prefix="/api", tags=["Dashboard"])
35
+ app.include_router(custom_chatbot_router, prefix="/api", tags=["Custom Chatbot"])
36
+ # app.include_router(pdf_ingestion.router, prefix="/api", tags=["PDF Ingestion"])
37
 
38
  @app.on_event("startup")
39
  async def startup_event():
app/pdf_ingestion.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from fastapi import APIRouter, HTTPException
4
+ from pydantic.v1 import BaseModel, EmailStr
5
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_openai import OpenAIEmbeddings
8
+ from langchain_community.vectorstores import FAISS
9
+ from dotenv import load_dotenv
10
+
11
+ load_dotenv()
12
+
13
+ router = APIRouter()
14
+ logger = logging.getLogger("pdf_ingestion")
15
+
16
+ class IngestRequest(BaseModel):
17
+ user_id: int
18
+
19
+ @router.post("/ingest-pdfs")
20
+ async def ingest_pdfs(request: IngestRequest):
21
+ user_id = request.user_id
22
+ user_pdf_path = f"./pdfs/user_{user_id}"
23
+ user_vector_path = f"./vector_store/user_{user_id}"
24
+
25
+ if not os.path.exists(user_pdf_path):
26
+ raise HTTPException(status_code=404, detail=f"No PDF directory found for user {user_id}")
27
+
28
+ try:
29
+ logger.info(f"πŸ“₯ Loading PDFs for user {user_id} from {user_pdf_path}")
30
+ loader = PyPDFDirectoryLoader(user_pdf_path)
31
+ documents = loader.load()
32
+
33
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
34
+ split_docs = splitter.split_documents(documents)
35
+
36
+ embeddings = OpenAIEmbeddings()
37
+ vector_store = FAISS.from_documents(split_docs, embeddings)
38
+
39
+ os.makedirs(user_vector_path, exist_ok=True)
40
+ vector_store.save_local(user_vector_path)
41
+
42
+ logger.info(f"βœ… Re-ingested and saved vector store for user {user_id}")
43
+ return {"message": f"Vector store updated for user {user_id}", "documents_ingested": len(split_docs)}
44
+
45
+ except Exception as e:
46
+ logger.error(f"❌ PDF ingestion failed: {e}")
47
+ raise HTTPException(status_code=500, detail="Failed to ingest PDFs and build vector store.")
app/utils/pdf.py CHANGED
@@ -5,11 +5,28 @@ from io import BytesIO
5
  def generate(transcription: str, summary: str):
6
  buffer = BytesIO()
7
  c = canvas.Canvas(buffer)
 
 
 
8
  c.drawString(100, 800, "πŸ“„ Video Summary Report")
9
- c.drawString(100, 770, "Transcription:")
10
- c.drawString(100, 750, transcription[:1000])
11
- c.drawString(100, 700, "Summary:")
12
- c.drawString(100, 680, summary[:1000])
 
 
 
 
 
 
 
 
 
 
13
  c.save()
14
  buffer.seek(0)
15
  return buffer.read()
 
 
 
 
 
5
  def generate(transcription: str, summary: str):
6
  buffer = BytesIO()
7
  c = canvas.Canvas(buffer)
8
+
9
+ # Title
10
+ c.setFont("Helvetica-Bold", 14)
11
  c.drawString(100, 800, "πŸ“„ Video Summary Report")
12
+
13
+ c.setFont("Helvetica", 12)
14
+
15
+ # Transcription section
16
+ c.drawString(100, 770, "Transcription (first 1000 characters):")
17
+ for i, line in enumerate(split_lines(transcription[:1000], 90)):
18
+ c.drawString(100, 750 - i * 15, line)
19
+
20
+ # Summary section
21
+ offset = 750 - (len(transcription[:1000]) // 90 + 1) * 15 - 30
22
+ c.drawString(100, offset, "Summary (first 1000 characters):")
23
+ for i, line in enumerate(split_lines(summary[:1000], 90)):
24
+ c.drawString(100, offset - 20 - i * 15, line)
25
+
26
  c.save()
27
  buffer.seek(0)
28
  return buffer.read()
29
+
30
+
31
+ def split_lines(text, width):
32
+ return [text[i:i + width] for i in range(0, len(text), width)]
app/utils/whisper_llm.py CHANGED
@@ -1,22 +1,142 @@
1
- import whisper
2
- from transformers import pipeline
3
  import requests
4
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
- def analyze(video_url: str):
8
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
9
- with requests.get(video_url, stream=True) as r:
10
- for chunk in r.iter_content(8192):
11
- tmp.write(chunk)
12
- tmp.close()
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- model = whisper.load_model("base")
15
- result = model.transcribe(tmp.name)
16
- text = result["text"]
17
 
 
 
18
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
19
- summary = summarizer(text, max_length=512, min_length=128, do_sample=False)[0][
20
- "summary_text"
21
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  return text, summary
 
1
+ import os
2
+ import logging
3
  import requests
4
  import tempfile
5
+ import torch
6
+ from transformers import pipeline
7
+ from faster_whisper import WhisperModel
8
+
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain_openai import OpenAIEmbeddings
11
+ from langchain_core.documents import Document
12
+ from langchain_community.vectorstores import FAISS
13
+
14
+ from app.db import SessionLocal # Assuming SQLAlchemy session
15
+ from app.models import User # Assuming SQLAlchemy User model
16
+
17
+ # Setup logger
18
+ logger = logging.getLogger("app.utils.whisper_llm")
19
+ logger.setLevel(logging.INFO)
20
+
21
+ if not logger.handlers:
22
+ handler = logging.StreamHandler()
23
+ formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
24
+ handler.setFormatter(formatter)
25
+ logger.addHandler(handler)
26
 
27
 
28
+ # Whisper Model Initialization
29
+ def get_whisper_model():
30
+ if torch.cuda.is_available():
31
+ device = "cuda"
32
+ compute_type = "float32"
33
+ logger.info("βœ… GPU detected: Using CUDA with float32 compute")
34
+ else:
35
+ device = "cpu"
36
+ compute_type = "int8"
37
+ logger.warning("⚠️ GPU not available: Falling back to CPU with int8 compute")
38
+ try:
39
+ model = WhisperModel("base", device=device, compute_type=compute_type)
40
+ logger.info(f"πŸ“¦ Loaded Faster-Whisper model on {device} with compute_type={compute_type}")
41
+ return model
42
+ except Exception as e:
43
+ logger.error(f"❌ Failed to load Whisper model: {e}")
44
+ raise
45
 
46
+ whisper_model = get_whisper_model()
 
 
47
 
48
+ # Summarizer
49
+ try:
50
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
51
+ logger.info("πŸ“¦ Hugging Face summarizer pipeline loaded successfully.")
52
+ except Exception as e:
53
+ logger.error(f"❌ Failed to load summarization pipeline: {e}")
54
+ raise
55
+
56
+
57
+ # Chunked summarization
58
+ def summarize_in_chunks(text, chunk_size=800, overlap=100):
59
+ summaries = []
60
+ words = text.split()
61
+ step = chunk_size - overlap
62
+
63
+ for i in range(0, len(words), step):
64
+ chunk = " ".join(words[i:i + chunk_size])
65
+ if len(chunk.strip()) == 0:
66
+ continue
67
+ try:
68
+ result = summarizer(chunk, max_length=256, min_length=64, do_sample=False)
69
+ summaries.append(result[0]['summary_text'])
70
+ except Exception as e:
71
+ logger.error(f"❌ Chunk summarization failed: {e}")
72
+ return " ".join(summaries)
73
+
74
+
75
+ # 🧠 Get user from Neon DB
76
+ def get_user(user_id: int):
77
+ db = SessionLocal()
78
+ try:
79
+ return db.query(User).filter(User.id == user_id).first()
80
+ finally:
81
+ db.close()
82
+
83
+
84
+ # ⚑ Core Analyzer Function with per-user FAISS ingestion
85
+ def analyze(video_url: str, user_id: int):
86
+ # Verify user exists
87
+ user = get_user(user_id)
88
+ if not user:
89
+ raise ValueError(f"❌ User with ID {user_id} not found in Neon DB")
90
+
91
+ logger.info(f"πŸ“₯ Starting video analysis for user: {user.email} (ID: {user.id})")
92
+
93
+ try:
94
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
95
+ with requests.get(video_url, stream=True, timeout=60) as response:
96
+ response.raise_for_status()
97
+ for chunk in response.iter_content(chunk_size=8192):
98
+ tmp.write(chunk)
99
+ tmp_path = tmp.name
100
+ logger.info(f"🎞️ Video saved to temp file: {tmp_path}")
101
+ except Exception as e:
102
+ logger.error(f"❌ Failed to download video: {e}")
103
+ raise
104
+
105
+ try:
106
+ logger.info("🧠 Transcribing audio with Faster-Whisper...")
107
+ segments, _ = whisper_model.transcribe(tmp_path)
108
+ text = " ".join(segment.text for segment in segments)
109
+ logger.info(f"βœ… Transcription completed. Length: {len(text)} characters.")
110
+ except Exception as e:
111
+ logger.error(f"❌ Transcription failed: {e}")
112
+ raise
113
+
114
+ try:
115
+ logger.info("πŸ“ Summarizing transcript with Hugging Face model...")
116
+ summary = summarize_in_chunks(text)
117
+ logger.info("βœ… Summarization completed.")
118
+ except Exception as e:
119
+ logger.error(f"❌ Summarization failed: {e}")
120
+ raise
121
+
122
+ try:
123
+ logger.info("πŸ“Š Creating/updating FAISS vector store for user...")
124
+ documents = [Document(page_content=summary)]
125
+ embeddings = OpenAIEmbeddings()
126
+
127
+ user_vector_path = f"vector_store/user_{user_id}"
128
+ os.makedirs(user_vector_path, exist_ok=True)
129
+
130
+ if os.path.exists(os.path.join(user_vector_path, "index.faiss")):
131
+ vector_store = FAISS.load_local(user_vector_path, embeddings)
132
+ vector_store.add_documents(documents)
133
+ else:
134
+ vector_store = FAISS.from_documents(documents, embeddings)
135
+
136
+ vector_store.save_local(user_vector_path)
137
+ logger.info(f"βœ… Vector store saved at: {user_vector_path}")
138
+ except Exception as e:
139
+ logger.error(f"❌ Failed to create vector store: {e}")
140
+ raise
141
+
142
  return text, summary
requirements.txt CHANGED
@@ -15,3 +15,33 @@ databases
15
  psycopg2-binary
16
  passlib[bcrypt]
17
  python-jose[cryptography]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  psycopg2-binary
16
  passlib[bcrypt]
17
  python-jose[cryptography]
18
+ faster-whisper
19
+ torch==2.2.2+cu121
20
+ torchvision==0.17.2+cu121
21
+ torchaudio==2.2.2+cu121
22
+ --extra-index-url https://download.pytorch.org/whl/cu121
23
+ # CTranslate2 GPU build for CUDA 12
24
+ langchain_openai
25
+ langchain_core
26
+ python-dotenv
27
+ streamlit
28
+ langchain_community
29
+ langserve
30
+ sse_starlette
31
+ bs4
32
+ pypdf
33
+ chromadb
34
+ faiss-cpu
35
+ groq
36
+ cassio
37
+ beautifulsoup4
38
+ langchain-groq
39
+ wikipedia
40
+ arxiv
41
+ langchainhub
42
+ sentence_transformers
43
+ PyPDF2
44
+ langchain-objectbox
45
+ pypdf
46
+ langchain_groq
47
+ langchain
worker/__init__.py ADDED
File without changes
worker/daemon.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ import time
4
+ from datetime import datetime
5
+
6
+ from sqlalchemy.future import select
7
+ from sqlalchemy.ext.asyncio import AsyncSession
8
+
9
+ from app.database import AsyncSessionLocal
10
+ from app.models import VideoUpload
11
+ from app.utils import whisper_llm, pdf, s3
12
+
13
+
14
+ POLL_INTERVAL = 200 # seconds
15
+
16
+
17
+ async def process_pending_videos():
18
+ async with AsyncSessionLocal() as session:
19
+ try:
20
+ result = await session.execute(
21
+ select(VideoUpload).where(VideoUpload.status == "pending")
22
+ )
23
+ pending_videos = result.scalars().all()
24
+
25
+ for video in pending_videos:
26
+ print(f"🎬 Processing video ID {video.id} for user {video.user_id}")
27
+
28
+ try:
29
+ transcription, summary = whisper_llm.analyze(video.video_url)
30
+ except Exception as e:
31
+ print(f"❌ Whisper failed: {e}")
32
+ continue
33
+
34
+ try:
35
+ pdf_bytes = pdf.generate(transcription, summary)
36
+ except Exception as e:
37
+ print(f"❌ PDF generation failed: {e}")
38
+ continue
39
+
40
+ try:
41
+ pdf_key = f"pdfs/{video.id}.pdf"
42
+ pdf_url = s3.upload_pdf_bytes(pdf_bytes, pdf_key)
43
+ except Exception as e:
44
+ print(f"❌ Upload to S3 failed: {e}")
45
+ continue
46
+
47
+ video.status = "completed"
48
+ video.pdf_url = pdf_url
49
+ video.updated_at = datetime.utcnow()
50
+
51
+ await session.commit()
52
+ print(f"βœ… Completed video {video.id}")
53
+
54
+ except Exception as e:
55
+ print(f"❌ DB error: {e}")
56
+
57
+
58
+ async def run_worker():
59
+ print("πŸš€ Async worker started (Neon)...")
60
+ while True:
61
+ print("πŸ” Checking for pending videos...")
62
+ try:
63
+ await process_pending_videos()
64
+ except Exception as e:
65
+ print(f"❌ Worker loop crashed: {e}")
66
+ await asyncio.sleep(POLL_INTERVAL)
67
+
68
+
69
+ if __name__ == "__main__":
70
+ asyncio.run(run_worker())
worker/gpu_test.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ def check_gpu():
4
+ print("πŸ” Checking CUDA and GPU details...\n")
5
+
6
+ # Check if CUDA is available
7
+ if torch.cuda.is_available():
8
+ device = torch.device("cuda")
9
+ print("βœ… CUDA is available.")
10
+ print(f"πŸ–₯️ GPU Name: {torch.cuda.get_device_name(0)}")
11
+ print(f"πŸ“Š GPU Memory: {round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2)} GB")
12
+
13
+ # Create a tensor on GPU
14
+ x = torch.rand(1000, 1000).to(device)
15
+ y = torch.mm(x, x)
16
+ print(f"πŸš€ Tensor computation successful on GPU! Tensor shape: {y.shape}")
17
+ else:
18
+ print("❌ CUDA is NOT available. Using CPU fallback.")
19
+ x = torch.rand(1000, 1000)
20
+ y = torch.mm(x, x)
21
+ print(f"βœ… CPU computation done. Tensor shape: {y.shape}")
22
+
23
+ if __name__ == "__main__":
24
+ check_gpu()