bangla-pdf-assistant / app_without_surya.py
SMSristi
Integrated Surya OCR
6845a65
raw
history blame
23.4 kB
import streamlit as st
import easyocr
from pdf2image import convert_from_bytes
from transformers import pipeline, VitsModel, AutoTokenizer, AutoModelForQuestionAnswering
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
import scipy.io.wavfile as wavfile
import io
import textwrap
import time
import json
from datetime import datetime
from collections import defaultdict
import nltk
from rank_bm25 import BM25Okapi
import re
# Download required NLTK data
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt', quiet=True)
# ==================== SESSION STATE INITIALIZATION ====================
if 'analytics' not in st.session_state:
st.session_state.analytics = {
'total_queries': 0,
'successful_queries': 0,
'failed_queries': 0,
'avg_confidence': [],
'processing_times': [],
'query_history': []
}
# ==================== OCR MODULE ====================
@st.cache_resource
def load_ocr_reader():
"""Load EasyOCR reader (cached to avoid reloading)"""
return easyocr.Reader(['bn', 'en'], gpu=False)
@st.cache_data
def extract_text_with_easyocr(pdf_file_contents):
"""Uses EasyOCR for free Bengali text extraction with progress tracking"""
reader = load_ocr_reader()
images = convert_from_bytes(pdf_file_contents)
full_text = ""
progress_bar = st.progress(0)
status_text = st.empty()
for i, img in enumerate(images):
img_array = np.array(img)
results = reader.readtext(img_array, detail=0, paragraph=True)
page_text = " ".join(results)
full_text += page_text + "\n"
progress = (i + 1) / len(images)
progress_bar.progress(progress)
status_text.text(f"Processed page {i+1}/{len(images)}")
progress_bar.empty()
status_text.empty()
return full_text
# ==================== TTS MODULE ====================
@st.cache_resource
def load_tts_model():
"""Load Meta's MMS-TTS model for Bengali"""
model = VitsModel.from_pretrained("facebook/mms-tts-ben")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-ben")
return model, tokenizer
def generate_audio_mms(text, max_length=1000):
"""Generate audio using Meta MMS-TTS with length limiting"""
try:
model, tokenizer = load_tts_model()
# Limit text length to prevent memory issues
if len(text) > max_length:
text = text[:max_length] + "..."
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
output = model(**inputs).waveform
waveform = output.squeeze().cpu().numpy()
audio_buffer = io.BytesIO()
wavfile.write(audio_buffer, rate=16000, data=(waveform * 32767).astype(np.int16))
audio_buffer.seek(0)
return audio_buffer
except Exception as e:
st.error(f"TTS Error: {e}")
return None
# ==================== ADVANCED CHUNKING MODULE ====================
def semantic_chunk_text(text, max_chunk_size=1000, overlap=100):
"""
Advanced semantic chunking that respects sentence boundaries.
Uses nltk for sentence tokenization.
"""
# Handle Bengali sentence endings
sentences = re.split(r'[ΰ₯€.!?]\s+', text)
sentences = [s.strip() + 'ΰ₯€' if not s.endswith(('ΰ₯€', '.', '!', '?')) else s.strip()
for s in sentences if s.strip()]
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= max_chunk_size:
current_chunk += " " + sentence
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk.strip())
# Add overlap between chunks for better context
overlapped_chunks = []
for i, chunk in enumerate(chunks):
if i > 0 and overlap > 0:
# Add last few sentences from previous chunk
prev_sentences = chunks[i-1].split('ΰ₯€')[-2:]
overlap_text = 'ΰ₯€'.join(prev_sentences)
overlapped_chunks.append(overlap_text + " " + chunk)
else:
overlapped_chunks.append(chunk)
return overlapped_chunks
@st.cache_data
def chunk_text_for_reader(text, max_chars=4000):
"""Simple chunking for text-to-speech"""
return textwrap.wrap(text, max_chars, break_long_words=False, replace_whitespace=False)
# ==================== HYBRID SEARCH RAG MODULE ====================
@st.cache_resource
def setup_hybrid_rag_pipeline(chunks):
"""
Setup hybrid RAG with both dense (FAISS) and sparse (BM25) retrieval.
This significantly improves retrieval accuracy.
"""
# Dense retrieval with multilingual embeddings
embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
embeddings = embedder.encode(chunks, show_progress_bar=False)
dense_index = faiss.IndexFlatL2(embeddings.shape[1])
dense_index.add(np.array(embeddings).astype('float32'))
# Sparse retrieval with BM25
tokenized_chunks = [chunk.split() for chunk in chunks]
sparse_index = BM25Okapi(tokenized_chunks)
return dense_index, sparse_index, embedder
def hybrid_search(dense_index, sparse_index, embedder, question, chunks, k=3, alpha=0.5):
"""
Hybrid search combining dense and sparse retrieval.
alpha: weight for dense retrieval (1-alpha for sparse)
"""
# Dense retrieval
question_embedding = embedder.encode([question])
dense_distances, dense_indices = dense_index.search(
np.array(question_embedding).astype('float32'), k*2
)
# Sparse retrieval
tokenized_question = question.split()
sparse_scores = sparse_index.get_scores(tokenized_question)
sparse_indices = np.argsort(sparse_scores)[-k*2:][::-1]
# Normalize scores
dense_scores = 1 / (1 + dense_distances[0]) # Convert distance to similarity
dense_scores = dense_scores / np.sum(dense_scores)
sparse_scores_norm = sparse_scores[sparse_indices]
if np.sum(sparse_scores_norm) > 0:
sparse_scores_norm = sparse_scores_norm / np.sum(sparse_scores_norm)
# Combine scores
combined_scores = {}
for idx, score in zip(dense_indices[0], dense_scores):
combined_scores[idx] = combined_scores.get(idx, 0) + alpha * score
for idx, score in zip(sparse_indices, sparse_scores_norm):
combined_scores[idx] = combined_scores.get(idx, 0) + (1 - alpha) * score
# Get top k results
top_indices = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:k]
return [chunks[idx] for idx, _ in top_indices]
# ==================== QA MODULE ====================
@st.cache_resource
def load_qa_model():
"""Load BanglaBERT model for Bengali question answering"""
model_name = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)
return qa_pipeline
# ==================== SUMMARIZATION MODULE ====================
@st.cache_resource
def load_summarization_model():
"""Load summarization model for Bengali"""
try:
# Use mT5 for multilingual summarization
summarizer = pipeline(
"summarization",
model="csebuetnlp/mT5_multilingual_XLSum",
tokenizer="csebuetnlp/mT5_multilingual_XLSum"
)
return summarizer
except:
return None
def generate_summary(text, max_length=200, min_length=50):
"""Generate document summary"""
summarizer = load_summarization_model()
if summarizer is None:
return "Summarization model not available."
try:
# Limit input length to avoid memory issues
max_input = 1024
if len(text) > max_input:
text = text[:max_input]
summary = summarizer(
text,
max_length=max_length,
min_length=min_length,
do_sample=False
)
return summary[0]['summary_text']
except Exception as e:
return f"Summarization error: {str(e)}"
# ==================== ANALYTICS MODULE ====================
def log_query(question, answer, confidence, processing_time, success=True):
"""Log query analytics for performance tracking"""
st.session_state.analytics['total_queries'] += 1
if success:
st.session_state.analytics['successful_queries'] += 1
st.session_state.analytics['avg_confidence'].append(confidence)
else:
st.session_state.analytics['failed_queries'] += 1
st.session_state.analytics['processing_times'].append(processing_time)
st.session_state.analytics['query_history'].append({
'timestamp': datetime.now().isoformat(),
'question': question,
'answer': answer[:100] + "..." if len(answer) > 100 else answer,
'confidence': confidence,
'processing_time': processing_time,
'success': success
})
def display_analytics():
"""Display analytics dashboard"""
analytics = st.session_state.analytics
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Queries", analytics['total_queries'])
st.metric("Success Rate",
f"{(analytics['successful_queries'] / max(analytics['total_queries'], 1) * 100):.1f}%")
with col2:
avg_conf = np.mean(analytics['avg_confidence']) if analytics['avg_confidence'] else 0
st.metric("Avg Confidence", f"{avg_conf:.2%}")
avg_time = np.mean(analytics['processing_times']) if analytics['processing_times'] else 0
st.metric("Avg Processing Time", f"{avg_time:.2f}s")
with col3:
st.metric("Successful Queries", analytics['successful_queries'])
st.metric("Failed Queries", analytics['failed_queries'])
# Query history
if analytics['query_history']:
st.subheader("Recent Query History")
for query in analytics['query_history'][-5:]:
with st.expander(f"Q: {query['question'][:50]}... ({query['timestamp']})"):
st.write(f"**Answer:** {query['answer']}")
st.write(f"**Confidence:** {query['confidence']:.2%}")
st.write(f"**Time:** {query['processing_time']:.2f}s")
st.write(f"**Status:** {'βœ… Success' if query['success'] else '❌ Failed'}")
# ==================== EXPORT MODULE ====================
def export_results(text, summaries, qa_pairs):
"""Export results as JSON"""
export_data = {
'export_date': datetime.now().isoformat(),
'full_text': text,
'summaries': summaries,
'qa_pairs': qa_pairs,
'analytics': st.session_state.analytics
}
return json.dumps(export_data, indent=2, ensure_ascii=False)
# ==================== MAIN STREAMLIT APP ====================
st.set_page_config(
page_title="Bengali PDF Assistant - Research Edition",
page_icon="πŸŽ“",
layout="wide",
initial_sidebar_state="expanded"
)
# Sidebar configuration
with st.sidebar:
st.header("βš™οΈ Configuration")
st.subheader("Model Settings")
rag_k = st.slider("Number of context chunks", 1, 5, 3)
rag_alpha = st.slider("Dense/Sparse balance", 0.0, 1.0, 0.5, 0.1,
help="0 = Only sparse (BM25), 1 = Only dense (embeddings)")
st.subheader("Audio Settings")
audio_speed = st.select_slider("Audio chunks per page", options=[2000, 3000, 4000, 5000], value=4000)
st.subheader("About")
st.info("""
**Research-Grade Features:**
- πŸ”¬ Hybrid RAG (Dense + Sparse)
- πŸ“Š Performance Analytics
- 🎯 Semantic Chunking
- πŸ“ Document Summarization
- πŸ’Ύ Export Capabilities
- πŸ“ˆ Confidence Scoring
""")
# Main title
st.title("πŸŽ“ Bengali PDF Assistant - Research Edition")
st.markdown("""
**Advanced NLP Pipeline for Bengali Document Analysis**
*Features: Hybrid RAG β€’ Meta MMS-TTS β€’ BanglaBERT QA β€’ Document Summarization β€’ Analytics*
""")
uploaded_file = st.file_uploader("πŸ“„ Upload Bengali PDF Document", type="pdf")
if uploaded_file:
start_time = time.time()
with st.spinner("πŸ”¬ Analyzing document with advanced NLP pipeline..."):
file_contents = uploaded_file.getvalue()
try:
# Extract text
full_text = extract_text_with_easyocr(file_contents)
# Semantic chunking for better context
reader_chunks = chunk_text_for_reader(full_text, max_chars=audio_speed)
rag_chunks = semantic_chunk_text(full_text, max_chunk_size=1000, overlap=100)
# Setup hybrid RAG pipeline
dense_idx, sparse_idx, embedder = setup_hybrid_rag_pipeline(rag_chunks)
processing_time = time.time() - start_time
st.success(f"βœ… Document processed in {processing_time:.2f}s!")
st.info(f"πŸ“Š Extracted {len(full_text)} characters β€’ {len(rag_chunks)} semantic chunks β€’ {len(reader_chunks)} audio segments")
except Exception as e:
st.error(f"❌ Processing Error: {e}")
st.stop()
# Document preview
with st.expander("πŸ“„ Document Preview & Metadata"):
col1, col2 = st.columns(2)
with col1:
st.text_area("Text Preview (first 500 chars)", full_text[:500], height=150)
with col2:
st.json({
"total_characters": len(full_text),
"total_words": len(full_text.split()),
"semantic_chunks": len(rag_chunks),
"audio_segments": len(reader_chunks),
"processing_time": f"{processing_time:.2f}s"
})
# Main tabs
tab1, tab2, tab3, tab4 = st.tabs([
"πŸ“– Read Aloud",
"πŸ’¬ Q&A System",
"πŸ“ Summarization",
"πŸ“Š Analytics & Export"
])
# ==================== TAB 1: READ ALOUD ====================
with tab1:
st.header("πŸŽ™οΈ Text-to-Speech")
st.caption("Using Meta MMS-TTS for natural Bengali speech synthesis")
col1, col2 = st.columns([3, 1])
with col1:
selected_mode = st.radio(
"Select mode:",
["Generate All Audio", "Generate Specific Segment"],
horizontal=True
)
if selected_mode == "Generate All Audio":
if st.button("🎡 Generate Complete Audio", type="primary"):
if not full_text.strip():
st.warning("⚠️ No text found in document.")
else:
progress_bar = st.progress(0)
for i, chunk in enumerate(reader_chunks):
try:
audio_buffer = generate_audio_mms(chunk)
if audio_buffer:
st.write(f"**Segment {i + 1} / {len(reader_chunks)}**")
st.audio(audio_buffer, format="audio/wav")
progress_bar.progress((i + 1) / len(reader_chunks))
except Exception as e:
st.error(f"❌ Error in segment {i+1}: {e}")
progress_bar.empty()
st.success("βœ… All audio segments generated!")
else:
segment_num = st.number_input(
"Select segment number:",
min_value=1,
max_value=len(reader_chunks),
value=1
)
if st.button("🎡 Generate Selected Segment"):
try:
chunk = reader_chunks[segment_num - 1]
st.text_area("Segment text:", chunk, height=100)
audio_buffer = generate_audio_mms(chunk)
if audio_buffer:
st.audio(audio_buffer, format="audio/wav")
except Exception as e:
st.error(f"❌ Error: {e}")
# ==================== TAB 2: Q&A SYSTEM ====================
with tab2:
st.header("πŸ’‘ Question Answering System")
st.caption("Hybrid RAG with BanglaBERT β€’ Combining dense embeddings + sparse retrieval")
question = st.text_input("πŸ” Ask a question about your document:", key="qa_input")
if question:
query_start_time = time.time()
with st.spinner("πŸ”¬ Processing query with hybrid search..."):
try:
# Hybrid search
relevant_chunks = hybrid_search(
dense_idx, sparse_idx, embedder, question, rag_chunks,
k=rag_k, alpha=rag_alpha
)
context = "\n---\n".join(relevant_chunks)
# Get answer using BanglaBERT
qa_model = load_qa_model()
result = qa_model(question=question, context=context)
answer_text = result['answer']
confidence = result['score']
query_time = time.time() - query_start_time
# Display results
st.subheader("πŸ’‘ Answer")
st.markdown(f"> {answer_text}")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Confidence", f"{confidence:.2%}")
with col2:
st.metric("Processing Time", f"{query_time:.2f}s")
with col3:
st.metric("Context Chunks", len(relevant_chunks))
# Audio for answer
st.write("πŸ”Š **Listen to Answer:**")
audio_buffer = generate_audio_mms(answer_text)
if audio_buffer:
st.audio(audio_buffer, format="audio/wav")
# Show retrieved context
with st.expander("πŸ“š Retrieved Context"):
for i, chunk in enumerate(relevant_chunks):
st.markdown(f"**Chunk {i+1}:**")
st.text_area(f"Context {i+1}", chunk, height=100, key=f"context_{i}")
# Log analytics
log_query(question, answer_text, confidence, query_time, success=True)
except Exception as e:
query_time = time.time() - query_start_time
st.error(f"❌ Error: {e}")
st.write("**Retrieved Context (fallback):**")
st.text_area("Context", context[:500] if 'context' in locals() else "No context retrieved", height=150)
log_query(question, str(e), 0.0, query_time, success=False)
# ==================== TAB 3: SUMMARIZATION ====================
with tab3:
st.header("πŸ“ Document Summarization")
st.caption("Using mT5 for Bengali text summarization")
col1, col2 = st.columns(2)
with col1:
summary_length = st.select_slider(
"Summary length:",
options=["Short", "Medium", "Long"],
value="Medium"
)
length_map = {"Short": (30, 100), "Medium": (50, 200), "Long": (100, 300)}
min_len, max_len = length_map[summary_length]
if st.button("πŸ“„ Generate Summary", type="primary"):
with st.spinner("πŸ”¬ Generating summary..."):
summary_start = time.time()
# Generate summary for full text or first portion
text_to_summarize = full_text[:2000] if len(full_text) > 2000 else full_text
summary = generate_summary(text_to_summarize, max_length=max_len, min_length=min_len)
summary_time = time.time() - summary_start
st.subheader("πŸ“‹ Summary")
st.markdown(f"> {summary}")
st.metric("Generation Time", f"{summary_time:.2f}s")
# Audio for summary
st.write("πŸ”Š **Listen to Summary:**")
audio_buffer = generate_audio_mms(summary)
if audio_buffer:
st.audio(audio_buffer, format="audio/wav")
# Summary statistics
with st.expander("πŸ“Š Summary Statistics"):
st.json({
"original_length": len(full_text),
"summary_length": len(summary),
"compression_ratio": f"{(len(summary) / len(full_text) * 100):.1f}%",
"generation_time": f"{summary_time:.2f}s"
})
# ==================== TAB 4: ANALYTICS & EXPORT ====================
with tab4:
st.header("πŸ“Š Performance Analytics")
display_analytics()
st.divider()
st.header("πŸ’Ύ Export Results")
col1, col2 = st.columns(2)
with col1:
if st.button("πŸ“₯ Export Session Data (JSON)"):
export_data = export_results(
full_text[:1000] + "..." if len(full_text) > 1000 else full_text,
[],
[]
)
st.download_button(
label="Download JSON",
data=export_data,
file_name=f"bengali_pdf_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
mime="application/json"
)
with col2:
if st.button("πŸ“„ Export Full Text"):
st.download_button(
label="Download Text",
data=full_text,
file_name=f"extracted_text_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
mime="text/plain"
)
st.divider()
st.header("πŸ”¬ Technical Details")
with st.expander("Pipeline Configuration"):
st.code(f"""
OCR: EasyOCR (Bengali + English)
TTS: facebook/mms-tts-ben (Meta MMS-TTS)
Embeddings: paraphrase-multilingual-MiniLM-L12-v2
QA Model: csebuetnlp/banglabert
Summarization: csebuetnlp/mT5_multilingual_XLSum
Dense Index: FAISS (L2)
Sparse Index: BM25Okapi
Chunking: Semantic (sentence-aware)
Hybrid Search Alpha: {rag_alpha}
Context Chunks (k): {rag_k}
""", language="yaml")
else:
# Landing page when no file uploaded
st.info("πŸ‘† Upload a Bengali PDF document to get started")
col1, col2, col3 = st.columns(3)
with col1:
st.subheader("🎯 Features")
st.markdown("""
- Advanced OCR for Bengali
- Natural TTS synthesis
- Hybrid RAG search
- Question answering
- Document summarization
""")
with col2:
st.subheader("πŸ”¬ Research Tools")
st.markdown("""
- Performance analytics
- Confidence scoring
- Processing metrics
- Query history
- Export capabilities
""")
with col3:
st.subheader("πŸš€ Technologies")
st.markdown("""
- EasyOCR
- Meta MMS-TTS
- BanglaBERT
- FAISS + BM25
- mT5 Summarization
""")
# Footer
st.divider()
st.caption("πŸŽ“ Bengali PDF Assistant - Research Edition | Built for academic research & accessibility")