import streamlit as st import easyocr from pdf2image import convert_from_bytes from transformers import pipeline, VitsModel, AutoTokenizer, AutoModelForQuestionAnswering import faiss import numpy as np from sentence_transformers import SentenceTransformer import torch import scipy.io.wavfile as wavfile import io import textwrap import time import json from datetime import datetime from collections import defaultdict import nltk from rank_bm25 import BM25Okapi import re # Download required NLTK data try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt', quiet=True) # ==================== SESSION STATE INITIALIZATION ==================== if 'analytics' not in st.session_state: st.session_state.analytics = { 'total_queries': 0, 'successful_queries': 0, 'failed_queries': 0, 'avg_confidence': [], 'processing_times': [], 'query_history': [] } # ==================== OCR MODULE ==================== @st.cache_resource def load_ocr_reader(): """Load EasyOCR reader (cached to avoid reloading)""" return easyocr.Reader(['bn', 'en'], gpu=False) @st.cache_data def extract_text_with_easyocr(pdf_file_contents): """Uses EasyOCR for free Bengali text extraction with progress tracking""" reader = load_ocr_reader() images = convert_from_bytes(pdf_file_contents) full_text = "" progress_bar = st.progress(0) status_text = st.empty() for i, img in enumerate(images): img_array = np.array(img) results = reader.readtext(img_array, detail=0, paragraph=True) page_text = " ".join(results) full_text += page_text + "\n" progress = (i + 1) / len(images) progress_bar.progress(progress) status_text.text(f"Processed page {i+1}/{len(images)}") progress_bar.empty() status_text.empty() return full_text # ==================== TTS MODULE ==================== @st.cache_resource def load_tts_model(): """Load Meta's MMS-TTS model for Bengali""" model = VitsModel.from_pretrained("facebook/mms-tts-ben") tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-ben") return model, tokenizer def generate_audio_mms(text, max_length=1000): """Generate audio using Meta MMS-TTS with length limiting""" try: model, tokenizer = load_tts_model() # Limit text length to prevent memory issues if len(text) > max_length: text = text[:max_length] + "..." inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform waveform = output.squeeze().cpu().numpy() audio_buffer = io.BytesIO() wavfile.write(audio_buffer, rate=16000, data=(waveform * 32767).astype(np.int16)) audio_buffer.seek(0) return audio_buffer except Exception as e: st.error(f"TTS Error: {e}") return None # ==================== ADVANCED CHUNKING MODULE ==================== def semantic_chunk_text(text, max_chunk_size=1000, overlap=100): """ Advanced semantic chunking that respects sentence boundaries. Uses nltk for sentence tokenization. """ # Handle Bengali sentence endings sentences = re.split(r'[।.!?]\s+', text) sentences = [s.strip() + '।' if not s.endswith(('।', '.', '!', '?')) else s.strip() for s in sentences if s.strip()] chunks = [] current_chunk = "" for sentence in sentences: if len(current_chunk) + len(sentence) <= max_chunk_size: current_chunk += " " + sentence else: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence if current_chunk: chunks.append(current_chunk.strip()) # Add overlap between chunks for better context overlapped_chunks = [] for i, chunk in enumerate(chunks): if i > 0 and overlap > 0: # Add last few sentences from previous chunk prev_sentences = chunks[i-1].split('।')[-2:] overlap_text = '।'.join(prev_sentences) overlapped_chunks.append(overlap_text + " " + chunk) else: overlapped_chunks.append(chunk) return overlapped_chunks @st.cache_data def chunk_text_for_reader(text, max_chars=4000): """Simple chunking for text-to-speech""" return textwrap.wrap(text, max_chars, break_long_words=False, replace_whitespace=False) # ==================== HYBRID SEARCH RAG MODULE ==================== @st.cache_resource def setup_hybrid_rag_pipeline(chunks): """ Setup hybrid RAG with both dense (FAISS) and sparse (BM25) retrieval. This significantly improves retrieval accuracy. """ # Dense retrieval with multilingual embeddings embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') embeddings = embedder.encode(chunks, show_progress_bar=False) dense_index = faiss.IndexFlatL2(embeddings.shape[1]) dense_index.add(np.array(embeddings).astype('float32')) # Sparse retrieval with BM25 tokenized_chunks = [chunk.split() for chunk in chunks] sparse_index = BM25Okapi(tokenized_chunks) return dense_index, sparse_index, embedder def hybrid_search(dense_index, sparse_index, embedder, question, chunks, k=3, alpha=0.5): """ Hybrid search combining dense and sparse retrieval. alpha: weight for dense retrieval (1-alpha for sparse) """ # Dense retrieval question_embedding = embedder.encode([question]) dense_distances, dense_indices = dense_index.search( np.array(question_embedding).astype('float32'), k*2 ) # Sparse retrieval tokenized_question = question.split() sparse_scores = sparse_index.get_scores(tokenized_question) sparse_indices = np.argsort(sparse_scores)[-k*2:][::-1] # Normalize scores dense_scores = 1 / (1 + dense_distances[0]) # Convert distance to similarity dense_scores = dense_scores / np.sum(dense_scores) sparse_scores_norm = sparse_scores[sparse_indices] if np.sum(sparse_scores_norm) > 0: sparse_scores_norm = sparse_scores_norm / np.sum(sparse_scores_norm) # Combine scores combined_scores = {} for idx, score in zip(dense_indices[0], dense_scores): combined_scores[idx] = combined_scores.get(idx, 0) + alpha * score for idx, score in zip(sparse_indices, sparse_scores_norm): combined_scores[idx] = combined_scores.get(idx, 0) + (1 - alpha) * score # Get top k results top_indices = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:k] return [chunks[idx] for idx, _ in top_indices] # ==================== QA MODULE ==================== @st.cache_resource def load_qa_model(): """Load BanglaBERT model for Bengali question answering""" model_name = "csebuetnlp/banglabert" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForQuestionAnswering.from_pretrained(model_name) qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer) return qa_pipeline # ==================== SUMMARIZATION MODULE ==================== @st.cache_resource def load_summarization_model(): """Load summarization model for Bengali""" try: # Use mT5 for multilingual summarization summarizer = pipeline( "summarization", model="csebuetnlp/mT5_multilingual_XLSum", tokenizer="csebuetnlp/mT5_multilingual_XLSum" ) return summarizer except: return None def generate_summary(text, max_length=200, min_length=50): """Generate document summary""" summarizer = load_summarization_model() if summarizer is None: return "Summarization model not available." try: # Limit input length to avoid memory issues max_input = 1024 if len(text) > max_input: text = text[:max_input] summary = summarizer( text, max_length=max_length, min_length=min_length, do_sample=False ) return summary[0]['summary_text'] except Exception as e: return f"Summarization error: {str(e)}" # ==================== ANALYTICS MODULE ==================== def log_query(question, answer, confidence, processing_time, success=True): """Log query analytics for performance tracking""" st.session_state.analytics['total_queries'] += 1 if success: st.session_state.analytics['successful_queries'] += 1 st.session_state.analytics['avg_confidence'].append(confidence) else: st.session_state.analytics['failed_queries'] += 1 st.session_state.analytics['processing_times'].append(processing_time) st.session_state.analytics['query_history'].append({ 'timestamp': datetime.now().isoformat(), 'question': question, 'answer': answer[:100] + "..." if len(answer) > 100 else answer, 'confidence': confidence, 'processing_time': processing_time, 'success': success }) def display_analytics(): """Display analytics dashboard""" analytics = st.session_state.analytics col1, col2, col3 = st.columns(3) with col1: st.metric("Total Queries", analytics['total_queries']) st.metric("Success Rate", f"{(analytics['successful_queries'] / max(analytics['total_queries'], 1) * 100):.1f}%") with col2: avg_conf = np.mean(analytics['avg_confidence']) if analytics['avg_confidence'] else 0 st.metric("Avg Confidence", f"{avg_conf:.2%}") avg_time = np.mean(analytics['processing_times']) if analytics['processing_times'] else 0 st.metric("Avg Processing Time", f"{avg_time:.2f}s") with col3: st.metric("Successful Queries", analytics['successful_queries']) st.metric("Failed Queries", analytics['failed_queries']) # Query history if analytics['query_history']: st.subheader("Recent Query History") for query in analytics['query_history'][-5:]: with st.expander(f"Q: {query['question'][:50]}... ({query['timestamp']})"): st.write(f"**Answer:** {query['answer']}") st.write(f"**Confidence:** {query['confidence']:.2%}") st.write(f"**Time:** {query['processing_time']:.2f}s") st.write(f"**Status:** {'✅ Success' if query['success'] else '❌ Failed'}") # ==================== EXPORT MODULE ==================== def export_results(text, summaries, qa_pairs): """Export results as JSON""" export_data = { 'export_date': datetime.now().isoformat(), 'full_text': text, 'summaries': summaries, 'qa_pairs': qa_pairs, 'analytics': st.session_state.analytics } return json.dumps(export_data, indent=2, ensure_ascii=False) # ==================== MAIN STREAMLIT APP ==================== st.set_page_config( page_title="Bengali PDF Assistant - Research Edition", page_icon="🎓", layout="wide", initial_sidebar_state="expanded" ) # Sidebar configuration with st.sidebar: st.header("⚙️ Configuration") st.subheader("Model Settings") rag_k = st.slider("Number of context chunks", 1, 5, 3) rag_alpha = st.slider("Dense/Sparse balance", 0.0, 1.0, 0.5, 0.1, help="0 = Only sparse (BM25), 1 = Only dense (embeddings)") st.subheader("Audio Settings") audio_speed = st.select_slider("Audio chunks per page", options=[2000, 3000, 4000, 5000], value=4000) st.subheader("About") st.info(""" **Research-Grade Features:** - 🔬 Hybrid RAG (Dense + Sparse) - 📊 Performance Analytics - 🎯 Semantic Chunking - 📝 Document Summarization - 💾 Export Capabilities - 📈 Confidence Scoring """) # Main title st.title("🎓 Bengali PDF Assistant - Research Edition") st.markdown(""" **Advanced NLP Pipeline for Bengali Document Analysis** *Features: Hybrid RAG • Meta MMS-TTS • BanglaBERT QA • Document Summarization • Analytics* """) uploaded_file = st.file_uploader("📄 Upload Bengali PDF Document", type="pdf") if uploaded_file: start_time = time.time() with st.spinner("🔬 Analyzing document with advanced NLP pipeline..."): file_contents = uploaded_file.getvalue() try: # Extract text full_text = extract_text_with_easyocr(file_contents) # Semantic chunking for better context reader_chunks = chunk_text_for_reader(full_text, max_chars=audio_speed) rag_chunks = semantic_chunk_text(full_text, max_chunk_size=1000, overlap=100) # Setup hybrid RAG pipeline dense_idx, sparse_idx, embedder = setup_hybrid_rag_pipeline(rag_chunks) processing_time = time.time() - start_time st.success(f"✅ Document processed in {processing_time:.2f}s!") st.info(f"📊 Extracted {len(full_text)} characters • {len(rag_chunks)} semantic chunks • {len(reader_chunks)} audio segments") except Exception as e: st.error(f"❌ Processing Error: {e}") st.stop() # Document preview with st.expander("📄 Document Preview & Metadata"): col1, col2 = st.columns(2) with col1: st.text_area("Text Preview (first 500 chars)", full_text[:500], height=150) with col2: st.json({ "total_characters": len(full_text), "total_words": len(full_text.split()), "semantic_chunks": len(rag_chunks), "audio_segments": len(reader_chunks), "processing_time": f"{processing_time:.2f}s" }) # Main tabs tab1, tab2, tab3, tab4 = st.tabs([ "📖 Read Aloud", "💬 Q&A System", "📝 Summarization", "📊 Analytics & Export" ]) # ==================== TAB 1: READ ALOUD ==================== with tab1: st.header("🎙️ Text-to-Speech") st.caption("Using Meta MMS-TTS for natural Bengali speech synthesis") col1, col2 = st.columns([3, 1]) with col1: selected_mode = st.radio( "Select mode:", ["Generate All Audio", "Generate Specific Segment"], horizontal=True ) if selected_mode == "Generate All Audio": if st.button("🎵 Generate Complete Audio", type="primary"): if not full_text.strip(): st.warning("⚠️ No text found in document.") else: progress_bar = st.progress(0) for i, chunk in enumerate(reader_chunks): try: audio_buffer = generate_audio_mms(chunk) if audio_buffer: st.write(f"**Segment {i + 1} / {len(reader_chunks)}**") st.audio(audio_buffer, format="audio/wav") progress_bar.progress((i + 1) / len(reader_chunks)) except Exception as e: st.error(f"❌ Error in segment {i+1}: {e}") progress_bar.empty() st.success("✅ All audio segments generated!") else: segment_num = st.number_input( "Select segment number:", min_value=1, max_value=len(reader_chunks), value=1 ) if st.button("🎵 Generate Selected Segment"): try: chunk = reader_chunks[segment_num - 1] st.text_area("Segment text:", chunk, height=100) audio_buffer = generate_audio_mms(chunk) if audio_buffer: st.audio(audio_buffer, format="audio/wav") except Exception as e: st.error(f"❌ Error: {e}") # ==================== TAB 2: Q&A SYSTEM ==================== with tab2: st.header("💡 Question Answering System") st.caption("Hybrid RAG with BanglaBERT • Combining dense embeddings + sparse retrieval") question = st.text_input("🔍 Ask a question about your document:", key="qa_input") if question: query_start_time = time.time() with st.spinner("🔬 Processing query with hybrid search..."): try: # Hybrid search relevant_chunks = hybrid_search( dense_idx, sparse_idx, embedder, question, rag_chunks, k=rag_k, alpha=rag_alpha ) context = "\n---\n".join(relevant_chunks) # Get answer using BanglaBERT qa_model = load_qa_model() result = qa_model(question=question, context=context) answer_text = result['answer'] confidence = result['score'] query_time = time.time() - query_start_time # Display results st.subheader("💡 Answer") st.markdown(f"> {answer_text}") col1, col2, col3 = st.columns(3) with col1: st.metric("Confidence", f"{confidence:.2%}") with col2: st.metric("Processing Time", f"{query_time:.2f}s") with col3: st.metric("Context Chunks", len(relevant_chunks)) # Audio for answer st.write("🔊 **Listen to Answer:**") audio_buffer = generate_audio_mms(answer_text) if audio_buffer: st.audio(audio_buffer, format="audio/wav") # Show retrieved context with st.expander("📚 Retrieved Context"): for i, chunk in enumerate(relevant_chunks): st.markdown(f"**Chunk {i+1}:**") st.text_area(f"Context {i+1}", chunk, height=100, key=f"context_{i}") # Log analytics log_query(question, answer_text, confidence, query_time, success=True) except Exception as e: query_time = time.time() - query_start_time st.error(f"❌ Error: {e}") st.write("**Retrieved Context (fallback):**") st.text_area("Context", context[:500] if 'context' in locals() else "No context retrieved", height=150) log_query(question, str(e), 0.0, query_time, success=False) # ==================== TAB 3: SUMMARIZATION ==================== with tab3: st.header("📝 Document Summarization") st.caption("Using mT5 for Bengali text summarization") col1, col2 = st.columns(2) with col1: summary_length = st.select_slider( "Summary length:", options=["Short", "Medium", "Long"], value="Medium" ) length_map = {"Short": (30, 100), "Medium": (50, 200), "Long": (100, 300)} min_len, max_len = length_map[summary_length] if st.button("📄 Generate Summary", type="primary"): with st.spinner("🔬 Generating summary..."): summary_start = time.time() # Generate summary for full text or first portion text_to_summarize = full_text[:2000] if len(full_text) > 2000 else full_text summary = generate_summary(text_to_summarize, max_length=max_len, min_length=min_len) summary_time = time.time() - summary_start st.subheader("📋 Summary") st.markdown(f"> {summary}") st.metric("Generation Time", f"{summary_time:.2f}s") # Audio for summary st.write("🔊 **Listen to Summary:**") audio_buffer = generate_audio_mms(summary) if audio_buffer: st.audio(audio_buffer, format="audio/wav") # Summary statistics with st.expander("📊 Summary Statistics"): st.json({ "original_length": len(full_text), "summary_length": len(summary), "compression_ratio": f"{(len(summary) / len(full_text) * 100):.1f}%", "generation_time": f"{summary_time:.2f}s" }) # ==================== TAB 4: ANALYTICS & EXPORT ==================== with tab4: st.header("📊 Performance Analytics") display_analytics() st.divider() st.header("💾 Export Results") col1, col2 = st.columns(2) with col1: if st.button("📥 Export Session Data (JSON)"): export_data = export_results( full_text[:1000] + "..." if len(full_text) > 1000 else full_text, [], [] ) st.download_button( label="Download JSON", data=export_data, file_name=f"bengali_pdf_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", mime="application/json" ) with col2: if st.button("📄 Export Full Text"): st.download_button( label="Download Text", data=full_text, file_name=f"extracted_text_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt", mime="text/plain" ) st.divider() st.header("🔬 Technical Details") with st.expander("Pipeline Configuration"): st.code(f""" OCR: EasyOCR (Bengali + English) TTS: facebook/mms-tts-ben (Meta MMS-TTS) Embeddings: paraphrase-multilingual-MiniLM-L12-v2 QA Model: csebuetnlp/banglabert Summarization: csebuetnlp/mT5_multilingual_XLSum Dense Index: FAISS (L2) Sparse Index: BM25Okapi Chunking: Semantic (sentence-aware) Hybrid Search Alpha: {rag_alpha} Context Chunks (k): {rag_k} """, language="yaml") else: # Landing page when no file uploaded st.info("👆 Upload a Bengali PDF document to get started") col1, col2, col3 = st.columns(3) with col1: st.subheader("🎯 Features") st.markdown(""" - Advanced OCR for Bengali - Natural TTS synthesis - Hybrid RAG search - Question answering - Document summarization """) with col2: st.subheader("🔬 Research Tools") st.markdown(""" - Performance analytics - Confidence scoring - Processing metrics - Query history - Export capabilities """) with col3: st.subheader("🚀 Technologies") st.markdown(""" - EasyOCR - Meta MMS-TTS - BanglaBERT - FAISS + BM25 - mT5 Summarization """) # Footer st.divider() st.caption("🎓 Bengali PDF Assistant - Research Edition | Built for academic research & accessibility")