Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import easyocr | |
| from pdf2image import convert_from_bytes | |
| from transformers import pipeline, VitsModel, AutoTokenizer, AutoModelForQuestionAnswering | |
| import faiss | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| import torch | |
| import scipy.io.wavfile as wavfile | |
| import io | |
| import textwrap | |
| import time | |
| import json | |
| from datetime import datetime | |
| from collections import defaultdict | |
| import nltk | |
| from rank_bm25 import BM25Okapi | |
| import re | |
| # Download required NLTK data | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| nltk.download('punkt', quiet=True) | |
| # ==================== SESSION STATE INITIALIZATION ==================== | |
| if 'analytics' not in st.session_state: | |
| st.session_state.analytics = { | |
| 'total_queries': 0, | |
| 'successful_queries': 0, | |
| 'failed_queries': 0, | |
| 'avg_confidence': [], | |
| 'processing_times': [], | |
| 'query_history': [] | |
| } | |
| # ==================== OCR MODULE ==================== | |
| def load_ocr_reader(): | |
| """Load EasyOCR reader (cached to avoid reloading)""" | |
| return easyocr.Reader(['bn', 'en'], gpu=False) | |
| def extract_text_with_easyocr(pdf_file_contents): | |
| """Uses EasyOCR for free Bengali text extraction with progress tracking""" | |
| reader = load_ocr_reader() | |
| images = convert_from_bytes(pdf_file_contents) | |
| full_text = "" | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| for i, img in enumerate(images): | |
| img_array = np.array(img) | |
| results = reader.readtext(img_array, detail=0, paragraph=True) | |
| page_text = " ".join(results) | |
| full_text += page_text + "\n" | |
| progress = (i + 1) / len(images) | |
| progress_bar.progress(progress) | |
| status_text.text(f"Processed page {i+1}/{len(images)}") | |
| progress_bar.empty() | |
| status_text.empty() | |
| return full_text | |
| # ==================== TTS MODULE ==================== | |
| def load_tts_model(): | |
| """Load Meta's MMS-TTS model for Bengali""" | |
| model = VitsModel.from_pretrained("facebook/mms-tts-ben") | |
| tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-ben") | |
| return model, tokenizer | |
| def generate_audio_mms(text, max_length=1000): | |
| """Generate audio using Meta MMS-TTS with length limiting""" | |
| try: | |
| model, tokenizer = load_tts_model() | |
| # Limit text length to prevent memory issues | |
| if len(text) > max_length: | |
| text = text[:max_length] + "..." | |
| inputs = tokenizer(text, return_tensors="pt") | |
| with torch.no_grad(): | |
| output = model(**inputs).waveform | |
| waveform = output.squeeze().cpu().numpy() | |
| audio_buffer = io.BytesIO() | |
| wavfile.write(audio_buffer, rate=16000, data=(waveform * 32767).astype(np.int16)) | |
| audio_buffer.seek(0) | |
| return audio_buffer | |
| except Exception as e: | |
| st.error(f"TTS Error: {e}") | |
| return None | |
| # ==================== ADVANCED CHUNKING MODULE ==================== | |
| def semantic_chunk_text(text, max_chunk_size=1000, overlap=100): | |
| """ | |
| Advanced semantic chunking that respects sentence boundaries. | |
| Uses nltk for sentence tokenization. | |
| """ | |
| # Handle Bengali sentence endings | |
| sentences = re.split(r'[ΰ₯€.!?]\s+', text) | |
| sentences = [s.strip() + 'ΰ₯€' if not s.endswith(('ΰ₯€', '.', '!', '?')) else s.strip() | |
| for s in sentences if s.strip()] | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| if len(current_chunk) + len(sentence) <= max_chunk_size: | |
| current_chunk += " " + sentence | |
| else: | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| # Add overlap between chunks for better context | |
| overlapped_chunks = [] | |
| for i, chunk in enumerate(chunks): | |
| if i > 0 and overlap > 0: | |
| # Add last few sentences from previous chunk | |
| prev_sentences = chunks[i-1].split('ΰ₯€')[-2:] | |
| overlap_text = 'ΰ₯€'.join(prev_sentences) | |
| overlapped_chunks.append(overlap_text + " " + chunk) | |
| else: | |
| overlapped_chunks.append(chunk) | |
| return overlapped_chunks | |
| def chunk_text_for_reader(text, max_chars=4000): | |
| """Simple chunking for text-to-speech""" | |
| return textwrap.wrap(text, max_chars, break_long_words=False, replace_whitespace=False) | |
| # ==================== HYBRID SEARCH RAG MODULE ==================== | |
| def setup_hybrid_rag_pipeline(chunks): | |
| """ | |
| Setup hybrid RAG with both dense (FAISS) and sparse (BM25) retrieval. | |
| This significantly improves retrieval accuracy. | |
| """ | |
| # Dense retrieval with multilingual embeddings | |
| embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') | |
| embeddings = embedder.encode(chunks, show_progress_bar=False) | |
| dense_index = faiss.IndexFlatL2(embeddings.shape[1]) | |
| dense_index.add(np.array(embeddings).astype('float32')) | |
| # Sparse retrieval with BM25 | |
| tokenized_chunks = [chunk.split() for chunk in chunks] | |
| sparse_index = BM25Okapi(tokenized_chunks) | |
| return dense_index, sparse_index, embedder | |
| def hybrid_search(dense_index, sparse_index, embedder, question, chunks, k=3, alpha=0.5): | |
| """ | |
| Hybrid search combining dense and sparse retrieval. | |
| alpha: weight for dense retrieval (1-alpha for sparse) | |
| """ | |
| # Dense retrieval | |
| question_embedding = embedder.encode([question]) | |
| dense_distances, dense_indices = dense_index.search( | |
| np.array(question_embedding).astype('float32'), k*2 | |
| ) | |
| # Sparse retrieval | |
| tokenized_question = question.split() | |
| sparse_scores = sparse_index.get_scores(tokenized_question) | |
| sparse_indices = np.argsort(sparse_scores)[-k*2:][::-1] | |
| # Normalize scores | |
| dense_scores = 1 / (1 + dense_distances[0]) # Convert distance to similarity | |
| dense_scores = dense_scores / np.sum(dense_scores) | |
| sparse_scores_norm = sparse_scores[sparse_indices] | |
| if np.sum(sparse_scores_norm) > 0: | |
| sparse_scores_norm = sparse_scores_norm / np.sum(sparse_scores_norm) | |
| # Combine scores | |
| combined_scores = {} | |
| for idx, score in zip(dense_indices[0], dense_scores): | |
| combined_scores[idx] = combined_scores.get(idx, 0) + alpha * score | |
| for idx, score in zip(sparse_indices, sparse_scores_norm): | |
| combined_scores[idx] = combined_scores.get(idx, 0) + (1 - alpha) * score | |
| # Get top k results | |
| top_indices = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:k] | |
| return [chunks[idx] for idx, _ in top_indices] | |
| # ==================== QA MODULE ==================== | |
| def load_qa_model(): | |
| """Load BanglaBERT model for Bengali question answering""" | |
| model_name = "csebuetnlp/banglabert" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForQuestionAnswering.from_pretrained(model_name) | |
| qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer) | |
| return qa_pipeline | |
| # ==================== SUMMARIZATION MODULE ==================== | |
| def load_summarization_model(): | |
| """Load summarization model for Bengali""" | |
| try: | |
| # Use mT5 for multilingual summarization | |
| summarizer = pipeline( | |
| "summarization", | |
| model="csebuetnlp/mT5_multilingual_XLSum", | |
| tokenizer="csebuetnlp/mT5_multilingual_XLSum" | |
| ) | |
| return summarizer | |
| except: | |
| return None | |
| def generate_summary(text, max_length=200, min_length=50): | |
| """Generate document summary""" | |
| summarizer = load_summarization_model() | |
| if summarizer is None: | |
| return "Summarization model not available." | |
| try: | |
| # Limit input length to avoid memory issues | |
| max_input = 1024 | |
| if len(text) > max_input: | |
| text = text[:max_input] | |
| summary = summarizer( | |
| text, | |
| max_length=max_length, | |
| min_length=min_length, | |
| do_sample=False | |
| ) | |
| return summary[0]['summary_text'] | |
| except Exception as e: | |
| return f"Summarization error: {str(e)}" | |
| # ==================== ANALYTICS MODULE ==================== | |
| def log_query(question, answer, confidence, processing_time, success=True): | |
| """Log query analytics for performance tracking""" | |
| st.session_state.analytics['total_queries'] += 1 | |
| if success: | |
| st.session_state.analytics['successful_queries'] += 1 | |
| st.session_state.analytics['avg_confidence'].append(confidence) | |
| else: | |
| st.session_state.analytics['failed_queries'] += 1 | |
| st.session_state.analytics['processing_times'].append(processing_time) | |
| st.session_state.analytics['query_history'].append({ | |
| 'timestamp': datetime.now().isoformat(), | |
| 'question': question, | |
| 'answer': answer[:100] + "..." if len(answer) > 100 else answer, | |
| 'confidence': confidence, | |
| 'processing_time': processing_time, | |
| 'success': success | |
| }) | |
| def display_analytics(): | |
| """Display analytics dashboard""" | |
| analytics = st.session_state.analytics | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Total Queries", analytics['total_queries']) | |
| st.metric("Success Rate", | |
| f"{(analytics['successful_queries'] / max(analytics['total_queries'], 1) * 100):.1f}%") | |
| with col2: | |
| avg_conf = np.mean(analytics['avg_confidence']) if analytics['avg_confidence'] else 0 | |
| st.metric("Avg Confidence", f"{avg_conf:.2%}") | |
| avg_time = np.mean(analytics['processing_times']) if analytics['processing_times'] else 0 | |
| st.metric("Avg Processing Time", f"{avg_time:.2f}s") | |
| with col3: | |
| st.metric("Successful Queries", analytics['successful_queries']) | |
| st.metric("Failed Queries", analytics['failed_queries']) | |
| # Query history | |
| if analytics['query_history']: | |
| st.subheader("Recent Query History") | |
| for query in analytics['query_history'][-5:]: | |
| with st.expander(f"Q: {query['question'][:50]}... ({query['timestamp']})"): | |
| st.write(f"**Answer:** {query['answer']}") | |
| st.write(f"**Confidence:** {query['confidence']:.2%}") | |
| st.write(f"**Time:** {query['processing_time']:.2f}s") | |
| st.write(f"**Status:** {'β Success' if query['success'] else 'β Failed'}") | |
| # ==================== EXPORT MODULE ==================== | |
| def export_results(text, summaries, qa_pairs): | |
| """Export results as JSON""" | |
| export_data = { | |
| 'export_date': datetime.now().isoformat(), | |
| 'full_text': text, | |
| 'summaries': summaries, | |
| 'qa_pairs': qa_pairs, | |
| 'analytics': st.session_state.analytics | |
| } | |
| return json.dumps(export_data, indent=2, ensure_ascii=False) | |
| # ==================== MAIN STREAMLIT APP ==================== | |
| st.set_page_config( | |
| page_title="Bengali PDF Assistant - Research Edition", | |
| page_icon="π", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Sidebar configuration | |
| with st.sidebar: | |
| st.header("βοΈ Configuration") | |
| st.subheader("Model Settings") | |
| rag_k = st.slider("Number of context chunks", 1, 5, 3) | |
| rag_alpha = st.slider("Dense/Sparse balance", 0.0, 1.0, 0.5, 0.1, | |
| help="0 = Only sparse (BM25), 1 = Only dense (embeddings)") | |
| st.subheader("Audio Settings") | |
| audio_speed = st.select_slider("Audio chunks per page", options=[2000, 3000, 4000, 5000], value=4000) | |
| st.subheader("About") | |
| st.info(""" | |
| **Research-Grade Features:** | |
| - π¬ Hybrid RAG (Dense + Sparse) | |
| - π Performance Analytics | |
| - π― Semantic Chunking | |
| - π Document Summarization | |
| - πΎ Export Capabilities | |
| - π Confidence Scoring | |
| """) | |
| # Main title | |
| st.title("π Bengali PDF Assistant - Research Edition") | |
| st.markdown(""" | |
| **Advanced NLP Pipeline for Bengali Document Analysis** | |
| *Features: Hybrid RAG β’ Meta MMS-TTS β’ BanglaBERT QA β’ Document Summarization β’ Analytics* | |
| """) | |
| uploaded_file = st.file_uploader("π Upload Bengali PDF Document", type="pdf") | |
| if uploaded_file: | |
| start_time = time.time() | |
| with st.spinner("π¬ Analyzing document with advanced NLP pipeline..."): | |
| file_contents = uploaded_file.getvalue() | |
| try: | |
| # Extract text | |
| full_text = extract_text_with_easyocr(file_contents) | |
| # Semantic chunking for better context | |
| reader_chunks = chunk_text_for_reader(full_text, max_chars=audio_speed) | |
| rag_chunks = semantic_chunk_text(full_text, max_chunk_size=1000, overlap=100) | |
| # Setup hybrid RAG pipeline | |
| dense_idx, sparse_idx, embedder = setup_hybrid_rag_pipeline(rag_chunks) | |
| processing_time = time.time() - start_time | |
| st.success(f"β Document processed in {processing_time:.2f}s!") | |
| st.info(f"π Extracted {len(full_text)} characters β’ {len(rag_chunks)} semantic chunks β’ {len(reader_chunks)} audio segments") | |
| except Exception as e: | |
| st.error(f"β Processing Error: {e}") | |
| st.stop() | |
| # Document preview | |
| with st.expander("π Document Preview & Metadata"): | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.text_area("Text Preview (first 500 chars)", full_text[:500], height=150) | |
| with col2: | |
| st.json({ | |
| "total_characters": len(full_text), | |
| "total_words": len(full_text.split()), | |
| "semantic_chunks": len(rag_chunks), | |
| "audio_segments": len(reader_chunks), | |
| "processing_time": f"{processing_time:.2f}s" | |
| }) | |
| # Main tabs | |
| tab1, tab2, tab3, tab4 = st.tabs([ | |
| "π Read Aloud", | |
| "π¬ Q&A System", | |
| "π Summarization", | |
| "π Analytics & Export" | |
| ]) | |
| # ==================== TAB 1: READ ALOUD ==================== | |
| with tab1: | |
| st.header("ποΈ Text-to-Speech") | |
| st.caption("Using Meta MMS-TTS for natural Bengali speech synthesis") | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| selected_mode = st.radio( | |
| "Select mode:", | |
| ["Generate All Audio", "Generate Specific Segment"], | |
| horizontal=True | |
| ) | |
| if selected_mode == "Generate All Audio": | |
| if st.button("π΅ Generate Complete Audio", type="primary"): | |
| if not full_text.strip(): | |
| st.warning("β οΈ No text found in document.") | |
| else: | |
| progress_bar = st.progress(0) | |
| for i, chunk in enumerate(reader_chunks): | |
| try: | |
| audio_buffer = generate_audio_mms(chunk) | |
| if audio_buffer: | |
| st.write(f"**Segment {i + 1} / {len(reader_chunks)}**") | |
| st.audio(audio_buffer, format="audio/wav") | |
| progress_bar.progress((i + 1) / len(reader_chunks)) | |
| except Exception as e: | |
| st.error(f"β Error in segment {i+1}: {e}") | |
| progress_bar.empty() | |
| st.success("β All audio segments generated!") | |
| else: | |
| segment_num = st.number_input( | |
| "Select segment number:", | |
| min_value=1, | |
| max_value=len(reader_chunks), | |
| value=1 | |
| ) | |
| if st.button("π΅ Generate Selected Segment"): | |
| try: | |
| chunk = reader_chunks[segment_num - 1] | |
| st.text_area("Segment text:", chunk, height=100) | |
| audio_buffer = generate_audio_mms(chunk) | |
| if audio_buffer: | |
| st.audio(audio_buffer, format="audio/wav") | |
| except Exception as e: | |
| st.error(f"β Error: {e}") | |
| # ==================== TAB 2: Q&A SYSTEM ==================== | |
| with tab2: | |
| st.header("π‘ Question Answering System") | |
| st.caption("Hybrid RAG with BanglaBERT β’ Combining dense embeddings + sparse retrieval") | |
| question = st.text_input("π Ask a question about your document:", key="qa_input") | |
| if question: | |
| query_start_time = time.time() | |
| with st.spinner("π¬ Processing query with hybrid search..."): | |
| try: | |
| # Hybrid search | |
| relevant_chunks = hybrid_search( | |
| dense_idx, sparse_idx, embedder, question, rag_chunks, | |
| k=rag_k, alpha=rag_alpha | |
| ) | |
| context = "\n---\n".join(relevant_chunks) | |
| # Get answer using BanglaBERT | |
| qa_model = load_qa_model() | |
| result = qa_model(question=question, context=context) | |
| answer_text = result['answer'] | |
| confidence = result['score'] | |
| query_time = time.time() - query_start_time | |
| # Display results | |
| st.subheader("π‘ Answer") | |
| st.markdown(f"> {answer_text}") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Confidence", f"{confidence:.2%}") | |
| with col2: | |
| st.metric("Processing Time", f"{query_time:.2f}s") | |
| with col3: | |
| st.metric("Context Chunks", len(relevant_chunks)) | |
| # Audio for answer | |
| st.write("π **Listen to Answer:**") | |
| audio_buffer = generate_audio_mms(answer_text) | |
| if audio_buffer: | |
| st.audio(audio_buffer, format="audio/wav") | |
| # Show retrieved context | |
| with st.expander("π Retrieved Context"): | |
| for i, chunk in enumerate(relevant_chunks): | |
| st.markdown(f"**Chunk {i+1}:**") | |
| st.text_area(f"Context {i+1}", chunk, height=100, key=f"context_{i}") | |
| # Log analytics | |
| log_query(question, answer_text, confidence, query_time, success=True) | |
| except Exception as e: | |
| query_time = time.time() - query_start_time | |
| st.error(f"β Error: {e}") | |
| st.write("**Retrieved Context (fallback):**") | |
| st.text_area("Context", context[:500] if 'context' in locals() else "No context retrieved", height=150) | |
| log_query(question, str(e), 0.0, query_time, success=False) | |
| # ==================== TAB 3: SUMMARIZATION ==================== | |
| with tab3: | |
| st.header("π Document Summarization") | |
| st.caption("Using mT5 for Bengali text summarization") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| summary_length = st.select_slider( | |
| "Summary length:", | |
| options=["Short", "Medium", "Long"], | |
| value="Medium" | |
| ) | |
| length_map = {"Short": (30, 100), "Medium": (50, 200), "Long": (100, 300)} | |
| min_len, max_len = length_map[summary_length] | |
| if st.button("π Generate Summary", type="primary"): | |
| with st.spinner("π¬ Generating summary..."): | |
| summary_start = time.time() | |
| # Generate summary for full text or first portion | |
| text_to_summarize = full_text[:2000] if len(full_text) > 2000 else full_text | |
| summary = generate_summary(text_to_summarize, max_length=max_len, min_length=min_len) | |
| summary_time = time.time() - summary_start | |
| st.subheader("π Summary") | |
| st.markdown(f"> {summary}") | |
| st.metric("Generation Time", f"{summary_time:.2f}s") | |
| # Audio for summary | |
| st.write("π **Listen to Summary:**") | |
| audio_buffer = generate_audio_mms(summary) | |
| if audio_buffer: | |
| st.audio(audio_buffer, format="audio/wav") | |
| # Summary statistics | |
| with st.expander("π Summary Statistics"): | |
| st.json({ | |
| "original_length": len(full_text), | |
| "summary_length": len(summary), | |
| "compression_ratio": f"{(len(summary) / len(full_text) * 100):.1f}%", | |
| "generation_time": f"{summary_time:.2f}s" | |
| }) | |
| # ==================== TAB 4: ANALYTICS & EXPORT ==================== | |
| with tab4: | |
| st.header("π Performance Analytics") | |
| display_analytics() | |
| st.divider() | |
| st.header("πΎ Export Results") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("π₯ Export Session Data (JSON)"): | |
| export_data = export_results( | |
| full_text[:1000] + "..." if len(full_text) > 1000 else full_text, | |
| [], | |
| [] | |
| ) | |
| st.download_button( | |
| label="Download JSON", | |
| data=export_data, | |
| file_name=f"bengali_pdf_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", | |
| mime="application/json" | |
| ) | |
| with col2: | |
| if st.button("π Export Full Text"): | |
| st.download_button( | |
| label="Download Text", | |
| data=full_text, | |
| file_name=f"extracted_text_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt", | |
| mime="text/plain" | |
| ) | |
| st.divider() | |
| st.header("π¬ Technical Details") | |
| with st.expander("Pipeline Configuration"): | |
| st.code(f""" | |
| OCR: EasyOCR (Bengali + English) | |
| TTS: facebook/mms-tts-ben (Meta MMS-TTS) | |
| Embeddings: paraphrase-multilingual-MiniLM-L12-v2 | |
| QA Model: csebuetnlp/banglabert | |
| Summarization: csebuetnlp/mT5_multilingual_XLSum | |
| Dense Index: FAISS (L2) | |
| Sparse Index: BM25Okapi | |
| Chunking: Semantic (sentence-aware) | |
| Hybrid Search Alpha: {rag_alpha} | |
| Context Chunks (k): {rag_k} | |
| """, language="yaml") | |
| else: | |
| # Landing page when no file uploaded | |
| st.info("π Upload a Bengali PDF document to get started") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.subheader("π― Features") | |
| st.markdown(""" | |
| - Advanced OCR for Bengali | |
| - Natural TTS synthesis | |
| - Hybrid RAG search | |
| - Question answering | |
| - Document summarization | |
| """) | |
| with col2: | |
| st.subheader("π¬ Research Tools") | |
| st.markdown(""" | |
| - Performance analytics | |
| - Confidence scoring | |
| - Processing metrics | |
| - Query history | |
| - Export capabilities | |
| """) | |
| with col3: | |
| st.subheader("π Technologies") | |
| st.markdown(""" | |
| - EasyOCR | |
| - Meta MMS-TTS | |
| - BanglaBERT | |
| - FAISS + BM25 | |
| - mT5 Summarization | |
| """) | |
| # Footer | |
| st.divider() | |
| st.caption("π Bengali PDF Assistant - Research Edition | Built for academic research & accessibility") | |