import streamlit as st import re import time from typing import List, Dict import pandas as pd import plotly.express as px import numpy as np import networkx as nx from lime.lime_text import LimeTextExplainer import shap # ----------------- Streamlit Page Config ----------------- st.set_page_config( page_title="🧠 اردو متن خلاصہ ساز", page_icon="🧠", layout="wide", initial_sidebar_state="expanded" ) # ----------------- Custom CSS ----------------- st.markdown(""" """, unsafe_allow_html=True) # ----------------- Urdu Text Summarizer ----------------- class UrduTextSummarizer: def __init__(self): self.urdu_stop_words = {'اور','کا','کی','کے','میں','سے','کو','نے','ہے','ہیں','تھا','تھی','تھے', 'گا','گی','گے','کہ','جو','یہ','وہ','اس','ان','پر','کر','کرنا','کیا', 'ہو','ہوا','ہوئی','ہوئے','بھی','تو','ہی','لیے','ساتھ','بعد','پہلے'} def tokenize(self, sentence: str) -> List[str]: if isinstance(sentence, bytes): sentence = sentence.decode('utf-8', errors='ignore') elif not isinstance(sentence, str): sentence = str(sentence) words = re.sub(r'[۔،؟!؛:]', '', sentence).split() return [w for w in words if w not in self.urdu_stop_words and len(w) > 2] def extract_keywords(self, text: str) -> Dict: words = re.sub(r'[۔،؟!؛:]', '', text).split() word_freq = {} for w in words: if w not in self.urdu_stop_words and len(w) > 2: word_freq[w] = word_freq.get(w, 0) + 1 sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10] return {'keywords': [{'word': w, 'frequency': f, 'importance': min(1.0, f/5)} for w, f in sorted_words]} def get_sentence_score(self, sentences: List[str], tfidf: np.ndarray, sim_matrix: np.ndarray, i: int) -> float: vec = tfidf[i] norm = np.linalg.norm(vec) avg_sim = np.mean(sim_matrix[i]) return (norm + avg_sim) / 2 if norm > 0 else 0.0 def summarize_text(self, text: str) -> Dict: start_time = time.time() sentences = [s.strip() for s in re.split(r'[۔!?]', text) if s.strip()] if len(sentences) == 0: return {'summary': '', 'sentences': [], 'keywords': {'keywords': []}, 'duration': 0.0, 'explanations': {}} if len(sentences) < 2: return {'summary': sentences[0], 'sentences': [{'sentence': sentences[0], 'score': 1.0, 'included': True, 'position': 1}], 'keywords': self.extract_keywords(text), 'duration': time.time() - start_time, 'explanations': {}} # Tokenize sentences sent_words = [self.tokenize(s) for s in sentences] all_words = list(set(w for sw in sent_words for w in sw)) if not all_words: summary = '۔ '.join(sentences[:3]) sentence_scores = [{'sentence': s, 'score': 0.0, 'included': i < 3, 'position': i+1} for i, s in enumerate(sentences)] return {'summary': summary, 'sentences': sentence_scores, 'keywords': {'keywords': []}, 'duration': time.time() - start_time, 'explanations': {}} word_to_idx = {w: i for i, w in enumerate(all_words)} num_words = len(all_words) num_sents = len(sentences) # TF matrix tf = np.zeros((num_sents, num_words)) for i, words in enumerate(sent_words): word_count = {} for w in words: word_count[w] = word_count.get(w, 0) + 1 for w, count in word_count.items(): tf[i, word_to_idx[w]] = count / len(words) if len(words) > 0 else 0 # IDF df = np.sum(tf > 0, axis=0) idf = np.log(num_sents / (1 + df)) # TF-IDF tfidf = tf * idf # Similarity matrix sim_matrix = np.zeros((num_sents, num_sents)) for i in range(num_sents): for j in range(num_sents): if i != j: vec_i = tfidf[i] vec_j = tfidf[j] norm_i = np.linalg.norm(vec_i) norm_j = np.linalg.norm(vec_j) if norm_i > 0 and norm_j > 0: sim_matrix[i, j] = np.dot(vec_i, vec_j) / (norm_i * norm_j) # Graph and PageRank graph = nx.from_numpy_array(sim_matrix) try: scores = nx.pagerank(graph, max_iter=200, tol=1e-6) except nx.PowerIterationFailedConvergence: scores = {i: 1.0 / num_sents for i in range(num_sents)} # Sentence scores with position bonus sentence_scores = [] max_score = max(scores.values()) if scores else 1.0 for i in range(num_sents): base_score = scores.get(i, 0.0) / max_score if max_score > 0 else 0.0 position_bonus = 0.1 if i == 0 or i == num_sents - 1 else 0 final_score = base_score + position_bonus sentence_scores.append({'sentence': sentences[i], 'score': final_score, 'included': False, 'position': i+1}) # Select top N sentences N = max(3, min(6, int(len(sentences) * 0.3))) sorted_scores = sorted(sentence_scores, key=lambda x: x['score'], reverse=True)[:N] # Redundancy check selected = [] for cand in sorted_scores: add = True cand_vec = tfidf[cand['position'] - 1] for sel in selected: sel_vec = tfidf[sel['position'] - 1] sim = np.dot(cand_vec, sel_vec) / (np.linalg.norm(cand_vec) * np.linalg.norm(sel_vec)) if np.linalg.norm(cand_vec) > 0 and np.linalg.norm(sel_vec) > 0 else 0 if sim > 0.8: add = False break if add: selected.append(cand) # Sort selected by original position summary_sents = sorted(selected, key=lambda x: x['position']) summary = '۔ '.join([s['sentence'] for s in summary_sents]) + '۔' if summary_sents else '' # Update included selected_sentences = {s['sentence'] for s in selected} for ss in sentence_scores: ss['included'] = ss['sentence'] in selected_sentences # Explainability: LIME and SHAP explanations = {'lime': [], 'shap': []} # Predictor function for LIME/SHAP def predictor(texts): scores = [] for t in texts: try: words = self.tokenize(t) if not words: scores.append([0.0, 1.0]) continue temp_sent_words = [words] temp_all_words = list(set(words)) temp_word_to_idx = {w: idx for idx, w in enumerate(temp_all_words)} temp_tf = np.zeros((1, len(temp_all_words))) word_count = {w: words.count(w) for w in words} for w, count in word_count.items(): temp_tf[0, temp_word_to_idx[w]] = count / len(words) temp_df = np.sum(temp_tf > 0, axis=0) temp_idf = np.log(1 / (1 + temp_df)) temp_tfidf = temp_tf * temp_idf norm = np.linalg.norm(temp_tfidf[0]) score = min(max(norm, 0.0), 1.0) scores.append([score, 1.0 - score]) except Exception: scores.append([0.0, 1.0]) return np.array(scores) # LIME Explainer lime_explainer = LimeTextExplainer(class_names=["Score", "Not Score"], bow=False) top_indices = [s['position'] - 1 for s in sorted_scores[:2]] for idx in top_indices: try: exp = lime_explainer.explain_instance(sentences[idx], predictor, num_features=10, num_samples=100) explanations['lime'].append({'sentence': sentences[idx], 'exp': exp.as_list(label=0)}) except Exception as e: explanations['lime'].append({'sentence': sentences[idx], 'exp': [('Error', f'LIME failed: {str(e)}')]}) # SHAP Explainer background_texts = sentences[:min(10, len(sentences))] shap_explainer = shap.KernelExplainer(predictor, background_texts) for idx in top_indices: try: shap_values = shap_explainer.shap_values(sentences[idx], nsamples=100)[0] explanations['shap'].append({'sentence': sentences[idx], 'shap_values': shap_values}) except Exception as e: explanations['shap'].append({'sentence': sentences[idx], 'shap_values': [0.0], 'error': str(e)}) keywords = self.extract_keywords(text) duration = time.time() - start_time return {'summary': summary, 'sentences': sentence_scores, 'keywords': keywords, 'duration': duration, 'explanations': explanations} # ----------------- App Interface ----------------- st.title("🧠 اردو متن خلاصہ ساز") st.sidebar.header("📑 متن یا مثال منتخب کریں") summarizer = UrduTextSummarizer() sample_text = "انڈیا کے زیر انتظام کشمیر میں واقع سلال ڈیم اُس وقت زیرِ بحث آیا ..." selected_sample = st.sidebar.selectbox("مثال منتخب کریں:", ["اپنا متن درج کریں", "نمونہ متن"]) user_input = st.text_area("اپنا اردو متن یہاں درج کریں:", height=200) if selected_sample=="اپنا متن درج کریں" else sample_text if st.button("خلاصہ تیار کریں") and user_input.strip(): result = summarizer.summarize_text(user_input) # ----------------- Multi Tabs ----------------- tabs = st.tabs(["📄 خلاصہ", "🧩 جملے", "🔑 اہم الفاظ", "📊 گراف", "🛠 Explainability"]) # --- Summary Tab --- with tabs[0]: st.markdown(f"