import os import gradio as gr import requests import re import time import pandas as pd from typing import Dict, Tuple, List, Optional # Configuration API_URL = "http://localhost:5685/punctuate" punc_dict = { '!': 'EXCLAMATION', '?': 'QUESTION', ',': 'COMMA', ';': 'SEMICOLON', ':': 'COLON', '-': 'HYPHEN', '।': 'DARI', } allowed_punctuations = set(punc_dict.keys()) def clean_and_normalize_text(text, remove_punctuations=False): """Clean and normalize Bangla text with correct spacing""" if remove_punctuations: # Remove all allowed punctuations cleaned_text = re.sub(f"[{re.escape(''.join(allowed_punctuations))}]", "", text) # Normalize spaces cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() return cleaned_text else: # Keep only allowed punctuations and Bangla letters/digits chunks = re.split(f"([{re.escape(''.join(allowed_punctuations))}])", text) filtered_chunks = [] for chunk in chunks: if chunk in allowed_punctuations: filtered_chunks.append(chunk) else: # Clean text and preserve word boundaries clean_chunk = re.sub(rf"[^\u0980-\u09FF\u09E6-\u09EF\s]", "", chunk) clean_chunk = re.sub(r'\s+', ' ', clean_chunk) # Normalize internal spacing clean_chunk = clean_chunk.strip() if clean_chunk: filtered_chunks.append(' ' + clean_chunk) # Add space before word chunks # Join and clean up spacing result = ''.join(filtered_chunks) result = re.sub(r'\s+', ' ', result).strip() return result def restore_punctuation(text): """Call the punctuation restoration API""" try: payload = {"text": text} start_time = time.time() response = requests.post(API_URL, json=payload) end_time = time.time() api_time = end_time - start_time if response.status_code == 200: restored_text = response.json().get("restored_text") return restored_text, api_time else: return f"API Error: {response.status_code} - {response.text}", api_time except Exception as e: return f"Connection Error: {str(e)}", 0.0 def dummy_restore_punctuation(text): """Dummy API call for demonstration when real API is not available""" time.sleep(0.5) # Simulate API delay # Simple dummy logic - add some punctuations randomly for demo words = text.split() if len(words) > 5: words[2] = words[2] + ',' words[-1] = words[-1] + '?' elif len(words) > 2: words[-1] = words[-1] + '!' return ' '.join(words), 0.5 def tokenize_with_punctuation(text): """Tokenize text keeping punctuation separate using chunk-based approach""" tokens = [] chunks = re.split(f"([{re.escape(''.join(allowed_punctuations))}])", text) for chunk in chunks: if not chunk.strip(): continue if chunk in allowed_punctuations: # This is a punctuation tokens.append(chunk) else: # This is text, split into words words = chunk.strip().split() for word in words: if word.strip(): tokens.append(word.strip()) return tokens def compare_texts(ground_truth, predicted): """Compare ground truth and predicted text token by token with proper alignment""" gt_tokens = tokenize_with_punctuation(ground_truth) pred_tokens = tokenize_with_punctuation(predicted) comparison_result = [] correct_puncs = {} wrong_puncs = {} gt_punc_counts = {} # Count punctuations in ground truth for token in gt_tokens: if token in allowed_punctuations: punc_name = punc_dict[token] gt_punc_counts[punc_name] = gt_punc_counts.get(punc_name, 0) + 1 # Separate words and punctuations for better alignment gt_words = [token for token in gt_tokens if token not in allowed_punctuations] pred_words = [token for token in pred_tokens if token not in allowed_punctuations] # Create position maps for punctuations gt_punct_map = {} # word_index -> [punctuations after this word] pred_punct_map = {} # word_index -> [punctuations after this word] # Build ground truth punctuation map word_idx = -1 for i, token in enumerate(gt_tokens): if token not in allowed_punctuations: word_idx += 1 else: if word_idx not in gt_punct_map: gt_punct_map[word_idx] = [] gt_punct_map[word_idx].append(token) # Build predicted punctuation map word_idx = -1 for i, token in enumerate(pred_tokens): if token not in allowed_punctuations: word_idx += 1 else: if word_idx not in pred_punct_map: pred_punct_map[word_idx] = [] pred_punct_map[word_idx].append(token) # Compare words and punctuations max_words = max(len(gt_words), len(pred_words)) for i in range(max_words): # Add word if i < len(gt_words) and i < len(pred_words): if gt_words[i] == pred_words[i]: comparison_result.append((gt_words[i], "correct", "black")) else: comparison_result.append((f"{gt_words[i]}→{pred_words[i]}", "word_diff", "orange")) elif i < len(gt_words): comparison_result.append((f"{gt_words[i]}→''", "missing_word", "red")) elif i < len(pred_words): comparison_result.append((f"''→{pred_words[i]}", "extra_word", "red")) # Compare punctuations after this word gt_puncs = gt_punct_map.get(i, []) pred_puncs = pred_punct_map.get(i, []) # Handle punctuation comparison max_puncs = max(len(gt_puncs), len(pred_puncs)) for j in range(max_puncs): if j < len(gt_puncs) and j < len(pred_puncs): gt_punc = gt_puncs[j] pred_punc = pred_puncs[j] if gt_punc == pred_punc: punc_name = punc_dict[gt_punc] correct_puncs[punc_name] = correct_puncs.get(punc_name, 0) + 1 comparison_result.append((gt_punc, "correct", "green")) else: # Wrong punctuation punc_name = punc_dict[gt_punc] wrong_puncs[punc_name] = wrong_puncs.get(punc_name, 0) + 1 comparison_result.append((f"{gt_punc}→{pred_punc}", "wrong_punct", "red")) elif j < len(gt_puncs): # Missing punctuation gt_punc = gt_puncs[j] punc_name = punc_dict[gt_punc] wrong_puncs[punc_name] = wrong_puncs.get(punc_name, 0) + 1 comparison_result.append((f"{gt_punc}→''", "missing_punct", "red")) elif j < len(pred_puncs): # Extra punctuation (not counted in wrong_puncs since it's not in GT) pred_punc = pred_puncs[j] comparison_result.append((f"''→{pred_punc}", "extra_punct", "red")) return comparison_result, correct_puncs, wrong_puncs, gt_punc_counts def create_evaluation_table(correct_puncs, wrong_puncs, gt_punc_counts): """Create evaluation table""" table_data = [] for punc_name in gt_punc_counts.keys(): correct_count = correct_puncs.get(punc_name, 0) wrong_count = wrong_puncs.get(punc_name, 0) total_count = gt_punc_counts[punc_name] table_data.append([ punc_name, correct_count, wrong_count, total_count ]) df = pd.DataFrame(table_data, columns=[ "Punctuation Name", "Correctly Classified", "Wrongly Classified", "Count in Ground Truth" ]) return df def format_comparison_html(comparison_result): """Format comparison result as HTML with improved display""" html = "
Comparison will appear here after processing with ground truth.
" ) evaluation_table = gr.Dataframe( label="📊 Punctuation Evaluation Metrics", headers=["Punctuation Name", "Correctly Classified", "Wrongly Classified", "Count in Ground Truth"], interactive=False ) # Legend gr.Markdown(""" ### 🎨 Color Legend: - 🟢 **Green**: Correctly predicted punctuation - 🔴 **Red**: Incorrectly predicted, missing, or extra punctuation/word - 🟡 **Orange**: Word-level differences - ⚫ **Black**: Correct words/tokens - **∅**: Empty/Missing (instead of showing word→word or punct→word) """) submit_btn.click( fn=process_punctuation_restoration, inputs=[input_text, ground_truth], outputs=[predicted_output, comparison_output, api_time, evaluation_table, normalized_gt] ) # Example section gr.Markdown("### 📚 Example") gr.Examples( examples=[ [ "পুরুষের সংখ্যা মোট জনসংখ্যার ৫২ এবং নারীর সংখ্যা ৪৮ শহরের সাক্ষরতার হার কত", "পুরুষের সংখ্যা মোট জনসংখ্যার ৫২, এবং নারীর সংখ্যা ৪৮। শহরের সাক্ষরতার হার কত?" ], [ "ক্রিকেট বিশ্বের কাছে নিজের আগামীবার তা ভালোভাবেই পৌঁছে দিলেন পাকিস্তানের পেসার আমের জামান", "" ] ], inputs=[input_text, ground_truth] ) return app if __name__ == "__main__": app = create_interface() app.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=True )