import gradio as gr import sys import os import torch sys.path.append(".") def setup_cpu_environment(): os.environ['CUDA_VISIBLE_DEVICES'] = '' torch.set_num_threads(4) os.environ['TOKENIZERS_PARALLELISM'] = 'false' os.environ['TRANSFORMERS_CACHE'] = './cache' setup_cpu_environment() from RadEval import RadEval, compare_systems def run_radeval_simple(ref_text, hyp_text, selected_metrics): """ Run RadEval with selected metrics on a pair of reference and hypothesis texts """ try: refs = [ref_text.strip()] hyps = [hyp_text.strip()] # Configure RadEval based on selected metrics config = { 'do_radgraph': 'RadGraph F1' in selected_metrics, 'do_bleu': 'BLEU' in selected_metrics, 'do_rouge': 'ROUGE' in selected_metrics, 'do_bertscore': 'BERTScore' in selected_metrics, 'do_chexbert': 'CheXbert F1' in selected_metrics, 'do_ratescore': 'RaTEScore' in selected_metrics, 'do_radcliq': 'RadCliQ' in selected_metrics, 'do_temporal': 'Temporal F1' in selected_metrics, 'do_radeval_bertsore': 'RadEval BERTScore' in selected_metrics, 'do_green': 'GREEN' in selected_metrics, 'do_srr_bert': 'SRR-BERT' in selected_metrics } # Initialize RadEval with selected metrics evaluator = RadEval(**config) # Run evaluation results = evaluator(refs=refs, hyps=hyps) # Prepare results for display table_data = [] analysis_text = "## ๐Ÿš€ RadEval Results\n\n" analysis_text += f"**Reference:** {ref_text[:100]}{'...' if len(ref_text) > 100 else ''}\n\n" analysis_text += f"**Hypothesis:** {hyp_text[:100]}{'...' if len(hyp_text) > 100 else ''}\n\n" analysis_text += "### Evaluation Scores:\n\n" for metric, score in results.items(): if isinstance(score, (int, float)): formatted_score = f"{score:.4f}" if isinstance(score, float) else str(score) table_data.append([metric, formatted_score]) analysis_text += f"- **{metric}**: {formatted_score}\n" elif isinstance(score, dict): # Handle nested metrics for sub_metric, sub_score in score.items(): if isinstance(sub_score, (int, float)): formatted_score = f"{sub_score:.4f}" if isinstance(sub_score, float) else str(sub_score) metric_name = f"{metric}_{sub_metric}" table_data.append([metric_name, formatted_score]) analysis_text += f"- **{metric_name}**: {formatted_score}\n" if not table_data: return "No metrics were computed. Please select at least one metric.", [["No results", ""]] return analysis_text, table_data except ImportError as e: error_msg = f"Import Error: {str(e)}. Please ensure RadEval dependencies are installed." return error_msg, [["Error", error_msg]] except Exception as e: error_msg = f"Evaluation Error: {str(e)}" return error_msg, [["Error", error_msg]] # Example pairs for radiology reports examples = { "Normal vs Normal": { "ref": "Heart size is normal. Lungs are clear. No pleural effusion or pneumothorax.", "hyp": "Cardiac silhouette is within normal limits. Lungs are clear bilaterally. No effusion or pneumothorax identified.", }, "Pneumonia Case": { "ref": "Moderate cardiomegaly. Bilateral lower lobe consolidations consistent with pneumonia.", "hyp": "Enlarged heart. Worsening bilateral infiltrates in the lower lobes suggestive of pneumonia.", }, "Temporal Comparison": { "ref": "Compared to prior study, the pleural effusion has increased in size. New bilateral infiltrates are present.", "hyp": "The pleural effusion is larger than on the previous examination. There are new bilateral pulmonary infiltrates.", }, "Discordant Reports": { "ref": "No acute cardiopulmonary process. Normal heart size and lung fields.", "hyp": "Mild cardiomegaly with bilateral lower lobe atelectasis. Small pleural effusion on the right.", }, "Ambiguous Language": { "ref": "There is a small left-sided pleural effusion with adjacent atelectasis.", "hyp": "Possible small effusion on the left. Atelectasis cannot be excluded.", }, "Surgical Follow-up": { "ref": "Status post coronary artery bypass grafting. No evidence of acute complication.", "hyp": "Post-operative changes from CABG are present. No signs of surgical complication.", }, "False Positive": { "ref": "No focal consolidation, pleural effusion, or pneumothorax identified.", "hyp": "Right lower lobe consolidation concerning for pneumonia.", }, "Textual Hallucination": { "ref": "Heart and mediastinum are normal. Lungs are clear.", "hyp": "Large left pleural effusion with mediastinal shift to the right.", }, "Negation Challenge": { "ref": "No evidence of pneumothorax or pleural effusion.", "hyp": "Evidence of small pneumothorax on the right.", }, "Fine-grained Difference": { "ref": "Mild interstitial markings at the lung bases, likely chronic.", "hyp": "Subtle increased interstitial opacities at both lung bases, likely chronic in nature.", } } def update_fields(choice): """Update text fields based on example selection""" if choice == "Custom": return gr.update(value="", interactive=True), gr.update(value="", interactive=True) else: return ( gr.update(value=examples[choice]["ref"], interactive=False), gr.update(value=examples[choice]["hyp"], interactive=False) ) # Available metrics (ordered by computational complexity) available_metrics = [ "BLEU", "ROUGE", "BERTScore", "Temporal F1", "RadEval BERTScore", "RaTEScore", "RadCliQ", "SRR-BERT", "CheXbert F1", "RadGraph F1", "GREEN" ] # Fast metrics for default selection default_metrics = ["BLEU", "ROUGE", "BERTScore"] with gr.Blocks(title="RadEval Evaluation", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # ๐ŸŽ๏ธ RadEval Evaluation **RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box. **โš ๏ธ Performance Warning โš ๏ธ** The demo is currently running on **CPU**. When using some slower metrics (like RadGraph, CheXbert, GREEN), it may take a while to complete evaluation. Please be patient. """ ) with gr.Row(): choice = gr.Radio( label="๐Ÿ“‹ Choose Example or Custom Input", choices=["Custom"] + list(examples.keys()), value="Custom", interactive=True ) with gr.Row(): with gr.Column(scale=1): ref_input = gr.Textbox( label="๐Ÿ“„ Reference Report (Ground Truth)", lines=5, placeholder="Enter the reference radiology report here...", info="The ground truth or expert-written report" ) with gr.Column(scale=1): hyp_input = gr.Textbox( label="๐Ÿค– Hypothesis Report (Generated)", lines=5, placeholder="Enter the generated/predicted radiology report here...", info="The AI-generated or system-produced report" ) choice.change( update_fields, inputs=choice, outputs=[ref_input, hyp_input], ) with gr.Row(): metrics_selection = gr.CheckboxGroup( label="๐ŸŽฏ Select Evaluation Metrics", choices=available_metrics, value=default_metrics, interactive=True, info="Select metrics to compute. Some metrics may take longer (RadGraph, CheXbert, GREEN)." ) with gr.Row(): run_button = gr.Button("๐Ÿš€ Run RadEval", variant="primary", size="lg") with gr.Row(): with gr.Column(scale=2): analysis_output = gr.Markdown( value="๐Ÿ“Š **Results will appear here after evaluation...**\n\nSelect your texts and metrics, then click 'Run RadEval'." ) with gr.Column(scale=1): table_output = gr.DataFrame( label="๐Ÿ“ˆ Detailed Scores", headers=["Metric", "Score"], wrap=True ) # Information section with gr.Accordion("๐Ÿ’ก Metric Information", open=False): gr.Markdown( """ ### ๐Ÿ“Š Available Metrics: **Traditional NLG Metrics:** - **BLEU**: N-gram overlap between reference and hypothesis - **ROUGE**: Recall-oriented overlap (ROUGE-1, ROUGE-2, ROUGE-L) - **BERTScore**: Semantic similarity using BERT embeddings **Radiology-Specific Metrics:** - **RadGraph F1**: Entity and relation extraction for radiology - **CheXbert F1**: Chest X-ray finding classification performance - **RaTEScore**: Radiology-aware text evaluation score - **RadCliQ**: Composite metric for radiology reports - **Temporal F1**: Temporal entity and relationship evaluation - **RadEval BERTScore**: Specialized BERT for radiology text - **GREEN**: Generative evaluation with natural language explanations - **SRR-BERT**: Structured radiology reasoning evaluation ### โšก Performance Notes: - **Fast**: BLEU, ROUGE, BERTScore, Temporal F1 - **Medium**: RadEval BERTScore, RaTEScore, RadCliQ, SRR-BERT - **Slow**: CheXbert F1, RadGraph F1, GREEN (requires model downloads) """ ) run_button.click( run_radeval_simple, inputs=[ref_input, hyp_input, metrics_selection], outputs=[analysis_output, table_output] ) # ============================================================================= # ๐Ÿงช Hypothesis Testing Section # ============================================================================= def run_hypothesis_testing(systems_data, selected_test_metrics, n_samples, significance_level): """ Run statistical significance testing between multiple systems """ try: # Parse systems data (expecting JSON format) import json systems_dict = json.loads(systems_data) # Extract references and systems if 'references' not in systems_dict or 'systems' not in systems_dict: return "Error: Please provide both 'references' and 'systems' in the JSON data.", "" references = systems_dict['references'] systems = systems_dict['systems'] # Validate data integrity if not references or not systems: return "Error: References and systems cannot be empty.", "" if not isinstance(references, list) or not isinstance(systems, dict): return "Error: References must be a list and systems must be a dictionary.", "" # Check that all systems have the same number of outputs as references ref_count = len(references) for system_name, system_outputs in systems.items(): if not isinstance(system_outputs, list): return f"Error: System '{system_name}' outputs must be a list.", "" if len(system_outputs) != ref_count: return f"Error: System '{system_name}' has {len(system_outputs)} outputs but {ref_count} references provided.", "" # Validate that all texts are non-empty strings for i, ref in enumerate(references): if not isinstance(ref, str) or not ref.strip(): return f"Error: Reference {i+1} is empty or not a string.", "" for system_name, system_outputs in systems.items(): for i, output in enumerate(system_outputs): if not isinstance(output, str) or not output.strip(): return f"Error: System '{system_name}' output {i+1} is empty or not a string.", "" # Initialize evaluators based on selected metrics (fast metrics only) evaluators = {} if 'BLEU' in selected_test_metrics: evaluators['bleu'] = RadEval(do_bleu=True) if 'ROUGE' in selected_test_metrics: evaluators['rouge'] = RadEval(do_rouge=True) if 'BERTScore' in selected_test_metrics: evaluators['bertscore'] = RadEval(do_bertscore=True) # Custom metric: average word count def word_count_metric(hyps, refs): return sum(len(report.split()) for report in hyps) / len(hyps) # Build metrics dictionary (following the example structure) metrics = {} if 'BLEU' in selected_test_metrics: # Test the evaluator first try: test_result = evaluators['bleu'](references[:1], [systems[list(systems.keys())[0]][0]]) if 'bleu' not in test_result: return "Error: BLEU evaluator doesn't return 'bleu' key. Available keys: " + str(list(test_result.keys())), "" metrics['bleu'] = lambda hyps, refs: evaluators['bleu'](refs, hyps)['bleu'] except Exception as bleu_error: return f"Error testing BLEU evaluator: {str(bleu_error)}", "" if 'ROUGE' in selected_test_metrics: try: test_result = evaluators['rouge'](references[:1], [systems[list(systems.keys())[0]][0]]) for rouge_key in ['rouge1', 'rouge2', 'rougeL']: if rouge_key not in test_result: return f"Error: ROUGE evaluator doesn't return '{rouge_key}' key. Available keys: " + str(list(test_result.keys())), "" metrics['rouge1'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge1'] metrics['rouge2'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge2'] metrics['rougeL'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rougeL'] except Exception as rouge_error: return f"Error testing ROUGE evaluator: {str(rouge_error)}", "" if 'BERTScore' in selected_test_metrics: try: test_result = evaluators['bertscore'](references[:1], [systems[list(systems.keys())[0]][0]]) if 'bertscore' not in test_result: return "Error: BERTScore evaluator doesn't return 'bertscore' key. Available keys: " + str(list(test_result.keys())), "" metrics['bertscore'] = lambda hyps, refs: evaluators['bertscore'](refs, hyps)['bertscore'] except Exception as bert_error: return f"Error testing BERTScore evaluator: {str(bert_error)}", "" if 'custom: Word Count' in selected_test_metrics: metrics['word_count'] = word_count_metric # โ† example of a simple custom-defined metric if not metrics: return "Error: Please select at least one metric for testing.", "" # Run significance tests try: signatures, scores = compare_systems( systems=systems, metrics=metrics, references=references, n_samples=int(n_samples), significance_level=float(significance_level), print_results=False # We don't need print output for online demo ) except Exception as compare_error: return f"Error during significance testing: {str(compare_error)}\n\nThis might be due to:\n1. Empty or invalid text content\n2. Incompatible metric configurations\n3. RadEval library issues", str(compare_error) # Format results results_text = "## ๐Ÿงช Hypothesis Testing Results\n\n" results_text += f"**Parameters:**\n" results_text += f"- Randomization samples: {n_samples}\n" results_text += f"- Significance level: {significance_level}\n" results_text += f"- Number of systems: {len(systems)}\n" results_text += f"- Number of references: {len(references)}\n\n" # Significant differences summary results_text += "### ๐Ÿ“Š Significant Differences Summary\n\n" baseline_name = list(systems.keys())[0] # Assume first one is the baseline results_text += f"**Baseline system:** {baseline_name}\n\n" has_significant_differences = False for system_name in systems.keys(): if system_name == baseline_name: continue significant_metrics = [] for metric_name in metrics.keys(): pvalue_key = f"{metric_name}_pvalue" if pvalue_key in scores[system_name]: p_val = scores[system_name][pvalue_key] if p_val < float(significance_level): significant_metrics.append(metric_name) if significant_metrics: results_text += f"**{system_name} vs {baseline_name}:** {', '.join(significant_metrics)} (p < {significance_level})\n\n" has_significant_differences = True else: results_text += f"**{system_name} vs {baseline_name}:** No significant differences\n\n" if not has_significant_differences: results_text += "*No statistically significant differences found between systems.*\n\n" # Add mean scores in table format results_text += "### ๐Ÿ“ˆ Mean Scores by System\n\n" try: baseline_name = list(systems.keys())[0] # Display each system's results in a clean format for system_name in systems.keys(): results_text += f"**{system_name.upper()}:**\n\n" # Create table header results_text += "| Metric | Score | P-value |\n" results_text += "|--------|-------|----------|\n" # Get system data from scores system_scores = scores.get(system_name, {}) # Add rows for each metric for metric_name in metrics.keys(): if metric_name in system_scores: score = system_scores[metric_name] pvalue_key = f"{metric_name}_pvalue" # Format score score_str = f"{score:.4f}" if isinstance(score, (int, float)) else str(score) # Format p-value (only for non-baseline systems) if system_name != baseline_name and pvalue_key in system_scores: pvalue = system_scores[pvalue_key] pvalue_str = f"{pvalue:.4f}" if isinstance(pvalue, (int, float)) else str(pvalue) # Mark significant p-values if isinstance(pvalue, (int, float)) and pvalue < float(significance_level): pvalue_str += " *" else: pvalue_str = "-" if system_name == baseline_name else "N/A" results_text += f"| {metric_name} | {score_str} | {pvalue_str} |\n" results_text += "\n" results_text += "*Note: Baseline system shows scores only. Other systems show scores and p-values comparing to baseline.*\n" results_text += f"*P-values marked with * are significant (p < {significance_level}).*\n\n" except Exception as score_error: results_text += f"Error formatting scores: {str(score_error)}\n\n" return results_text except ImportError as e: return f"Import Error: {str(e)}. Please ensure RadEval with compare_systems is installed." except json.JSONDecodeError: return "Error: Invalid JSON format in systems data." except Exception as e: return f"Testing Error: {str(e)}" # Create Hypothesis Testing UI with gr.Blocks(title="Null Hypothesis Testing", theme=gr.themes.Soft()) as hypothesis_demo: gr.Markdown( """ # ๐Ÿ–ฅ๏ธ Null Hypothesis Testing **Statistical significance testing** for comparing multiple radiology report generation systems. This tool uses **randomization-based significance testing** to determine if differences between systems are statistically meaningful. **โš ๏ธ Performance Warning โš ๏ธ** Hypothesis testing with multiple metrics may take some time, especially with larger sample sizes. Please be patient during computation. """ ) with gr.Row(): with gr.Column(scale=1.5): systems_input = gr.Textbox( label="๐Ÿ“Š Systems Data (JSON Format)", lines=18, placeholder="""Enter systems data in JSON format, e.g.: { "references": [ "No acute cardiopulmonary process.", "Mild cardiomegaly with clear lung fields." ], "systems": { "baseline": [ "No acute findings.", "Mild cardiomegaly, clear lungs." ], "improved": [ "No acute cardiopulmonary process.", "Mild cardiomegaly with clear lung fields bilaterally." ] } }""", info="Provide reference reports and multiple systems to compare" ) with gr.Column(scale=1): test_metrics_selection = gr.CheckboxGroup( label="๐ŸŽฏ Select Metrics for Testing", choices=["BLEU", "ROUGE", "BERTScore", "custom: Word Count"], value=["BLEU", "ROUGE", "BERTScore"], interactive=True, info="Only fast metrics are shown to ensure quick evaluation (slow ones are excluded)" ) n_samples_input = gr.Number( label="๐Ÿ”„ Randomization Samples", value=50, minimum=10, maximum=1000, step=10, info="Number of randomisation samples (higher = more confidence, but slower)" ) significance_level_input = gr.Number( label="๐Ÿ“ˆ Significance Level (ฮฑ)", value=0.05, minimum=0.01, maximum=0.10, step=0.01, info="Alpha level for significance testing" ) example_button = gr.Button("๐Ÿ“ Load Example Data", variant="secondary") clear_button = gr.Button("๐Ÿ—‘๏ธ Clear Data", variant="secondary") with gr.Row(): test_button = gr.Button("๐Ÿงช Run Hypothesis Testing", variant="primary", size="lg") with gr.Row(): test_results = gr.Markdown( value="๐Ÿ“Š **Test results will appear here...**\n\nClick 'Load Example Data' to see sample input, then click 'Run Hypothesis Testing' to see results." ) # Example data button def load_example_data(): example_data = { "references": [ "No acute cardiopulmonary process.", "No radiographic findings to suggest pneumonia.", "Mild cardiomegaly with clear lung fields.", "Small pleural effusion on the right side.", "Status post cardiac surgery with stable appearance." ], "systems": { "baseline": [ "No acute findings.", "No pneumonia.", "Mild cardiomegaly, clear lungs.", "Small right pleural effusion.", "Post-cardiac surgery, stable." ], "improved": [ "No acute cardiopulmonary process.", "No radiographic findings suggesting pneumonia.", "Mild cardiomegaly with clear lung fields bilaterally.", "Small pleural effusion present on the right side.", "Status post cardiac surgery with stable appearance." ], "poor": [ "Normal.", "OK.", "Heart big.", "Some fluid.", "Surgery done." ] } } import json return json.dumps(example_data, indent=2) example_button.click( load_example_data, outputs=systems_input ) clear_button.click( lambda: "", outputs=systems_input ) test_button.click( run_hypothesis_testing, inputs=[systems_input, test_metrics_selection, n_samples_input, significance_level_input], outputs=[test_results] ) with gr.Accordion("๐Ÿ’ก Hypothesis Testing Information", open=False): gr.Markdown( """ ### ๐Ÿ”ฌ How it Works: This tool performs **randomization-based significance testing** to compare multiple systems: 1. **Null Hypothesis**: No difference between systems 2. **Randomization**: Randomly permute system outputs multiple times 3. **P-value Calculation**: Proportion of permutations where random difference โ‰ฅ observed difference 4. **Significance**: If p-value < ฮฑ, reject null hypothesis (systems are significantly different) ### ๐Ÿ“Š Input Format: - **References**: Ground truth reports - **Systems**: Multiple systems to compare (each with same number of outputs as references) - **Metrics**: Evaluation metrics to use for comparison ### ๐Ÿ“ˆ Output: - **Significance Matrix**: P-values for all pairwise system comparisons - **Mean Scores**: Average performance of each system on each metric - **Bold p-values**: Indicate statistically significant differences ### โšก Performance: - **Fast Metrics Only**: This tool only includes BLEU, ROUGE, BERTScore, and Word Count for optimal performance - **Excluded Slow Metrics**: RadGraph F1, CheXbert F1 are excluded to ensure reasonable computation time - More randomization samples = more accurate p-values but slower computation - Recommended: 50-100 samples for quick testing, 1000+ for publication """ ) # Combine both demos using gr.Blocks to add a header with gr.Blocks( title="RadEval: A framework for radiology text evaluation", theme=gr.themes.Soft(), css=""" .tab-nav button { font-weight: bold !important; border: 2px solid #e0e7ff !important; border-radius: 10px !important; margin: 0 5px !important; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; color: white !important; box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2) !important; transition: all 0.3s ease !important; } .tab-nav button:hover { transform: translateY(-2px) !important; box-shadow: 0 6px 20px rgba(0, 0, 0, 0.3) !important; background: linear-gradient(135deg, #764ba2 0%, #667eea 100%) !important; } .tab-nav button.selected { background: linear-gradient(135deg, #ff6b6b 0%, #ee5a24 100%) !important; border-color: #ff6b6b !important; transform: translateY(-1px) !important; box-shadow: 0 8px 25px rgba(255, 107, 107, 0.4) !important; } """ ) as combined_demo: gr.Markdown( """ # ๐Ÿฉบ RadEval: A framework for radiology text evaluation ### [Github](https://github.com/jbdel/RadEval) | [PyPI](https://pypi.org/project/RadEval) | [Video](https://justin13601.github.io/files/radeval.mp4) | [arXiv]() | [RadEval_ModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]() """ ) tabs = gr.TabbedInterface( [demo, hypothesis_demo], ["๐ŸŽ๏ธ RadEval Evaluation", "๐Ÿ–ฅ๏ธ Null Hypothesis Testing"] ) if __name__ == "__main__": combined_demo.launch()