Spaces:

X-iZhang
/

RadEval

Running

App Files Files Community

X-iZhang commited on 12 days ago

Commit

bad8293

verified ·

1 Parent(s): a39b152

initial

Browse files

Files changed (32) hide show

.gitattributes +1 -0
RadEval.py +424 -0
RadEval_banner.png +3 -0
__init__.py +2 -0
factual/RaTEScore/__init__.py +2 -0
factual/RaTEScore/score.py +83 -0
factual/RaTEScore/scorer.py +146 -0
factual/RaTEScore/utils.py +143 -0
factual/RadCliQv1/radcliq.py +213 -0
factual/RadCliQv1/radcliq_bertscore.py +10 -0
factual/RadCliQv1/radcliq_radgraph.py +80 -0
factual/RadCliQv1/semb_score.py +74 -0
factual/SRRBert/leaves_mapping.json +58 -0
factual/SRRBert/leaves_with_statuses_mapping.json +165 -0
factual/SRRBert/srr_bert.py +160 -0
factual/SRRBert/upper_mapping.json +28 -0
factual/SRRBert/upper_with_statuses_mapping.json +76 -0
factual/__init__.py +0 -0
factual/f1chexbert.py +254 -0
factual/f1temporal.py +167 -0
factual/green_score/__init__.py +1 -0
factual/green_score/green.py +465 -0
factual/green_score/utils.py +200 -0
nlg/__init__.py +0 -0
nlg/bertscore/__init__.py +1 -0
nlg/bertscore/bertscore.py +50 -0
nlg/bleu/__init__.py +1 -0
nlg/bleu/bleu.py +49 -0
nlg/bleu/bleu_scorer.py +268 -0
nlg/radevalbertscore.py +53 -0
nlg/rouge/rouge.py +37 -0
utils.py +341 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+RadEval_banner.png filter=lfs diff=lfs merge=lfs -text

RadEval.py ADDED Viewed

	@@ -0,0 +1,424 @@

+from collections import defaultdict
+import stanza
+import warnings
+import logging
+import os
+import re
+from nlg.rouge.rouge import Rouge
+from nlg.bleu.bleu import Bleu
+from nlg.bertscore.bertscore import BertScore
+from radgraph import F1RadGraph
+from factual.green_score import GREEN
+from factual.RaTEScore import RaTEScore
+from factual.f1temporal import F1Temporal
+from torch import nn
+import pandas as pd
+import numpy as np
+from sklearn.metrics import classification_report
+from sklearn.exceptions import UndefinedMetricWarning
+import json
+from factual.f1chexbert import F1CheXbert
+import nltk
+from utils import clean_numbered_list
+from factual.RadCliQv1.radcliq import CompositeMetric
+from factual.SRRBert.srr_bert import SRRBert, srr_bert_parse_sentences
+from nlg.radevalbertscore import RadEvalBERTScorer
+# Suppress Warning
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+warnings.filterwarnings('ignore')
+logging.basicConfig(level=logging.ERROR)
+class RadEval():
+    def __init__(self,
+                 do_radgraph=False,
+                 do_green=False,
+                 do_bleu=False,
+                 do_rouge=False,
+                 do_bertscore=False,
+                 do_srr_bert=False,
+                 do_chexbert=False,
+                 do_ratescore=False,
+                 do_radcliq=False,
+                 do_radeval_bertsore=False,
+                 do_temporal=False,
+                 do_details=False,
+                 ):
+        super(RadEval, self).__init__()
+        self.do_radgraph = do_radgraph
+        self.do_green = do_green
+        self.do_bleu = do_bleu
+        self.do_rouge = do_rouge
+        self.do_bertscore = do_bertscore
+        self.do_srr_bert = do_srr_bert
+        self.do_chexbert = do_chexbert
+        self.do_ratescore = do_ratescore
+        self.do_radcliq = do_radcliq
+        self.do_temporal = do_temporal
+        self.do_radeval_bertsore = do_radeval_bertsore
+        self.do_details = do_details
+        # Initialize scorers only once
+        if self.do_radgraph:
+            self.radgraph_scorer = F1RadGraph(reward_level="all", model_type="radgraph-xl")
+        if self.do_bleu:
+            self.bleu_scorer = Bleu()
+            self.bleu_scorer_1 = Bleu(n=1)
+            self.bleu_scorer_2 = Bleu(n=2)
+            self.bleu_scorer_3 = Bleu(n=3)
+        if self.do_bertscore:
+            self.bertscore_scorer = BertScore(model_type='distilbert-base-uncased',
+                                              num_layers=5)
+        if self.do_green:
+            # Initialize green scorer here if needed
+            self.green_scorer = GREEN("StanfordAIMI/GREEN-radllama2-7b",
+                                      output_dir=".")
+        if self.do_rouge:
+            self.rouge_scorers = {
+                "rouge1": Rouge(rouges=["rouge1"]),
+                "rouge2": Rouge(rouges=["rouge2"]),
+                "rougeL": Rouge(rouges=["rougeL"])
+            }
+        if self.do_srr_bert:
+            nltk.download('punkt_tab', quiet=True)
+            self.srr_bert_scorer = SRRBert(model_type="leaves_with_statuses")
+        if self.do_chexbert:
+            self.chexbert_scorer = F1CheXbert()
+        if self.do_ratescore:
+            self.ratescore_scorer = RaTEScore()
+        if self.do_radcliq:
+            self.radcliq_scorer = CompositeMetric()
+        if self.do_temporal:
+            stanza.download('en', package='radiology', processors={'ner': 'radiology'})
+            self.F1Temporal = F1Temporal
+        if self.do_radeval_bertsore:
+            self.radeval_bertsore = RadEvalBERTScorer(
+                model_type="IAMJB/RadEvalModernBERT",
+                num_layers=22,
+                use_fast_tokenizer=True,
+                rescale_with_baseline=False)
+        # Store the metric keys
+        self.metric_keys = []
+        if self.do_radgraph:
+            self.metric_keys.extend(["radgraph_simple", "radgraph_partial", "radgraph_complete"])
+        if self.do_bleu:
+            self.metric_keys.append("bleu")
+        if self.do_green:
+            self.metric_keys.append("green")
+        if self.do_bertscore:
+            self.metric_keys.append("bertscore")
+        if self.do_rouge:
+            self.metric_keys.extend(self.rouge_scorers.keys())
+        if self.do_srr_bert:
+            self.metric_keys.extend(["samples_avg_precision", "samples_avg_recall", "samples_avg_f1-score"])
+        if self.do_chexbert:
+            self.metric_keys.extend([
+                "chexbert-5_micro avg_f1-score",
+                "chexbert-all_micro avg_f1-score",
+                "chexbert-5_macro avg_f1-score",
+                "chexbert-all_macro avg_f1-score"
+            ])
+        if self.do_ratescore:
+            self.metric_keys.append("ratescore")
+        if self.do_radcliq:
+            self.metric_keys.append("radcliqv1")
+        if self.do_temporal:
+            self.metric_keys.append("temporal_f1")
+        if self.do_radeval_bertsore:
+            self.metric_keys.append("radeval_bertsore")
+    def __call__(self, refs, hyps):
+        if not (isinstance(hyps, list) and isinstance(refs, list)):
+            raise TypeError("hyps and refs must be of type list")
+        if len(hyps) != len(refs):
+            raise ValueError("hyps and refs lists don't have the same size")
+        if len(refs) == 0:
+            return {}
+        scores = self.compute_scores(refs=refs, hyps=hyps)
+        return scores
+    def compute_scores(self, refs, hyps):
+        if not (isinstance(hyps, list) and isinstance(refs, list)):
+            raise TypeError("hyps and refs must be of type list")
+        if len(hyps) != len(refs):
+            raise ValueError("hyps and refs lists don't have the same size")
+        scores = {}
+        if self.do_radgraph:
+            radgraph_scores = self.radgraph_scorer(refs=refs, hyps=hyps)
+            if self.do_details:
+                f1_scores = radgraph_scores[0]
+                individual_scores = radgraph_scores[1]
+                hyps_entities = radgraph_scores[2]
+                refs_entities = radgraph_scores[3]
+                scores["radgraph"] = {
+                    "radgraph_simple": f1_scores[0],
+                    "radgraph_partial": f1_scores[1],
+                    "radgraph_complete": f1_scores[2],
+                    "reward_list": individual_scores,
+                    "hypothesis_annotation_lists": hyps_entities,
+                    "reference_annotation_lists": refs_entities
+                }
+            else:
+                radgraph_scores = radgraph_scores[0]
+                scores["radgraph_simple"] = radgraph_scores[0]
+                scores["radgraph_partial"] = radgraph_scores[1]
+                scores["radgraph_complete"] = radgraph_scores[2]
+        if self.do_bleu:
+            if self.do_details:
+                bleu_1_score = self.bleu_scorer_1(refs, hyps)[0]
+                bleu_2_score = self.bleu_scorer_2(refs, hyps)[0]
+                bleu_3_score = self.bleu_scorer_3(refs, hyps)[0]
+                bleu_4_score = self.bleu_scorer(refs, hyps)[0]
+                scores["bleu"] = {
+                    "bleu_1": bleu_1_score,
+                    "bleu_2": bleu_2_score,
+                    "bleu_3": bleu_3_score,
+                    "bleu_4": bleu_4_score
+                }
+            else:
+                scores["bleu"] = self.bleu_scorer(refs, hyps)[0]
+        if self.do_bertscore:
+            if self.do_details:
+                bertscore_scores, sample_scores = self.bertscore_scorer(refs, hyps)
+                scores["bertscore"] = {
+                    "mean_score": bertscore_scores,
+                    "sample_scores": sample_scores
+                }
+            else:
+                scores["bertscore"] = self.bertscore_scorer(refs, hyps)[0]
+        if self.do_green:
+            # Use the initialized green scorer
+            mean, std, sample_scores, summary, _ = self.green_scorer(refs, hyps)
+            if self.do_details:
+                scores["green"] = {
+                    "mean": mean,
+                    "std": std,
+                    "sample_scores": sample_scores,
+                    "summary": summary
+                }
+            else:
+                scores["green"] = mean
+        if self.do_rouge:
+            if self.do_details:
+                rouge_scores = {}
+                for key, scorer in self.rouge_scorers.items():
+                    mean, sample_scores  = scorer(refs, hyps)
+                    rouge_scores[key] = {
+                        "mean_score": mean,
+                        "sample_scores": sample_scores
+                    }
+                scores["rouge"] = rouge_scores
+            else:
+                for key, scorer in self.rouge_scorers.items():
+                    scores[key] = scorer(refs, hyps)[0]
+        if self.do_srr_bert:
+            # Clean reports before tokenization
+            parsed_refs = [srr_bert_parse_sentences(ref) for ref in refs]
+            parsed_hyps = [srr_bert_parse_sentences(hyp) for hyp in hyps]
+            section_level_hyps_pred = []
+            section_level_refs_pred = []
+            for parsed_hyp, parsed_ref in zip(parsed_hyps, parsed_refs):
+                outputs, _ = self.srr_bert_scorer(sentences=parsed_ref + parsed_hyp)
+                refs_preds = outputs[:len(parsed_ref)]
+                hyps_preds = outputs[len(parsed_ref):]
+                merged_refs_preds = np.any(refs_preds, axis=0).astype(int)
+                merged_hyps_preds = np.any(hyps_preds, axis=0).astype(int)
+                section_level_hyps_pred.append(merged_hyps_preds)
+                section_level_refs_pred.append(merged_refs_preds)
+            label_names = [label for label, idx in sorted(self.srr_bert_scorer.mapping.items(), key=lambda x: x[1])]
+            classification_dict = classification_report(section_level_refs_pred,
+                                                        section_level_hyps_pred,
+                                                        target_names=label_names,
+                                                        output_dict=True,
+                                                        zero_division=0)
+            if self.do_details:
+                label_scores = {}
+                for label in label_names:
+                    if label in classification_dict:
+                        f1 = classification_dict[label]["f1-score"]
+                        support = classification_dict[label]["support"]
+                        if f1 > 0 or support > 0:
+                            label_scores[label] = {
+                                "f1-score": f1,
+                                "precision": classification_dict[label]["precision"],
+                                "recall": classification_dict[label]["recall"],
+                                "support": support
+                            }
+                scores["srr_bert"] = {
+                    "srr_bert_weighted_f1": classification_dict["weighted avg"]["f1-score"],
+                    "srr_bert_weighted_precision": classification_dict["weighted avg"]["precision"],
+                    "srr_bert_weighted_recall": classification_dict["weighted avg"]["recall"],
+                    "label_scores": label_scores
+                }
+            else:
+                scores["srr_bert_weighted_f1"] = classification_dict["weighted avg"]["f1-score"]
+                scores["srr_bert_weighted_precision"] = classification_dict["weighted avg"]["precision"]
+                scores["srr_bert_weighted_recall"] = classification_dict["weighted avg"]["recall"]
+        if self.do_chexbert:
+            accuracy, accuracy_per_sample, chexbert_all, chexbert_5 = self.chexbert_scorer(hyps, refs)
+            if self.do_details:
+                chexbert_5_labels = {
+                    k: v["f1-score"]
+                    for k, v in list(chexbert_5.items())[:-4]
+                }
+                chexbert_all_labels = {
+                    k: v["f1-score"]
+                    for k, v in list(chexbert_all.items())[:-4]
+                }
+                scores["chexbert"] = {
+                    "chexbert-5_micro avg_f1-score": chexbert_5["micro avg"]["f1-score"],
+                    "chexbert-all_micro avg_f1-score": chexbert_all["micro avg"]["f1-score"],
+                    "chexbert-5_macro avg_f1-score": chexbert_5["macro avg"]["f1-score"],
+                    "chexbert-all_macro avg_f1-score": chexbert_all["macro avg"]["f1-score"],
+                    "chexbert-5_weighted_f1": chexbert_5["weighted avg"]["f1-score"],
+                    "chexbert-all_weighted_f1": chexbert_all["weighted avg"]["f1-score"],
+                    "label_scores_f1-score": {
+                        "chexbert-5": chexbert_5_labels,
+                        "chexbert_all": chexbert_all_labels
+                    }
+                }
+            else:
+                scores["chexbert-5_micro avg_f1-score"] = chexbert_5["micro avg"]["f1-score"]
+                scores["chexbert-all_micro avg_f1-score"] = chexbert_all["micro avg"]["f1-score"]
+                scores["chexbert-5_macro avg_f1-score"] = chexbert_5["macro avg"]["f1-score"]
+                scores["chexbert-all_macro avg_f1-score"] = chexbert_all["macro avg"]["f1-score"]
+                scores["chexbert-5_weighted_f1"] = chexbert_5["weighted avg"]["f1-score"]
+                scores["chexbert-all_weighted_f1"] = chexbert_all["weighted avg"]["f1-score"]
+        if self.do_ratescore:
+            rate_score, pred_pairs_raw ,gt_pairs_raw = self.ratescore_scorer.compute_score(candidate_list=hyps, reference_list=refs)
+            f1_ratescore = float(np.mean(rate_score))
+            if self.do_details:
+                pred_pairs = [
+                    {ent: label for ent, label in sample}
+                    for sample in pred_pairs_raw
+                ]
+                gt_pairs = [
+                    {ent: label for ent, label in sample}
+                    for sample in gt_pairs_raw
+                ]
+                scores["ratescore"] = {
+                    "f1-score": f1_ratescore,
+                    "hyps_pairs": pred_pairs,
+                    "refs_pairs": gt_pairs
+                }
+            else:
+                scores["ratescore"] = f1_ratescore
+        if self.do_radcliq:
+            mean_scores, detail_scores = self.radcliq_scorer.predict(refs, hyps)
+            if self.do_details:
+                scores["radcliq-v1"] = {
+                    "mean_score": mean_scores,
+                    "sample_scores": detail_scores.tolist()
+                }
+            else:
+                scores["radcliq-v1"] = mean_scores
+        if self.do_temporal:
+            temporal_scores = self.F1Temporal(predictions=hyps, references=refs)
+            if self.do_details:
+                hyp_entities = [
+                    sorted(list(group)) if group else []
+                    for group in temporal_scores.get("prediction_entities", [])
+                ]
+                ref_entities = [
+                    sorted(list(group)) if group else []
+                    for group in temporal_scores.get("reference_entities", [])
+                ]
+                scores["temporal_f1"] = {
+                    "f1-score": temporal_scores["f1"],
+                    "hyps_entities": hyp_entities,
+                    "refs_entities": ref_entities
+                }
+            else:
+                scores["temporal_f1"] = temporal_scores["f1"]
+        if self.do_radeval_bertsore:
+            radeval_bertsores = self.radeval_bertsore.score(refs=refs, hyps=hyps)
+            if self.do_details:
+                scores["radeval_bertsore"] = {
+                    "f1-score": radeval_bertsores[0],
+                    "sample_scores": radeval_bertsores[1].tolist()
+                }
+            else:
+                scores["radeval_bertsore"] = radeval_bertsores[0]
+        return scores
+def main():
+    refs = [
+        "No acute cardiopulmonary process.",
+        "No radiographic findings to suggest pneumonia.",
+        "1.Status post median sternotomy for CABG with stable cardiac enlargement and calcification of the aorta consistent with atherosclerosis.Relatively lower lung volumes with no focal airspace consolidation appreciated.Crowding of the pulmonary vasculature with possible minimal perihilar edema, but no overt pulmonary edema.No pleural effusions or pneumothoraces.",
+        "1. Left PICC tip appears to terminate in the distal left brachiocephalic vein.2. Mild pulmonary vascular congestion.3. Interval improvement in aeration of the lung bases with residual streaky opacity likely reflective of atelectasis.Interval resolution of the left pleural effusion.",
+        "No definite acute cardiopulmonary process.Enlarged cardiac silhouette could be accentuated by patient's positioning.",
+        "Increased mild pulmonary edema and left basal atelectasis.",
+    ]
+    hyps = [
+        "No acute cardiopulmonary process.",
+        "No radiographic findings to suggest pneumonia.",
+        "Status post median sternotomy for CABG with stable cardiac enlargement and calcification of the aorta consistent with atherosclerosis.",
+        "Relatively lower lung volumes with no focal airspace consolidation appreciated.",
+        "Crowding of the pulmonary vasculature with possible minimal perihilar edema, but no overt pulmonary edema.",
+        "No pleural effusions or pneumothoraces.",
+    ]
+    evaluator = RadEval(do_radgraph=True,
+                        do_green=False,
+                        do_bleu=True,
+                        do_rouge=True,
+                        do_bertscore=True,
+                        do_srr_bert=True,
+                        do_chexbert=True,
+                        do_temporal=True,
+                        do_ratescore=True,
+                        do_radcliq=True,
+                        do_radeval_bertsore=True)
+    results = evaluator(refs=refs, hyps=hyps)
+    print(json.dumps(results, indent=4))
+if __name__ == '__main__':
+    main()

RadEval_banner.png ADDED Viewed

Git LFS Details

SHA256: 2ddd4245bbadd24dae93ac925134bd2aea5548b94b8b55c8c8a21c8fd0338709
Pointer size: 132 Bytes
Size of remote file: 1.18 MB

__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .RadEval import RadEval
2	+ from .utils import compare_systems

factual/RaTEScore/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .score import *
2	+ from .scorer import RaTEScore

factual/RaTEScore/score.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import medspacy
+nlp = medspacy.load(medspacy_enable=["medspacy_pyrush", "medspacy_context"])
+from .utils import sentence_split, post_process
+def run_ner(texts, idx2label, tokenizer, model, device, batch_size):
+    clean_text_list, is_start_list = sentence_split(texts)
+    predicted_labels = []
+    for i in range(0, len(clean_text_list), batch_size):
+        batch_text = clean_text_list[i:i+batch_size]
+        inputs = tokenizer(batch_text,
+                        max_length=512,
+                        padding=True,
+                        truncation=True,
+                        return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        predicted_labels.extend(torch.argmax(outputs.logits, dim=2).tolist())
+    inputs = tokenizer(clean_text_list,
+                        max_length=512,
+                        padding=True,
+                        truncation=True,
+                        return_tensors="pt")
+    save_pairs = []
+    pad_token_id = tokenizer.pad_token_id
+    for i, is_start in enumerate(is_start_list):
+        predicted_entities = [idx2label[label] for label in predicted_labels[i]]
+        non_pad_mask = inputs["input_ids"][i] != pad_token_id
+        non_pad_length = non_pad_mask.sum().item()
+        non_pad_input_ids = inputs["input_ids"][i][:non_pad_length]
+        tokenized_text = tokenizer.convert_ids_to_tokens(non_pad_input_ids)
+        if is_start:
+            save_pair = post_process(tokenized_text, predicted_entities, tokenizer)
+        else:
+            save_pair = post_process(tokenized_text, predicted_entities, tokenizer)
+            save_pairs[-1].extend(save_pair)
+            continue
+        save_pairs.append(save_pair)
+    return save_pairs
+def process_embedding(pair, eval_tokenizer, eval_model, device):
+    entities = [pair[0] for pair in pair]
+    types = [pair[1] for pair in pair]
+    if len(entities) == 0:
+        embeds_word = torch.tensor([])
+    else:
+        embeds_word = torch.tensor([]).to(device)
+        with torch.no_grad():
+            # tokenize the queries
+            encoded = eval_tokenizer(
+                entities,
+                truncation=True,
+                padding=True,
+                return_tensors='pt',
+                max_length=30,
+            ).to(device)
+            # encode the queries (use the [CLS] last hidden states as the representations)
+            embeds_word = torch.cat((embeds_word.to('cpu'),
+                                    eval_model(**encoded).last_hidden_state[:, 0, :].to('cpu')), dim=0)
+    return embeds_word, types

factual/RaTEScore/scorer.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+import json
+import numpy as np
+from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoModelForTokenClassification
+import pandas as pd
+import os
+from .score import run_ner, process_embedding
+from .utils import compute
+DEFAULT_MATRIX_LONG = {"abnormality_abnormality": 0.4276119164393705, "abnormality_anatomy": 0.6240929990607657, "abnormality_disease": 0.0034478181112993847, "abnormality_non-abnormality": 0.5431049700217344, "abnormality_non-disease": 0.27005425386213877, "anatomy_abnormality": 0.7487824274337533, "anatomy_anatomy": 0.2856134859160784, "anatomy_disease": 0.4592143222158069, "anatomy_non-abnormality": 0.02097055139911715, "anatomy_non-disease": 0.00013736314126696204, "disease_abnormality": 0.8396510075734789, "disease_anatomy": 0.9950209388542061, "disease_disease": 0.8460555030578727, "disease_non-abnormality": 0.9820689020512646, "disease_non-disease": 0.3789136708096537, "non-abnormality_abnormality": 0.16546764653692908, "non-abnormality_anatomy": 0.018670610691852826, "non-abnormality_disease": 0.719397354576018, "non-abnormality_non-abnormality": 0.0009357166071730684, "non-abnormality_non-disease": 0.0927333564267591, "non-disease_abnormality": 0.7759420231214385, "non-disease_anatomy": 0.1839139293714062, "non-disease_disease": 0.10073046076318157, "non-disease_non-abnormality": 0.03860183811876373, "non-disease_non-disease": 0.34065681486566446, "neg_weight":0.8716553966489615}
+DEFAULT_MATRIX_SHORT = {"abnormality_abnormality": 0.4070293318365468, "abnormality_anatomy": 0.6952639610605605, "abnormality_disease": 0.28342529466226446, "abnormality_non-abnormality": 0.9479148658006686, "abnormality_non-disease": 0.23875064111146294, "anatomy_abnormality": 0.5829759950441763, "anatomy_anatomy": 0.7709590751917746, "anatomy_disease": 0.0006059634829551632, "anatomy_non-abnormality": 0.794672584951181, "anatomy_non-disease": 0.27982942400798977, "disease_abnormality": 0.8840397619834857, "disease_anatomy": 0.9637659445696822, "disease_disease": 0.19018958438059513, "disease_non-abnormality": 0.6962283914800402, "disease_non-disease": 0.943727057946997, "non-abnormality_abnormality": 0.1712744286898638, "non-abnormality_anatomy": 0.4485149671497294, "non-abnormality_disease": 0.00045065329822896076, "non-abnormality_non-abnormality": 0.0007887930317199857, "non-abnormality_non-disease": 0.8555432840895761, "non-disease_abnormality": 0.9555801066212176, "non-disease_anatomy": 0.13122106162635216, "non-disease_disease": 0.6072996585919443, "non-disease_non-abnormality": 0.05650711141169969, "non-disease_non-disease": 0.3214769399791204, "neg_weight":0.3611577852354489}
+class RaTEScore:
+    def __init__(self,
+                    bert_model="Angelakeke/RaTE-NER-Deberta",
+                    eval_model='FremyCompany/BioLORD-2023-C',
+                    batch_size=1,
+                    use_gpu=True,
+                    visualization_path=None,
+                    affinity_matrix="long",
+                ):
+        """ RaTEScore is a novel, entity-aware metric to assess the quality of medical reports generated by AI models.
+        It emphasizes crucial medical entities such as diagnostic outcomes and anatomical details, and is robust
+        against complex medical synonyms and sensitive to negation expressions. The evaluations demonstrate that
+        RaTEScore aligns more closely with human preference than existing metrics.
+        Args:
+            bert_model (str, optional): Medical entity recognition modul module. Defaults to "Angelakeke/RaTE-NER-Deberta".
+            eval_model (str, optional): Synonym disambuation encoding module. Defaults to 'FremyCompany/BioLORD-2023-C'.
+            batch_size (int, optional): Batch size to choose. Defaults to 1.
+            use_gpu (bool, optional): If to use gpu. Defaults to True.
+            visualization_path (str, optional): Output the visualized files, default to save as a json file. Defaults to None.
+            affinity_matrix (str, optional):pre-searched type weight and can be changed due to the human rating bias.
+                                          Defaults to 'long'.
+        """
+        # if use_gpu
+        if use_gpu:
+            self.device = torch.device('cuda')
+        else:
+            self.device = torch.device('cpu')
+        # load the Medical entity recognition module
+        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)
+        self.model = AutoModelForTokenClassification.from_pretrained(bert_model).eval().to(self.device)
+        # load the Synonym disambuation module
+        self.eval_tokenizer = AutoTokenizer.from_pretrained(eval_model)
+        self.eval_model = AutoModel.from_pretrained(eval_model).eval().to(self.device)
+        # load the weight matrix
+        if isinstance(affinity_matrix, str):
+            # Choose the appropriate matrix based on the argument
+            if affinity_matrix.lower() == "long":
+                self.matrix_path = DEFAULT_MATRIX_LONG
+            elif affinity_matrix.lower() == "short":
+                self.matrix_path = DEFAULT_MATRIX_SHORT
+            else:
+                # Assume it's a file path
+                try:
+                    with open(affinity_matrix, 'r') as f:
+                        self.matrix_path = json.load(f)
+                except Exception as e:
+                    raise ValueError(f"Failed to load affinity matrix from {affinity_matrix}: {e}")
+        else:
+            raise ValueError("affinity_matrix must be a string")
+        self.affinity_matrix = {(k.split('_')[0].upper(), k.split('_')[1].upper()):v for k,v in self.matrix_path.items()}
+        # load the label file
+        self.config = AutoConfig.from_pretrained(bert_model)
+        self.label2idx = self.config.label2id
+        self.idx2label = self.config.id2label
+        # save the input
+        self.batch_size = batch_size
+        if visualization_path:
+            self.visualization_path = visualization_path
+            if not os.path.exists(os.path.dirname(visualization_path)):
+                os.makedirs(os.path.dirname(visualization_path))
+        else:
+            self.visualization_path = None
+    def compute_score(self, candidate_list, reference_list):
+        '''Compute the RaTEScore for the candidate and reference reports.
+        Args:
+            candidate_list (list): list of candidate reports
+            reference_list (list): list of reference reports
+        '''
+        # check if candidate and reference are list
+        if not isinstance(candidate_list, list):
+            raise ValueError("candidate must be a list")
+        if not isinstance(reference_list, list):
+            raise ValueError("reference must be a list")
+        assert len(candidate_list) == len(reference_list), "candidate and reference must have the same length"
+        # check if candidate and reference are list of strings
+        if not all(isinstance(x, str) for x in candidate_list):
+            raise ValueError("candidate must be a list of strings")
+        gt_pairs = run_ner(reference_list, self.idx2label, self.tokenizer, self.model, self.device, self.batch_size)
+        pred_pairs = run_ner(candidate_list, self.idx2label, self.tokenizer, self.model, self.device, self.batch_size)
+        rate_score = []
+        for gt_pair, pred_pair in zip(gt_pairs, pred_pairs):
+            # process the embedding for gt
+            gt_embeds_word, gt_types = process_embedding(gt_pair, self.eval_tokenizer, self.eval_model, self.device)
+            # process the embedding for pred
+            pred_embeds_word, pred_types = process_embedding(pred_pair, self.eval_tokenizer, self.eval_model, self.device)
+            # compute the score, if the length of gt or pred is 0, the score is 0.5
+            if len(gt_embeds_word) == 0 or len(pred_embeds_word) == 0:
+                rate_score.append(0.5)
+                continue
+            precision_score = compute(gt_embeds_word, pred_embeds_word, gt_types, pred_types, self.affinity_matrix)
+            recall_score = compute(pred_embeds_word, gt_embeds_word, pred_types, gt_types, self.affinity_matrix)
+            if precision_score + recall_score == 0:
+                rate_score.append(0)
+            else:
+                rate_score.append(2*precision_score*recall_score/(precision_score+recall_score))
+        if self.visualization_path:
+            save_file = pd.DataFrame({
+                'candidate': candidate_list,
+                'reference': reference_list,
+                'candidate_entities': pred_pairs,
+                'reference_entities': gt_pairs,
+                'rate_score': rate_score
+            })
+            save_file.to_json(os.path.join(self.visualization_path, 'rate_score.json'), lines=True, orient='records')
+        return rate_score, pred_pairs ,gt_pairs

factual/RaTEScore/utils.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import torch
+import torch.nn.functional as F
+import medspacy
+nlp = medspacy.load(medspacy_enable=["medspacy_pyrush", "medspacy_conte"])
+def sentence_split(text_list):
+    """
+    split sentences by medspacy
+    """
+    clean_text_list = []
+    is_start_list = []
+    for text in text_list:
+        doc = nlp(text)
+        is_start = 1
+        for sent in doc.sents:
+            sent = str(sent).strip()
+            # # check if the sentence has no words
+            if len(sent.split()) == 0:
+                continue
+            if len(sent) < 3:
+                continue
+            is_start_list.append(is_start)
+            clean_text_list.append(sent)
+            is_start = 0
+    return clean_text_list, is_start_list
+def post_process(tokenized_text, predicted_entities, tokenizer):
+    entity_spans = []
+    start = end = None
+    entity_type = None
+    for i, (token, label) in enumerate(zip(tokenized_text, predicted_entities[:len(tokenized_text)])):
+        if token in ["[CLS]", "[SEP]"]:
+            continue
+        if label != "O" and i < len(predicted_entities) - 1:
+            if label.startswith("B-") and predicted_entities[i+1].startswith("I-"):
+                start = i
+                entity_type = label[2:]
+            elif label.startswith("B-") and predicted_entities[i+1].startswith("B-"):
+                start = i
+                end = i
+                entity_spans.append((start, end, label[2:]))
+                start = i
+                entity_type = label[2:]
+            elif label.startswith("B-") and predicted_entities[i+1].startswith("O"):
+                start = i
+                end = i
+                entity_spans.append((start, end, label[2:]))
+                start = end = None
+                entity_type = None
+            elif label.startswith("I-") and predicted_entities[i+1].startswith("B-"):
+                end = i
+                if start is not None:
+                    entity_spans.append((start, end, entity_type))
+                start = i
+                entity_type = label[2:]
+            elif label.startswith("I-") and predicted_entities[i+1].startswith("O"):
+                end = i
+                if start is not None:
+                    entity_spans.append((start, end, entity_type))
+                start = end = None
+                entity_type = None
+    # 处理最后一个实体
+    if start is not None and end is None:
+        end = len(tokenized_text) - 2
+        entity_spans.append((start, end, entity_type))
+    # 输出结果
+    save_pair = []
+    for start, end, entity_type in entity_spans:
+        entity_str = tokenizer.convert_tokens_to_string(tokenized_text[start:end+1])
+        # print(f"实体: {entity_str}, 类型: {entity_type}")
+        save_pair.append((entity_str, entity_type))
+    return save_pair
+def topk_similarity(embeddings1, embeddings2, k=1):
+    """
+    Compute the top-k similarity between two sets of embeddings using PyTorch.
+    """
+    ### Normalize the embeddings to use cosine similarity
+    embeddings1 = F.normalize(embeddings1, p=2, dim=1)
+    embeddings2 = F.normalize(embeddings2, p=2, dim=1)
+    topk_values = []
+    topk_indices = []
+    ### Iterate over each embedding in the first set
+    for emb1 in embeddings1:
+        ### Calculate cosine similarity between this embedding and all embeddings in the second set
+        similarities = torch.matmul(embeddings2, emb1)
+        ### Find the top-k highest similarity values
+        values, indices = torch.topk(similarities, k, largest=True)
+        topk_values.append(values[0])
+        topk_indices.append(indices[0])
+    return topk_indices, topk_values
+def compute(gt_embeds_word, pred_embeds_word, gt_types, pred_types, weight_matrix):
+    neg_class = [('NON-DISEASE', 'DISEASE'),
+                 ('NON-ABNORMALITY', 'ABNORMALITY'),
+                 ('DISEASE', 'NON-DISEASE'),
+                ('ABNORMALITY', 'NON-ABNORMALITY'),
+                ('NON-DISEASE', 'ABNORMALITY'),
+                ('NON-ABNORMALITY', 'DISEASE'),
+                ('DISEASE', 'NON-ABNORMALITY'),
+                ('ABNORMALITY', 'NON-DISEASE'),]
+    neg_weight = weight_matrix[("NEG", "WEIGHT")]
+    topk_indices, topk_values = topk_similarity(gt_embeds_word, pred_embeds_word, k=1)
+    for i in range(len(topk_indices)):
+        topk_indices[i] = topk_indices[i].cpu().numpy().tolist()
+        topk_values[i] = topk_values[i].cpu().numpy().tolist()
+    # map the indices to type
+    topk_map = [pred_types[i] for i in topk_indices]
+    weight_score = [weight_matrix[(gt_type, pred_type)] for gt_type, pred_type in zip(gt_types, topk_map)]
+    type_score = [neg_weight if (gt_type, pred_type) in neg_class else 1 for gt_type, pred_type in zip(gt_types, topk_map)]
+    weighted_avg_score = 0
+    weighted_sum = 0
+    for score, weight, type in zip(topk_values, weight_score, type_score):
+        weighted_avg_score += score*weight*type
+        weighted_sum += weight
+    if weighted_sum != 0:
+        RaTE = weighted_avg_score/weighted_sum
+    else:
+        RaTE = 0
+    return RaTE

factual/RadCliQv1/radcliq.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import numpy as np
+import torch
+from nlg.bertscore.bertscore import BertScore
+from radgraph import RadGraph
+from factual.f1chexbert import F1CheXbert
+from sklearn.preprocessing import StandardScaler
+from nlg.bleu.bleu import Bleu
+def radcliq_bertscore(refs, hyps, model_type='distilroberta-base'):
+    """
+    Computes BERTScore for each pair of reference and hypothesis.
+    Returns:
+        np.ndarray of shape (N,) with the BERTScore F1 values per pair.
+    """
+    # https://github.com/rajpurkarlab/CXR-Report-Metric/blob/9c9ecad39be6cb2be8e75be1d1c50ef8888a3e40/CXRMetric/run_eval.py#L103
+    scorer = BertScore(
+        model_type=model_type,
+        rescale_with_baseline=True,
+        idf=False,
+        num_layers=None
+    )
+    _, scores = scorer(refs, hyps)
+    # scores is a list of torch.Tensor, convert to numpy
+    return np.array([float(s) for s in scores])
+def compute_f1(test_set, retrieved_set):
+    """Helper to compute F1 between two sets of items."""
+    tp = len(test_set & retrieved_set)
+    fp = len(retrieved_set) - tp
+    fn = len(test_set) - tp
+    precision = tp / (tp + fp) if (tp + fp) else 0.0
+    recall = tp / (tp + fn) if (tp + fn) else 0.0
+    return 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
+def extract_entities(output):
+    """Extracts set of (tokens, label) tuples from RadGraph output."""
+    return {(tuple(ent["tokens"]), ent["label"]) for ent in output.get("entities", {}).values()}
+def extract_relations(output):
+    """Extracts set of (src, tgt, relation) tuples from RadGraph output."""
+    rels = set()
+    entities = output.get("entities", {})
+    for ent in entities.values():
+        src = (tuple(ent["tokens"]), ent["label"])
+        for rel_type, tgt_idx in ent.get("relations", []):
+            tgt_ent = entities.get(tgt_idx)
+            if tgt_ent:
+                tgt = (tuple(tgt_ent["tokens"]), tgt_ent["label"])
+                rels.add((src, tgt, rel_type))
+    return rels
+def radcliq_radgraph_scores(refs, hyps, model_name='radgraph'):
+    """
+    Computes entity and relation F1 via RadGraph for each report pair and returns their average.
+    Returns:
+        np.ndarray of shape (N,) with (entity_f1 + relation_f1)/2 per pair.
+    """
+    rad = RadGraph(model_type=model_name)
+    gt_outputs = rad(refs)
+    pred_outputs = rad(hyps)
+    scores = []
+    for i in range(len(refs)):
+        gt_out = gt_outputs.get(str(i), {})
+        pred_out = pred_outputs.get(str(i), {})
+        ents_gt = extract_entities(gt_out)
+        ents_pred = extract_entities(pred_out)
+        rels_gt = extract_relations(gt_out)
+        rels_pred = extract_relations(pred_out)
+        ent_f1 = compute_f1(ents_gt, ents_pred)
+        rel_f1 = compute_f1(rels_gt, rels_pred)
+        scores.append((ent_f1 + rel_f1) / 2)
+    return np.array(scores)
+def semantic_embedding_scores(refs, hyps, device='cpu'):
+    """
+    Computes per-pair cosine similarity between embeddings from CheXbert labeler.
+    Returns:
+        np.ndarray of shape (N,) with cosine similarities per pair.
+    """
+    if len(refs) != len(hyps):
+        raise ValueError(f"refs ({len(refs)}) and hyps ({len(hyps)}) must be same length")
+    labeler = F1CheXbert(device=device)
+    gt_embs = np.vstack(labeler.get_embeddings(refs))
+    pred_embs = np.vstack(labeler.get_embeddings(hyps))
+    # https://github.com/rajpurkarlab/CXR-Report-Metric/blob/9c9ecad39be6cb2be8e75be1d1c50ef8888a3e40/CXRMetric/run_eval.py#L126
+    dot = np.einsum("nd,nd->n", gt_embs, pred_embs)
+    norms = np.linalg.norm(gt_embs, axis=1) * np.linalg.norm(pred_embs, axis=1)
+    with np.errstate(divide='ignore', invalid='ignore'):
+        sims = np.where(norms > 0, dot / norms, 0.0)
+    return sims
+def radcliq_scores(refs, hyps,
+                   bert_model='distilroberta-base',
+                   radgraph_model='radgraph'):
+    """
+    Computes BERTScore, RadGraph score, and semantic embedding similarity for each ref-hyp pair.
+    Args:
+        refs: List of reference report strings.
+        hyps: List of hypothesis report strings.
+        device: Device for embedding model ('cpu' or 'cuda').
+        bert_model: HuggingFace model name for BERTScore.
+        radgraph_model: Model name for RadGraph inference.
+    Returns:
+        Dict with keys 'bertscore', 'radgraph', 'semantic', each mapping to a numpy array of shape (N,).
+    """
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # BERTScore
+    bert_scores = radcliq_bertscore(refs, hyps, model_type=bert_model)
+    # RadGraph
+    rad_scores = radcliq_radgraph_scores(refs, hyps, model_name=radgraph_model)
+    # Semantic embeddings
+    sem_scores = semantic_embedding_scores(refs, hyps, device=device)
+    # BLEU
+    bleu_scorer = Bleu()
+    bleu_scores = bleu_scorer(refs, hyps)[1]
+    return {
+        'bertscore': bert_scores,
+        'radgraph': rad_scores,
+        'semb_score': sem_scores,
+        'bleu_score': bleu_scores
+    }
+class CompositeMetric:
+    def __init__(self):
+        scaler = StandardScaler(with_mean=True, with_std=True)
+        # learnt parameters, infered from
+        # https://github.com/rajpurkarlab/CXR-Report-Metric/blob/main/CXRMetric/run_eval.py#L219
+        scaler.mean_            = np.array([0.53792312, 0.61757256, 0.76479421, 0.44738335])
+        scaler.scale_           = np.array([0.30282584, 0.22430938, 0.25394391, 0.29892717])
+        scaler.var_             = np.array([0.09170349, 0.05031470, 0.06448751, 0.08935745])
+        scaler.n_samples_seen_  = 160       # integer
+        scaler.n_features_in_   = 4         # integer
+        self.scaler = scaler
+        self.coefs  = np.array([
+                        -3.77083683e-01,   # radgraph weight
+                        -3.70300100e-01,   # bertscore weight
+                        -2.52616218e-01,   # s-emb weight
+                        4.31504841e-12,   # bleu weight
+                        2.46655256e-10    # intercept / bias
+                    ])
+        self.cols   = ["radgraph", "bertscore", "semb_score", "bleu_score"]
+    def predict(self, X):
+        Xn = self.scaler.transform(X)
+        Xn = np.hstack([Xn, np.ones((Xn.shape[0], 1))])
+        return Xn @ self.coefs
+    def _build_matrix(self, metrics: dict[str, np.ndarray]) -> np.ndarray:
+        """Stack features in the canonical column order."""
+        return np.column_stack([metrics[c] for c in self.cols])
+    def predict(self, refs, hyps) -> np.ndarray:
+        """
+        Args
+        ----
+        metrics : dict returned by `radcliq_scores`
+        Returns
+        -------
+        np.ndarray of shape (N,) – RadCliQ-v1 score for each ref/hyp pair.
+        """
+        metrics = radcliq_scores(refs, hyps)
+        X = self._build_matrix(metrics)
+        Xn = self.scaler.transform(X)
+        # Append bias term
+        Xn = np.hstack([Xn, np.ones((Xn.shape[0], 1))])
+        scores = Xn @ self.coefs
+        return 1/scores.mean(), scores
+if __name__ == "__main__":
+    refs = [
+        "No evidence of pneumothorax following chest tube removal.",
+        "There is a left pleural effusion.",
+        "There is a left pleural effusion."
+    ]
+    hyps = [
+        "No pneumothorax detected.",
+        "Left pleural effusion is present.",
+        "No pneumothorax detected.",
+    ]
+    # Step-1: compute the four individual metrics
+    # Step-2: get the RadCliQ-v1 composite
+    radcliq = CompositeMetric()
+    mean_scores, detail_scores = radcliq.predict(refs, hyps)
+    for i, s in enumerate(detail_scores, 1):
+        print(f"Pair {i}: RadCliQ-v1 = {s:.4f}")
+    print(f"RadCliQ-v1 score: {mean_scores:.4f}")

factual/RadCliQv1/radcliq_bertscore.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from nlg.bertscore.bertscore import BertScore
+def radcliq_bertscore(refs, hyps):
+    bertscore_scorer = BertScore(model_type='distilroberta-base',
+                                rescale_with_baseline=True,
+                                idf=False,
+                                num_layers=None)
+    print(bertscore_scorer)
+    avg, scores = bertscore_scorer(refs, hyps)
+    return scores

factual/RadCliQv1/radcliq_radgraph.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import json
+from radgraph import RadGraph
+def compute_f1(test, retrieved):
+    """Computes F1 between test/retrieved report's entities or relations."""
+    tp = len(test & retrieved)
+    fp = len(retrieved) - tp
+    fn = len(test) - tp
+    precision = tp / (tp + fp) if (tp + fp) else 0
+    recall = tp / (tp + fn) if (tp + fn) else 0
+    return 2 * precision * recall / (precision + recall) if (precision + recall) else 0
+def extract_entities(output):
+    """Extracts set of (tokens, label) from a RadGraph output dict."""
+    return {(tuple(ent["tokens"]), ent["label"]) for ent in output.get("entities", {}).values()}
+def extract_relations(output):
+    """Extracts set of (src, tgt, relation) from a RadGraph output dict."""
+    rels = set()
+    entities = output.get("entities", {})
+    for ent in entities.values():
+        src = (tuple(ent["tokens"]), ent["label"])
+        for rel_type, tgt_idx in ent.get("relations", []):
+            tgt_ent = entities.get(tgt_idx)
+            if tgt_ent:
+                tgt = (tuple(tgt_ent["tokens"]), tgt_ent["label"])
+                rels.add((src, tgt, rel_type))
+    return rels
+def compute_radgraph_scores(refs, hyps, model_name='radgraph'):
+    """
+    Computes combined RadGraph F1 scores for each pair of reference and hypothesis reports.
+    Returns:
+      List of floats: (entity_f1 + relation_f1)/2 per report.
+    """
+    # Initialize RadGraph model
+    rad = RadGraph(model_type=model_name)
+    # Perform inference
+    gt_outputs = rad(refs)
+    pred_outputs = rad(hyps)
+    scores = []
+    for i in range(len(gt_outputs)):
+        gt_out = gt_outputs[str(i)]
+        pred_out = pred_outputs[str(i)]
+        gt_ents = extract_entities(gt_out)
+        pred_ents = extract_entities(pred_out)
+        gt_rels = extract_relations(gt_out)
+        pred_rels = extract_relations(pred_out)
+        ent_f1 = compute_f1(gt_ents, pred_ents)
+        rel_f1 = compute_f1(gt_rels, pred_rels)
+        scores.append((ent_f1 + rel_f1) / 2)
+    return scores
+if __name__ == '__main__':
+    # Example usage
+    refs = [
+        "No evidence of pneumothorax following chest tube removal.",
+        "There is a left pleural effusion."
+    ]
+    hyps = [
+        "No pneumothorax detected.",
+        "Left pleural effusion is present."
+    ]
+    combined_scores = compute_radgraph_scores(refs, hyps)
+    print(combined_scores)  # e.g., [1.0, 1.0]
+    from radgraph import F1RadGraph
+    f1_radgraph = F1RadGraph(model_type="radgraph", reward_level="simple")
+    f1_scores = f1_radgraph(refs, hyps,)
+    print(f1_scores)

factual/RadCliQv1/semb_score.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from typing import Sequence, Union
+import numpy as np
+import torch
+from factual.f1chexbert import F1CheXbert
+def semantic_embedding_scores(
+    refs: Sequence[str],
+    hyps: Sequence[str],
+    *,
+    device: Union[str, torch.device] = "cpu",
+) -> np.ndarray:
+    """Return per‑pair cosine similarities between `refs` and `hyps`.
+    All heavy math is vectorised; no Python loops.
+    Args:
+        refs: Iterable of ground‑truth report strings.
+        hyps: Iterable of predicted report strings (must match `refs` length).
+        device: Computation device (e.g. "cpu", "cuda", "cuda:0").
+    Returns
+    -------
+    np.ndarray
+        Shape ``(N,)`` – cosine similarity for each pair, where
+        ``N == len(refs) == len(hyps)``.
+    Raises
+    ------
+    ValueError
+        If `refs` and `hyps` are of different lengths.
+    """
+    if len(refs) != len(hyps):
+        raise ValueError(f"refs ({len(refs)}) and hyps ({len(hyps)}) differ in length")
+    labeler = F1CheXbert(device=device)
+    # Stack embeddings into (N, dim) matrices
+    gt_embeds = np.vstack(labeler.get_embeddings(refs))   # (N, dim)
+    pred_embeds = np.vstack(labeler.get_embeddings(hyps))  # (N, dim)
+    # Cosine similarity – fully vectorised
+    dot = np.einsum("nd,nd->n", gt_embeds, pred_embeds)
+    norms = np.linalg.norm(gt_embeds, axis=1) * np.linalg.norm(pred_embeds, axis=1)
+    with np.errstate(divide="ignore", invalid="ignore"):
+        sims = np.where(norms > 0, dot / norms, 0.0)
+    return sims
+def mean_semantic_score(scores: np.ndarray) -> float:
+    """Convenience helper: mean of an array of scores."""
+    return float(scores.mean())
+if __name__ == "__main__":
+    _refs = [
+        "No evidence of pneumothorax following chest tube removal.",
+        "There is a left pleural effusion.",
+        "No evidence of pneumothorax following chest tube removal.",
+    ]
+    _hyps = [
+        "No pneumothorax detected.",
+        "Left pleural effusion is present.",
+        "Left pleural effusion is present.",
+    ]
+    _scores = semantic_embedding_scores(_refs, _hyps, device="cpu")
+    print("Per‑pair cosine:", _scores)
+    print("Mean:", mean_semantic_score(_scores))

factual/SRRBert/leaves_mapping.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+    "No Finding": 0,
+    "Lung Lesion": 1,
+    "Edema": 2,
+    "Pneumonia": 3,
+    "Atelectasis": 4,
+    "Aspiration": 5,
+    "Lung collapse": 6,
+    "Perihilar airspace opacity": 7,
+    "Air space opacity\u2013multifocal": 8,
+    "Mass/Solitary lung mass": 9,
+    "Nodule/Solitary lung nodule": 10,
+    "Cavitating mass with content": 11,
+    "Cavitating masses": 12,
+    "Emphysema": 13,
+    "Fibrosis": 14,
+    "Pulmonary congestion": 15,
+    "Hilar lymphadenopathy": 16,
+    "Bronchiectasis": 17,
+    "Simple pneumothorax": 18,
+    "Loculated pneumothorax": 19,
+    "Tension pneumothorax": 20,
+    "Simple pleural effusion": 21,
+    "Loculated pleural effusion": 22,
+    "Pleural scarring": 23,
+    "Hydropneumothorax": 24,
+    "Pleural Other": 25,
+    "Cardiomegaly": 26,
+    "Pericardial effusion": 27,
+    "Inferior mediastinal mass": 28,
+    "Superior mediastinal mass": 29,
+    "Tortuous Aorta": 30,
+    "Calcification of the Aorta": 31,
+    "Enlarged pulmonary artery": 32,
+    "Hernia": 33,
+    "Pneumomediastinum": 34,
+    "Tracheal deviation": 35,
+    "Acute humerus fracture": 36,
+    "Acute rib fracture": 37,
+    "Acute clavicle fracture": 38,
+    "Acute scapula fracture": 39,
+    "Compression fracture": 40,
+    "Shoulder dislocation": 41,
+    "Subcutaneous Emphysema": 42,
+    "Suboptimal central line": 43,
+    "Suboptimal endotracheal tube": 44,
+    "Suboptimal nasogastric tube": 45,
+    "Suboptimal pulmonary arterial catheter": 46,
+    "Pleural tube": 47,
+    "PICC line": 48,
+    "Port catheter": 49,
+    "Pacemaker": 50,
+    "Implantable defibrillator": 51,
+    "LVAD": 52,
+    "Intraaortic balloon pump": 53,
+    "Pneumoperitoneum": 54
+  }

factual/SRRBert/leaves_with_statuses_mapping.json ADDED Viewed

	@@ -0,0 +1,165 @@

+{
+  "Lung Lesion (Present)": 0,
+  "Edema (Present)": 1,
+  "Pneumonia (Present)": 2,
+  "Atelectasis (Present)": 3,
+  "Aspiration (Present)": 4,
+  "Lung collapse (Present)": 5,
+  "Perihilar airspace opacity (Present)": 6,
+  "Air space opacity\u2013multifocal (Present)": 7,
+  "Mass/Solitary lung mass (Present)": 8,
+  "Nodule/Solitary lung nodule (Present)": 9,
+  "Cavitating mass with content (Present)": 10,
+  "Cavitating masses (Present)": 11,
+  "Emphysema (Present)": 12,
+  "Fibrosis (Present)": 13,
+  "Pulmonary congestion (Present)": 14,
+  "Hilar lymphadenopathy (Present)": 15,
+  "Bronchiectasis (Present)": 16,
+  "Simple pneumothorax (Present)": 17,
+  "Loculated pneumothorax (Present)": 18,
+  "Tension pneumothorax (Present)": 19,
+  "Simple pleural effusion (Present)": 20,
+  "Loculated pleural effusion (Present)": 21,
+  "Pleural scarring (Present)": 22,
+  "Hydropneumothorax (Present)": 23,
+  "Pleural Other (Present)": 24,
+  "Cardiomegaly (Present)": 25,
+  "Pericardial effusion (Present)": 26,
+  "Inferior mediastinal mass (Present)": 27,
+  "Superior mediastinal mass (Present)": 28,
+  "Tortuous Aorta (Present)": 29,
+  "Calcification of the Aorta (Present)": 30,
+  "Enlarged pulmonary artery (Present)": 31,
+  "Hernia (Present)": 32,
+  "Pneumomediastinum (Present)": 33,
+  "Tracheal deviation (Present)": 34,
+  "Acute humerus fracture (Present)": 35,
+  "Acute rib fracture (Present)": 36,
+  "Acute clavicle fracture (Present)": 37,
+  "Acute scapula fracture (Present)": 38,
+  "Compression fracture (Present)": 39,
+  "Shoulder dislocation (Present)": 40,
+  "Subcutaneous Emphysema (Present)": 41,
+  "Suboptimal central line (Present)": 42,
+  "Suboptimal endotracheal tube (Present)": 43,
+  "Suboptimal nasogastric tube (Present)": 44,
+  "Suboptimal pulmonary arterial catheter (Present)": 45,
+  "Pleural tube (Present)": 46,
+  "PICC line (Present)": 47,
+  "Port catheter (Present)": 48,
+  "Pacemaker (Present)": 49,
+  "Implantable defibrillator (Present)": 50,
+  "LVAD (Present)": 51,
+  "Intraaortic balloon pump (Present)": 52,
+  "Pneumoperitoneum (Present)": 53,
+  "Lung Lesion (Uncertain)": 54,
+  "Edema (Uncertain)": 55,
+  "Pneumonia (Uncertain)": 56,
+  "Atelectasis (Uncertain)": 57,
+  "Aspiration (Uncertain)": 58,
+  "Lung collapse (Uncertain)": 59,
+  "Perihilar airspace opacity (Uncertain)": 60,
+  "Air space opacity\u2013multifocal (Uncertain)": 61,
+  "Mass/Solitary lung mass (Uncertain)": 62,
+  "Nodule/Solitary lung nodule (Uncertain)": 63,
+  "Cavitating mass with content (Uncertain)": 64,
+  "Cavitating masses (Uncertain)": 65,
+  "Emphysema (Uncertain)": 66,
+  "Fibrosis (Uncertain)": 67,
+  "Pulmonary congestion (Uncertain)": 68,
+  "Hilar lymphadenopathy (Uncertain)": 69,
+  "Bronchiectasis (Uncertain)": 70,
+  "Simple pneumothorax (Uncertain)": 71,
+  "Loculated pneumothorax (Uncertain)": 72,
+  "Tension pneumothorax (Uncertain)": 73,
+  "Simple pleural effusion (Uncertain)": 74,
+  "Loculated pleural effusion (Uncertain)": 75,
+  "Pleural scarring (Uncertain)": 76,
+  "Hydropneumothorax (Uncertain)": 77,
+  "Pleural Other (Uncertain)": 78,
+  "Cardiomegaly (Uncertain)": 79,
+  "Pericardial effusion (Uncertain)": 80,
+  "Inferior mediastinal mass (Uncertain)": 81,
+  "Superior mediastinal mass (Uncertain)": 82,
+  "Tortuous Aorta (Uncertain)": 83,
+  "Calcification of the Aorta (Uncertain)": 84,
+  "Enlarged pulmonary artery (Uncertain)": 85,
+  "Hernia (Uncertain)": 86,
+  "Pneumomediastinum (Uncertain)": 87,
+  "Tracheal deviation (Uncertain)": 88,
+  "Acute humerus fracture (Uncertain)": 89,
+  "Acute rib fracture (Uncertain)": 90,
+  "Acute clavicle fracture (Uncertain)": 91,
+  "Acute scapula fracture (Uncertain)": 92,
+  "Compression fracture (Uncertain)": 93,
+  "Shoulder dislocation (Uncertain)": 94,
+  "Subcutaneous Emphysema (Uncertain)": 95,
+  "Suboptimal central line (Uncertain)": 96,
+  "Suboptimal endotracheal tube (Uncertain)": 97,
+  "Suboptimal nasogastric tube (Uncertain)": 98,
+  "Suboptimal pulmonary arterial catheter (Uncertain)": 99,
+  "Pleural tube (Uncertain)": 100,
+  "PICC line (Uncertain)": 101,
+  "Port catheter (Uncertain)": 102,
+  "Pacemaker (Uncertain)": 103,
+  "Implantable defibrillator (Uncertain)": 104,
+  "LVAD (Uncertain)": 105,
+  "Intraaortic balloon pump (Uncertain)": 106,
+  "Pneumoperitoneum (Uncertain)": 107,
+  "Lung Lesion (Absent)": 108,
+  "Edema (Absent)": 109,
+  "Pneumonia (Absent)": 110,
+  "Atelectasis (Absent)": 111,
+  "Aspiration (Absent)": 112,
+  "Lung collapse (Absent)": 113,
+  "Perihilar airspace opacity (Absent)": 114,
+  "Air space opacity\u2013multifocal (Absent)": 115,
+  "Mass/Solitary lung mass (Absent)": 116,
+  "Nodule/Solitary lung nodule (Absent)": 117,
+  "Cavitating mass with content (Absent)": 118,
+  "Cavitating masses (Absent)": 119,
+  "Emphysema (Absent)": 120,
+  "Fibrosis (Absent)": 121,
+  "Pulmonary congestion (Absent)": 122,
+  "Hilar lymphadenopathy (Absent)": 123,
+  "Bronchiectasis (Absent)": 124,
+  "Simple pneumothorax (Absent)": 125,
+  "Loculated pneumothorax (Absent)": 126,
+  "Tension pneumothorax (Absent)": 127,
+  "Simple pleural effusion (Absent)": 128,
+  "Loculated pleural effusion (Absent)": 129,
+  "Pleural scarring (Absent)": 130,
+  "Hydropneumothorax (Absent)": 131,
+  "Pleural Other (Absent)": 132,
+  "Cardiomegaly (Absent)": 133,
+  "Pericardial effusion (Absent)": 134,
+  "Inferior mediastinal mass (Absent)": 135,
+  "Superior mediastinal mass (Absent)": 136,
+  "Tortuous Aorta (Absent)": 137,
+  "Calcification of the Aorta (Absent)": 138,
+  "Enlarged pulmonary artery (Absent)": 139,
+  "Hernia (Absent)": 140,
+  "Pneumomediastinum (Absent)": 141,
+  "Tracheal deviation (Absent)": 142,
+  "Acute humerus fracture (Absent)": 143,
+  "Acute rib fracture (Absent)": 144,
+  "Acute clavicle fracture (Absent)": 145,
+  "Acute scapula fracture (Absent)": 146,
+  "Compression fracture (Absent)": 147,
+  "Shoulder dislocation (Absent)": 148,
+  "Subcutaneous Emphysema (Absent)": 149,
+  "Suboptimal central line (Absent)": 150,
+  "Suboptimal endotracheal tube (Absent)": 151,
+  "Suboptimal nasogastric tube (Absent)": 152,
+  "Suboptimal pulmonary arterial catheter (Absent)": 153,
+  "Pleural tube (Absent)": 154,
+  "PICC line (Absent)": 155,
+  "Port catheter (Absent)": 156,
+  "Pacemaker (Absent)": 157,
+  "Implantable defibrillator (Absent)": 158,
+  "LVAD (Absent)": 159,
+  "Intraaortic balloon pump (Absent)": 160,
+  "Pneumoperitoneum (Absent)": 161,
+  "No Finding": 162
+}

factual/SRRBert/srr_bert.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+import json
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import BertForSequenceClassification, BertTokenizer
+from tqdm import tqdm
+import re
+import nltk
+def srr_bert_parse_sentences(text):
+    # Handle numbers followed by a dot, not followed by a digit (to avoid decimals like 3.5)
+    # Case 1: Number at beginning of text
+    text = re.sub(r'^\s*\d+\.(?!\d)\s*', '', text)
+    # Case 2: Number after a period, like "word.2."
+    text = re.sub(r'(\w)\.(\d+)\.(?!\d)\s*', r'\1. ', text)
+    # Case 3: Number attached to a word, like "word2."
+    text = re.sub(r'(\w)(\d+)\.(?!\d)\s*', r'\1. ', text)
+    # Case 4: Number after space following a word, like "word 2."
+    text = re.sub(r'(\w)\s+\d+\.(?!\d)\s*', r'\1. ', text)
+    # Case 5: Standalone number in the middle, like ". 2. word"
+    text = re.sub(r'([.!?])\s*\d+\.(?!\d)\s*', r'\1 ', text)
+    # Add space after periods followed immediately by uppercase letter (new sentence without space)
+    text = re.sub(r'\.([A-Z])', r'. \1', text)
+    # Make sure the text ends with a period
+    if not text.strip().endswith(('.', '!', '?')):
+        text = text.strip() + '.'
+    # Tokenize into sentences
+    sentences = nltk.sent_tokenize(text)
+    return sentences
+class SRRBert(nn.Module):
+    # Supported model types and their configs
+    MODEL_CONFIGS = {
+        "leaves": {
+            "model_path": "StanfordAIMI/SRR-BERT-Leaves",
+            "mapping_file": "leaves_mapping.json"
+        },
+        "upper": {
+            "model_path": "StanfordAIMI/SRR-BERT-Upper",
+            "mapping_file": "upper_mapping.json"
+        },
+        "leaves_with_statuses": {
+            "model_path": "StanfordAIMI/SRR-BERT-Leaves-with-Statuses",
+            "mapping_file": "leaves_with_statuses_mapping.json"
+        },
+        "upper_with_statuses": {
+            "model_path": "StanfordAIMI/SRRG-BERT-Upper-with-Statuses",
+            "mapping_file": "upper_with_statuses_mapping.json"
+        },
+    }
+    def __init__(
+        self,
+        model_type: str = "leaves",
+        batch_size: int = 4,
+        tqdm_enable: bool = False
+    ):
+        super().__init__()
+        if model_type not in self.MODEL_CONFIGS:
+            raise ValueError(
+                f"model_type must be one of {list(self.MODEL_CONFIGS.keys())}"
+            )
+        config = self.MODEL_CONFIGS[model_type]
+        # Load mapping
+        mapping_path = os.path.join(
+            os.path.dirname(__file__),
+            config["mapping_file"]
+        )
+        with open(mapping_path, 'r') as f:
+            self.mapping = json.load(f)
+        # Device setup
+        self.device = torch.device(
+            'cuda' if torch.cuda.is_available() else 'cpu'
+        )
+        # Load model
+        self.model = BertForSequenceClassification.from_pretrained(
+            config["model_path"],
+            num_labels=len(self.mapping)
+        )
+        self.model.to(self.device)
+        self.model.eval()
+        # Tokenizer
+        self.tokenizer = BertTokenizer.from_pretrained(
+            "microsoft/BiomedVLP-CXR-BERT-general"
+        )
+        # Settings
+        self.batch_size = batch_size
+        self.tqdm_enable = tqdm_enable
+    def map_predictions_to_labels(self, outputs):
+        inverted_mapping = {v: k for k, v in self.mapping.items()}
+        all_labels = []
+        for output in outputs:
+            labels = [inverted_mapping[i] for i, flag in enumerate(output) if flag == 1]
+            all_labels.append(labels)
+        return all_labels
+    def forward(self, sentences):
+        # Batch sentences
+        batches = [
+            sentences[i:i + self.batch_size]
+            for i in range(0, len(sentences), self.batch_size)
+        ]
+        outputs = []
+        with torch.no_grad():
+            for batch in tqdm(
+                batches, desc="Predicting", disable=not self.tqdm_enable
+            ):
+                inputs = self.tokenizer.batch_encode_plus(
+                    batch,
+                    add_special_tokens=True,
+                    max_length=512,
+                    padding="max_length",
+                    truncation=True,
+                    return_attention_mask=True,
+                    return_tensors="pt",
+                )
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                logits = self.model(**inputs).logits
+                preds = (torch.sigmoid(logits) > 0.5).cpu().numpy().astype(int)
+                outputs.append(preds)
+        outputs = np.concatenate(outputs, axis=0)
+        return outputs, self.map_predictions_to_labels(outputs)
+if __name__ == "__main__":
+    example_sentences = [
+        "Layering pleural effusions",
+        "Moderate pulmonary edema.",
+        "Chronic fracture and dislocation involving the left humeral surgical neck and glenoid.",
+        "Stable cardiomegaly.",
+    ]
+    # Initialize model (choose one of: leaves, upper, leaves_with_statuses, upper_with_statuses)
+    model = SRRBert(
+        model_type="leaves",
+        batch_size=4,
+        tqdm_enable=True
+    )
+    outputs, labels = model(example_sentences)
+    print("Raw outputs:", outputs)
+    print("Predicted labels:", labels)

factual/SRRBert/upper_mapping.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+    "Pleural Effusion": 0,
+    "Upper abdominal finding": 1,
+    "Widened cardiac silhouette": 2,
+    "Lung Finding": 3,
+    "No Finding": 4,
+    "Widened aortic contour": 5,
+    "Pleural Thickening": 6,
+    "Vascular finding": 7,
+    "Consolidation": 8,
+    "Pneumothorax": 9,
+    "Subdiaphragmatic gas": 10,
+    "Masslike opacity": 11,
+    "Chest wall finding": 12,
+    "Focal air space opacity": 13,
+    "Segmental collapse": 14,
+    "Fracture": 15,
+    "Mediastinal mass": 16,
+    "Solitary masslike opacity": 17,
+    "Support Devices": 18,
+    "Mediastinal finding": 19,
+    "Pleural finding": 20,
+    "Air space opacity": 21,
+    "Diffuse air space opacity": 22,
+    "Multiple masslike opacities": 23,
+    "Musculoskeletal finding": 24
+  }

factual/SRRBert/upper_with_statuses_mapping.json ADDED Viewed

	@@ -0,0 +1,76 @@

+{
+    "Pleural Effusion (Present)": 0,
+    "Upper abdominal finding (Present)": 1,
+    "Widened cardiac silhouette (Present)": 2,
+    "Lung Finding (Present)": 3,
+    "Widened aortic contour (Present)": 4,
+    "Pleural Thickening (Present)": 5,
+    "Vascular finding (Present)": 6,
+    "Consolidation (Present)": 7,
+    "Pneumothorax (Present)": 8,
+    "Subdiaphragmatic gas (Present)": 9,
+    "Masslike opacity (Present)": 10,
+    "Chest wall finding (Present)": 11,
+    "Focal air space opacity (Present)": 12,
+    "Segmental collapse (Present)": 13,
+    "Fracture (Present)": 14,
+    "Mediastinal mass (Present)": 15,
+    "Solitary masslike opacity (Present)": 16,
+    "Support Devices (Present)": 17,
+    "Mediastinal finding (Present)": 18,
+    "Pleural finding (Present)": 19,
+    "Air space opacity (Present)": 20,
+    "Diffuse air space opacity (Present)": 21,
+    "Multiple masslike opacities (Present)": 22,
+    "Musculoskeletal finding (Present)": 23,
+    "Pleural Effusion (Uncertain)": 24,
+    "Upper abdominal finding (Uncertain)": 25,
+    "Widened cardiac silhouette (Uncertain)": 26,
+    "Lung Finding (Uncertain)": 27,
+    "Widened aortic contour (Uncertain)": 28,
+    "Pleural Thickening (Uncertain)": 29,
+    "Vascular finding (Uncertain)": 30,
+    "Consolidation (Uncertain)": 31,
+    "Pneumothorax (Uncertain)": 32,
+    "Subdiaphragmatic gas (Uncertain)": 33,
+    "Masslike opacity (Uncertain)": 34,
+    "Chest wall finding (Uncertain)": 35,
+    "Focal air space opacity (Uncertain)": 36,
+    "Segmental collapse (Uncertain)": 37,
+    "Fracture (Uncertain)": 38,
+    "Mediastinal mass (Uncertain)": 39,
+    "Solitary masslike opacity (Uncertain)": 40,
+    "Support Devices (Uncertain)": 41,
+    "Mediastinal finding (Uncertain)": 42,
+    "Pleural finding (Uncertain)": 43,
+    "Air space opacity (Uncertain)": 44,
+    "Diffuse air space opacity (Uncertain)": 45,
+    "Multiple masslike opacities (Uncertain)": 46,
+    "Musculoskeletal finding (Uncertain)": 47,
+    "Pleural Effusion (Absent)": 48,
+    "Upper abdominal finding (Absent)": 49,
+    "Widened cardiac silhouette (Absent)": 50,
+    "Lung Finding (Absent)": 51,
+    "Widened aortic contour (Absent)": 52,
+    "Pleural Thickening (Absent)": 53,
+    "Vascular finding (Absent)": 54,
+    "Consolidation (Absent)": 55,
+    "Pneumothorax (Absent)": 56,
+    "Subdiaphragmatic gas (Absent)": 57,
+    "Masslike opacity (Absent)": 58,
+    "Chest wall finding (Absent)": 59,
+    "Focal air space opacity (Absent)": 60,
+    "Segmental collapse (Absent)": 61,
+    "Fracture (Absent)": 62,
+    "Mediastinal mass (Absent)": 63,
+    "Solitary masslike opacity (Absent)": 64,
+    "Support Devices (Absent)": 65,
+    "Mediastinal finding (Absent)": 66,
+    "Pleural finding (Absent)": 67,
+    "Air space opacity (Absent)": 68,
+    "Diffuse air space opacity (Absent)": 69,
+    "Multiple masslike opacities (Absent)": 70,
+    "Musculoskeletal finding (Absent)": 71,
+    "No Finding": 72
+  }

factual/__init__.py ADDED Viewed

File without changes

factual/f1chexbert.py ADDED Viewed

	@@ -0,0 +1,254 @@

+#!/usr/bin/env python
+"""CheXbert evaluation utilities – **device‑safe end‑to‑end**
+This is a drop‑in replacement for your previous `f1chexbert.py` **and** for the helper
+`SemanticEmbeddingScorer`.  All tensors – model weights *and* inputs – are created on
+exactly the same device so the             ``Expected all tensors to be on the same device``
+run‑time error disappears.  The public API stays identical, so the rest of your
+pipeline does not need to change.
+"""
+from __future__ import annotations
+import os
+import warnings
+import logging
+from typing import List, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+import numpy as np
+from transformers import (
+    AutoConfig,
+    BertModel,
+    BertTokenizer,
+)
+from sklearn.metrics import (
+    accuracy_score,
+    classification_report,
+)
+from sklearn.metrics._classification import _check_targets
+from sklearn.utils.sparsefuncs import count_nonzero
+from huggingface_hub import hf_hub_download
+from appdirs import user_cache_dir
+# -----------------------------------------------------------------------------
+# GLOBALS & UTILITIES
+# -----------------------------------------------------------------------------
+CACHE_DIR = user_cache_dir("chexbert")
+warnings.filterwarnings("ignore")
+logging.getLogger("urllib3").setLevel(logging.ERROR)
+# Helper ----------------------------------------------------------------------
+def _generate_attention_masks(batch_ids: torch.LongTensor) -> torch.FloatTensor:
+    """Create a padding mask: 1 for real tokens, 0 for pads."""
+    # batch_ids shape: (B, L)
+    lengths = (batch_ids != 0).sum(dim=1)  # (B,)
+    max_len = batch_ids.size(1)
+    idxs = torch.arange(max_len, device=batch_ids.device).unsqueeze(0)  # (1, L)
+    return (idxs < lengths.unsqueeze(1)).float()  # (B, L)
+# -----------------------------------------------------------------------------
+# MODEL COMPONENTS
+# -----------------------------------------------------------------------------
+class BertLabeler(nn.Module):
+    """BERT backbone + 14 small classification heads (CheXbert)."""
+    def __init__(self, *, device: Union[str, torch.device]):
+        super().__init__()
+        if isinstance(device, str):
+            self.device = torch.device(device)
+        else:
+            self.device = device
+        # 1) Backbone on *CPU* first – we'll move to correct device after weights load
+        config = AutoConfig.from_pretrained("bert-base-uncased")
+        self.bert = BertModel(config)
+        hidden = self.bert.config.hidden_size
+        # 13 heads with 4‑way logits, + 1 head with 2‑way logits
+        self.linear_heads = nn.ModuleList([nn.Linear(hidden, 4) for _ in range(13)])
+        self.linear_heads.append(nn.Linear(hidden, 2))
+        self.dropout = nn.Dropout(0.1)
+        # 2) Load checkpoint weights directly onto CPU first -------------------
+        ckpt_path = hf_hub_download(
+            repo_id="StanfordAIMI/RRG_scorers",
+            filename="chexbert.pth",
+            cache_dir=CACHE_DIR,
+        )
+        state = torch.load(ckpt_path, map_location="cpu")["model_state_dict"]
+        state = {k.replace("module.", ""): v for k, v in state.items()}
+        self.load_state_dict(state, strict=True)
+        # 3) NOW move the entire module (recursively) to `self.device` ----------
+        self.to(self.device)
+        # freeze ---------------------------------------------------------------
+        for p in self.parameters():
+            p.requires_grad = False
+    # ---------------------------------------------------------------------
+    # forward helpers
+    # ---------------------------------------------------------------------
+    @torch.no_grad()
+    def cls_logits(self, input_ids: torch.LongTensor) -> List[torch.Tensor]:
+        """Returns a list of logits for each head (no softmax)."""
+        attn = _generate_attention_masks(input_ids)
+        outputs = self.bert(input_ids=input_ids, attention_mask=attn)
+        cls_repr = self.dropout(outputs.last_hidden_state[:, 0])
+        return [head(cls_repr) for head in self.linear_heads]
+    @torch.no_grad()
+    def cls_embeddings(self, input_ids: torch.LongTensor) -> torch.Tensor:
+        """Returns pooled [CLS] representations (B, hidden_size)."""
+        attn = _generate_attention_masks(input_ids)
+        outputs = self.bert(input_ids=input_ids, attention_mask=attn)
+        return outputs.last_hidden_state[:, 0]  # (B, hidden)
+# -----------------------------------------------------------------------------
+# F1‑CheXbert evaluator
+# -----------------------------------------------------------------------------
+class F1CheXbert(nn.Module):
+    """Generate CheXbert labels + handy evaluation utilities."""
+    CONDITION_NAMES = [
+        "Enlarged Cardiomediastinum",
+        "Cardiomegaly",
+        "Lung Opacity",
+        "Lung Lesion",
+        "Edema",
+        "Consolidation",
+        "Pneumonia",
+        "Atelectasis",
+        "Pneumothorax",
+        "Pleural Effusion",
+        "Pleural Other",
+        "Fracture",
+        "Support Devices",
+    ]
+    NO_FINDING = "No Finding"
+    TARGET_NAMES = CONDITION_NAMES + [NO_FINDING]
+    TOP5 = [
+        "Cardiomegaly",
+        "Edema",
+        "Consolidation",
+        "Atelectasis",
+        "Pleural Effusion",
+    ]
+    def __init__(
+        self,
+        *,
+        refs_filename: str | None = None,
+        hyps_filename: str | None = None,
+        device: Union[str, torch.device] = "cpu",
+    ):
+        super().__init__()
+        # Resolve device -------------------------------------------------------
+        if isinstance(device, str):
+            self.device = torch.device(device)
+        else:
+            self.device = device
+        self.refs_filename = refs_filename
+        self.hyps_filename = hyps_filename
+        # HuggingFace tokenizer (always CPU, we just move tensors later) -------
+        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        # backbone + heads ------------------------------------------------------
+        self.model = BertLabeler(device=self.device).eval()
+        # indices for the TOP‑5 label subset -----------------------------------
+        self.top5_idx = [self.TARGET_NAMES.index(n) for n in self.TOP5]
+    # ---------------------------------------------------------------------
+    # Public helpers
+    # ---------------------------------------------------------------------
+    @torch.no_grad()
+    def get_embeddings(self, reports: Sequence[str]) -> List[np.ndarray]:
+        """Return list[np.ndarray] of pooled [CLS] vectors for each report."""
+        # Tokenise *as a batch* for efficiency
+        encoding = self.tokenizer(
+            reports,
+            padding=True,
+            truncation=True,
+            max_length=512,
+            return_tensors="pt",
+        )
+        input_ids = encoding.input_ids.to(self.device)
+        # (B, hidden)
+        cls = self.model.cls_embeddings(input_ids)
+        return [v.cpu().numpy() for v in cls]
+    @torch.no_grad()
+    def get_label(self, report: str, mode: str = "rrg") -> List[int]:
+        """Return 14‑dim binary vector for the given report."""
+        input_ids = self.tokenizer(report, truncation=True, max_length=512, return_tensors="pt").input_ids.to(self.device)
+        preds = [head.argmax(dim=1).item() for head in self.model.cls_logits(input_ids)]
+        binary = []
+        if mode == "rrg":
+            for c in preds:
+                binary.append(1 if c in {1, 3} else 0)
+        elif mode == "classification":
+            for c in preds:
+                if c == 1:
+                    binary.append(1)
+                elif c == 2:
+                    binary.append(0)
+                elif c == 3:
+                    binary.append(-1)
+                else:
+                    binary.append(0)
+        else:
+            raise ValueError(f"Unknown mode: {mode}")
+        return binary
+    # ---------------------------------------------------------------------
+    # Full evaluator – unchanged logic but simplified I/O
+    # ---------------------------------------------------------------------
+    def forward(self, hyps: List[str], refs: List[str]):
+        """Return (accuracy, per‑example‑accuracy, full classification reports)."""
+        # Reference labels -----------------------------------------------------
+        if self.refs_filename and os.path.exists(self.refs_filename):
+            with open(self.refs_filename) as f:
+                refs_chexbert = [eval(line) for line in f]
+        else:
+            refs_chexbert = [self.get_label(r) for r in refs]
+            if self.refs_filename:
+                with open(self.refs_filename, "w") as f:
+                    f.write("\n".join(map(str, refs_chexbert)))
+        # Hypothesis labels ----------------------------------------------------
+        hyps_chexbert = [self.get_label(h) for h in hyps]
+        if self.hyps_filename:
+            with open(self.hyps_filename, "w") as f:
+                f.write("\n".join(map(str, hyps_chexbert)))
+        # TOP‑5 subset arrays --------------------------------------------------
+        refs5 = [np.array(r)[self.top5_idx] for r in refs_chexbert]
+        hyps5 = [np.array(h)[self.top5_idx] for h in hyps_chexbert]
+        # overall accuracy -----------------------------------------------------
+        accuracy = accuracy_score(refs5, hyps5)
+        _, y_true, y_pred = _check_targets(refs5, hyps5)
+        pe_accuracy = (count_nonzero(y_true - y_pred, axis=1) == 0).astype(float)
+        # full classification reports -----------------------------------------
+        cr = classification_report(refs_chexbert, hyps_chexbert, target_names=self.TARGET_NAMES, output_dict=True)
+        cr5 = classification_report(refs5, hyps5, target_names=self.TOP5, output_dict=True)
+        return accuracy, pe_accuracy, cr, cr5

factual/f1temporal.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# Temporal Entity F1
+# Adopted from https://github.com/X-iZhang/Libra/blob/main/libra/eval/temporal_f1.py
+import re
+import stanza
+import argparse
+from typing import List, Union
+# Initialize the pipeline with the radiology NER model explicitly specified
+nlp = stanza.Pipeline(
+    lang='en',
+    package='radiology',
+    processors={'tokenize': 'default', 'ner': 'radiology'},
+    logging_level='ERROR',  # Only output warnings or more severe messages
+    verbose=False  # Suppress additional information during pipeline initialization
+)
+# Keywords used for radiology-related entity extraction
+# Reference: Learning to Exploit Temporal Structure for Biomedical Vision-Language Processing (CVPR2023)
+# https://arxiv.org/pdf/2301.04558
+KEYWORDS = {
+    "bigger", "change", "cleared", "constant", "decrease", "decreased", "decreasing", "elevated", "elevation",
+    "enlarged", "enlargement", "enlarging", "expanded", "greater", "growing", "improved", "improvement",
+    "improving", "increase", "increased", "increasing", "larger", "new", "persistence", "persistent",
+    "persisting", "progression", "progressive", "reduced", "removal", "resolution", "resolved", "resolving",
+    "smaller", "stability", "stable", "stably", "unchanged", "unfolded", "worse", "worsen", "worsened",
+    "worsening", "unaltered"
+}
+def clean_text(text: str) -> str:
+    """
+    Clean the input text by removing special characters and redundant spaces or newlines.
+    Args:
+        text (str): Input text.
+    Returns:
+        str: Cleaned text.
+    """
+    # Remove special characters and redundant newlines
+    text = re.sub(r'\n+', ' ', text)  # Replace multiple newlines with a single space
+    text = re.sub(r'[_-]+', ' ', text)  # Replace underscores and dashes with spaces
+    text = re.sub(r'\(___, __, __\)', '', text)  # Remove irrelevant underscore patterns
+    text = re.sub(r'---, ---, ---', '', text)  # Remove dashed patterns
+    text = re.sub(r'\(__, __, ___\)', '', text)  # Remove similar underscore patterns
+    text = re.sub(r'[_-]+', ' ', text)  # Replace underscores and dashes again (if any remain)
+    text = re.sub(r'[^\w\s.,:;()-]', '', text)  # Remove non-alphanumeric characters except common punctuation
+    # Remove extra spaces
+    text = re.sub(r'\s{2,}', ' ', text).strip()
+    return text
+def extract_entities(text: str, keywords: set) -> set:
+    """
+    Extract entities from the given text based on Stanza NER and provided keywords.
+    Args:
+        text (str): Input text.
+        keywords (set): Set of keywords to extract entities.
+    Returns:
+        set: Set of matched entities found in the text.
+    """
+    # Use Stanza NER to extract entities tagged as "OBSERVATION" or "OBSERVATION_MODIFIER"
+    doc = nlp(text)
+    stanza_entities = {ent.text.lower() for ent in doc.entities if ent.type in {"OBSERVATION", "OBSERVATION_MODIFIER"}}
+    # Filter Stanza entities to include only those present in keywords
+    matched_stanza_entities = {entity for entity in stanza_entities if entity in keywords}
+    # Clean the text before extracting entities
+    text = clean_text(text)
+    # Create a regex pattern that matches any of the keywords as whole words
+    pattern = r'\b(' + '|'.join(re.escape(word) for word in keywords) + r')\b'
+    # Find all matches using regex
+    keyword_matches = {match.group().lower() for match in re.finditer(pattern, text.lower())}
+    # Combine Stanza entities and regex matches
+    return matched_stanza_entities | keyword_matches
+def calculate_tem_score(prediction_text: str, reference_text: Union[str, List[str]], epsilon: float = 1e-10) -> float:
+    """
+    Calculate the Temporal Entity Matching (TEM) score (similar to F1-score).
+    Args:
+        reference_text (Union[str, List[str]]): Reference text or a list of reference texts.
+        prediction_text (str): Prediction text.
+        epsilon (float): Small value to avoid division by zero.
+    Returns:
+        float: TEM score.
+    """
+    if isinstance(reference_text, list):
+        reference_entities = set()
+        for ref in reference_text:
+            reference_entities.update(extract_entities(ref, KEYWORDS))
+    else:
+        reference_entities = extract_entities(reference_text, KEYWORDS)
+    prediction_entities = extract_entities(prediction_text, KEYWORDS)
+    if len(reference_entities) == 0:
+        if len(prediction_entities) == 0:
+            return {
+                "f1": 1.0,
+                "prediction_entities": prediction_entities,
+                "reference_entities": reference_entities
+            }  # Perfect match when both are empty
+        else:
+            return {
+                "f1": epsilon,
+                "prediction_entities": prediction_entities,
+                "reference_entities": reference_entities
+            }  # Minimal score when reference is empty but prediction is not
+    # Calculate intersection of entities
+    true_positives = len(prediction_entities & reference_entities)
+    # Calculate precision and recall with epsilon to avoid division by zero
+    precision = (true_positives + epsilon) / (len(prediction_entities) + epsilon)
+    recall = (true_positives + epsilon) / (len(reference_entities) + epsilon)
+    # Calculate TEM score (F1 score)
+    tem_score = (2 * precision * recall) / (precision + recall + epsilon)
+    return {
+        "f1": tem_score,
+        "prediction_entities": prediction_entities,
+        "reference_entities": reference_entities
+    }
+def F1Temporal(predictions: List[str], references: List[Union[str, List[str]]], epsilon: float = 1e-10) -> dict:
+    """
+    Calculate the average TEM score over a list of reference and prediction texts.
+    Args:
+        references (List[Union[str, List[str]]]): List of reference texts or lists of reference texts.
+        predictions (List[str]): List of prediction texts.
+        epsilon (float): Small value to avoid division by zero.
+    Returns:
+        float: Average TEM score.
+    """
+    assert len(references) == len(predictions), "Reference and prediction lists must have the same length."
+    tem_scores = []
+    prediction_entities = []
+    reference_entities = []
+    for pred, ref in zip(predictions, references):
+        result = calculate_tem_score(pred, ref, epsilon)
+        tem_scores.append(result["f1"])
+        prediction_entities.append(result["prediction_entities"])
+        reference_entities.append(result["reference_entities"])
+    average_f1 = sum(tem_scores) / len(tem_scores)
+    return {
+        "f1": average_f1,
+        "prediction_entities": prediction_entities,
+        "reference_entities": reference_entities
+    }

factual/green_score/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .green import GREEN

factual/green_score/green.py ADDED Viewed

	@@ -0,0 +1,465 @@

+import re
+import torch
+import torch.distributed as dist
+import pandas as pd
+from datasets import Dataset
+from datasets.distributed import split_dataset_by_node
+import os
+from tqdm import tqdm
+import numpy as np
+import time
+import sys
+import warnings
+import torch.nn as nn
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.utils import logging
+# Import necessary functions (ensure these are available in your environment)
+from factual.green_score.utils import (
+    gather_processes,
+    make_prompt,
+    clean_responses,
+    compute_largest_cluster,
+    flatten_values_lists_of_list_dicts_to_dict,
+)
+# Set the logging level for the transformers library to ERROR to suppress benign warnings
+logging.get_logger("transformers").setLevel(logging.ERROR)
+def get_rank():
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def tqdm_on_main(*args, **kwargs):
+    if is_main_process():
+        print("==== Beginning Inference ====")
+        return tqdm(*args, **kwargs)
+    else:
+        return kwargs.get("iterable", None)
+class GREEN:
+    def __init__(self, model_name, output_dir=".", cpu=False):
+        super().__init__()
+        warnings.filterwarnings(
+            "ignore", message="A decoder-only architecture is being used*"
+        )
+        from sklearn.exceptions import ConvergenceWarning
+        warnings.filterwarnings(
+            "ignore",
+            category=ConvergenceWarning,
+            message="Number of distinct clusters.*",
+        )
+        warnings.filterwarnings(
+            "ignore",
+            category=FutureWarning,
+            module="transformers.tokenization_utils_base",
+        )
+        self.cpu = cpu
+        self.model_name = model_name.split("/")[-1]
+        self.output_dir = output_dir
+        self.batch_size = 4
+        self.max_length = 2048
+        self.categories = [
+            "Clinically Significant Errors",
+            "Clinically Insignificant Errors",
+            "Matched Findings",
+        ]
+        self.sub_categories = [
+            "(a) False report of a finding in the candidate",
+            "(b) Missing a finding present in the reference",
+            "(c) Misidentification of a finding's anatomic location/position",
+            "(d) Misassessment of the severity of a finding",
+            "(e) Mentioning a comparison that isn't in the reference",
+            "(f) Omitting a comparison detailing a change from a prior study",
+        ]
+        self.prompts = None
+        self.completions = None
+        self.green_scores = None
+        self.error_counts = None
+        if torch.cuda.is_available() and torch.cuda.device_count() > 1 and not self.cpu:
+            if not dist.is_initialized():
+                dist.init_process_group(
+                    backend="nccl",
+                )
+                torch.cuda.set_device(dist.get_rank())
+                if dist.get_rank() == 0:
+                    print(
+                        "Distributed training with", torch.cuda.device_count(), "GPUs"
+                    )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=False if "Phi" in model_name else True,
+            device_map=(
+                {"": "cuda:{}".format(torch.cuda.current_device())}
+                if not self.cpu
+                else {"": "cpu"}
+            ),
+            torch_dtype=torch.float16,
+        )
+        self.model.eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            add_eos_token=True,
+            use_fast=True,
+            trust_remote_code=True,
+            padding_side="left",
+        )
+        # Set up chat template for chat-style prompts
+        chat_template = (
+            "{% for message in messages %}\n"
+            "{% if message['from'] == 'human' %}\n"
+            "{{ '<|user|>\n' + message['value'] + eos_token }}\n"
+            "{% elif message['from'] == 'system' %}\n"
+            "{{ '<|system|>\n' + message['value'] + eos_token }}\n"
+            "{% elif message['from'] == 'gpt' %}\n"
+            "{{ '<|assistant|>\n'  + message['value'] + eos_token }}\n"
+            "{% endif %}\n"
+            "{% if loop.last and add_generation_prompt %}\n"
+            "{{ '<|assistant|>' }}\n"
+            "{% endif %}\n"
+            "{% endfor %}"
+        )
+        self.tokenizer.chat_template = chat_template
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.tokenizer.clean_up_tokenization_spaces = True
+        self.tokenizer.padding_side = "left"
+    def __call__(self, refs, hyps):
+        print("Processing data...making prompts")
+        dataset = Dataset.from_dict({"reference": refs, "prediction": hyps})
+        dataset = self.process_data(dataset)
+        print("Done.")
+        self.dataset = dataset
+        t = time.time()
+        mean, std, green_scores, summary, results_df = self.infer()
+        t = time.time() - t
+        print("Seconds per example: ", t / len(refs))
+        if not is_main_process():
+            print(f"Rank {dist.get_rank()} exiting.")
+            dist.destroy_process_group()
+            sys.exit()
+        return mean, std, green_scores, summary, results_df
+    def process_data(self, dataset):
+        def prompting(examples):
+            return {
+                "prompt": [
+                    make_prompt(r, p)
+                    for r, p in zip(examples["reference"], examples["prediction"])
+                ]
+            }
+        dataset = dataset.map(prompting, batched=True)
+        return dataset
+    @torch.inference_mode()
+    def infer(self):
+        if torch.cuda.is_available() and torch.cuda.device_count() > 1 and not self.cpu:
+            dataset_dist = split_dataset_by_node(
+                self.dataset,
+                rank=get_rank(),
+                world_size=int(os.environ["WORLD_SIZE"]),
+            )
+            print("Distributed dataset created on rank: ", int(os.environ["RANK"]))
+        else:
+            dataset_dist = self.dataset
+        local_completions = []
+        local_references = []
+        for batch in tqdm_on_main(
+            iterable=dataset_dist.iter(batch_size=self.batch_size),
+            total=len(dataset_dist) // self.batch_size,
+        ):
+            local_references.extend(batch["prompt"])
+            local_completions.extend(self.get_response(batch))
+        if torch.cuda.is_available() and torch.cuda.device_count() > 1 and not self.cpu:
+            self.completions, self.prompts = gather_processes(
+                local_completions, local_references
+            )
+        else:
+            self.completions = local_completions
+            self.prompts = local_references
+        if is_main_process():
+            print("==== End Inference ====")
+        if len(self.completions) != len(self.prompts):
+            print("Length of prompts and completions are not equal!")
+        return self.process_results()
+    def tokenize_batch_as_chat(self, batch):
+        local_rank = int(os.environ.get("LOCAL_RANK", 0)) if not self.cpu else "cpu"
+        batch = [
+            self.tokenizer.apply_chat_template(
+                i, tokenize=False, add_generation_prompt=True
+            )
+            for i in batch
+        ]
+        batch = self.tokenizer.batch_encode_plus(
+            batch,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=self.max_length,
+        ).to(local_rank)
+        return batch
+    def get_response(self, batch):
+        assert "prompt" in batch.keys(), "prompt is not in batch keys"
+        batch = [
+            [{"from": "human", "value": prompt}, {"from": "gpt", "value": ""}]
+            for prompt in batch["prompt"]
+        ]
+        batch = self.tokenize_batch_as_chat(batch)
+        outputs = self.model.generate(
+            input_ids=batch["input_ids"],
+            attention_mask=batch["attention_mask"],
+            eos_token_id=self.tokenizer.eos_token_id,
+            pad_token_id=self.tokenizer.pad_token_id,
+            max_length=2048,
+            do_sample=False,
+            temperature=None,
+            top_p=None,
+        )
+        responses = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        response_list = []
+        if isinstance(responses, list):
+            for response in responses:
+                response = clean_responses(response)
+                response_list.append(response)
+        else:
+            responses = clean_responses(responses)
+            response_list.append(responses)
+        return response_list
+    def process_results(self):
+        self.green_scores = [
+            self.compute_green(response) for response in self.completions
+        ]
+        self.error_counts = pd.DataFrame(
+            [self.compute_error_count(response) for response in self.completions],
+            columns=self.sub_categories + ["Matched Findings"],
+        )
+        results_df = pd.DataFrame(
+            {
+                "reference": self.dataset["reference"],
+                "predictions": self.dataset["prediction"],
+                "green_analysis": self.completions,
+                "green_score": self.green_scores,
+                **self.error_counts,
+            }
+        )
+        mean, std, summary = self.compute_summary()
+        return mean, std, self.green_scores, summary, results_df
+    def compute_error_count(self, response):
+        _, sig_errors = self.parse_error_counts(response, self.categories[0])
+        matched_findings, _ = self.parse_error_counts(response, self.categories[2])
+        return sig_errors + [matched_findings]
+    def compute_green(self, response):
+        sig_present, sig_errors = self.parse_error_counts(response, self.categories[0])
+        matched_findings, _ = self.parse_error_counts(response, self.categories[2])
+        if matched_findings == 0:
+            return 0
+        if sig_present is None or matched_findings is None:
+            return None
+        return matched_findings / (matched_findings + sum(sig_errors))
+    def parse_error_counts(self, text, category, for_reward=False):
+        if category not in self.categories:
+            raise ValueError(
+                f"Category {category} is not a valid category. Please choose from {self.categories}."
+            )
+        pattern = rf"\[{category}\]:\s*(.*?)(?:\n\s*\n|\Z)"
+        category_text = re.search(pattern, text, re.DOTALL)
+        sum_counts = 0
+        sub_counts = [0 for i in range(6)]
+        if not category_text:
+            if for_reward:
+                return None, None
+            return sum_counts, sub_counts
+        if category_text.group(1).startswith("No"):
+            return sum_counts, sub_counts
+        if category == "Matched Findings":
+            counts = re.findall(r"^\b\d+\b(?=\.)", category_text.group(1))
+            if len(counts) > 0:
+                sum_counts = int(counts[0])
+            return sum_counts, sub_counts
+        else:
+            sub_categories = [s.split(" ", 1)[0] + " " for s in self.sub_categories]
+            matches = sorted(re.findall(r"\([a-f]\) .*", category_text.group(1)))
+            if len(matches) == 0:
+                matches = sorted(re.findall(r"\([1-6]\) .*", category_text.group(1)))
+                sub_categories = [
+                    f"({i})" + " " for i in range(1, len(self.sub_categories) + 1)
+                ]
+            for position, sub_category in enumerate(sub_categories):
+                for match in range(len(matches)):
+                    if matches[match].startswith(sub_category):
+                        count = re.findall(r"(?<=: )\b\d+\b(?=\.)", matches[match])
+                        if len(count) > 0:
+                            sub_counts[position] = int(count[0])
+            return sum(sub_counts), sub_counts
+    def parse_error_sentences(self, response, category):
+        if category not in self.categories:
+            raise ValueError(
+                f"Category {category} is not a valid category. Please choose from {self.categories}."
+            )
+        pattern = rf"\[{category}\]:\s*(.*?)(?:\n\s*\n|\Z)"
+        category_text = re.search(pattern, response, re.DOTALL)
+        sub_category_dict_sentences = {}
+        for sub_category in self.sub_categories:
+            sub_category_dict_sentences[sub_category] = []
+        if not category_text:
+            return sub_category_dict_sentences
+        if category_text.group(1).startswith("No"):
+            return sub_category_dict_sentences
+        if category == "Matched Findings":
+            return (
+                category_text.group(1).rsplit(":", 1)[-1].rsplit(".", 1)[-1].split(";")
+            )
+        matches = sorted(re.findall(r"\([a-f]\) .*", category_text.group(1)))
+        if len(matches) == 0:
+            matches = sorted(re.findall(r"\([1-6]\) .*", category_text.group(1)))
+            self.sub_categories = [
+                f"({i})" + " " for i in range(1, len(self.sub_categories) + 1)
+            ]
+        for position, sub_category in enumerate(self.sub_categories):
+            for match in range(len(matches)):
+                if matches[match].startswith(sub_category):
+                    sentences_list = (
+                        matches[match].rsplit(":", 1)[-1].split(".", 1)[-1].split(";")
+                    )
+                    sub_category_dict_sentences[self.sub_categories[position]] = (
+                        sentences_list
+                    )
+        return sub_category_dict_sentences
+    def compute_sentences(self, response):
+        return self.parse_error_sentences(response, self.categories[0])
+    def get_representative_sentences(self, responses):
+        list_sentences = []
+        for i in responses:
+            sentences = self.compute_sentences(i)
+            list_sentences.append(sentences)
+        dict_sentences = flatten_values_lists_of_list_dicts_to_dict(list_sentences)
+        result_sentences_dict = {}
+        for i in self.sub_categories:
+            sentences = dict_sentences[i]
+            sentences = [i for i in sentences if i.strip() != ""]
+            _, sentences_of_largest_cluster = compute_largest_cluster(sentences)
+            result_sentences_dict[i] = sentences_of_largest_cluster
+        return result_sentences_dict
+    def compute_accuracy(self, responses):
+        counts = []
+        for response in responses:
+            _, sig_errors = self.parse_error_counts(response, self.categories[0])
+            counts.append(sig_errors)
+        counts = np.array(counts)
+        dict_acc = {}
+        for i in range(len(self.sub_categories)):
+            error_counts = counts[:, i]
+            accuracy = np.mean(error_counts == 0)
+            dict_acc[self.sub_categories[i]] = accuracy
+        return dict_acc
+    def compute_summary(self):
+        print("Computing summary ...")
+        representative_sentences = self.get_representative_sentences(self.completions)
+        accuracies = self.compute_accuracy(self.completions)
+        mean = np.mean(self.green_scores)
+        std = np.std(self.green_scores)
+        summary = f"\n-------------{self.model_name}----------------\n [Summary]: Green average {mean} and standard deviation {std} \n [Clinically Significant Errors Analyses]: <accuracy>. <representative error>\n\n"
+        for idx, sub_category in enumerate(self.sub_categories):
+            accuracy = accuracies[sub_category]
+            sentences = representative_sentences[sub_category]
+            summary += f"{sub_category}: {accuracy}. \n {sentences} \n\n"
+        summary += "----------------------------------\n"
+        return mean, std, summary
+if __name__ == "__main__":
+    refs = [
+        "Interstitial opacities without changes.",
+        "Interval development of segmental heterogeneous airspace opacities throughout the lungs . No significant pneumothorax or pleural effusion . Bilateral calcified pleural plaques are scattered throughout the lungs . The heart is not significantly enlarged .",
+        "Lung volumes are low, causing bronchovascular crowding. The cardiomediastinal silhouette is unremarkable. No focal consolidation, pleural effusion, or pneumothorax detected. Within the limitations of chest radiography, osseous structures are unremarkable.",
+    ]
+    hyps = [
+        "Interstitial opacities at bases without changes.",
+        "Interval development of segmental heterogeneous airspace opacities throughout the lungs . No significant pneumothorax or pleural effusion . Bilateral calcified pleural plaques are scattered throughout the lungs . The heart is not significantly enlarged .",
+        "Endotracheal and nasogastric tubes have been removed. Changes of median sternotomy, with continued leftward displacement of the fourth inferiomost sternal wire. There is continued moderate-to-severe enlargement of the cardiac silhouette. Pulmonary aeration is slightly improved, with residual left lower lobe atelectasis. Stable central venous congestion and interstitial pulmonary edema. Small bilateral pleural effusions are unchanged.",
+    ]
+    model_name = "StanfordAIMI/GREEN-radllama2-7b"
+    green_scorer = GREEN(model_name, output_dir=".")
+    mean, std, green_score_list, summary, result_df = green_scorer(refs, hyps)
+    print(green_score_list)
+    print(summary)
+    # for index, row in result_df.iterrows():
+    #     print(f"Row {index}:\n")
+    #     for col_name in result_df.columns:
+    #         print(f"{col_name}: {row[col_name]}\n")
+    #     print('-' * 80)

factual/green_score/utils.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import torch.distributed as dist
+import os
+import sys
+from sklearn.metrics import silhouette_score
+from sklearn.cluster import KMeans
+from sklearn import preprocessing
+from sentence_transformers import SentenceTransformer
+from scipy.spatial import distance
+import numpy as np
+# A dictionary to store rewards for pairs of reference and hypothesis reports
+def compute_largest_cluster(sentences):
+    """
+    Computes the largest cluster of sentences using K-means clustering, finds the sentences within the largest cluster, and orders them by their distance to the cluster center.
+    Args:
+        sentences (list): List of sentences to be clustered.
+    Returns:
+        tuple: A tuple containing:
+            - embeddings (ndarray): Normalized embeddings of the input sentences.
+            - sentences_of_largest_cluster (list): Sentences in the largest cluster, ordered by their proximity
+              to the cluster center.
+    """
+    if len(sentences) == 0:
+        return None, None
+    embeddings, kmeans = compute_kmeans(sentences)
+    cluster_sizes = np.bincount(kmeans.labels_)
+    largest_cluster_idx = np.argmax(cluster_sizes)
+    cluster_member_ids = np.where(kmeans.labels_ == largest_cluster_idx)[0]
+    sentences_of_largest_cluster = [sentences[i] for i in cluster_member_ids]
+    largest_cluster_mean = kmeans.cluster_centers_[largest_cluster_idx]
+    embeddings_of_largest_cluster = [embeddings[i] for i in cluster_member_ids]
+    distances = distance.cdist(
+        embeddings_of_largest_cluster, [largest_cluster_mean], "cosine"
+    ).flatten()
+    closest_point_indices = np.argsort(distances)[0]
+    sentences_of_largest_cluster = sentences_of_largest_cluster[closest_point_indices]
+    return embeddings, sentences_of_largest_cluster
+def compute_kmeans(sentences):
+    """
+    Computes K-means clustering for a list of sentences by generating their embeddings, normalizing the embeddings, and determining the optimal number of clusters using binary search.
+    Args:
+        sentences (list): List of sentences to be clustered.
+    Returns:
+        tuple: A tuple containing:
+            - embeddings (ndarray): Normalized embeddings of the input sentences.
+            - kmeans (KMeans): The KMeans object with the optimal number of clusters determined.
+    """
+    # sentence embeddings
+    model = SentenceTransformer("sentence-transformers/paraphrase-mpnet-base-v2")
+    embeddings = model.encode(sentences)
+    # normalize the embeddings for equivalent computation of the cosine distance
+    embeddings = preprocessing.normalize(embeddings)
+    # compute the number of clusters with binary search
+    kmeans = binary_search_optimal_kmeans(embeddings, min_k=0, max_k=len(sentences))
+    return embeddings, kmeans
+def binary_search_optimal_kmeans(data, min_k, max_k):
+    """
+    Finds the optimal k for KMeans clustering using binary search on the silhouette score.
+    Args:
+        data (list): cluster data.
+        min_k: minimum k for binary search
+        max_k: maximum k for binary search
+    Returns:
+        list: List of cleaned response strings.
+    """
+    best_k = min_k
+    best_score = -1
+    best_kmeans = KMeans(n_clusters=1, random_state=42).fit(
+        data
+    )  # start with 1 cluster for len(data) < 2
+    while min_k <= max_k:
+        mid_k = (min_k + max_k) // 2
+        if mid_k < 2:
+            break
+        kmeans = KMeans(n_clusters=mid_k, random_state=42).fit(data)
+        labels = kmeans.labels_
+        score = silhouette_score(data, labels)
+        if score > best_score:
+            best_score = score
+            best_k = mid_k
+            best_kmeans = kmeans  # Update the best KMeans model
+            min_k = mid_k + 1
+        else:
+            max_k = mid_k - 1
+    return best_kmeans
+def flatten_values_lists_of_list_dicts_to_dict(item):
+    """
+    Flattens a list of dictionaries containing lists of values into a single dictionary.
+    Args:
+        item (list): List of dictionaries, where each dictionary's values are lists. If any element of the list is itself a list, the function will consider only the first dictionary in that sublist.
+    Returns:
+        dict: A dictionary where each key corresponds to the keys in the input dictionaries, and each value is a flattened list of all values associated with that key across all input dictionaries.
+    """
+    result = {}
+    for i in item:
+        if isinstance(i, list):
+            i = i[0]
+        for key, lists in i.items():
+            if key not in result:
+                result[key] = []
+            result[key].extend(lists)
+    return result
+def gather_processes(local_candidates, local_references=None):
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("RANK", "0"))
+    global_candidates_list = None
+    global_references_list = None
+    if local_rank == 0:
+        # Initialize the gather list only on the root process
+        global_candidates_list = [None for _ in range(world_size)]
+        global_references_list = [None for _ in range(world_size)]
+    try:
+        dist.gather_object(local_candidates, global_candidates_list, dst=0)
+        if not local_references is None:
+            dist.gather_object(local_references, global_references_list, dst=0)
+    except Exception as e:
+        print(f"Error during result gathering: {e}")
+    if local_rank != 0:
+        # Exit the process
+        # print(f"Rank {dist.get_rank()} exiting.")
+        dist.destroy_process_group()  # Clean up the distributed processing group
+        sys.exit()  # Exit the process
+    # Flatten the gathered list
+    candidates_list = []
+    for i in global_candidates_list:
+        candidates_list.extend(i)
+    if not global_references_list[0] is None:
+        references_list = []
+        for i in global_references_list:
+            references_list.extend(i)
+        print(f"References list: {len(references_list)}")
+        return candidates_list, references_list
+    return candidates_list
+def clean_responses(response):
+    if "[Explanation]:" in response:
+        if "<|assistant|>" in response:
+            response = response.split("<|assistant|>")[-1]
+        if (
+            "[Explanation]:\n    <Explanation>\n" or "[Explanation]:\n<Explanation>"
+        ) in response:
+            response = response.split("[Explanation]:")[1]
+        else:
+            response = response.split("[Explanation]:")[-1]
+    if "<|assistant|>" in response:
+        response = response.split("<|assistant|>")[-1]
+    return response.replace("</s>", "").replace("<unk>", "")
+def make_prompt(text1, text2, max_len=300):
+    """
+    Creates a prompt for evaluating the accuracy of a candidate radiology report in comparison to a reference radiology report.
+    Args:
+        text1 (str): Reference radiology report.
+        text2 (str): Candidate radiology report.
+    Returns:
+        str: Formatted prompt string.
+    """
+    text1 = " ".join(text1.split()[:max_len])
+    text2 = " ".join(text2.split()[:max_len])
+    prompt = f"Objective: Evaluate the accuracy of a candidate radiology report in comparison to a reference radiology report composed by expert radiologists.\n\n    Process Overview: You will be presented with:\n\n    1. The criteria for making a judgment.\n    2. The reference radiology report.\n    3. The candidate radiology report.\n    4. The desired format for your assessment.\n\n    1. Criteria for Judgment:\n\n    For each candidate report, determine:\n\n    The count of clinically significant errors.\n    The count of clinically insignificant errors.\n\n    Errors can fall into one of these categories:\n\n    a) False report of a finding in the candidate.\n    b) Missing a finding present in the reference.\n    c) Misidentification of a finding's anatomic location/position.\n    d) Misassessment of the severity of a finding.\n    e) Mentioning a comparison that isn't in the reference.\n    f) Omitting a comparison detailing a change from a prior study.\n    Note: Concentrate on the clinical findings rather than the report's writing style. Evaluate only the findings that appear in both reports.\n\n    2. Reference Report:\n    {text1}\n\n    3. Candidate Report:\n    {text2}\n\n    4. Reporting Your Assessment:\n\n    Follow this specific format for your output, even if no errors are found:\n    ```\n    [Explanation]:\n    <Explanation>\n\n    [Clinically Significant Errors]:\n    (a) <Error Type>: <The number of errors>. <Error 1>; <Error 2>; ...; <Error n>\n    ....\n    (f) <Error Type>: <The number of errors>. <Error 1>; <Error 2>; ...; <Error n>\n\n    [Clinically Insignificant Errors]:\n    (a) <Error Type>: <The number of errors>. <Error 1>; <Error 2>; ...; <Error n>\n    ....\n    (f) <Error Type>: <The number of errors>. <Error 1>; <Error 2>; ...; <Error n>\n\n    [Matched Findings]:\n    <The number of matched findings>. <Finding 1>; <Finding 2>; ...; <Finding n>\n    ```\n"
+    return prompt

nlg/__init__.py ADDED Viewed

File without changes

nlg/bertscore/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __author__ = 'tylin'

nlg/bertscore/bertscore.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import torch.nn as nn
+from bert_score import BERTScorer
+class BertScore(nn.Module):
+    def __init__(self,
+                 model_type='distilbert-base-uncased',
+                 num_layers=5,
+                 rescale_with_baseline=True,
+                 idf=False,
+                 ):
+        super(BertScore, self).__init__()
+        with torch.no_grad():
+            self.bert_scorer = BERTScorer(model_type=model_type,
+                                          num_layers=num_layers,
+                                          batch_size=64,
+                                          nthreads=4,
+                                          all_layers=False,
+                                          idf=idf,
+                                          device=None,
+                                          lang='en',
+                                          rescale_with_baseline=rescale_with_baseline,
+                                          baseline_path=None)
+    def forward(self, refs, hyps):
+        p, r, f = self.bert_scorer.score(
+            cands=hyps,
+            refs=refs,
+            verbose=False,
+            batch_size=64,
+        )
+        return torch.mean(f).item(), f.tolist()
+if __name__ == '__main__':
+    x, y = (BertScore()(
+        hyps=[
+            "nothing to do lol",
+            "nothing to do x",
+            'there are moderate bilateral pleural effusions with overlying atelectasis,  underlying consolidation not excluded. mild prominence of the interstitial  markings suggests mild pulmonary edema. the cardiac silhouette is mildly  enlarged. the mediastinal contours are unremarkable. there is no evidence of  pneumothorax.'
+        ],
+        refs=[
+            'heart size is moderately enlarged. the mediastinal and hilar contours are unchanged. there is no pulmonary edema. small left pleural effusion is present. patchy opacities in the lung bases likely reflect atelectasis. no pneumothorax is seen. there are no acute osseous abnormalities.',
+            'heart size is mildly enlarged. the mediastinal and hilar contours are normal. there is mild pulmonary edema. moderate bilateral pleural effusions are present, left greater than right. bibasilar airspace opacities likely reflect atelectasis. no pneumothorax is seen. there are no acute osseous abnormalities.',
+            'heart size is mildly enlarged. the mediastinal and hilar contours are normal. there is mild pulmonary edema. moderate bilateral pleural effusions are present, left greater than right. bibasilar airspace opacities likely reflect atelectasis. no pneumothorax is seen. there are no acute osseous abnormalities.'
+        ])
+    )
+    print(x)
+    print(y)

nlg/bleu/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __author__ = 'tylin'

nlg/bleu/bleu.py ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/usr/bin/env python
+#
+# File Name : bleu.py
+#
+# Description : Wrapper for BLEU scorer.
+#
+# Creation Date : 06-01-2015
+# Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
+# Authors : Hao Fang <[email protected]> and Tsung-Yi Lin <[email protected]>
+import torch.nn as nn
+from .bleu_scorer import BleuScorer
+class Bleu(nn.Module):
+    def __init__(self, n=4, **kwargs):
+        # default compute Blue score up to 4
+        super().__init__()
+        self._n = n
+    def forward(self, gts, res):
+        return self.compute_score(gts, res)
+    def compute_score(self, gts, res):
+        res = {i: [v] for i, v in enumerate(res)}
+        gts = {i: [v] for i, v in enumerate(gts)}
+        bleu_scorer = BleuScorer(n=self._n)
+        for id in sorted(gts.keys()):
+            hypo = res[id]
+            ref = gts[id]
+            # Sanity check.
+            assert (type(hypo) is list)
+            assert (len(hypo) == 1)
+            assert (type(ref) is list)
+            assert (len(ref) >= 1)
+            bleu_scorer += (hypo[0], ref)
+        # score, scores = bleu_scorer.compute_score(option='shortest')
+        score, scores = bleu_scorer.compute_score(option='closest', verbose=0)
+        # score, scores = bleu_scorer.compute_score(option='average', verbose=1)
+        # return (bleu, bleu_info)
+        return score[self._n-1], scores[self._n-1]
+    def method(self):
+        return "Bleu"

nlg/bleu/bleu_scorer.py ADDED Viewed

	@@ -0,0 +1,268 @@

+#!/usr/bin/env python
+# bleu_scorer.py
+# David Chiang <[email protected]>
+# Copyright (c) 2004-2006 University of Maryland. All rights
+# reserved. Do not redistribute without permission from the
+# author. Not for commercial use.
+# Modified by:
+# Hao Fang <[email protected]>
+# Tsung-Yi Lin <[email protected]>
+'''Provides:
+cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
+cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
+'''
+import copy
+import sys, math, re
+from collections import defaultdict
+import six
+from six.moves import xrange as range
+def precook(s, n=4, out=False):
+    """Takes a string as input and returns an object that can be given to
+    either cook_refs or cook_test. This is optional: cook_refs and cook_test
+    can take string arguments as well."""
+    words = s.split()
+    counts = defaultdict(int)
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
+            ngram = tuple(words[i:i+k])
+            counts[ngram] += 1
+    return (len(words), counts)
+def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
+    '''Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them.'''
+    reflen = []
+    maxcounts = {}
+    for ref in refs:
+        rl, counts = precook(ref, n)
+        reflen.append(rl)
+        for (ngram,count) in six.iteritems(counts):
+            maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
+    # Calculate effective reference sentence length.
+    if eff == "shortest":
+        reflen = min(reflen)
+    elif eff == "average":
+        reflen = float(sum(reflen))/len(reflen)
+    ## lhuang: N.B.: leave reflen computaiton to the very end!!
+    ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
+    return (reflen, maxcounts)
+def cook_test(test, reflen_refmaxcounts, eff=None, n=4):
+    '''Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it.'''
+    reflen, refmaxcounts = reflen_refmaxcounts
+    testlen, counts = precook(test, n, True)
+    result = {}
+    # Calculate effective reference sentence length.
+    if eff == "closest":
+        result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
+    else: ## i.e., "average" or "shortest" or None
+        result["reflen"] = reflen
+    result["testlen"] = testlen
+    result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
+    result['correct'] = [0]*n
+    for (ngram, count) in six.iteritems(counts):
+        result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
+    return result
+class BleuScorer(object):
+    """Bleu scorer.
+    """
+    __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
+    # special_reflen is used in oracle (proportional effective ref len for a node).
+    def copy(self):
+        ''' copy the refs.'''
+        new = BleuScorer(n=self.n)
+        new.ctest = copy.copy(self.ctest)
+        new.crefs = copy.copy(self.crefs)
+        new._score = None
+        return new
+    def __init__(self, test=None, refs=None, n=4, special_reflen=None):
+        ''' singular instance '''
+        self.n = n
+        self.crefs = []
+        self.ctest = []
+        self.cook_append(test, refs)
+        self.special_reflen = special_reflen
+    def cook_append(self, test, refs):
+        '''called by constructor and __iadd__ to avoid creating new instances.'''
+        if refs is not None:
+            self.crefs.append(cook_refs(refs))
+            if test is not None:
+                cooked_test = cook_test(test, self.crefs[-1])
+                self.ctest.append(cooked_test) ## N.B.: -1
+            else:
+                self.ctest.append(None) # lens of crefs and ctest have to match
+        self._score = None ## need to recompute
+    def ratio(self, option=None):
+        self.compute_score(option=option)
+        return self._ratio
+    def score_ratio(self, option=None):
+        '''return (bleu, len_ratio) pair'''
+        return (self.fscore(option=option), self.ratio(option=option))
+    def score_ratio_str(self, option=None):
+        return "%.4f (%.2f)" % self.score_ratio(option)
+    def reflen(self, option=None):
+        self.compute_score(option=option)
+        return self._reflen
+    def testlen(self, option=None):
+        self.compute_score(option=option)
+        return self._testlen
+    def retest(self, new_test):
+        if type(new_test) is str:
+            new_test = [new_test]
+        assert len(new_test) == len(self.crefs), new_test
+        self.ctest = []
+        for t, rs in zip(new_test, self.crefs):
+            self.ctest.append(cook_test(t, rs))
+        self._score = None
+        return self
+    def rescore(self, new_test):
+        ''' replace test(s) with new test(s), and returns the new score.'''
+        return self.retest(new_test).compute_score()
+    def size(self):
+        assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
+        return len(self.crefs)
+    def __iadd__(self, other):
+        '''add an instance (e.g., from another sentence).'''
+        if type(other) is tuple:
+            ## avoid creating new BleuScorer instances
+            self.cook_append(other[0], other[1])
+        else:
+            assert self.compatible(other), "incompatible BLEUs."
+            self.ctest.extend(other.ctest)
+            self.crefs.extend(other.crefs)
+            self._score = None ## need to recompute
+        return self
+    def compatible(self, other):
+        return isinstance(other, BleuScorer) and self.n == other.n
+    def single_reflen(self, option="average"):
+        return self._single_reflen(self.crefs[0][0], option)
+    def _single_reflen(self, reflens, option=None, testlen=None):
+        if option == "shortest":
+            reflen = min(reflens)
+        elif option == "average":
+            reflen = float(sum(reflens))/len(reflens)
+        elif option == "closest":
+            reflen = min((abs(l-testlen), l) for l in reflens)[1]
+        else:
+            assert False, "unsupported reflen option %s" % option
+        return reflen
+    def recompute_score(self, option=None, verbose=0):
+        self._score = None
+        return self.compute_score(option, verbose)
+    def compute_score(self, option=None, verbose=0):
+        n = self.n
+        small = 1e-9
+        tiny = 1e-15 ## so that if guess is 0 still return 0
+        bleu_list = [[] for _ in range(n)]
+        if self._score is not None:
+            return self._score
+        if option is None:
+            option = "average" if len(self.crefs) == 1 else "closest"
+        self._testlen = 0
+        self._reflen = 0
+        totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
+        # for each sentence
+        for comps in self.ctest:
+            testlen = comps['testlen']
+            self._testlen += testlen
+            if self.special_reflen is None: ## need computation
+                reflen = self._single_reflen(comps['reflen'], option, testlen)
+            else:
+                reflen = self.special_reflen
+            self._reflen += reflen
+            for key in ['guess','correct']:
+                for k in range(n):
+                    totalcomps[key][k] += comps[key][k]
+            # append per image bleu score
+            bleu = 1.
+            for k in range(n):
+                bleu *= (float(comps['correct'][k]) + tiny) \
+                        /(float(comps['guess'][k]) + small)
+                bleu_list[k].append(bleu ** (1./(k+1)))
+            ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
+            if ratio < 1:
+                for k in range(n):
+                    bleu_list[k][-1] *= math.exp(1 - 1/ratio)
+            if verbose > 1:
+                print(comps, reflen)
+        totalcomps['reflen'] = self._reflen
+        totalcomps['testlen'] = self._testlen
+        bleus = []
+        bleu = 1.
+        for k in range(n):
+            bleu *= float(totalcomps['correct'][k] + tiny) \
+                    / (totalcomps['guess'][k] + small)
+            bleus.append(bleu ** (1./(k+1)))
+        ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
+        if ratio < 1:
+            for k in range(n):
+                bleus[k] *= math.exp(1 - 1/ratio)
+        if verbose > 0:
+            print(totalcomps)
+            print("ratio:", ratio)
+        self._score = bleus
+        return self._score, bleu_list

nlg/radevalbertscore.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+from bert_score import score
+def _get_default_device():
+    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class RadEvalBERTScorer:
+    """
+    Wrapper around bert_score for radiology reports using a custom BERT model.
+    """
+    def __init__(self,
+                 model_type: str = "IAMJB/RadEvalModernBERT",
+                 num_layers: int = None,
+                 use_fast_tokenizer: bool = True,
+                 rescale_with_baseline: bool = False,
+                 device: torch.device = None):
+        self.model_type = model_type
+        self.num_layers = num_layers
+        self.use_fast_tokenizer = use_fast_tokenizer
+        self.rescale_with_baseline = rescale_with_baseline
+        self.device = device or _get_default_device()
+    def score(self, refs: list[str], hyps: list[str]) -> float:
+        """
+        Compute BERTScore F1 between reference and hypothesis texts.
+        Args:
+            refs: list of reference sentences.
+            hyps: list of hypothesis sentences (predictions).
+        Returns:
+            Mean F1 score as a float.
+        """
+        # bert_score expects cands (hypotheses) first, then refs
+        P, R, F1 = score(
+            cands=hyps,
+            refs=refs,
+            model_type=self.model_type,
+            num_layers=self.num_layers,
+            use_fast_tokenizer=self.use_fast_tokenizer,
+            rescale_with_baseline=self.rescale_with_baseline,
+            device=self.device
+        )
+        # Return the mean F1 over all pairs
+        return F1.mean().item(), F1
+if __name__ == "__main__":
+    # Example usage
+    refs = ["Chronic mild to moderate cardiomegaly and pulmonary venous hypertension."]
+    hyps = ["Mild left basal atelectasis; no pneumonia."]
+    scorer = RadiologyBERTScorer(num_layers=23)
+    f1_score = scorer.score(refs, hyps)
+    print(f"Mean F1 score: {f1_score:.4f}")

nlg/rouge/rouge.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch.nn as nn
+from rouge_score import rouge_scorer
+from six.moves import zip_longest
+import numpy as np
+class Rouge(nn.Module):
+    def __init__(self, rouges, **kwargs):
+        super().__init__()
+        rouges = [r.replace('rougel', 'rougeL') for r in rouges]
+        self.scorer = rouge_scorer.RougeScorer(rouges, use_stemmer=True)
+        self.rouges = rouges
+    def forward(self, refs, hyps):
+        scores = []
+        for target_rec, prediction_rec in zip_longest(refs, hyps):
+            if target_rec is None or prediction_rec is None:
+                raise ValueError("Must have equal number of lines across target and "
+                                 "prediction.")
+            scores.append(self.scorer.score(target_rec, prediction_rec))
+        f1_rouge = [s[self.rouges[0]].fmeasure for s in scores]
+        return np.mean(f1_rouge), f1_rouge
+class Rouge1(Rouge):
+    def __init__(self, **kwargs):
+        super(Rouge1, self).__init__(rouges=['rouge1'])
+class Rouge2(Rouge):
+    def __init__(self, **kwargs):
+        super(Rouge2, self).__init__(rouges=['rouge2'])
+class RougeL(Rouge):
+    def __init__(self, **kwargs):
+        super(RougeL, self).__init__(rouges=['rougeL'])

utils.py ADDED Viewed

	@@ -0,0 +1,341 @@

+# ---------------------------------------------------------------
+# This file includes code adapted from:
+# https://github.com/jbdel/RadEval/blob/null-hypothesis/utils.py
+# Original author: Justin Xu
+# ---------------------------------------------------------------
+import re
+import nltk
+import random
+from typing import List, Dict, Tuple, Callable, Optional
+from collections import defaultdict
+nltk.download("punkt_tab", quiet=True)
+def clean_numbered_list(text):
+    """
+    Clean a report if it's a numbered list by:
+    1. Adding proper spacing between numbered items
+    2. Removing the numbered list markers
+    3. Adding spaces after periods between sentences
+    """
+    # First, separate numbered items that are stuck together without spaces
+    # Example: "textx.2. text2" -> "texty. 2. text2"
+    text = re.sub(r'\.(\d+\.)', r'. \1', text)
+    # Handle patterns where there's no period between numbered entries
+    # Example: "1. item1 2. item2" -> "1. item1. 2. item2"
+    text = re.sub(r'(\d+\.\s*[^.]+?)\s+(?=\d+\.)', r'\1. ', text)
+    # Then remove the numbered list markers
+    # But avoid removing decimal numbers in measurements like "3.5 cm"
+    text = re.sub(r'(?<!\d)\d+\.\s*', '', text)
+    # Add spaces after periods between sentences if missing
+    # Example: "sentence1.sentence2" -> "sentence1. sentence2"
+    # But don't split decimal numbers like "3.5 cm"
+    text = re.sub(r'\.([A-Za-z])', r'. \1', text)
+    return nltk.sent_tokenize(text)
+class PairedTest:
+    """
+    Paired significance testing for comparing radiology report generation systems.
+    Supports paired approximate randomization (AR).
+    """
+    def __init__(self,
+                 systems: Dict[str, List[str]],
+                 metrics: Dict[str, Callable],
+                 references: Optional[List[str]],
+                 n_samples: int = 10000,
+                 n_jobs: int = 1,
+                 seed: int = 12345):
+        """
+        Args:
+            systems: Dictionary mapping system names to their generated reports
+            metrics: Dictionary mapping metric names to metric functions
+            references: List of reference reports
+            n_samples: Number of resampling trials (default: 10000)
+            n_jobs: Number of parallel jobs (default: 1)
+            seed: Random seed for reproducibility
+        """
+        self.systems = systems
+        self.metrics = metrics
+        self.references = references
+        self.n_samples = n_samples
+        self.n_jobs = n_jobs
+        self.seed = seed
+        random.seed(seed)
+        if not systems:
+            raise ValueError("At least one system is required")
+        system_lengths = [len(outputs) for outputs in systems.values()]
+        if len(set(system_lengths)) > 1:
+            raise ValueError("All systems must have the same number of outputs")
+        if references and len(references) != system_lengths[0]:
+            raise ValueError("References must have same length as system outputs")
+        self.n_instances = system_lengths[0]
+    def __call__(self) -> Tuple[Dict[str, str], Dict[str, Dict[str, float]]]:
+        """
+        Run the paired significance test.
+        Returns:
+            Tuple of (signatures, scores) where:
+            - signatures: Dict mapping metric names to signature strings
+            - scores: Dict mapping system names to metric scores and p-values
+        """
+        # Calculate baseline scores for all systems and metrics
+        baseline_scores = self._calculate_baseline_scores()
+        # Get baseline system (first system)
+        baseline_name = list(self.systems.keys())[0]
+        scores = {}
+        signatures = {}
+        # Calculate scores and p-values for each system
+        for system_name in self.systems.keys():
+            scores[system_name] = {}
+            for metric_name in self.metrics.keys():
+                score = baseline_scores[system_name][metric_name]
+                scores[system_name][metric_name] = score
+                if system_name != baseline_name:
+                    p_value = self._calculate_p_value(
+                        baseline_name, system_name, metric_name, baseline_scores
+                    )
+                    scores[system_name][f'{metric_name}_pvalue'] = p_value
+        for metric_name in self.metrics.keys():
+            signatures[metric_name] = f"{metric_name}|{'ar'}:{self.n_samples}|seed:{self.seed}"
+        return signatures, scores
+    def _calculate_baseline_scores(self) -> Dict[str, Dict[str, float]]:
+        """Calculate baseline scores for all systems and metrics."""
+        scores = defaultdict(dict)
+        for system_name, outputs in self.systems.items():
+            for metric_name, metric_func in self.metrics.items():
+                if self.references:
+                    score = metric_func(outputs, self.references)
+                else:
+                    score = metric_func(outputs)
+                if isinstance(score, dict):
+                    if 'score' in score:
+                        scores[system_name][metric_name] = score['score']
+                    else:
+                        scores[system_name][metric_name] = list(score.values())[0]
+                elif isinstance(score, (tuple, list)):
+                    scores[system_name][metric_name] = score[0]
+                else:
+                    scores[system_name][metric_name] = score
+        return scores
+    def _calculate_p_value(self,
+                          baseline_name: str,
+                          system_name: str,
+                          metric_name: str,
+                          baseline_scores: Dict[str, Dict[str, float]]) -> float:
+        """Calculate p-value using AR test"""
+        baseline_outputs = self.systems[baseline_name]
+        system_outputs = self.systems[system_name]
+        metric_func = self.metrics[metric_name]
+        baseline_score = baseline_scores[baseline_name][metric_name]
+        system_score = baseline_scores[system_name][metric_name]
+        original_delta = abs(system_score - baseline_score)
+        return self._approximate_randomization_test(
+            baseline_outputs, system_outputs, metric_func, original_delta
+        )
+    def _approximate_randomization_test(self,
+                                      baseline_outputs: List[str],
+                                      system_outputs: List[str],
+                                      metric_func: Callable,
+                                      original_delta: float) -> float:
+        """
+        Perform AR test.
+        For each trial, randomly swap outputs between systems and calculate
+        the score difference. P-value is the proportion of trials where
+        the randomized delta >= original delta.
+        """
+        count_greater = 0
+        for _ in range(self.n_samples):
+            randomized_baseline = []
+            randomized_system = []
+            for i in range(self.n_instances):
+                if random.random() < 0.5:
+                    # Don't swap
+                    randomized_baseline.append(baseline_outputs[i])
+                    randomized_system.append(system_outputs[i])
+                else:
+                    # Swap
+                    randomized_baseline.append(system_outputs[i])
+                    randomized_system.append(baseline_outputs[i])
+            if self.references:
+                rand_baseline_score = metric_func(randomized_baseline, self.references)
+                rand_system_score = metric_func(randomized_system, self.references)
+            else:
+                rand_baseline_score = metric_func(randomized_baseline)
+                rand_system_score = metric_func(randomized_system)
+            if isinstance(rand_baseline_score, dict):
+                rand_baseline_score = rand_baseline_score.get('score', list(rand_baseline_score.values())[0])
+            elif isinstance(rand_baseline_score, (tuple, list)):
+                rand_baseline_score = rand_baseline_score[0]
+            if isinstance(rand_system_score, dict):
+                rand_system_score = rand_system_score.get('score', list(rand_system_score.values())[0])
+            elif isinstance(rand_system_score, (tuple, list)):
+                rand_system_score = rand_system_score[0]
+            rand_delta = abs(rand_system_score - rand_baseline_score)
+            if rand_delta >= original_delta:
+                count_greater += 1
+        return count_greater / self.n_samples
+def print_significance_results(scores: Dict[str, Dict[str, float]],
+                             signatures: Dict[str, str],
+                             baseline_name: str,
+                             significance_level: float = 0.05):
+    """
+    Args:
+        scores: Dictionary of system scores and p-values
+        signatures: Dictionary of metric signatures
+        baseline_name: Name of the baseline system
+        significance_level: Significance threshold (default: 0.05)
+    """
+    assert baseline_name in scores, f"Baseline system '{baseline_name}' not found in scores."
+    metric_names = [name for name in signatures.keys()]
+    system_names = list(scores.keys())
+    print("=" * 80)
+    print("PAIRED SIGNIFICANCE TEST RESULTS")
+    print("=" * 80)
+    header = f"{'System':<40}"
+    for metric in metric_names:
+        header += f"{metric:>15}"
+    print(header)
+    print("-" * len(header))
+    baseline_row = f"Baseline: {baseline_name:<32}"
+    for metric in metric_names:
+        score = scores[baseline_name][metric]
+        baseline_row += f"{score:>12.4f}   "
+    print(baseline_row)
+    print("-" * len(header))
+    for system_name in system_names:
+        if system_name == baseline_name:
+            continue
+        system_row = f"{system_name:<40}"
+        for metric in metric_names:
+            score = scores[system_name].get(metric, 0.0)
+            if isinstance(score, float):
+                system_row += f"{score:>12.4f}   "
+            else:
+                system_row += f"{str(score):>12}   "
+        print(system_row)
+        # P-value row
+        pvalue_row = " " * 40
+        for metric in metric_names:
+            pvalue_key = f"{metric}_pvalue"
+            if pvalue_key in scores[system_name]:
+                p_val = scores[system_name][pvalue_key]
+                significance_marker = "*" if p_val < significance_level else ""
+                pvalue_row += f"(p={p_val:.4f}){significance_marker:<2}".rjust(15)
+            else:
+                pvalue_row += " " * 15
+        print(pvalue_row)
+        print("-" * len(header))
+    # Footer
+    print(f"- Significance level: {significance_level}")
+    print("- '*' indicates significant difference (p < significance level)")
+    print("- Null hypothesis: systems are essentially the same")
+    print("- Significant results suggest systems are meaningfully different\n")
+    print("METRIC SIGNATURES:")
+    for metric, signature in signatures.items():
+        print(f"- {metric}: {signature}")
+def compare_systems(systems: Dict[str, List[str]],
+                   metrics: Dict[str, Callable],
+                   references: Optional[List[str]] = None,
+                   n_samples: int = 10000,
+                   significance_level: float = 0.05,
+                   seed: int = 12345,
+                   print_results: bool = True) -> Tuple[Dict[str, str], Dict[str, Dict[str, float]]]:
+    """
+    Args:
+        systems: Dictionary mapping system names to their generated reports
+        metrics: Dictionary mapping metric names to metric functions
+        references: Optional list of reference reports
+        n_samples: Number of resampling trials
+        significance_level: Significance threshold for printing results
+        seed: Random seed for reproducibility
+        print_results: Whether to print formatted results
+    Returns:
+        Tuple of (signatures, scores)
+    Example:
+        ```python
+        systems = {
+            'baseline_model': baseline_reports,
+            'new_model': new_model_reports,
+            'other_model': other_model_reports
+        }
+        metrics = {
+            'bleu': lambda hyp, ref: bleu_score(hyp, ref),
+            'rouge': lambda hyp, ref: rouge_score(hyp, ref),
+            'bertscore': lambda hyp, ref: bert_score(hyp, ref)
+            'custom_metric': lambda hyp, ref: custom_metric(hyp, ref)
+        }
+        signatures, scores = compare_systems(
+            systems, metrics, references,
+            n_samples=10000
+        )
+        ```
+    """
+    paired_test = PairedTest(
+        systems=systems,
+        metrics=metrics,
+        references=references,
+        n_samples=n_samples,
+        seed=seed
+    )
+    signatures, scores = paired_test()
+    if print_results:
+        baseline_name = list(systems.keys())[0]
+        print_significance_results(scores, signatures, baseline_name, significance_level)
+    return signatures, scores