import argparse import json import string from pathlib import Path from jiwer import cer, wer def normalize_text(text: str) -> str: """ Lowercase and remove punctuation from a string. Args: text (str): Input string Returns: str: Normalized string """ # Lowercase text = text.lower() # Remove punctuation text = text.translate(str.maketrans("", "", string.punctuation)) return text def load_jsonl_dict(path): transcripts = {} with open(path, "r", encoding="utf-8") as f: for line in f: data = json.loads(line) transcripts[Path(data["file"]).name] = data["transcript"] return transcripts def main(args): ref_dict = load_jsonl_dict(args.reference) hyp_dict = load_jsonl_dict(args.hypothesis) common_files = set(ref_dict.keys()) & set(hyp_dict.keys()) if not common_files: print("No common files between reference and hypothesis.") return refs = [normalize_text(ref_dict[f]) for f in sorted(common_files)] hyps = [normalize_text(hyp_dict[f]) for f in sorted(common_files)] cer_score = cer(refs, hyps) wer_score = wer(refs, hyps) print(f"CER: {cer_score:.3%}") print(f"WER: {wer_score:.3%}") print(f"Evaluated on {len(common_files)} files.") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--reference", type=str, required=True, help="Path to reference JSONL" ) parser.add_argument( "--hypothesis", type=str, required=True, help="Path to hypothesis JSONL" ) args = parser.parse_args() main(args)