Spaces:
Running
on
Zero
Running
on
Zero
| import argparse | |
| import json | |
| import string | |
| from pathlib import Path | |
| from jiwer import cer, wer | |
| def normalize_text(text: str) -> str: | |
| """ | |
| Lowercase and remove punctuation from a string. | |
| Args: | |
| text (str): Input string | |
| Returns: | |
| str: Normalized string | |
| """ | |
| # Lowercase | |
| text = text.lower() | |
| # Remove punctuation | |
| text = text.translate(str.maketrans("", "", string.punctuation)) | |
| return text | |
| def load_jsonl_dict(path): | |
| transcripts = {} | |
| with open(path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| data = json.loads(line) | |
| transcripts[Path(data["file"]).name] = data["transcript"] | |
| return transcripts | |
| def main(args): | |
| ref_dict = load_jsonl_dict(args.reference) | |
| hyp_dict = load_jsonl_dict(args.hypothesis) | |
| common_files = set(ref_dict.keys()) & set(hyp_dict.keys()) | |
| if not common_files: | |
| print("No common files between reference and hypothesis.") | |
| return | |
| refs = [normalize_text(ref_dict[f]) for f in sorted(common_files)] | |
| hyps = [normalize_text(hyp_dict[f]) for f in sorted(common_files)] | |
| cer_score = cer(refs, hyps) | |
| wer_score = wer(refs, hyps) | |
| print(f"CER: {cer_score:.3%}") | |
| print(f"WER: {wer_score:.3%}") | |
| print(f"Evaluated on {len(common_files)} files.") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "--reference", type=str, required=True, help="Path to reference JSONL" | |
| ) | |
| parser.add_argument( | |
| "--hypothesis", type=str, required=True, help="Path to hypothesis JSONL" | |
| ) | |
| args = parser.parse_args() | |
| main(args) | |