pardi-speech / codec /scripts /compute_wer_from_refs.py
Mehdi Lakbar
Initial demo of Lina-speech (pardi-speech)
56cfa73
import argparse
import json
import string
from pathlib import Path
from jiwer import cer, wer
def normalize_text(text: str) -> str:
"""
Lowercase and remove punctuation from a string.
Args:
text (str): Input string
Returns:
str: Normalized string
"""
# Lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))
return text
def load_jsonl_dict(path):
transcripts = {}
with open(path, "r", encoding="utf-8") as f:
for line in f:
data = json.loads(line)
transcripts[Path(data["file"]).name] = data["transcript"]
return transcripts
def main(args):
ref_dict = load_jsonl_dict(args.reference)
hyp_dict = load_jsonl_dict(args.hypothesis)
common_files = set(ref_dict.keys()) & set(hyp_dict.keys())
if not common_files:
print("No common files between reference and hypothesis.")
return
refs = [normalize_text(ref_dict[f]) for f in sorted(common_files)]
hyps = [normalize_text(hyp_dict[f]) for f in sorted(common_files)]
cer_score = cer(refs, hyps)
wer_score = wer(refs, hyps)
print(f"CER: {cer_score:.3%}")
print(f"WER: {wer_score:.3%}")
print(f"Evaluated on {len(common_files)} files.")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--reference", type=str, required=True, help="Path to reference JSONL"
)
parser.add_argument(
"--hypothesis", type=str, required=True, help="Path to hypothesis JSONL"
)
args = parser.parse_args()
main(args)