ner-spacy-indonesian / spacy_model_analyzer.py

mac

Add interactive model analyzer script

b6b76b9 about 1 month ago

4.42 kB

	import streamlit as st
	import spacy
	from spacy.training import Example
	from spacy.scorer import Scorer
	import re
	import pandas as pd

	# Load the NER model
	@st.cache_resource
	def load_model(path):
	return spacy.load(path)

	def parse_training_data(file_path):
	with open(file_path, 'r', encoding='utf-8') as f:
	lines = f.readlines()

	examples = []
	tag_pattern = re.compile(r'<([A-Z]+)>(.*?)</\1>')

	for line in lines:
	line = line.strip()
	if not line:
	continue

	text = ""
	ents = []
	last_end = 0

	for match in tag_pattern.finditer(line):
	# Add the text before the current tag
	text += line[last_end:match.start()]

	# The entity text
	entity_text = match.group(2)
	entity_label = match.group(1)

	start_char = len(text)
	text += entity_text
	end_char = len(text)

	ents.append((start_char, end_char, entity_label))

	last_end = match.end()

	# Add any remaining text after the last tag
	text += line[last_end:]

	if text:
	# Remove duplicates and sort
	unique_ents = sorted(list(set(ents)))
	examples.append({"text": text.lower(), "ents": unique_ents})

	return examples

	def evaluate_model(nlp, examples):
	scorer = Scorer()
	example_list = []
	skipped_examples = 0
	for ex in examples:
	try:
	doc = nlp.make_doc(ex["text"])
	gold_ents = ex["ents"]

	gold_dict = {"entities": gold_ents}

	example = Example.from_dict(doc, gold_dict)
	pred_doc = nlp(example.predicted)

	aligned_example = Example(pred_doc, example.reference)
	example_list.append(aligned_example)
	except ValueError as e:
	# This will catch alignment errors
	st.warning(f"Skipping an example due to alignment issues: {ex['text'][:50]}... Error: {e}")
	skipped_examples += 1
	continue

	st.info(f"Total examples evaluated: {len(example_list)}. Skipped: {skipped_examples}.")
	if not example_list:
	return {}
	scores = scorer.score(example_list)
	return scores

	nlp = load_model(".")

	# Streamlit app
	st.title("Indonesian NER SpaCy Model Analyzer")

	st.header("Model Information")
	st.write(f"Language: {nlp.lang}")
	st.write(f"Pipeline: {', '.join(nlp.pipe_names)}")
	st.write(f"Labels: {', '.join(nlp.get_pipe('ner').labels)}")

	st.header("Model Evaluation")
	evaluation_data = parse_training_data('../data_training.txt')
	scores = evaluate_model(nlp, evaluation_data)

	if scores:
	# Overall scores
	st.subheader("Overall Scores")
	overall_scores = {
	"Precision": scores.get("ents_p", 0),
	"Recall": scores.get("ents_r", 0),
	"F1-score": scores.get("ents_f", 0),
	}
	st.table(pd.DataFrame([overall_scores]))

	# Scores per entity
	st.subheader("Scores per Entity")
	per_entity_scores = []
	for label, metrics in scores.get("ents_per_type", {}).items():
	per_entity_scores.append({
	"Entity": label,
	"Precision": metrics.get("p", 0),
	"Recall": metrics.get("r", 0),
	"F1-score": metrics.get("f", 0),
	})

	if per_entity_scores:
	df_scores = pd.DataFrame(per_entity_scores)
	# Sort alphabetically to have a consistent order before numbering
	df_scores = df_scores.sort_values(by="Entity", ascending=True)
	# Add a unique number ID column
	df_scores.insert(0, '#', range(1, 1 + len(df_scores)))
	st.table(df_scores)
	else:
	st.write("No per-entity scores available.")
	else:
	st.write("Could not calculate scores. Please check the training data format.")


	st.header("Analyze Text")
	text_input = st.text_area("Enter text to analyze:", "Presiden Joko Widodo mengunjungi Jakarta hari ini.")

	if text_input:
	doc = nlp(text_input.lower())
	st.header("NER Visualization")
	html = spacy.displacy.render(doc, style="ent", jupyter=False)
	st.html(html)

	st.header("Named Entities")
	ents = [(ent.text, ent.label_) for ent in doc.ents]
	if ents:
	st.table(ents)
	else:
	st.write("No entities found.")