|
import streamlit as st |
|
import spacy |
|
from spacy.training import Example |
|
from spacy.scorer import Scorer |
|
import re |
|
import pandas as pd |
|
|
|
|
|
@st.cache_resource |
|
def load_model(path): |
|
return spacy.load(path) |
|
|
|
def parse_training_data(file_path): |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
lines = f.readlines() |
|
|
|
examples = [] |
|
tag_pattern = re.compile(r'<([A-Z]+)>(.*?)</\1>') |
|
|
|
for line in lines: |
|
line = line.strip() |
|
if not line: |
|
continue |
|
|
|
text = "" |
|
ents = [] |
|
last_end = 0 |
|
|
|
for match in tag_pattern.finditer(line): |
|
|
|
text += line[last_end:match.start()] |
|
|
|
|
|
entity_text = match.group(2) |
|
entity_label = match.group(1) |
|
|
|
start_char = len(text) |
|
text += entity_text |
|
end_char = len(text) |
|
|
|
ents.append((start_char, end_char, entity_label)) |
|
|
|
last_end = match.end() |
|
|
|
|
|
text += line[last_end:] |
|
|
|
if text: |
|
|
|
unique_ents = sorted(list(set(ents))) |
|
examples.append({"text": text.lower(), "ents": unique_ents}) |
|
|
|
return examples |
|
|
|
def evaluate_model(nlp, examples): |
|
scorer = Scorer() |
|
example_list = [] |
|
skipped_examples = 0 |
|
for ex in examples: |
|
try: |
|
doc = nlp.make_doc(ex["text"]) |
|
gold_ents = ex["ents"] |
|
|
|
gold_dict = {"entities": gold_ents} |
|
|
|
example = Example.from_dict(doc, gold_dict) |
|
pred_doc = nlp(example.predicted) |
|
|
|
aligned_example = Example(pred_doc, example.reference) |
|
example_list.append(aligned_example) |
|
except ValueError as e: |
|
|
|
st.warning(f"Skipping an example due to alignment issues: {ex['text'][:50]}... Error: {e}") |
|
skipped_examples += 1 |
|
continue |
|
|
|
st.info(f"Total examples evaluated: {len(example_list)}. Skipped: {skipped_examples}.") |
|
if not example_list: |
|
return {} |
|
scores = scorer.score(example_list) |
|
return scores |
|
|
|
nlp = load_model(".") |
|
|
|
|
|
st.title("Indonesian NER SpaCy Model Analyzer") |
|
|
|
st.header("Model Information") |
|
st.write(f"**Language:** {nlp.lang}") |
|
st.write(f"**Pipeline:** {', '.join(nlp.pipe_names)}") |
|
st.write(f"**Labels:** {', '.join(nlp.get_pipe('ner').labels)}") |
|
|
|
st.header("Model Evaluation") |
|
evaluation_data = parse_training_data('../data_training.txt') |
|
scores = evaluate_model(nlp, evaluation_data) |
|
|
|
if scores: |
|
|
|
st.subheader("Overall Scores") |
|
overall_scores = { |
|
"Precision": scores.get("ents_p", 0), |
|
"Recall": scores.get("ents_r", 0), |
|
"F1-score": scores.get("ents_f", 0), |
|
} |
|
st.table(pd.DataFrame([overall_scores])) |
|
|
|
|
|
st.subheader("Scores per Entity") |
|
per_entity_scores = [] |
|
for label, metrics in scores.get("ents_per_type", {}).items(): |
|
per_entity_scores.append({ |
|
"Entity": label, |
|
"Precision": metrics.get("p", 0), |
|
"Recall": metrics.get("r", 0), |
|
"F1-score": metrics.get("f", 0), |
|
}) |
|
|
|
if per_entity_scores: |
|
df_scores = pd.DataFrame(per_entity_scores) |
|
|
|
df_scores = df_scores.sort_values(by="Entity", ascending=True) |
|
|
|
df_scores.insert(0, '#', range(1, 1 + len(df_scores))) |
|
st.table(df_scores) |
|
else: |
|
st.write("No per-entity scores available.") |
|
else: |
|
st.write("Could not calculate scores. Please check the training data format.") |
|
|
|
|
|
st.header("Analyze Text") |
|
text_input = st.text_area("Enter text to analyze:", "Presiden Joko Widodo mengunjungi Jakarta hari ini.") |
|
|
|
if text_input: |
|
doc = nlp(text_input.lower()) |
|
st.header("NER Visualization") |
|
html = spacy.displacy.render(doc, style="ent", jupyter=False) |
|
st.html(html) |
|
|
|
st.header("Named Entities") |
|
ents = [(ent.text, ent.label_) for ent in doc.ents] |
|
if ents: |
|
st.table(ents) |
|
else: |
|
st.write("No entities found.") |