mac commited on
Commit
b6b76b9
·
1 Parent(s): 47301d1

Add interactive model analyzer script

Browse files

- Include spacy_model_analyzer.py for model evaluation and testing
- Add installation instructions for streamlit and pandas dependencies
- Enable interactive analysis with real-time entity recognition
- Provide detailed metrics calculation and visualization tools

Author: Asep Muhamad <[email protected]>

Files changed (2) hide show
  1. README.md +4 -0
  2. spacy_model_analyzer.py +145 -0
README.md CHANGED
@@ -140,6 +140,10 @@ The model was evaluated on 2,987 examples from the training data with the follow
140
  You can reproduce these metrics using the included analyzer script:
141
 
142
  ```bash
 
 
 
 
143
  streamlit run spacy_model_analyzer.py
144
  ```
145
 
 
140
  You can reproduce these metrics using the included analyzer script:
141
 
142
  ```bash
143
+ # Install required dependencies
144
+ pip install streamlit pandas
145
+
146
+ # Run the analyzer
147
  streamlit run spacy_model_analyzer.py
148
  ```
149
 
spacy_model_analyzer.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import spacy
3
+ from spacy.training import Example
4
+ from spacy.scorer import Scorer
5
+ import re
6
+ import pandas as pd
7
+
8
+ # Load the NER model
9
+ @st.cache_resource
10
+ def load_model(path):
11
+ return spacy.load(path)
12
+
13
+ def parse_training_data(file_path):
14
+ with open(file_path, 'r', encoding='utf-8') as f:
15
+ lines = f.readlines()
16
+
17
+ examples = []
18
+ tag_pattern = re.compile(r'<([A-Z]+)>(.*?)</\1>')
19
+
20
+ for line in lines:
21
+ line = line.strip()
22
+ if not line:
23
+ continue
24
+
25
+ text = ""
26
+ ents = []
27
+ last_end = 0
28
+
29
+ for match in tag_pattern.finditer(line):
30
+ # Add the text before the current tag
31
+ text += line[last_end:match.start()]
32
+
33
+ # The entity text
34
+ entity_text = match.group(2)
35
+ entity_label = match.group(1)
36
+
37
+ start_char = len(text)
38
+ text += entity_text
39
+ end_char = len(text)
40
+
41
+ ents.append((start_char, end_char, entity_label))
42
+
43
+ last_end = match.end()
44
+
45
+ # Add any remaining text after the last tag
46
+ text += line[last_end:]
47
+
48
+ if text:
49
+ # Remove duplicates and sort
50
+ unique_ents = sorted(list(set(ents)))
51
+ examples.append({"text": text.lower(), "ents": unique_ents})
52
+
53
+ return examples
54
+
55
+ def evaluate_model(nlp, examples):
56
+ scorer = Scorer()
57
+ example_list = []
58
+ skipped_examples = 0
59
+ for ex in examples:
60
+ try:
61
+ doc = nlp.make_doc(ex["text"])
62
+ gold_ents = ex["ents"]
63
+
64
+ gold_dict = {"entities": gold_ents}
65
+
66
+ example = Example.from_dict(doc, gold_dict)
67
+ pred_doc = nlp(example.predicted)
68
+
69
+ aligned_example = Example(pred_doc, example.reference)
70
+ example_list.append(aligned_example)
71
+ except ValueError as e:
72
+ # This will catch alignment errors
73
+ st.warning(f"Skipping an example due to alignment issues: {ex['text'][:50]}... Error: {e}")
74
+ skipped_examples += 1
75
+ continue
76
+
77
+ st.info(f"Total examples evaluated: {len(example_list)}. Skipped: {skipped_examples}.")
78
+ if not example_list:
79
+ return {}
80
+ scores = scorer.score(example_list)
81
+ return scores
82
+
83
+ nlp = load_model(".")
84
+
85
+ # Streamlit app
86
+ st.title("Indonesian NER SpaCy Model Analyzer")
87
+
88
+ st.header("Model Information")
89
+ st.write(f"**Language:** {nlp.lang}")
90
+ st.write(f"**Pipeline:** {', '.join(nlp.pipe_names)}")
91
+ st.write(f"**Labels:** {', '.join(nlp.get_pipe('ner').labels)}")
92
+
93
+ st.header("Model Evaluation")
94
+ evaluation_data = parse_training_data('../data_training.txt')
95
+ scores = evaluate_model(nlp, evaluation_data)
96
+
97
+ if scores:
98
+ # Overall scores
99
+ st.subheader("Overall Scores")
100
+ overall_scores = {
101
+ "Precision": scores.get("ents_p", 0),
102
+ "Recall": scores.get("ents_r", 0),
103
+ "F1-score": scores.get("ents_f", 0),
104
+ }
105
+ st.table(pd.DataFrame([overall_scores]))
106
+
107
+ # Scores per entity
108
+ st.subheader("Scores per Entity")
109
+ per_entity_scores = []
110
+ for label, metrics in scores.get("ents_per_type", {}).items():
111
+ per_entity_scores.append({
112
+ "Entity": label,
113
+ "Precision": metrics.get("p", 0),
114
+ "Recall": metrics.get("r", 0),
115
+ "F1-score": metrics.get("f", 0),
116
+ })
117
+
118
+ if per_entity_scores:
119
+ df_scores = pd.DataFrame(per_entity_scores)
120
+ # Sort alphabetically to have a consistent order before numbering
121
+ df_scores = df_scores.sort_values(by="Entity", ascending=True)
122
+ # Add a unique number ID column
123
+ df_scores.insert(0, '#', range(1, 1 + len(df_scores)))
124
+ st.table(df_scores)
125
+ else:
126
+ st.write("No per-entity scores available.")
127
+ else:
128
+ st.write("Could not calculate scores. Please check the training data format.")
129
+
130
+
131
+ st.header("Analyze Text")
132
+ text_input = st.text_area("Enter text to analyze:", "Presiden Joko Widodo mengunjungi Jakarta hari ini.")
133
+
134
+ if text_input:
135
+ doc = nlp(text_input.lower())
136
+ st.header("NER Visualization")
137
+ html = spacy.displacy.render(doc, style="ent", jupyter=False)
138
+ st.html(html)
139
+
140
+ st.header("Named Entities")
141
+ ents = [(ent.text, ent.label_) for ent in doc.ents]
142
+ if ents:
143
+ st.table(ents)
144
+ else:
145
+ st.write("No entities found.")