import streamlit as st
import spacy
from spacy.training import Example
from spacy.scorer import Scorer
import re
import pandas as pd

# Load the NER model
@st.cache_resource
def load_model(path):
    return spacy.load(path)

def parse_training_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    examples = []
    tag_pattern = re.compile(r'<([A-Z]+)>(.*?)</\1>')

    for line in lines:
        line = line.strip()
        if not line:
            continue

        text = ""
        ents = []
        last_end = 0
        
        for match in tag_pattern.finditer(line):
            # Add the text before the current tag
            text += line[last_end:match.start()]
            
            # The entity text
            entity_text = match.group(2)
            entity_label = match.group(1)
            
            start_char = len(text)
            text += entity_text
            end_char = len(text)
            
            ents.append((start_char, end_char, entity_label))
            
            last_end = match.end()
            
        # Add any remaining text after the last tag
        text += line[last_end:]
        
        if text:
             # Remove duplicates and sort
             unique_ents = sorted(list(set(ents)))
             examples.append({"text": text.lower(), "ents": unique_ents})
             
    return examples

def evaluate_model(nlp, examples):
    scorer = Scorer()
    example_list = []
    skipped_examples = 0
    for ex in examples:
        try:
            doc = nlp.make_doc(ex["text"])
            gold_ents = ex["ents"]
            
            gold_dict = {"entities": gold_ents}

            example = Example.from_dict(doc, gold_dict)
            pred_doc = nlp(example.predicted)
            
            aligned_example = Example(pred_doc, example.reference)
            example_list.append(aligned_example)
        except ValueError as e:
            # This will catch alignment errors
            st.warning(f"Skipping an example due to alignment issues: {ex['text'][:50]}... Error: {e}")
            skipped_examples += 1
            continue
            
    st.info(f"Total examples evaluated: {len(example_list)}. Skipped: {skipped_examples}.")
    if not example_list:
        return {}
    scores = scorer.score(example_list)
    return scores

nlp = load_model(".")

# Streamlit app
st.title("Indonesian NER SpaCy Model Analyzer")

st.header("Model Information")
st.write(f"**Language:** {nlp.lang}")
st.write(f"**Pipeline:** {', '.join(nlp.pipe_names)}")
st.write(f"**Labels:** {', '.join(nlp.get_pipe('ner').labels)}")

st.header("Model Evaluation")
evaluation_data = parse_training_data('../data_training.txt')
scores = evaluate_model(nlp, evaluation_data)

if scores:
    # Overall scores
    st.subheader("Overall Scores")
    overall_scores = {
        "Precision": scores.get("ents_p", 0),
        "Recall": scores.get("ents_r", 0),
        "F1-score": scores.get("ents_f", 0),
    }
    st.table(pd.DataFrame([overall_scores]))

    # Scores per entity
    st.subheader("Scores per Entity")
    per_entity_scores = []
    for label, metrics in scores.get("ents_per_type", {}).items():
        per_entity_scores.append({
            "Entity": label,
            "Precision": metrics.get("p", 0),
            "Recall": metrics.get("r", 0),
            "F1-score": metrics.get("f", 0),
        })

    if per_entity_scores:
        df_scores = pd.DataFrame(per_entity_scores)
        # Sort alphabetically to have a consistent order before numbering
        df_scores = df_scores.sort_values(by="Entity", ascending=True)
        # Add a unique number ID column
        df_scores.insert(0, '#', range(1, 1 + len(df_scores)))
        st.table(df_scores)
    else:
        st.write("No per-entity scores available.")
else:
    st.write("Could not calculate scores. Please check the training data format.")


st.header("Analyze Text")
text_input = st.text_area("Enter text to analyze:", "Presiden Joko Widodo mengunjungi Jakarta hari ini.")

if text_input:
    doc = nlp(text_input.lower())
    st.header("NER Visualization")
    html = spacy.displacy.render(doc, style="ent", jupyter=False)
    st.html(html)

    st.header("Named Entities")
    ents = [(ent.text, ent.label_) for ent in doc.ents]
    if ents:
        st.table(ents)
    else:
        st.write("No entities found.")