import gradio as gr
from tokenizer import sanskrit_token_preprocessor, bpeAlgo, create_vocab, save_paired_tokens_vocab, save_vocab, encode, decode, load_paired_tokens_vocab, load_vocab

# Created a `TokenizerModel` class to maintain state between calls
# - Added a web interface with separate tabs for training and tokenization
# - Included example texts and documentation
# - Made the interface more user-friendly with clear labels and instructions
# - Added JSON output for better visualization of results

# The app provides two main functionalities:
# 1. Training the tokenizer on custom text
# 2. Tokenizing new text using the trained model
class TokenizerModel:
    def __init__(self, vocab_size=5000):
        self.vocab_size = vocab_size
        self.paired_tokens_vocab = None
        self.vocab = None
    
    def train(self, text):
        # Preprocess tokens
        tokens = sanskrit_token_preprocessor(text)
        tokens = [list(tok.encode('utf-8')) for tok in tokens]
        orig_tokens_len = len([tok for toks in tokens for tok in toks])
        print(type(orig_tokens_len))
        
        # Train BPE
        try:
            self.paired_tokens_vocab, encoded_tokens = bpeAlgo(tokens, self.vocab_size - 256, 256)
            self.vocab = create_vocab(self.paired_tokens_vocab)
        except ValueError as e:
             # Display the error message in your UI
            return f"Error during training: {str(e)}"
        
        # save_paired_tokens_vocab("data/paired_tokens.bpe", self.paired_tokens_vocab)
        # save_vocab( "data/vocab.bpe", self.vocab)

        # Calculate compression ratio
        compression_ratio = orig_tokens_len / len(encoded_tokens)
        
        return f"Training completed! Vocabulary size: {len(self.vocab)} and Compression Ratio: {compression_ratio:.2f}"
    
    def tokenize(self, text):
        if self.paired_tokens_vocab is None:
            # try:
            #     with open("data/paired_tokens.bpe", "r") as f:
            #         self.paired_tokens_vocab = eval(f.read())
            #     with open("data/vocab.bpe", "r") as f:
            #         self.vocab = eval(f.read())
            # except FileNotFoundError as fe:
            #     return {"Error":f"Please train the tokenizer first or ensure vocabulary files exist! {fe}"}
            # except SyntaxError as se:
            #     return {"Error":f"Please train the tokenizer first or ensure vocabulary files exist! {se}"}
            # except Exception as e:
            #     return {"Error": str(e)}
            try:
                self.paired_tokens_vocab = load_paired_tokens_vocab("data/paired_tokens.bpe")
                self.vocab = load_vocab("data/vocab.bpe")
            except Exception as e:
                print(e)
                return {"Error": str(e)}
            # return {"Error": "Please train it first to create vocab. Training smaples can be found in the data dir"}
        
        encoded = encode(text, self.paired_tokens_vocab)
        decoded = decode(encoded, self.vocab)
        
        return {
            "Encoded tokens": encoded,
            "Decoded text": decoded,
            "Matches original": decoded == text
        }

# Create global tokenizer instance
tokenizer = TokenizerModel()

def train_tokenizer(text):
    return tokenizer.train(text)

def process_text(text):
    return tokenizer.tokenize(text)

# Create the Gradio interface
with gr.Blocks(title="Sanskrit BPE Tokenizer") as demo:
    gr.Markdown("# Sanskrit BPE Tokenizer")
    gr.Markdown("This tokenizer implements Byte-Pair Encoding (BPE) for Sanskrit text.")
    
    # with gr.Tab("Train"):
    #     train_input = gr.Textbox(
    #         label="Training Text",
    #         placeholder="Enter Sanskrit text for training...",
    #         lines=5
    #     )
    #     train_button = gr.Button("Train Tokenizer")
    #     train_output = gr.Textbox(label="Training Result")
        
    #     train_button.click(
    #         train_tokenizer,
    #         inputs=train_input,
    #         outputs=train_output
    #     )
    
    with gr.Tab("Tokenize"):
        text_input = gr.Textbox(
            label="Input Text",
            placeholder="Enter Sanskrit text to tokenize...",
            lines=3
        )
        tokenize_button = gr.Button("Tokenize")
        result_output = gr.JSON(label="Results")
        
        tokenize_button.click(
            process_text,
            inputs=text_input,
            outputs=result_output
        )
    
    gr.Markdown("""
    ### Example texts:
    ```
    चीराण्यपास्याज्जनकस्य कन्या नेयं प्रतिज्ञा मम दत्तपूर्वा।
    यथासुखं गच्छतु राजपुत्री वनं समग्रा सह सर्वरत्नैः॥
    ```
    """)

if __name__ == "__main__":
    demo.launch()