Spaces:

gitesh-grover
/

sanskrit-tokenizer-demo

Sleeping

App Files Files Community

gitesh-grover commited on Jan 14

Commit

43a6ebc

1 Parent(s): 2cd2c96

Removed the training option from the HuggingFace. Reading the pre-trained vocab

Browse files

Files changed (6) hide show

app.py +21 -15
data/paired_tokens.bpe +1 -2
data/vocab.bpe +0 -0
playground.ipynb +0 -0
tests/test_tokenizer.py +1 -1
tokenizer.py +18 -0

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from tokenizer import sanskrit_token_preprocessor, bpeAlgo, create_vocab, save_paired_tokens_vocab, save_vocab, encode, decode
 # Created a `TokenizerModel` class to maintain state between calls
 # - Added a web interface with separate tabs for training and tokenization
@@ -52,7 +52,13 @@ class TokenizerModel:
             #     return {"Error":f"Please train the tokenizer first or ensure vocabulary files exist! {se}"}
             # except Exception as e:
             #     return {"Error": str(e)}
-            return {"Error": "Please train it first to create vocab. Training smaples can be found in the data dir"}
         encoded = encode(text, self.paired_tokens_vocab)
         decoded = decode(encoded, self.vocab)
@@ -77,20 +83,20 @@ with gr.Blocks(title="Sanskrit BPE Tokenizer") as demo:
     gr.Markdown("# Sanskrit BPE Tokenizer")
     gr.Markdown("This tokenizer implements Byte-Pair Encoding (BPE) for Sanskrit text.")
-    with gr.Tab("Train"):
-        train_input = gr.Textbox(
-            label="Training Text",
-            placeholder="Enter Sanskrit text for training...",
-            lines=5
-        )
-        train_button = gr.Button("Train Tokenizer")
-        train_output = gr.Textbox(label="Training Result")
-        train_button.click(
-            train_tokenizer,
-            inputs=train_input,
-            outputs=train_output
-        )
     with gr.Tab("Tokenize"):
         text_input = gr.Textbox(

 import gradio as gr
+from tokenizer import sanskrit_token_preprocessor, bpeAlgo, create_vocab, save_paired_tokens_vocab, save_vocab, encode, decode, load_paired_tokens_vocab, load_vocab
 # Created a `TokenizerModel` class to maintain state between calls
 # - Added a web interface with separate tabs for training and tokenization
             #     return {"Error":f"Please train the tokenizer first or ensure vocabulary files exist! {se}"}
             # except Exception as e:
             #     return {"Error": str(e)}
+            try:
+                self.paired_tokens_vocab = load_paired_tokens_vocab("data/paired_tokens.bpe")
+                self.vocab = load_vocab("data/vocab.bpe")
+            except Exception as e:
+                print(e)
+                return {"Error": str(e)}
+            # return {"Error": "Please train it first to create vocab. Training smaples can be found in the data dir"}
         encoded = encode(text, self.paired_tokens_vocab)
         decoded = decode(encoded, self.vocab)
     gr.Markdown("# Sanskrit BPE Tokenizer")
     gr.Markdown("This tokenizer implements Byte-Pair Encoding (BPE) for Sanskrit text.")
+    # with gr.Tab("Train"):
+    #     train_input = gr.Textbox(
+    #         label="Training Text",
+    #         placeholder="Enter Sanskrit text for training...",
+    #         lines=5
+    #     )
+    #     train_button = gr.Button("Train Tokenizer")
+    #     train_output = gr.Textbox(label="Training Result")
+    #     train_button.click(
+    #         train_tokenizer,
+    #         inputs=train_input,
+    #         outputs=train_output
+    #     )
     with gr.Tab("Tokenize"):
         text_input = gr.Textbox(

data/paired_tokens.bpe CHANGED Viewed

@@ -1,4 +1,3 @@
-Token version 1
 (224, 164):256
 (224, 165):257
 (257, 141):258
@@ -4742,4 +4741,4 @@ Token version 1
 (384, 442):4996
 (558, 4159):4997
 (1766, 360):4998
-(973, 334):4999

 (224, 164):256
 (224, 165):257
 (257, 141):258
 (384, 442):4996
 (558, 4159):4997
 (1766, 360):4998
+(973, 334):4999

data/vocab.bpe CHANGED Viewed

The diff for this file is too large to render. See raw diff

playground.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

tests/test_tokenizer.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import pytest
-from src.tokenizer import sanskrit_token_preprocessor, bpeAlgo, create_vocab, encode, decode
 @pytest.fixture
 def sample_text():

 import pytest
+from tokenizer import sanskrit_token_preprocessor, bpeAlgo, create_vocab, encode, decode
 @pytest.fixture
 def sample_text():

tokenizer.py CHANGED Viewed

@@ -102,6 +102,24 @@ def save_vocab(filepath, vocab):
         f.write('Token version 1\n')
         for k,v in vocab.items():
             f.write(f"{k}:{v}\n")

         f.write('Token version 1\n')
         for k,v in vocab.items():
             f.write(f"{k}:{v}\n")
+def load_paired_tokens_vocab(filepath):
+    paired_tokens = {}
+    with open(filepath, 'r') as f:
+         for line in f:
+             [k, v] = line.split(":")
+             paired_tokens[k] = v
+    return paired_tokens
+def load_vocab(filepath):
+    vocab = {}
+    with open(filepath, 'r') as f:
+         for line in f:
+             [k, v] = line.split("<::>")
+             vocab[int(k)] = eval(v)
+    return vocab