gitesh-grover commited on
Commit
43a6ebc
·
1 Parent(s): 2cd2c96

Removed the training option from the HuggingFace. Reading the pre-trained vocab

Browse files
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from tokenizer import sanskrit_token_preprocessor, bpeAlgo, create_vocab, save_paired_tokens_vocab, save_vocab, encode, decode
3
 
4
  # Created a `TokenizerModel` class to maintain state between calls
5
  # - Added a web interface with separate tabs for training and tokenization
@@ -52,7 +52,13 @@ class TokenizerModel:
52
  # return {"Error":f"Please train the tokenizer first or ensure vocabulary files exist! {se}"}
53
  # except Exception as e:
54
  # return {"Error": str(e)}
55
- return {"Error": "Please train it first to create vocab. Training smaples can be found in the data dir"}
 
 
 
 
 
 
56
 
57
  encoded = encode(text, self.paired_tokens_vocab)
58
  decoded = decode(encoded, self.vocab)
@@ -77,20 +83,20 @@ with gr.Blocks(title="Sanskrit BPE Tokenizer") as demo:
77
  gr.Markdown("# Sanskrit BPE Tokenizer")
78
  gr.Markdown("This tokenizer implements Byte-Pair Encoding (BPE) for Sanskrit text.")
79
 
80
- with gr.Tab("Train"):
81
- train_input = gr.Textbox(
82
- label="Training Text",
83
- placeholder="Enter Sanskrit text for training...",
84
- lines=5
85
- )
86
- train_button = gr.Button("Train Tokenizer")
87
- train_output = gr.Textbox(label="Training Result")
88
 
89
- train_button.click(
90
- train_tokenizer,
91
- inputs=train_input,
92
- outputs=train_output
93
- )
94
 
95
  with gr.Tab("Tokenize"):
96
  text_input = gr.Textbox(
 
1
  import gradio as gr
2
+ from tokenizer import sanskrit_token_preprocessor, bpeAlgo, create_vocab, save_paired_tokens_vocab, save_vocab, encode, decode, load_paired_tokens_vocab, load_vocab
3
 
4
  # Created a `TokenizerModel` class to maintain state between calls
5
  # - Added a web interface with separate tabs for training and tokenization
 
52
  # return {"Error":f"Please train the tokenizer first or ensure vocabulary files exist! {se}"}
53
  # except Exception as e:
54
  # return {"Error": str(e)}
55
+ try:
56
+ self.paired_tokens_vocab = load_paired_tokens_vocab("data/paired_tokens.bpe")
57
+ self.vocab = load_vocab("data/vocab.bpe")
58
+ except Exception as e:
59
+ print(e)
60
+ return {"Error": str(e)}
61
+ # return {"Error": "Please train it first to create vocab. Training smaples can be found in the data dir"}
62
 
63
  encoded = encode(text, self.paired_tokens_vocab)
64
  decoded = decode(encoded, self.vocab)
 
83
  gr.Markdown("# Sanskrit BPE Tokenizer")
84
  gr.Markdown("This tokenizer implements Byte-Pair Encoding (BPE) for Sanskrit text.")
85
 
86
+ # with gr.Tab("Train"):
87
+ # train_input = gr.Textbox(
88
+ # label="Training Text",
89
+ # placeholder="Enter Sanskrit text for training...",
90
+ # lines=5
91
+ # )
92
+ # train_button = gr.Button("Train Tokenizer")
93
+ # train_output = gr.Textbox(label="Training Result")
94
 
95
+ # train_button.click(
96
+ # train_tokenizer,
97
+ # inputs=train_input,
98
+ # outputs=train_output
99
+ # )
100
 
101
  with gr.Tab("Tokenize"):
102
  text_input = gr.Textbox(
data/paired_tokens.bpe CHANGED
@@ -1,4 +1,3 @@
1
- Token version 1
2
  (224, 164):256
3
  (224, 165):257
4
  (257, 141):258
@@ -4742,4 +4741,4 @@ Token version 1
4742
  (384, 442):4996
4743
  (558, 4159):4997
4744
  (1766, 360):4998
4745
- (973, 334):4999
 
 
1
  (224, 164):256
2
  (224, 165):257
3
  (257, 141):258
 
4741
  (384, 442):4996
4742
  (558, 4159):4997
4743
  (1766, 360):4998
4744
+ (973, 334):4999
data/vocab.bpe CHANGED
The diff for this file is too large to render. See raw diff
 
playground.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
tests/test_tokenizer.py CHANGED
@@ -1,5 +1,5 @@
1
  import pytest
2
- from src.tokenizer import sanskrit_token_preprocessor, bpeAlgo, create_vocab, encode, decode
3
 
4
  @pytest.fixture
5
  def sample_text():
 
1
  import pytest
2
+ from tokenizer import sanskrit_token_preprocessor, bpeAlgo, create_vocab, encode, decode
3
 
4
  @pytest.fixture
5
  def sample_text():
tokenizer.py CHANGED
@@ -102,6 +102,24 @@ def save_vocab(filepath, vocab):
102
  f.write('Token version 1\n')
103
  for k,v in vocab.items():
104
  f.write(f"{k}:{v}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
 
107
 
 
102
  f.write('Token version 1\n')
103
  for k,v in vocab.items():
104
  f.write(f"{k}:{v}\n")
105
+
106
+ def load_paired_tokens_vocab(filepath):
107
+ paired_tokens = {}
108
+ with open(filepath, 'r') as f:
109
+ for line in f:
110
+ [k, v] = line.split(":")
111
+ paired_tokens[k] = v
112
+ return paired_tokens
113
+
114
+ def load_vocab(filepath):
115
+ vocab = {}
116
+ with open(filepath, 'r') as f:
117
+ for line in f:
118
+ [k, v] = line.split("<::>")
119
+ vocab[int(k)] = eval(v)
120
+ return vocab
121
+
122
+
123
 
124
 
125