Spaces:

peeyushsinghal
/

hindi-tokenizer-bpe

Sleeping

App Files Files Community

peeyushsinghal commited on Jan 10

Commit

10f8413

verified ·

1 Parent(s): 0a0cd27

included hindi tokenizer files

Browse files

Files changed (4) hide show

app.py +197 -0
config_app.yml +4 -0
requirements.txt +86 -0
tokenizer.py +212 -0

app.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import gradio as gr
+from tokenizer import Tokenizer, load_config
+import json
+import html
+# Load the tokenizer
+config = load_config("config_app.yml")
+tokenizer = Tokenizer.load(config["tokenizer_file_path"])
+tokenizer.config = config
+def highlight_tokens(text: str, encoded_tokens: list) -> str:
+    """
+    Create HTML with highlighted tokens in the text.
+    Args:
+        text (str): The original input text to be tokenized.
+        encoded_tokens (list): A list of encoded token IDs.
+    Returns:
+        str: HTML string with highlighted tokens and tooltips showing token IDs.
+    """
+    decoded_tokens = []
+    current_pos = 0
+    html_text = ""
+    # Decode each token and create spans with different colors
+    for i, token in enumerate(encoded_tokens):
+        token_bytes = tokenizer.decode([token])
+        decoded_tokens.append(token_bytes)
+        # Find the token in the original text
+        token_pos = text.find(token_bytes, current_pos)
+        if token_pos != -1:
+            # Add any skipped text
+            if token_pos > current_pos:
+                html_text += html.escape(text[current_pos:token_pos])
+            # Add the highlighted token with improved tooltip
+            color = f"hsl({(i * 60) % 360}, 80%, 85%)"
+            html_text += f'''
+                <span
+                    style="background-color: {color};
+                           border-radius: 3px;
+                           padding: 0 3px;
+                           margin: 0 1px;
+                           position: relative;
+                           cursor: help;"
+                    onmouseover="this.querySelector('.tooltip').style.display='block'"
+                    onmouseout="this.querySelector('.tooltip').style.display='none'">
+                    {html.escape(token_bytes)}
+                    <span class="tooltip"
+                          style="display: none;
+                                 position: absolute;
+                                 bottom: 100%;
+                                 left: 50%;
+                                 transform: translateX(-50%);
+                                 background-color: #333;
+                                 color: white;
+                                 padding: 4px 8px;
+                                 border-radius: 4px;
+                                 font-size: 12px;
+                                 white-space: nowrap;
+                                 z-index: 1000;">
+                        Token ID: {token}
+                    </span>
+                </span>'''
+            current_pos = token_pos + len(token_bytes)
+    # Add any remaining text
+    if current_pos < len(text):
+        html_text += html.escape(text[current_pos:])
+    return html_text
+def process_text(text: str) -> tuple:
+    """
+    Process input text through the tokenizer and return results.
+    Args:
+        text (str): The input text to be processed.
+    Returns:
+        tuple: A tuple containing:
+            - HTML string of highlighted tokens.
+            - HTML string of token statistics.
+            - String of token IDs.
+    """
+    try:
+        # Encode the text
+        encoded = tokenizer.encode(text)
+        # Decode back to text
+        decoded = tokenizer.decode(encoded)
+        # Create token visualization
+        highlighted_text = highlight_tokens(text, encoded)
+        # Token statistics
+        stats = {
+            "Total Tokens": len(encoded),
+            "Unique Tokens": len(set(encoded)),
+            "Characters": len(text),
+            "Bytes": len(text.encode('utf-8')),
+            "Compression Ratio": f"{len(text.encode('utf-8')) / (len(encoded) * 4):.2f}x"
+        }
+        # Format statistics
+        stats_html = "<div style='margin-top: 20px;'>"
+        for key, value in stats.items():
+            stats_html += f"<div style='margin: 5px 0;'><b>{key}:</b> {value}</div>"
+        stats_html += "</div>"
+        return (
+            gr.HTML(highlighted_text),
+            gr.HTML(stats_html),
+            f"Token IDs: {encoded}"
+        )
+    except Exception as e:
+        return (
+            gr.HTML(f"<span style='color: red'>Error: {str(e)}</span>"),
+            "",
+            ""
+        )
+# Define example inputs
+examples = [
+    ["यहां वर्तमान में 20 हजार पुस्तकें थी जो अभी रैन बसेरा परिसर के कक्ष में रखी हुई है।"],
+    ["भारत एक विशाल देश है।"],
+    ["मैं हिंदी में बात कर रहा हूं।"],
+    ["नमस्ते, आप कैसे हैं?"],
+    ["दिल्ली भारत की राजधानी है।"]
+]
+# Custom CSS
+custom_css = """
+.container {
+    max-width: 800px;
+    margin: auto;
+    padding: 20px;
+}
+.token-viz {
+    font-family: monospace;
+    line-height: 1.6;
+    padding: 15px;
+    border: 1px solid #ddd;
+    border-radius: 5px;
+    background: white;
+    margin: 10px 0;
+    position: relative;
+}
+.stats {
+    background: #f7f7f7;
+    padding: 15px;
+    border-radius: 5px;
+    margin: 10px 0;
+}
+.token-ids {
+    font-family: monospace;
+    padding: 15px;
+    background: #f0f0f0;
+    border-radius: 5px;
+    overflow-wrap: break-word;
+}
+.tooltip {
+    pointer-events: none;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.2);
+}
+"""
+# Create the Gradio interface
+iface = gr.Interface(
+    fn=process_text,
+    inputs=[
+        gr.Textbox(
+            label="Input Text",
+            placeholder="Enter Hindi text here...",
+            lines=3
+        )
+    ],
+    outputs=[
+        gr.HTML(label="Tokenized Text", elem_classes="token-viz"),
+        gr.HTML(label="Statistics", elem_classes="stats"),
+        gr.Textbox(label="Token IDs", elem_classes="token-ids")
+    ],
+    title="Hindi BPE Tokenizer Visualization",
+    description="""
+    This demo shows how the Hindi BPE tokenizer processes text. Each token is highlighted with a different color.
+    Hover over the highlighted tokens to see their token IDs.
+    """,
+    examples=examples,
+    theme=gr.themes.Soft(),
+    css=custom_css,
+    allow_flagging="never"
+)
+if __name__ == "__main__":
+    iface.launch(share=True)

config_app.yml ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ regex_string: r"""'s\|'t\|'re\|'ve\|'m\|'ll\|'d\| ?\p{N}+\| ?(?:[\u0904-\u0939\u093d-\u093d\u0950-\u0950\u0958-\u0961\u0970-\u097f\ua8f2-\ua8fe\U00011b00-\U00011b09\u1cd3-\u1cd3\u1ce9-\u1cec\u1cee-\u1cf3\u1cf5-\u1cf6\u1cfa-\u1cfa][\u0900-\u0903\u093a-\u093c\u093e-\u094f\u0951-\u0957\u0962-\u0963\ua8e0-\ua8f1\ua8ff-\ua8ff\u1cd0-\u1cd2\u1cd4-\u1ce8\u1ced-\u1ced\u1cf4-\u1cf4\u1cf7-\u1cf9]*)+\| ?\p{L}+\| ?[^\s\p{L}\p{N}]+\|\s+(?!\S)\|\s+"""
3	+
4	+ tokenizer_file_path: "model/hi_tokenizer_regex.json"

requirements.txt ADDED Viewed

	@@ -0,0 +1,86 @@

+aiofiles==23.2.1
+annotated-types==0.7.0
+anyio==4.8.0
+appnope==0.1.4
+asttokens==3.0.0
+certifi==2024.12.14
+charset-normalizer==3.4.1
+click==8.1.8
+comm==0.2.2
+contourpy==1.3.0
+cycler==0.12.1
+debugpy==1.8.11
+decorator==5.1.1
+exceptiongroup==1.2.2
+executing==2.1.0
+fastapi==0.115.6
+ffmpy==0.5.0
+filelock==3.16.1
+fonttools==4.55.3
+fsspec==2024.12.0
+gradio==4.44.1
+gradio_client==1.3.0
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+huggingface-hub==0.27.1
+idna==3.10
+importlib_metadata==8.5.0
+importlib_resources==6.5.2
+ipykernel==6.29.5
+ipython==8.18.1
+jedi==0.19.2
+Jinja2==3.1.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+kiwisolver==1.4.7
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.4
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+nest-asyncio==1.6.0
+numpy==2.0.2
+orjson==3.10.14
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pexpect==4.9.0
+pillow==10.4.0
+platformdirs==4.3.6
+prompt_toolkit==3.0.48
+psutil==6.1.1
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pydantic==2.10.5
+pydantic_core==2.27.2
+pydub==0.25.1
+Pygments==2.19.1
+pyparsing==3.2.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2024.2
+PyYAML==6.0.2
+pyzmq==26.2.0
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+ruff==0.9.0
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+stack-data==0.6.3
+starlette==0.41.3
+tomlkit==0.12.0
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+typer==0.15.1
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.3.0
+uvicorn==0.34.0
+wcwidth==0.2.13
+websockets==12.0
+zipp==3.21.0

tokenizer.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import yaml
+import regex as re
+from tqdm import tqdm
+import gc
+import json
+def load_config(config_file_path: str = "config.yml"):
+    with open(config_file_path, "r") as f:
+        config = yaml.safe_load(f)
+    return config
+def get_input_text(config: dict) -> str:
+    with open(config["input_file_info"]["file_path"], 'r', encoding='utf-8') as _f:
+        hi_text = [line.strip() for line in _f.readlines()]
+    hi_text_abridged = hi_text[:int(config["input_file_info"]["input_file_limit"])]
+    hi_text_abridged = '\n'.join(hi_text_abridged)
+    if config["input_file_info"]["print_text"]:
+        print(" Sample text: ", hi_text_abridged[:10])
+    return hi_text_abridged
+def get_stats(ids, counts= None):
+    counts = {} if counts is None else counts
+    for pair in zip(ids, ids[1:]):
+        counts[pair] = counts.get(pair, 0) + 1
+    return counts
+def merge(ids, pair, idx):
+    newids = []
+    i = 0
+    while i < len(ids):
+        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
+            newids.append(idx)
+            i += 2
+        else:
+            newids.append(ids[i])
+            i += 1
+    return newids
+def stoi(text: str, config: dict) -> list:
+    # tokenize the text
+    if config["regex_string"] and len(config["regex_string"]) > 0:
+        print("Using regex string: ", config["regex_string"])
+        tokens = re.findall(config["regex_string"], text)
+        # Convert tokens to bytes and then to integers
+        return [b for token in tokens for b in token.encode('utf-8')]
+    else:
+        print("Using default tokenizer")
+        # Instead of splitting, we'll preserve spaces by encoding them directly
+        return [b for ch in text for b in ch.encode('utf-8')]
+def encode(text, merges, config: dict):
+    """
+    Encode text into tokens using the learned merges
+    """
+    ids = stoi(text, config)
+    sorted_merges = sorted(merges.items(), key=lambda x: x[1])
+    for (p1, p2), idx in sorted_merges:
+        ids = merge(ids, (p1, p2), idx)
+    return ids
+def decode(ids, merges, config: dict):
+    """
+    Decode tokens back to text using the learned merges
+    """
+    # Create reverse mapping from token to pair
+    reverse_merges = {idx: pair for pair, idx in merges.items()}
+    # Expand all tokens recursively
+    def expand_token(token):
+        if token < 256:  # Base case: token is a byte
+            return bytes([token])
+        # Recursive case: expand the token into its constituent pair
+        pair = reverse_merges[token]
+        return expand_token(pair[0]) + expand_token(pair[1])
+    # Expand all tokens and concatenate
+    bytes_list = [expand_token(id) for id in ids]
+    bytes_data = b''.join(bytes_list)
+    # Convert bytes back to text
+    try:
+        return bytes_data.decode('utf-8')
+    except UnicodeDecodeError:
+        return "[DECODE_ERROR]"
+class Tokenizer:
+    def __init__(self, merges = None, config: dict = None):
+        self.merges = merges or {}
+        self.config = config
+    def save(self, file_path):
+        # Convert tuple keys to strings for JSON serialization
+        serializable_merges = {f"{k[0]},{k[1]}": v for k, v in self.merges.items()}
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(serializable_merges, f)
+    @classmethod
+    def load(cls, file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            serialized_merges = json.load(f)
+            # Convert string keys back to tuples
+            merges = {tuple(map(int, k.split(','))): v
+                          for k, v in serialized_merges.items()}
+        return cls(merges)
+    def encode(self, text):
+        return encode(text, self.merges, self.config)
+    def decode(self, ids):
+        return decode(ids, self.merges, self.config)
+def train_tokenizer(config: dict) -> None:
+    # get input text
+    hi_text = get_input_text(config)
+    # convert string to tokens
+    tokens = stoi(hi_text, config)
+    initial_len = len(tokens)
+    print("Tokens length (initial): ", initial_len, " tokens unique: ", len(set(tokens)))
+    print("Example tokens: ", ord('क'), chr(2325), ord("।"), chr(2404))
+    print("Training tokenizer....")
+    num_merges = config["vocab_size"] - 256
+    original_token = tokens
+    merges ={}
+    pbar = tqdm(range(num_merges), desc="Training tokenizer")
+    output_file = config["output_file_info"]["file_path"]
+    for i in pbar:
+        # Get statistics of the tokens
+        stats = get_stats(tokens)
+        # Get the most frequent pair
+        pair = max (stats, key=stats.get)
+        # Get the index of the new token
+        idx = 256 + i
+        # Merge the pair
+        tokens = merge(tokens, pair, idx)
+        merges[pair] = idx
+        # Show progress
+        if (i + 1) % 100 == 0:
+            current_ratio = initial_len / len(tokens)
+            pbar.write(f"Iteration {i+1}: compression ratio: {current_ratio:.2f}X")
+        # Garbage collection periodically
+        if (i + 1) % 1000 == 0:
+            gc.collect()
+        # Save intermediate merges
+        if (i + 1) % 1000 == 0:
+            temp_tokenizer = Tokenizer(merges)
+            temp_tokenizer.save(f"{output_file}.checkpoint")
+    print("Training tokenizer completed")
+    final_tokenizer = Tokenizer(merges)
+    final_tokenizer.save(f"{output_file}")
+    print("\n=== Final Statistics ===")
+    print(f"Vocabulary size: {config['vocab_size']}")
+    print(f"Initial tokens: {initial_len:,}")
+    print(f"Final tokens: {len(tokens):,}")
+    print(f"Initial bytes: {initial_len * 4:,}")
+    print(f"Final bytes: {len(tokens) * 4:,}")
+    print(f"Token compression ratio: {initial_len / len(tokens):.2f}X")
+    print(f"Byte compression ratio: {initial_len * 4 / len(tokens) * 4:.2f}X")
+    print(f"Saved tokenizer to: {output_file}")
+    return merges
+def load_tokenizer(config: dict) -> Tokenizer:
+    "load the tokenizer from the json file"
+    with open(config["output_file_info"]["file_path"], 'r', encoding='utf-8') as f:
+        serialized_merges = json.load(f)
+    merges = {tuple(map(int, k.split(','))): v
+                          for k, v in serialized_merges.items()}
+    return Tokenizer(merges, config)
+if __name__ == "__main__":
+    # TRAIN TOKENIZER
+    config = load_config()
+    merges = train_tokenizer(config)
+    print("Merges: ", merges)
+    # USE TOKENIZER
+    # tokenizer = load_tokenizer(config)
+    # test_text = config["test_text"]
+    # print("Test text: ", test_text)
+    # print("Encoded text: ", tokenizer.encode(test_text))
+    # decoded = tokenizer.decode(tokenizer.encode(test_text))
+    # print("Decoded text: ", decoded)
+    # print(f"Successful roundtrip: {test_text == decoded}")