Upload files

Browse files

Files changed (9) hide show

LICENSE +21 -0
Persian_BPE_Tokenizer_30K.json +0 -0
README.md +43 -3
main.py +43 -0
packages.py +7 -0
preprocess.py +66 -0
requirements.txt +5 -0
tokenizer_training.py +24 -0
utils.py +59 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Amir Hossein Fouladi
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

Persian_BPE_Tokenizer_30K.json ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -1,3 +1,43 @@
----
-license: mit
----

+# Persian BPE Tokenizer (30K)
+A Byte-Pair Encoding (BPE) tokenizer with a vocabulary size of 30,000, trained on ~2M Persian texts with an average length of 10,000 characters for NLP tasks.
+## Usage
+### Encoding
+```python
+from tokenizers import Tokenizer
+tokenizer= Tokenizer.from_file("Persian_BPE_Tokenizer_30K.json")
+encoded_text= tokenizer.encode("این یک متن آزمایشی است.")
+print("Tokens:", encoded_text.tokens)
+print("IDs:", encoded_text.ids)
+```
+### Decoding
+```python
+decoded_text= tokenizer.decode_batch([[id] for id in encoded_text.ids])
+print("Decoded:", decoded_text)
+```
+## Training Data
+This tokenizer was trained on the following datasets:
+- Wikipedia (20231101.fa): https://huggingface.co/datasets/wikimedia/wikipedia
+- Persian Blog: https://huggingface.co/datasets/RohanAiLab/persian_blog
+- HomoRich: https://huggingface.co/datasets/MahtaFetrat/HomoRich-G2P-Persian
+## License
+Code and tokenizer: MIT License
+## Evaluation Metrics
+- UNK Rate: 0.0% (on 100,000 samples)
+- Compression Ratio: 4.56 (on 100,000 samples)
+## Requirements
+- **For using the tokenizer**:
+  - Python >= 3.9
+  - tokenizers
+- **For training the tokenizer**:
+  - pandas
+  - datasets
+  - requests
+  - hazm

main.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from packages import *
+from preprocess import *
+from tokenizer_training import *
+'''
+Datasets:
+https://huggingface.co/datasets/wikimedia/wikipedia
+https://huggingface.co/datasets/RohanAiLab/persian_blog
+https://huggingface.co/datasets/MahtaFetrat/HomoRich-G2P-Persian
+'''
+os.system('cls' if os.name == 'nt' else 'clear')
+if os.path.exists("merged_dataset.csv"):
+    dataset= pd.read_csv("merged_dataset.csv")
+else: dataset= preprocess_pipeline_fn()
+if os.path.exists("Persian_BPE_Tokenizer_30K.json"):
+    tokenizer= Tokenizer.from_file("Persian_BPE_Tokenizer_30K.json")
+else: tokenizer= tokenizer_training_fn(dataset)
+# test
+test_texts= ["این یک متن آزمایشی برای بررسی عملکرد توکنایزر است.",
+             "دیروز به کتابخانه رفتم و کتابی درباره تاریخ ایران باستان خواندم.",
+             "سلام! چطور می‌توانم به سرعت زبان فارسی را یاد بگیرم؟",
+             "هوای تهران امروز خیلی گرم و آفتابی است، ولی شب خنک می‌شود."]
+for i, text in enumerate(test_texts):
+    encoded_text= tokenizer.encode(text)
+    print(10 * "--", f" test {i+1}", 10 * "--")
+    print("text: ", text)
+    print("tokens: ", encoded_text.tokens)
+    print("ids: ", encoded_text.ids)
+    # print("decoded: ", tokenizer.decode(encoded_text.ids))
+    tokens = tokenizer.decode_batch([[id] for id in encoded_text.ids])
+    print("decoded:", ' '.join(tokens))
+# evaluation criteria
+print(70*"-"), print("evaluation"), print(70*"-")
+unk_rate, compression_ratio= evaluation_fn(tokenizer, dataset, 100_000)
+print(f"unk rate: {unk_rate}%")
+print(f"compression ratio: {compression_ratio}")

packages.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import os, re
+from zipfile import ZipFile
+import requests
+import pandas as pd
+from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors, decoders
+from hazm import Normalizer

preprocess.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from packages import *
+from utils import *
+def preprocess_fn(dataset: pd.DataFrame, normaliztion= False, show_sample= False) -> pd.DataFrame:
+    # Remove duplicate texts
+    dataset= dataset.drop_duplicates(subset='text')
+    dataset= dataset.copy()
+    # Remove URLs
+    dataset['text']= dataset['text'].apply(lambda x: re.sub(r'http\S+|www\S+', '', x) if isinstance(x, str) else x)
+    # Remove HTML tags
+    dataset['text']= dataset['text'].apply(lambda x: re.sub(r'<[^>]+>', '', x) if isinstance(x, str) else x)
+    # Remove emojis and unnecessary characters
+    dataset['text']= dataset['text'].apply(lambda x: re.sub(r'[^\u0600-\u06FF\s.,!?]', '', x) if isinstance(x, str) else x)
+    # Remove English texts
+    dataset['text']= dataset['text'].apply(lambda x: re.sub(r'[a-zA-Z]+', '', x) if isinstance(x, str) else x)
+    # Remove empty or invalid texts
+    dataset= dataset[dataset['text'].notna() & (dataset['text'].str.strip() != '')]
+    # Remove extra spaces
+    dataset['text']= dataset['text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip() if isinstance(x, str) else x)
+    # Normalization
+    if normaliztion:
+        dataset['text']= dataset['text'].apply(lambda x: Normalizer(persian_numbers= True).normalize(x) if isinstance(x, str) else x)
+    if show_sample:
+        print(dataset['text'].sample(n= 3).to_list())
+    return dataset
+def uniform_length(dataset: pd.DataFrame, target_length= 1000) -> pd.DataFrame:
+    full_text= ' '.join(dataset['text'].dropna().astype(str))
+    chunks, start= [], 0
+    while start < len(full_text):
+        end= min(start+ target_length, len(full_text))
+        while end < len(full_text) and full_text[end] not in [' ', '\n']:
+            end +=1
+        chunks.append(full_text[start: end].strip())
+        start= end +1
+    return pd.DataFrame(chunks, columns= ['text'])
+def preprocess_pipeline_fn(eda= False, save_dataset= True, shuffle= False):
+    ## load datasets
+    wiki_dataset= load_dataset("wikimedia/wikipedia", "20231101.fa")
+    wiki_dataset= pd.DataFrame(wiki_dataset['train']['text'], columns= ['text'])
+    blog_dataset= pd.read_csv("./blogs/blogs.csv")
+    homorich_dataset= load_dataset("MahtaFetrat/HomoRich-G2P-Persian", verification_mode= "no_checks")
+    homorich_dataset= pd.DataFrame(homorich_dataset["train"]["Grapheme"], columns= ['text'])
+    print(70*"-"), print("download & load datasets is done."), print(70*"-")
+    ## preprocess
+    dataset_names= ["Wikipedia", "Persian Blog", "HomoRich"]
+    datasets= [wiki_dataset, blog_dataset, homorich_dataset]
+    del wiki_dataset, blog_dataset, homorich_dataset
+    for i in range(len(datasets)):
+        datasets[i]= uniform_length(preprocess_fn(datasets[i]))
+        if eda:
+            eda_dataset(datasets[i], dataset_names[i])
+            show_short_samples(datasets[i], dataset_names[i])
+    ## merging datasets
+    merged_dataset= pd.concat([datasets[0], datasets[1], datasets[2]], ignore_index= True)
+    del datasets
+    if shuffle: merged_dataset= merged_dataset.sample(frac= 1).reset_index(drop= True)
+    print(70*"-"), print("preprocessing is done."), print(70*"-")
+    if save_dataset: merged_dataset.to_csv('merged_dataset.csv', index= False, encoding= 'utf-8-sig')
+    return merged_dataset

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+tokenizers==0.21.1
+datasets==2.21.0
+hazm==0.10.0
+pandas==2.0.3
+requests==2.32.3

tokenizer_training.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from packages import *
+def tokenizer_training_fn(dataset: pd.DataFrame, vocab_size= 30_000, min_frequency= 5, pre_tokenizer= True)-> Tokenizer:
+    unk, eos= "[UNK]", "<|endoftext|>"
+    tokenizer= Tokenizer(models.BPE(unk_token= unk))
+    if pre_tokenizer: tokenizer.pre_tokenizer= pre_tokenizers.Whitespace()
+    trainer= trainers.BpeTrainer(vocab_size= vocab_size, min_frequency= min_frequency,
+                                special_tokens= [unk, eos] )
+    tokenizer.train_from_iterator(dataset["text"], trainer)
+    print(10 * "--", " vocab size ", 10 * "--")
+    print(tokenizer.get_vocab_size())
+    tokenizer.post_processor= processors.TemplateProcessing(
+        single= f"{eos} $A {eos}",
+        special_tokens= [(eos, tokenizer.token_to_id(eos))]
+        )
+    tokenizer.decoder= decoders.BPEDecoder()
+    tokenizer.save(f"Persian_BPE_Tokenizer_{vocab_size//1000}K.json")
+    print(70*"-"), print("tokenizer training is complete and saved."), print(70*"-")
+    return tokenizer

utils.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from packages import *
+def download_blog_dataset():
+    url= "https://huggingface.co/datasets/RohanAiLab/persian_blog/resolve/main/blogs.zip"
+    output_path= "blogs.zip"
+    extract_folder= "blogs"
+    if not os.path.exists(output_path):
+        print("Downloading blogs.zip...")
+        r= requests.get(url, stream= True)
+        with open(output_path, "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                if chunk:
+                    f.write(chunk)
+        print("Download completed.")
+    if not os.path.exists(extract_folder):
+        print("Extracting blogs.zip...")
+        with ZipFile(output_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_folder)
+        print("Extraction completed.")
+def eda_dataset(df, name):
+    text_column= "text"
+    print(f"--- EDA for {name} ---")
+    print(f"تعداد نمونه‌ها: {len(df)}")
+    print(f"تعداد نمونه‌های خالی: {df[text_column].isna().sum() + (df[text_column].str.strip() == '').sum()}")
+    print(f"میانگین طول متن: {df[text_column].str.len().mean():.2f}")
+    print(f"حداکثر طول متن: {df[text_column].str.len().max()}")
+    print(f"حداقل طول متن: {df[text_column].str.len().min()}")
+    # print(f"تعداد کلمات منحصربه‌فرد: {len(set(' '.join(df[text_column].dropna().astype(str)).split()))}")
+    # print(f"نمونه متن: {df[text_column].iloc[0][:100] if isinstance(df[text_column].iloc[0], str) else 'غیرمعتبر'}")
+    print("\n")
+def show_short_samples(df, dataset_name, max_length= 1000, num_samples= 5):
+    print(f"--- نمونه‌های با طول کمتر از {max_length} کاراکتر برای {dataset_name} ---")
+    short_samples = df[df['text'].str.len() < max_length]
+    print(f"تعداد نمونه‌های با طول کمتر از {max_length} کاراکتر: {len(short_samples)}")
+    if len(short_samples) > 0:
+        for i, text in enumerate(short_samples['text'].head(num_samples)):
+            print(f"نمونه {i+1}: {text[:100]} (طول: {len(text)} کاراکتر)")
+    else:
+        print("هیچ نمونه‌ای با طول کمتر از {max_length} کاراکتر یافت نشد.")
+    print("\n")
+def evaluation_fn(tokenizer: Tokenizer, dataset:pd.DataFrame, samples_number= 10000):
+    dataset= dataset['text'].sample(n= samples_number).to_list()
+    unk_count= sum(text.count("[UNK]") for text in [tokenizer.encode(t).tokens for t in dataset])
+    total_tokens= sum(len(tokenizer.encode(t).tokens) for t in dataset)
+    unk_rate= (unk_count / total_tokens) * 100
+    char_counts= [len(t) for t in dataset]
+    token_counts= [len(tokenizer.encode(t).tokens) for t in dataset]
+    compression_ratio= sum(char_counts) / sum(token_counts)
+    return unk_rate, compression_ratio