Upload files
Browse files- LICENSE +21 -0
- Persian_BPE_Tokenizer_30K.json +0 -0
- README.md +43 -3
- main.py +43 -0
- packages.py +7 -0
- preprocess.py +66 -0
- requirements.txt +5 -0
- tokenizer_training.py +24 -0
- utils.py +59 -0
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Amir Hossein Fouladi
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
Persian_BPE_Tokenizer_30K.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
README.md
CHANGED
|
@@ -1,3 +1,43 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Persian BPE Tokenizer (30K)
|
| 2 |
+
|
| 3 |
+
A Byte-Pair Encoding (BPE) tokenizer with a vocabulary size of 30,000, trained on ~2M Persian texts with an average length of 10,000 characters for NLP tasks.
|
| 4 |
+
|
| 5 |
+
## Usage
|
| 6 |
+
|
| 7 |
+
### Encoding
|
| 8 |
+
```python
|
| 9 |
+
from tokenizers import Tokenizer
|
| 10 |
+
tokenizer= Tokenizer.from_file("Persian_BPE_Tokenizer_30K.json")
|
| 11 |
+
encoded_text= tokenizer.encode("این یک متن آزمایشی است.")
|
| 12 |
+
print("Tokens:", encoded_text.tokens)
|
| 13 |
+
print("IDs:", encoded_text.ids)
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
### Decoding
|
| 17 |
+
```python
|
| 18 |
+
decoded_text= tokenizer.decode_batch([[id] for id in encoded_text.ids])
|
| 19 |
+
print("Decoded:", decoded_text)
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
## Training Data
|
| 23 |
+
This tokenizer was trained on the following datasets:
|
| 24 |
+
- Wikipedia (20231101.fa): https://huggingface.co/datasets/wikimedia/wikipedia
|
| 25 |
+
- Persian Blog: https://huggingface.co/datasets/RohanAiLab/persian_blog
|
| 26 |
+
- HomoRich: https://huggingface.co/datasets/MahtaFetrat/HomoRich-G2P-Persian
|
| 27 |
+
|
| 28 |
+
## License
|
| 29 |
+
Code and tokenizer: MIT License
|
| 30 |
+
|
| 31 |
+
## Evaluation Metrics
|
| 32 |
+
- UNK Rate: 0.0% (on 100,000 samples)
|
| 33 |
+
- Compression Ratio: 4.56 (on 100,000 samples)
|
| 34 |
+
|
| 35 |
+
## Requirements
|
| 36 |
+
- **For using the tokenizer**:
|
| 37 |
+
- Python >= 3.9
|
| 38 |
+
- tokenizers
|
| 39 |
+
- **For training the tokenizer**:
|
| 40 |
+
- pandas
|
| 41 |
+
- datasets
|
| 42 |
+
- requests
|
| 43 |
+
- hazm
|
main.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from packages import *
|
| 2 |
+
from preprocess import *
|
| 3 |
+
from tokenizer_training import *
|
| 4 |
+
|
| 5 |
+
'''
|
| 6 |
+
Datasets:
|
| 7 |
+
https://huggingface.co/datasets/wikimedia/wikipedia
|
| 8 |
+
https://huggingface.co/datasets/RohanAiLab/persian_blog
|
| 9 |
+
https://huggingface.co/datasets/MahtaFetrat/HomoRich-G2P-Persian
|
| 10 |
+
'''
|
| 11 |
+
|
| 12 |
+
os.system('cls' if os.name == 'nt' else 'clear')
|
| 13 |
+
|
| 14 |
+
if os.path.exists("merged_dataset.csv"):
|
| 15 |
+
dataset= pd.read_csv("merged_dataset.csv")
|
| 16 |
+
else: dataset= preprocess_pipeline_fn()
|
| 17 |
+
|
| 18 |
+
if os.path.exists("Persian_BPE_Tokenizer_30K.json"):
|
| 19 |
+
tokenizer= Tokenizer.from_file("Persian_BPE_Tokenizer_30K.json")
|
| 20 |
+
else: tokenizer= tokenizer_training_fn(dataset)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# test
|
| 24 |
+
test_texts= ["این یک متن آزمایشی برای بررسی عملکرد توکنایزر است.",
|
| 25 |
+
"دیروز به کتابخانه رفتم و کتابی درباره تاریخ ایران باستان خواندم.",
|
| 26 |
+
"سلام! چطور میتوانم به سرعت زبان فارسی را یاد بگیرم؟",
|
| 27 |
+
"هوای تهران امروز خیلی گرم و آفتابی است، ولی شب خنک میشود."]
|
| 28 |
+
|
| 29 |
+
for i, text in enumerate(test_texts):
|
| 30 |
+
encoded_text= tokenizer.encode(text)
|
| 31 |
+
print(10 * "--", f" test {i+1}", 10 * "--")
|
| 32 |
+
print("text: ", text)
|
| 33 |
+
print("tokens: ", encoded_text.tokens)
|
| 34 |
+
print("ids: ", encoded_text.ids)
|
| 35 |
+
# print("decoded: ", tokenizer.decode(encoded_text.ids))
|
| 36 |
+
tokens = tokenizer.decode_batch([[id] for id in encoded_text.ids])
|
| 37 |
+
print("decoded:", ' '.join(tokens))
|
| 38 |
+
|
| 39 |
+
# evaluation criteria
|
| 40 |
+
print(70*"-"), print("evaluation"), print(70*"-")
|
| 41 |
+
unk_rate, compression_ratio= evaluation_fn(tokenizer, dataset, 100_000)
|
| 42 |
+
print(f"unk rate: {unk_rate}%")
|
| 43 |
+
print(f"compression ratio: {compression_ratio}")
|
packages.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, re
|
| 2 |
+
from zipfile import ZipFile
|
| 3 |
+
|
| 4 |
+
import requests
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors, decoders
|
| 7 |
+
from hazm import Normalizer
|
preprocess.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from packages import *
|
| 2 |
+
from utils import *
|
| 3 |
+
|
| 4 |
+
def preprocess_fn(dataset: pd.DataFrame, normaliztion= False, show_sample= False) -> pd.DataFrame:
|
| 5 |
+
# Remove duplicate texts
|
| 6 |
+
dataset= dataset.drop_duplicates(subset='text')
|
| 7 |
+
dataset= dataset.copy()
|
| 8 |
+
# Remove URLs
|
| 9 |
+
dataset['text']= dataset['text'].apply(lambda x: re.sub(r'http\S+|www\S+', '', x) if isinstance(x, str) else x)
|
| 10 |
+
# Remove HTML tags
|
| 11 |
+
dataset['text']= dataset['text'].apply(lambda x: re.sub(r'<[^>]+>', '', x) if isinstance(x, str) else x)
|
| 12 |
+
# Remove emojis and unnecessary characters
|
| 13 |
+
dataset['text']= dataset['text'].apply(lambda x: re.sub(r'[^\u0600-\u06FF\s.,!?]', '', x) if isinstance(x, str) else x)
|
| 14 |
+
# Remove English texts
|
| 15 |
+
dataset['text']= dataset['text'].apply(lambda x: re.sub(r'[a-zA-Z]+', '', x) if isinstance(x, str) else x)
|
| 16 |
+
# Remove empty or invalid texts
|
| 17 |
+
dataset= dataset[dataset['text'].notna() & (dataset['text'].str.strip() != '')]
|
| 18 |
+
# Remove extra spaces
|
| 19 |
+
dataset['text']= dataset['text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip() if isinstance(x, str) else x)
|
| 20 |
+
# Normalization
|
| 21 |
+
if normaliztion:
|
| 22 |
+
dataset['text']= dataset['text'].apply(lambda x: Normalizer(persian_numbers= True).normalize(x) if isinstance(x, str) else x)
|
| 23 |
+
if show_sample:
|
| 24 |
+
print(dataset['text'].sample(n= 3).to_list())
|
| 25 |
+
return dataset
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def uniform_length(dataset: pd.DataFrame, target_length= 1000) -> pd.DataFrame:
|
| 29 |
+
full_text= ' '.join(dataset['text'].dropna().astype(str))
|
| 30 |
+
chunks, start= [], 0
|
| 31 |
+
while start < len(full_text):
|
| 32 |
+
end= min(start+ target_length, len(full_text))
|
| 33 |
+
while end < len(full_text) and full_text[end] not in [' ', '\n']:
|
| 34 |
+
end +=1
|
| 35 |
+
chunks.append(full_text[start: end].strip())
|
| 36 |
+
start= end +1
|
| 37 |
+
return pd.DataFrame(chunks, columns= ['text'])
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def preprocess_pipeline_fn(eda= False, save_dataset= True, shuffle= False):
|
| 41 |
+
## load datasets
|
| 42 |
+
wiki_dataset= load_dataset("wikimedia/wikipedia", "20231101.fa")
|
| 43 |
+
wiki_dataset= pd.DataFrame(wiki_dataset['train']['text'], columns= ['text'])
|
| 44 |
+
blog_dataset= pd.read_csv("./blogs/blogs.csv")
|
| 45 |
+
homorich_dataset= load_dataset("MahtaFetrat/HomoRich-G2P-Persian", verification_mode= "no_checks")
|
| 46 |
+
homorich_dataset= pd.DataFrame(homorich_dataset["train"]["Grapheme"], columns= ['text'])
|
| 47 |
+
print(70*"-"), print("download & load datasets is done."), print(70*"-")
|
| 48 |
+
|
| 49 |
+
## preprocess
|
| 50 |
+
dataset_names= ["Wikipedia", "Persian Blog", "HomoRich"]
|
| 51 |
+
datasets= [wiki_dataset, blog_dataset, homorich_dataset]
|
| 52 |
+
del wiki_dataset, blog_dataset, homorich_dataset
|
| 53 |
+
for i in range(len(datasets)):
|
| 54 |
+
datasets[i]= uniform_length(preprocess_fn(datasets[i]))
|
| 55 |
+
if eda:
|
| 56 |
+
eda_dataset(datasets[i], dataset_names[i])
|
| 57 |
+
show_short_samples(datasets[i], dataset_names[i])
|
| 58 |
+
|
| 59 |
+
## merging datasets
|
| 60 |
+
merged_dataset= pd.concat([datasets[0], datasets[1], datasets[2]], ignore_index= True)
|
| 61 |
+
del datasets
|
| 62 |
+
if shuffle: merged_dataset= merged_dataset.sample(frac= 1).reset_index(drop= True)
|
| 63 |
+
print(70*"-"), print("preprocessing is done."), print(70*"-")
|
| 64 |
+
if save_dataset: merged_dataset.to_csv('merged_dataset.csv', index= False, encoding= 'utf-8-sig')
|
| 65 |
+
|
| 66 |
+
return merged_dataset
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tokenizers==0.21.1
|
| 2 |
+
datasets==2.21.0
|
| 3 |
+
hazm==0.10.0
|
| 4 |
+
pandas==2.0.3
|
| 5 |
+
requests==2.32.3
|
tokenizer_training.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from packages import *
|
| 2 |
+
|
| 3 |
+
def tokenizer_training_fn(dataset: pd.DataFrame, vocab_size= 30_000, min_frequency= 5, pre_tokenizer= True)-> Tokenizer:
|
| 4 |
+
unk, eos= "[UNK]", "<|endoftext|>"
|
| 5 |
+
tokenizer= Tokenizer(models.BPE(unk_token= unk))
|
| 6 |
+
if pre_tokenizer: tokenizer.pre_tokenizer= pre_tokenizers.Whitespace()
|
| 7 |
+
trainer= trainers.BpeTrainer(vocab_size= vocab_size, min_frequency= min_frequency,
|
| 8 |
+
special_tokens= [unk, eos] )
|
| 9 |
+
|
| 10 |
+
tokenizer.train_from_iterator(dataset["text"], trainer)
|
| 11 |
+
print(10 * "--", " vocab size ", 10 * "--")
|
| 12 |
+
print(tokenizer.get_vocab_size())
|
| 13 |
+
|
| 14 |
+
tokenizer.post_processor= processors.TemplateProcessing(
|
| 15 |
+
single= f"{eos} $A {eos}",
|
| 16 |
+
special_tokens= [(eos, tokenizer.token_to_id(eos))]
|
| 17 |
+
)
|
| 18 |
+
tokenizer.decoder= decoders.BPEDecoder()
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
tokenizer.save(f"Persian_BPE_Tokenizer_{vocab_size//1000}K.json")
|
| 22 |
+
print(70*"-"), print("tokenizer training is complete and saved."), print(70*"-")
|
| 23 |
+
|
| 24 |
+
return tokenizer
|
utils.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from packages import *
|
| 2 |
+
|
| 3 |
+
def download_blog_dataset():
|
| 4 |
+
url= "https://huggingface.co/datasets/RohanAiLab/persian_blog/resolve/main/blogs.zip"
|
| 5 |
+
output_path= "blogs.zip"
|
| 6 |
+
extract_folder= "blogs"
|
| 7 |
+
|
| 8 |
+
if not os.path.exists(output_path):
|
| 9 |
+
print("Downloading blogs.zip...")
|
| 10 |
+
r= requests.get(url, stream= True)
|
| 11 |
+
with open(output_path, "wb") as f:
|
| 12 |
+
for chunk in r.iter_content(chunk_size=8192):
|
| 13 |
+
if chunk:
|
| 14 |
+
f.write(chunk)
|
| 15 |
+
print("Download completed.")
|
| 16 |
+
|
| 17 |
+
if not os.path.exists(extract_folder):
|
| 18 |
+
print("Extracting blogs.zip...")
|
| 19 |
+
with ZipFile(output_path, 'r') as zip_ref:
|
| 20 |
+
zip_ref.extractall(extract_folder)
|
| 21 |
+
print("Extraction completed.")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def eda_dataset(df, name):
|
| 25 |
+
text_column= "text"
|
| 26 |
+
print(f"--- EDA for {name} ---")
|
| 27 |
+
print(f"تعداد نمونهها: {len(df)}")
|
| 28 |
+
print(f"تعداد نمونههای خالی: {df[text_column].isna().sum() + (df[text_column].str.strip() == '').sum()}")
|
| 29 |
+
print(f"میانگین طول متن: {df[text_column].str.len().mean():.2f}")
|
| 30 |
+
print(f"حداکثر طول متن: {df[text_column].str.len().max()}")
|
| 31 |
+
print(f"حداقل طول متن: {df[text_column].str.len().min()}")
|
| 32 |
+
# print(f"تعداد کلمات منحصربهفرد: {len(set(' '.join(df[text_column].dropna().astype(str)).split()))}")
|
| 33 |
+
# print(f"نمونه متن: {df[text_column].iloc[0][:100] if isinstance(df[text_column].iloc[0], str) else 'غیرمعتبر'}")
|
| 34 |
+
print("\n")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def show_short_samples(df, dataset_name, max_length= 1000, num_samples= 5):
|
| 38 |
+
print(f"--- نمونههای با طول کمتر از {max_length} کاراکتر برای {dataset_name} ---")
|
| 39 |
+
short_samples = df[df['text'].str.len() < max_length]
|
| 40 |
+
print(f"تعداد نمونههای با طول کمتر از {max_length} کاراکتر: {len(short_samples)}")
|
| 41 |
+
if len(short_samples) > 0:
|
| 42 |
+
for i, text in enumerate(short_samples['text'].head(num_samples)):
|
| 43 |
+
print(f"نمونه {i+1}: {text[:100]} (طول: {len(text)} کاراکتر)")
|
| 44 |
+
else:
|
| 45 |
+
print("هیچ نمونهای با طول کمتر از {max_length} کاراکتر یافت نشد.")
|
| 46 |
+
print("\n")
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def evaluation_fn(tokenizer: Tokenizer, dataset:pd.DataFrame, samples_number= 10000):
|
| 50 |
+
dataset= dataset['text'].sample(n= samples_number).to_list()
|
| 51 |
+
unk_count= sum(text.count("[UNK]") for text in [tokenizer.encode(t).tokens for t in dataset])
|
| 52 |
+
total_tokens= sum(len(tokenizer.encode(t).tokens) for t in dataset)
|
| 53 |
+
unk_rate= (unk_count / total_tokens) * 100
|
| 54 |
+
|
| 55 |
+
char_counts= [len(t) for t in dataset]
|
| 56 |
+
token_counts= [len(tokenizer.encode(t).tokens) for t in dataset]
|
| 57 |
+
compression_ratio= sum(char_counts) / sum(token_counts)
|
| 58 |
+
|
| 59 |
+
return unk_rate, compression_ratio
|