amirhofo commited on
Commit
6534adc
·
verified ·
1 Parent(s): fab16ec

Upload files

Browse files
Files changed (9) hide show
  1. LICENSE +21 -0
  2. Persian_BPE_Tokenizer_30K.json +0 -0
  3. README.md +43 -3
  4. main.py +43 -0
  5. packages.py +7 -0
  6. preprocess.py +66 -0
  7. requirements.txt +5 -0
  8. tokenizer_training.py +24 -0
  9. utils.py +59 -0
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Amir Hossein Fouladi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
Persian_BPE_Tokenizer_30K.json ADDED
The diff for this file is too large to render. See raw diff
 
README.md CHANGED
@@ -1,3 +1,43 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Persian BPE Tokenizer (30K)
2
+
3
+ A Byte-Pair Encoding (BPE) tokenizer with a vocabulary size of 30,000, trained on ~2M Persian texts with an average length of 10,000 characters for NLP tasks.
4
+
5
+ ## Usage
6
+
7
+ ### Encoding
8
+ ```python
9
+ from tokenizers import Tokenizer
10
+ tokenizer= Tokenizer.from_file("Persian_BPE_Tokenizer_30K.json")
11
+ encoded_text= tokenizer.encode("این یک متن آزمایشی است.")
12
+ print("Tokens:", encoded_text.tokens)
13
+ print("IDs:", encoded_text.ids)
14
+ ```
15
+
16
+ ### Decoding
17
+ ```python
18
+ decoded_text= tokenizer.decode_batch([[id] for id in encoded_text.ids])
19
+ print("Decoded:", decoded_text)
20
+ ```
21
+
22
+ ## Training Data
23
+ This tokenizer was trained on the following datasets:
24
+ - Wikipedia (20231101.fa): https://huggingface.co/datasets/wikimedia/wikipedia
25
+ - Persian Blog: https://huggingface.co/datasets/RohanAiLab/persian_blog
26
+ - HomoRich: https://huggingface.co/datasets/MahtaFetrat/HomoRich-G2P-Persian
27
+
28
+ ## License
29
+ Code and tokenizer: MIT License
30
+
31
+ ## Evaluation Metrics
32
+ - UNK Rate: 0.0% (on 100,000 samples)
33
+ - Compression Ratio: 4.56 (on 100,000 samples)
34
+
35
+ ## Requirements
36
+ - **For using the tokenizer**:
37
+ - Python >= 3.9
38
+ - tokenizers
39
+ - **For training the tokenizer**:
40
+ - pandas
41
+ - datasets
42
+ - requests
43
+ - hazm
main.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from packages import *
2
+ from preprocess import *
3
+ from tokenizer_training import *
4
+
5
+ '''
6
+ Datasets:
7
+ https://huggingface.co/datasets/wikimedia/wikipedia
8
+ https://huggingface.co/datasets/RohanAiLab/persian_blog
9
+ https://huggingface.co/datasets/MahtaFetrat/HomoRich-G2P-Persian
10
+ '''
11
+
12
+ os.system('cls' if os.name == 'nt' else 'clear')
13
+
14
+ if os.path.exists("merged_dataset.csv"):
15
+ dataset= pd.read_csv("merged_dataset.csv")
16
+ else: dataset= preprocess_pipeline_fn()
17
+
18
+ if os.path.exists("Persian_BPE_Tokenizer_30K.json"):
19
+ tokenizer= Tokenizer.from_file("Persian_BPE_Tokenizer_30K.json")
20
+ else: tokenizer= tokenizer_training_fn(dataset)
21
+
22
+
23
+ # test
24
+ test_texts= ["این یک متن آزمایشی برای بررسی عملکرد توکنایزر است.",
25
+ "دیروز به کتابخانه رفتم و کتابی درباره تاریخ ایران باستان خواندم.",
26
+ "سلام! چطور می‌توانم به سرعت زبان فارسی را یاد بگیرم؟",
27
+ "هوای تهران امروز خیلی گرم و آفتابی است، ولی شب خنک می‌شود."]
28
+
29
+ for i, text in enumerate(test_texts):
30
+ encoded_text= tokenizer.encode(text)
31
+ print(10 * "--", f" test {i+1}", 10 * "--")
32
+ print("text: ", text)
33
+ print("tokens: ", encoded_text.tokens)
34
+ print("ids: ", encoded_text.ids)
35
+ # print("decoded: ", tokenizer.decode(encoded_text.ids))
36
+ tokens = tokenizer.decode_batch([[id] for id in encoded_text.ids])
37
+ print("decoded:", ' '.join(tokens))
38
+
39
+ # evaluation criteria
40
+ print(70*"-"), print("evaluation"), print(70*"-")
41
+ unk_rate, compression_ratio= evaluation_fn(tokenizer, dataset, 100_000)
42
+ print(f"unk rate: {unk_rate}%")
43
+ print(f"compression ratio: {compression_ratio}")
packages.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import os, re
2
+ from zipfile import ZipFile
3
+
4
+ import requests
5
+ import pandas as pd
6
+ from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors, decoders
7
+ from hazm import Normalizer
preprocess.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from packages import *
2
+ from utils import *
3
+
4
+ def preprocess_fn(dataset: pd.DataFrame, normaliztion= False, show_sample= False) -> pd.DataFrame:
5
+ # Remove duplicate texts
6
+ dataset= dataset.drop_duplicates(subset='text')
7
+ dataset= dataset.copy()
8
+ # Remove URLs
9
+ dataset['text']= dataset['text'].apply(lambda x: re.sub(r'http\S+|www\S+', '', x) if isinstance(x, str) else x)
10
+ # Remove HTML tags
11
+ dataset['text']= dataset['text'].apply(lambda x: re.sub(r'<[^>]+>', '', x) if isinstance(x, str) else x)
12
+ # Remove emojis and unnecessary characters
13
+ dataset['text']= dataset['text'].apply(lambda x: re.sub(r'[^\u0600-\u06FF\s.,!?]', '', x) if isinstance(x, str) else x)
14
+ # Remove English texts
15
+ dataset['text']= dataset['text'].apply(lambda x: re.sub(r'[a-zA-Z]+', '', x) if isinstance(x, str) else x)
16
+ # Remove empty or invalid texts
17
+ dataset= dataset[dataset['text'].notna() & (dataset['text'].str.strip() != '')]
18
+ # Remove extra spaces
19
+ dataset['text']= dataset['text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip() if isinstance(x, str) else x)
20
+ # Normalization
21
+ if normaliztion:
22
+ dataset['text']= dataset['text'].apply(lambda x: Normalizer(persian_numbers= True).normalize(x) if isinstance(x, str) else x)
23
+ if show_sample:
24
+ print(dataset['text'].sample(n= 3).to_list())
25
+ return dataset
26
+
27
+
28
+ def uniform_length(dataset: pd.DataFrame, target_length= 1000) -> pd.DataFrame:
29
+ full_text= ' '.join(dataset['text'].dropna().astype(str))
30
+ chunks, start= [], 0
31
+ while start < len(full_text):
32
+ end= min(start+ target_length, len(full_text))
33
+ while end < len(full_text) and full_text[end] not in [' ', '\n']:
34
+ end +=1
35
+ chunks.append(full_text[start: end].strip())
36
+ start= end +1
37
+ return pd.DataFrame(chunks, columns= ['text'])
38
+
39
+
40
+ def preprocess_pipeline_fn(eda= False, save_dataset= True, shuffle= False):
41
+ ## load datasets
42
+ wiki_dataset= load_dataset("wikimedia/wikipedia", "20231101.fa")
43
+ wiki_dataset= pd.DataFrame(wiki_dataset['train']['text'], columns= ['text'])
44
+ blog_dataset= pd.read_csv("./blogs/blogs.csv")
45
+ homorich_dataset= load_dataset("MahtaFetrat/HomoRich-G2P-Persian", verification_mode= "no_checks")
46
+ homorich_dataset= pd.DataFrame(homorich_dataset["train"]["Grapheme"], columns= ['text'])
47
+ print(70*"-"), print("download & load datasets is done."), print(70*"-")
48
+
49
+ ## preprocess
50
+ dataset_names= ["Wikipedia", "Persian Blog", "HomoRich"]
51
+ datasets= [wiki_dataset, blog_dataset, homorich_dataset]
52
+ del wiki_dataset, blog_dataset, homorich_dataset
53
+ for i in range(len(datasets)):
54
+ datasets[i]= uniform_length(preprocess_fn(datasets[i]))
55
+ if eda:
56
+ eda_dataset(datasets[i], dataset_names[i])
57
+ show_short_samples(datasets[i], dataset_names[i])
58
+
59
+ ## merging datasets
60
+ merged_dataset= pd.concat([datasets[0], datasets[1], datasets[2]], ignore_index= True)
61
+ del datasets
62
+ if shuffle: merged_dataset= merged_dataset.sample(frac= 1).reset_index(drop= True)
63
+ print(70*"-"), print("preprocessing is done."), print(70*"-")
64
+ if save_dataset: merged_dataset.to_csv('merged_dataset.csv', index= False, encoding= 'utf-8-sig')
65
+
66
+ return merged_dataset
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ tokenizers==0.21.1
2
+ datasets==2.21.0
3
+ hazm==0.10.0
4
+ pandas==2.0.3
5
+ requests==2.32.3
tokenizer_training.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from packages import *
2
+
3
+ def tokenizer_training_fn(dataset: pd.DataFrame, vocab_size= 30_000, min_frequency= 5, pre_tokenizer= True)-> Tokenizer:
4
+ unk, eos= "[UNK]", "<|endoftext|>"
5
+ tokenizer= Tokenizer(models.BPE(unk_token= unk))
6
+ if pre_tokenizer: tokenizer.pre_tokenizer= pre_tokenizers.Whitespace()
7
+ trainer= trainers.BpeTrainer(vocab_size= vocab_size, min_frequency= min_frequency,
8
+ special_tokens= [unk, eos] )
9
+
10
+ tokenizer.train_from_iterator(dataset["text"], trainer)
11
+ print(10 * "--", " vocab size ", 10 * "--")
12
+ print(tokenizer.get_vocab_size())
13
+
14
+ tokenizer.post_processor= processors.TemplateProcessing(
15
+ single= f"{eos} $A {eos}",
16
+ special_tokens= [(eos, tokenizer.token_to_id(eos))]
17
+ )
18
+ tokenizer.decoder= decoders.BPEDecoder()
19
+
20
+
21
+ tokenizer.save(f"Persian_BPE_Tokenizer_{vocab_size//1000}K.json")
22
+ print(70*"-"), print("tokenizer training is complete and saved."), print(70*"-")
23
+
24
+ return tokenizer
utils.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from packages import *
2
+
3
+ def download_blog_dataset():
4
+ url= "https://huggingface.co/datasets/RohanAiLab/persian_blog/resolve/main/blogs.zip"
5
+ output_path= "blogs.zip"
6
+ extract_folder= "blogs"
7
+
8
+ if not os.path.exists(output_path):
9
+ print("Downloading blogs.zip...")
10
+ r= requests.get(url, stream= True)
11
+ with open(output_path, "wb") as f:
12
+ for chunk in r.iter_content(chunk_size=8192):
13
+ if chunk:
14
+ f.write(chunk)
15
+ print("Download completed.")
16
+
17
+ if not os.path.exists(extract_folder):
18
+ print("Extracting blogs.zip...")
19
+ with ZipFile(output_path, 'r') as zip_ref:
20
+ zip_ref.extractall(extract_folder)
21
+ print("Extraction completed.")
22
+
23
+
24
+ def eda_dataset(df, name):
25
+ text_column= "text"
26
+ print(f"--- EDA for {name} ---")
27
+ print(f"تعداد نمونه‌ها: {len(df)}")
28
+ print(f"تعداد نمونه‌های خالی: {df[text_column].isna().sum() + (df[text_column].str.strip() == '').sum()}")
29
+ print(f"میانگین طول متن: {df[text_column].str.len().mean():.2f}")
30
+ print(f"حداکثر طول متن: {df[text_column].str.len().max()}")
31
+ print(f"حداقل طول متن: {df[text_column].str.len().min()}")
32
+ # print(f"تعداد کلمات منحصربه‌فرد: {len(set(' '.join(df[text_column].dropna().astype(str)).split()))}")
33
+ # print(f"نمونه متن: {df[text_column].iloc[0][:100] if isinstance(df[text_column].iloc[0], str) else 'غیرمعتبر'}")
34
+ print("\n")
35
+
36
+
37
+ def show_short_samples(df, dataset_name, max_length= 1000, num_samples= 5):
38
+ print(f"--- نمونه‌های با طول کمتر از {max_length} کاراکتر برای {dataset_name} ---")
39
+ short_samples = df[df['text'].str.len() < max_length]
40
+ print(f"تعداد نمونه‌های با طول کمتر از {max_length} کاراکتر: {len(short_samples)}")
41
+ if len(short_samples) > 0:
42
+ for i, text in enumerate(short_samples['text'].head(num_samples)):
43
+ print(f"نمونه {i+1}: {text[:100]} (طول: {len(text)} کاراکتر)")
44
+ else:
45
+ print("هیچ نمونه‌ای با طول کمتر از {max_length} کاراکتر یافت نشد.")
46
+ print("\n")
47
+
48
+
49
+ def evaluation_fn(tokenizer: Tokenizer, dataset:pd.DataFrame, samples_number= 10000):
50
+ dataset= dataset['text'].sample(n= samples_number).to_list()
51
+ unk_count= sum(text.count("[UNK]") for text in [tokenizer.encode(t).tokens for t in dataset])
52
+ total_tokens= sum(len(tokenizer.encode(t).tokens) for t in dataset)
53
+ unk_rate= (unk_count / total_tokens) * 100
54
+
55
+ char_counts= [len(t) for t in dataset]
56
+ token_counts= [len(tokenizer.encode(t).tokens) for t in dataset]
57
+ compression_ratio= sum(char_counts) / sum(token_counts)
58
+
59
+ return unk_rate, compression_ratio