from packages import * def tokenizer_training_fn(dataset: pd.DataFrame, vocab_size= 30_000, min_frequency= 5, pre_tokenizer= True)-> Tokenizer: unk, eos= "[UNK]", "<|endoftext|>" tokenizer= Tokenizer(models.BPE(unk_token= unk)) if pre_tokenizer: tokenizer.pre_tokenizer= pre_tokenizers.Whitespace() trainer= trainers.BpeTrainer(vocab_size= vocab_size, min_frequency= min_frequency, special_tokens= [unk, eos] ) tokenizer.train_from_iterator(dataset["text"], trainer) print(10 * "--", " vocab size ", 10 * "--") print(tokenizer.get_vocab_size()) tokenizer.post_processor= processors.TemplateProcessing( single= f"{eos} $A {eos}", special_tokens= [(eos, tokenizer.token_to_id(eos))] ) tokenizer.decoder= decoders.BPEDecoder() tokenizer.save(f"Persian_BPE_Tokenizer_{vocab_size//1000}K.json") print(70*"-"), print("tokenizer training is complete and saved."), print(70*"-") return tokenizer