|
|
import sentencepiece as spm |
|
|
from transformers import T5Tokenizer |
|
|
|
|
|
|
|
|
corpus_files = ["dna_4g.txt", "eng_4g.txt","protein_4g.txt"] |
|
|
|
|
|
|
|
|
total_sentences = 0 |
|
|
for file in corpus_files: |
|
|
with open(file, 'r', encoding='utf-8') as f: |
|
|
total_sentences += sum(1 for _ in f) |
|
|
print(f"Total sentences: {total_sentences}") |
|
|
|
|
|
half_sentences = total_sentences // 3 |
|
|
|
|
|
|
|
|
spm.SentencePieceTrainer.train( |
|
|
input=corpus_files, |
|
|
model_prefix="spm_gene_eng", |
|
|
model_type="unigram", |
|
|
vocab_size=90000, |
|
|
pad_id=0, |
|
|
bos_id=-1, |
|
|
eos_id=1, |
|
|
unk_id=2, |
|
|
user_defined_symbols=",".join([f"<extra_id_{i}>" for i in range(99, -1, -1)]), |
|
|
input_sentence_size=half_sentences, |
|
|
shuffle_input_sentence=True, |
|
|
character_coverage=1.0, |
|
|
train_extremely_large_corpus=True, |
|
|
num_threads=64, |
|
|
) |
|
|
|
|
|
|
|
|
trained_tokenizer = T5Tokenizer("spm_gene_eng.model") |
|
|
print(f"Vocabulary size: {trained_tokenizer.vocab_size}") |
|
|
|
|
|
|
|
|
trained_tokenizer.save_pretrained("trained_t5_gene_eng_tokenizer") |
|
|
|
|
|
text = "TGGATAACATACGGTATAAGGTTTTGATCACTATAGTTTTGTAATATAGCTTGAAATTAAGAAGTGTGATGCCTCCAGGCTTGTTCT" |
|
|
print(trained_tokenizer.tokenize(text)) |