t5_gene_model / t5_token_gene_eng.py
marisming's picture
Upload folder using huggingface_hub
ba38a19 verified
import sentencepiece as spm
from transformers import T5Tokenizer
# 定义文件路径(改为列表,支持多个文件)
corpus_files = ["dna_4g.txt", "eng_4g.txt","protein_4g.txt"] # 添加你的第二个文件路径
# 先计算总句子数(行数),以便采样一半
total_sentences = 0
for file in corpus_files:
with open(file, 'r', encoding='utf-8') as f: # 假设UTF-8编码;如果不是,调整encoding
total_sentences += sum(1 for _ in f)
print(f"Total sentences: {total_sentences}")
half_sentences = total_sentences // 3 # 用一半的句子
# 训练 SentencePiece Unigram 模型
spm.SentencePieceTrainer.train(
input=corpus_files, # 修改为文件列表
model_prefix="spm_gene_eng", # 输出文件前缀:spm.model 和 spm.vocab
model_type="unigram",
vocab_size=90000, # T5-small 标准大小
pad_id=0, # <pad> 为 id 0
bos_id=-1, # 禁用 BOS(关键修复)
eos_id=1, # </s> 为 id 1
unk_id=2, # <unk> 为 id 2
user_defined_symbols=",".join([f"<extra_id_{i}>" for i in range(99, -1, -1)]), # 逆序添加 extra_ids (id 3 到 102)
input_sentence_size=half_sentences, # 修改:采样一半句子(0 表示所有)
shuffle_input_sentence=True, # 随机打乱以提高质量(采样时有效)
character_coverage=1.0, # 覆盖所有字符
train_extremely_large_corpus=True, # 如果数据集很大
num_threads=64, # 加速关键:使用多线程(调整为你CPU核心数)
)
# 加载为 T5Tokenizer
trained_tokenizer = T5Tokenizer("spm_gene_eng.model") # 注意:model_prefix是"spm_dna",但你写了"spm_dna_eng.model",可能是个笔误;保持一致
print(f"Vocabulary size: {trained_tokenizer.vocab_size}")
# 保存为 Hugging Face 格式
trained_tokenizer.save_pretrained("trained_t5_gene_eng_tokenizer")
text = "TGGATAACATACGGTATAAGGTTTTGATCACTATAGTTTTGTAATATAGCTTGAAATTAAGAAGTGTGATGCCTCCAGGCTTGTTCT"
print(trained_tokenizer.tokenize(text))