dnagpt
/

t5_gene_model

Model card Files Files and versions

Metrics Training metrics Community

t5_gene_model / t5_token_gene_eng.py

marisming's picture

Upload folder using huggingface_hub

ba38a19 verified 4 months ago

history blame contribute delete

1.98 kB

	import sentencepiece as spm
	from transformers import T5Tokenizer

	# 定义文件路径（改为列表，支持多个文件）
	corpus_files = ["dna_4g.txt", "eng_4g.txt","protein_4g.txt"] # 添加你的第二个文件路径

	# 先计算总句子数（行数），以便采样一半
	total_sentences = 0
	for file in corpus_files:
	with open(file, 'r', encoding='utf-8') as f: # 假设UTF-8编码；如果不是，调整encoding
	total_sentences += sum(1 for _ in f)
	print(f"Total sentences: {total_sentences}")

	half_sentences = total_sentences // 3 # 用一半的句子

	# 训练 SentencePiece Unigram 模型
	spm.SentencePieceTrainer.train(
	input=corpus_files, # 修改为文件列表
	model_prefix="spm_gene_eng", # 输出文件前缀：spm.model 和 spm.vocab
	model_type="unigram",
	vocab_size=90000, # T5-small 标准大小
	pad_id=0, # <pad> 为 id 0
	bos_id=-1, # 禁用 BOS（关键修复）
	eos_id=1, # </s> 为 id 1
	unk_id=2, # <unk> 为 id 2
	user_defined_symbols=",".join([f"<extra_id_{i}>" for i in range(99, -1, -1)]), # 逆序添加 extra_ids (id 3 到 102)
	input_sentence_size=half_sentences, # 修改：采样一半句子（0 表示所有）
	shuffle_input_sentence=True, # 随机打乱以提高质量（采样时有效）
	character_coverage=1.0, # 覆盖所有字符
	train_extremely_large_corpus=True, # 如果数据集很大
	num_threads=64, # 加速关键：使用多线程（调整为你CPU核心数）
	)

	# 加载为 T5Tokenizer
	trained_tokenizer = T5Tokenizer("spm_gene_eng.model") # 注意：model_prefix是"spm_dna"，但你写了"spm_dna_eng.model"，可能是个笔误；保持一致
	print(f"Vocabulary size: {trained_tokenizer.vocab_size}")

	# 保存为 Hugging Face 格式
	trained_tokenizer.save_pretrained("trained_t5_gene_eng_tokenizer")

	text = "TGGATAACATACGGTATAAGGTTTTGATCACTATAGTTTTGTAATATAGCTTGAAATTAAGAAGTGTGATGCCTCCAGGCTTGTTCT"
	print(trained_tokenizer.tokenize(text))