KoichiYasuoka's picture
initial release
a64c57e
#! /usr/bin/python3
tgt="KoichiYasuoka/modernbert-base-ukrainian"
import os
from transformers import LlamaTokenizerFast
tkz=LlamaTokenizerFast.from_pretrained("KoichiYasuoka/modernbert-large-ukrainian-ud-embeds")
tkz.save_pretrained(tgt)
with open("train.py","w",encoding="utf-8") as w:
print(f'#! /usr/bin/env deepspeed\ntgt="{tgt}"'+'''
from transformers import LlamaTokenizerFast,ModernBertForMaskedLM,AutoConfig,DataCollatorForLanguageModeling,TrainingArguments,Trainer
tkz=LlamaTokenizerFast.from_pretrained(tgt)
c={"vocab_size":len(tkz),"tokenizer_class":type(tkz).__name__}
for k,v in tkz.special_tokens_map.items():
c[k+"_id"]=tkz.convert_tokens_to_ids(v)
cfg=AutoConfig.from_pretrained("KoichiYasuoka/modernbert-base-classical-chinese",**c)
arg=TrainingArguments(num_train_epochs=1,per_device_train_batch_size=1,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,save_safetensors=False)
class trainTextDS(object):
def __init__(self,dataset,tokenizer):
from datasets import load_dataset
self.tokenizer=tokenizer
self.dataset=load_dataset(dataset)["train"]
__len__=lambda self:int(len(self.dataset)/32)
__getitem__=lambda self,i:self.tokenizer(" ".join(self.dataset[i*32:i*32+32]["text"]).replace("\\n"," "),truncation=True,add_special_tokens=True,max_length=8190)
trn=Trainer(args=arg,data_collator=DataCollatorForLanguageModeling(tkz),model=ModernBertForMaskedLM(cfg),train_dataset=trainTextDS("Goader/kobza",tkz))
trn.train()
trn.save_model(tgt)''',file=w)
os.system("chmod 755 train.py ; ./train.py")