#! /usr/bin/python3 tgt="KoichiYasuoka/modernbert-base-ukrainian" import os from transformers import LlamaTokenizerFast tkz=LlamaTokenizerFast.from_pretrained("KoichiYasuoka/modernbert-large-ukrainian-ud-embeds") tkz.save_pretrained(tgt) with open("train.py","w",encoding="utf-8") as w: print(f'#! /usr/bin/env deepspeed\ntgt="{tgt}"'+''' from transformers import LlamaTokenizerFast,ModernBertForMaskedLM,AutoConfig,DataCollatorForLanguageModeling,TrainingArguments,Trainer tkz=LlamaTokenizerFast.from_pretrained(tgt) c={"vocab_size":len(tkz),"tokenizer_class":type(tkz).__name__} for k,v in tkz.special_tokens_map.items(): c[k+"_id"]=tkz.convert_tokens_to_ids(v) cfg=AutoConfig.from_pretrained("KoichiYasuoka/modernbert-base-classical-chinese",**c) arg=TrainingArguments(num_train_epochs=1,per_device_train_batch_size=1,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,save_safetensors=False) class trainTextDS(object): def __init__(self,dataset,tokenizer): from datasets import load_dataset self.tokenizer=tokenizer self.dataset=load_dataset(dataset)["train"] __len__=lambda self:int(len(self.dataset)/32) __getitem__=lambda self,i:self.tokenizer(" ".join(self.dataset[i*32:i*32+32]["text"]).replace("\\n"," "),truncation=True,add_special_tokens=True,max_length=8190) trn=Trainer(args=arg,data_collator=DataCollatorForLanguageModeling(tkz),model=ModernBertForMaskedLM(cfg),train_dataset=trainTextDS("Goader/kobza",tkz)) trn.train() trn.save_model(tgt)''',file=w) os.system("chmod 755 train.py ; ./train.py")