|
|
|
tgt="KoichiYasuoka/modernbert-base-ukrainian" |
|
import os |
|
from transformers import LlamaTokenizerFast |
|
tkz=LlamaTokenizerFast.from_pretrained("KoichiYasuoka/modernbert-large-ukrainian-ud-embeds") |
|
tkz.save_pretrained(tgt) |
|
with open("train.py","w",encoding="utf-8") as w: |
|
print(f'#! /usr/bin/env deepspeed\ntgt="{tgt}"'+''' |
|
from transformers import LlamaTokenizerFast,ModernBertForMaskedLM,AutoConfig,DataCollatorForLanguageModeling,TrainingArguments,Trainer |
|
tkz=LlamaTokenizerFast.from_pretrained(tgt) |
|
c={"vocab_size":len(tkz),"tokenizer_class":type(tkz).__name__} |
|
for k,v in tkz.special_tokens_map.items(): |
|
c[k+"_id"]=tkz.convert_tokens_to_ids(v) |
|
cfg=AutoConfig.from_pretrained("KoichiYasuoka/modernbert-base-classical-chinese",**c) |
|
arg=TrainingArguments(num_train_epochs=1,per_device_train_batch_size=1,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,save_safetensors=False) |
|
class trainTextDS(object): |
|
def __init__(self,dataset,tokenizer): |
|
from datasets import load_dataset |
|
self.tokenizer=tokenizer |
|
self.dataset=load_dataset(dataset)["train"] |
|
__len__=lambda self:int(len(self.dataset)/32) |
|
__getitem__=lambda self,i:self.tokenizer(" ".join(self.dataset[i*32:i*32+32]["text"]).replace("\\n"," "),truncation=True,add_special_tokens=True,max_length=8190) |
|
trn=Trainer(args=arg,data_collator=DataCollatorForLanguageModeling(tkz),model=ModernBertForMaskedLM(cfg),train_dataset=trainTextDS("Goader/kobza",tkz)) |
|
trn.train() |
|
trn.save_model(tgt)''',file=w) |
|
os.system("chmod 755 train.py ; ./train.py") |
|
|