#! /usr/bin/python3 src="hplt_bert_base_be" tgt="KoichiYasuoka/ltgbert-base-belarusian-upos" url="https://github.com/UniversalDependencies/UD_Belarusian-HSE" import os from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DataCollatorForTokenClassification,TrainingArguments,Trainer os.system(f"test -d {src} || ( curl -L https://data.hplt-project.org/one/models/encoder/{src}.tar.gz | tar xvzf - )") d=os.path.basename(url) os.system(f"test -d {d} || git clone --depth=1 {url}") os.system("for F in train dev test ; do cp "+d+"/*-$F.conllu $F.conllu ; done") class UPOSFileDataset(object): def __init__(self,conllu,tokenizer): self.conllu=open(conllu,"r",encoding="utf-8") self.tokenizer=tokenizer self.seeks=[0] label=set(["SYM"]) s=self.conllu.readline() while s!="": if s=="\n": self.seeks.append(self.conllu.tell()) else: w=s.split("\t") if len(w)==10: if w[0].isdecimal(): label.add(w[3] if w[5]=="_" else w[3]+"|"+w[5]) s=self.conllu.readline() lid={} for i,l in enumerate(sorted(label)): lid[l],lid["B-"+l],lid["I-"+l]=i*3,i*3+1,i*3+2 self.label2id=lid def __call__(*args): lid={l:i for i,l in enumerate(sorted(set(sum([list(t.label2id) for t in args],[]))))} for t in args: t.label2id=lid return lid def __del__(self): self.conllu.close() __len__=lambda self:len(self.seeks)-1 def __getitem__(self,i): self.conllu.seek(self.seeks[i]) form,upos,space=[],[],[True] while self.conllu.tell()