#! /usr/bin/python3 src="HPLT/hplt_bert_base_tr" tgt="KoichiYasuoka/ltgbert-base-turkish-ud-goeswith" url="https://github.com/UniversalDependencies/UD_Turkish-" import os for e in ["Kenet","Penn","BOUN","Tourism","IMST","Atis","FrameNet"]: u=url+e d=os.path.basename(u) os.system("test -d "+d+" || git clone --depth=1 "+u) os.system("for F in train dev test ; do cat UD_Turkish-*/*-$F.conllu > $F.conllu ; done") class UDgoeswithDataset(object): def __init__(self,conllu,tokenizer): self.ids,self.tags,label=[],[],set() with open(conllu,"r",encoding="utf-8") as r: cls,sep,msk=tokenizer.cls_token_id,tokenizer.sep_token_id,tokenizer.mask_token_id dep,c,m="-|_|dep",[],[0,1] for s in r: t=s.split("\t") if len(t)==10: if t[0].isdecimal(): i=int(t[0]) if not m[0]=0: m=[i,i+1] elif t[0].find("-")>0: m=[int(i) for i in t[0].split("-")] elif c!=[]: v=tokenizer([t[1] for t in c],add_special_tokens=False)["input_ids"] for i in range(len(v)-1,-1,-1): for j in range(1,len(v[i])): c.insert(i+1,[c[i][0],"_","_","X","_","_",c[i][0],"goeswith","_","_"]) y=["0"]+[t[0] for t in c] h=[i if t[6]=="0" else y.index(t[6]) for i,t in enumerate(c,1)] p,v=[t[3]+"|"+t[5]+"|"+t[7] for t in c],sum(v,[]) if len(v)