|
|
|
src="KoichiYasuoka/modernbert-small-classical-chinese-traditional" |
|
tgt="KoichiYasuoka/modernbert-small-classical-chinese" |
|
import torch |
|
from transformers import BertTokenizerFast,AutoModelForMaskedLM |
|
from esupar.tradify import tradify |
|
from tokenizers.pre_tokenizers import Sequence,Whitespace,Split |
|
from tokenizers import Regex |
|
tkz=BertTokenizerFast.from_pretrained(src) |
|
mdl=AutoModelForMaskedLM.from_pretrained(src) |
|
c=[(k,v) for k,v in tradify.items() if tkz.add_tokens([k,v])==1] |
|
e=mdl.resize_token_embeddings(len(tkz)) |
|
with torch.no_grad(): |
|
for k,v in c: |
|
t=sorted(tkz.convert_tokens_to_ids([k,v])) |
|
e.weight[t[1],:]=e.weight[t[0],:] |
|
mdl.set_input_embeddings(e) |
|
mdl.save_pretrained(tgt,safe_serialization=False) |
|
with open(tgt+"/vocab.txt","w",encoding="utf-8") as w: |
|
print("\n".join(tkz.convert_ids_to_tokens(range(len(tkz)))),file=w) |
|
s=["[CLS]","[PAD]","[SEP]","[UNK]","[MASK]"] |
|
tkz=BertTokenizerFast(vocab_file=tgt+"/vocab.txt",never_split=s,do_lower_case=False,strip_accents=False,tokenize_chinese_chars=True) |
|
tkz.backend_tokenizer.pre_tokenizer=Sequence([Whitespace(),Split(Regex("."),"isolated")]) |
|
tkz.backend_tokenizer.decoder.prefix=tkz.backend_tokenizer.model.continuing_subword_prefix="" |
|
tkz.save_pretrained(tgt) |
|
|