File size: 1,254 Bytes
0e53262 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
#! /usr/bin/python3
src="KoichiYasuoka/modernbert-large-classical-chinese-traditional"
tgt="KoichiYasuoka/modernbert-large-classical-chinese"
import torch
from transformers import BertTokenizerFast,AutoModelForMaskedLM
from esupar.tradify import tradify
from tokenizers.pre_tokenizers import Sequence,Whitespace,Split
from tokenizers import Regex
tkz=BertTokenizerFast.from_pretrained(src)
mdl=AutoModelForMaskedLM.from_pretrained(src)
c=[(k,v) for k,v in tradify.items() if tkz.add_tokens([k,v])==1]
e=mdl.resize_token_embeddings(len(tkz))
with torch.no_grad():
for k,v in c:
t=sorted(tkz.convert_tokens_to_ids([k,v]))
e.weight[t[1],:]=e.weight[t[0],:]
mdl.set_input_embeddings(e)
mdl.save_pretrained(tgt,safe_serialization=False)
with open(tgt+"/vocab.txt","w",encoding="utf-8") as w:
print("\n".join(tkz.convert_ids_to_tokens(range(len(tkz)))),file=w)
s=["[CLS]","[PAD]","[SEP]","[UNK]","[MASK]"]
tkz=BertTokenizerFast(vocab_file=tgt+"/vocab.txt",never_split=s,do_lower_case=False,strip_accents=False,tokenize_chinese_chars=True)
tkz.backend_tokenizer.pre_tokenizer=Sequence([Whitespace(),Split(Regex("."),"isolated")])
tkz.backend_tokenizer.decoder.prefix=tkz.backend_tokenizer.model.continuing_subword_prefix=""
tkz.save_pretrained(tgt)
|