#! /usr/bin/python3 src="KoichiYasuoka/modernbert-small-classical-chinese-traditional" tgt="KoichiYasuoka/modernbert-small-classical-chinese" import torch from transformers import BertTokenizerFast,AutoModelForMaskedLM from esupar.tradify import tradify from tokenizers.pre_tokenizers import Sequence,Whitespace,Split from tokenizers import Regex tkz=BertTokenizerFast.from_pretrained(src) mdl=AutoModelForMaskedLM.from_pretrained(src) c=[(k,v) for k,v in tradify.items() if tkz.add_tokens([k,v])==1] e=mdl.resize_token_embeddings(len(tkz)) with torch.no_grad(): for k,v in c: t=sorted(tkz.convert_tokens_to_ids([k,v])) e.weight[t[1],:]=e.weight[t[0],:] mdl.set_input_embeddings(e) mdl.save_pretrained(tgt,safe_serialization=False) with open(tgt+"/vocab.txt","w",encoding="utf-8") as w: print("\n".join(tkz.convert_ids_to_tokens(range(len(tkz)))),file=w) s=["[CLS]","[PAD]","[SEP]","[UNK]","[MASK]"] tkz=BertTokenizerFast(vocab_file=tgt+"/vocab.txt",never_split=s,do_lower_case=False,strip_accents=False,tokenize_chinese_chars=True) tkz.backend_tokenizer.pre_tokenizer=Sequence([Whitespace(),Split(Regex("."),"isolated")]) tkz.backend_tokenizer.decoder.prefix=tkz.backend_tokenizer.model.continuing_subword_prefix="" tkz.save_pretrained(tgt)