Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
import os | |
from data import MultilingualBatchingCachingDataset | |
from aux import log, CmdlineArgs | |
from legacy.langconv import lang_set_maybe_smugri | |
from modelops import to_cpl_spec | |
from tokops import load_tokenizer | |
""" | |
def load_hf_tok(mdl_id, tok_id=None, verbose=False): | |
if tok_id is None: | |
tok_id = mdl_id | |
tokenizer = AutoTokenizer.fromm_pretrained(tok_id, token=hf_tok) | |
return tokenizer | |
""" | |
def _cmdline_args(): | |
description = """Pre-tokenize data and cache the results""" | |
pos_args = ["mdl_id", "train_file", "langs", "cache_path"] | |
pos_types = [str, str, lang_set_maybe_smugri, str] | |
kw_args = { "anchor_mdl_id": None, "anchor_langs": None, "batch_size": 16, "shard_size": 100000, | |
"exclude_set": None, "max_snt_len": 1024, "sort_by_len": False } | |
#post-process the arguments | |
args = CmdlineArgs(description, pos_arg_list=pos_args, pos_arg_types=pos_types, kw_arg_dict=kw_args) | |
if args.anchor_langs is not None: | |
args.anchor_langs = lang_set_maybe_smugri(args.anchor_langs) | |
# if the directory args.save_location already exists, raise an exception: | |
if os.path.exists(args.cache_path): | |
raise Exception(f"Save location '{args.cache_path}' already exists, don't want to overwrite") | |
log(f"Launched as {args}") | |
return args | |
def oh_look_another_do_main_function(): | |
args = _cmdline_args() | |
log("loading tokenizer") | |
main_tokenizer, main_postok = load_tokenizer(args.mdl_id) #load_hf_tok(args.mdl_id, verbose=True) | |
coupling_specs = to_cpl_spec(args.langs, None, main_tokenizer, main_postok, None) | |
if args.anchor_mdl_id is not None: | |
log("loading anchor model tokenizer") | |
anchor_tokenizer, anc_postok = load_tokenizer(args.anchor_mdl_id) | |
coupling_specs += to_cpl_spec(args.anchor_langs, None, anchor_tokenizer, anc_postok, None) | |
mbd = MultilingualBatchingCachingDataset(args.train_file, coupling_specs, args) | |
mbd.load_and_cache_data(args.cache_path) | |
if __name__ == "__main__": | |
oh_look_another_do_main_function() | |