Rasmus Lellep
add loader
76b1ec5
raw
history blame
2.09 kB
#!/usr/bin/env python3
import os
from data import MultilingualBatchingCachingDataset
from aux import log, CmdlineArgs
from legacy.langconv import lang_set_maybe_smugri
from modelops import to_cpl_spec
from tokops import load_tokenizer
"""
def load_hf_tok(mdl_id, tok_id=None, verbose=False):
if tok_id is None:
tok_id = mdl_id
tokenizer = AutoTokenizer.fromm_pretrained(tok_id, token=hf_tok)
return tokenizer
"""
def _cmdline_args():
description = """Pre-tokenize data and cache the results"""
pos_args = ["mdl_id", "train_file", "langs", "cache_path"]
pos_types = [str, str, lang_set_maybe_smugri, str]
kw_args = { "anchor_mdl_id": None, "anchor_langs": None, "batch_size": 16, "shard_size": 100000,
"exclude_set": None, "max_snt_len": 1024, "sort_by_len": False }
#post-process the arguments
args = CmdlineArgs(description, pos_arg_list=pos_args, pos_arg_types=pos_types, kw_arg_dict=kw_args)
if args.anchor_langs is not None:
args.anchor_langs = lang_set_maybe_smugri(args.anchor_langs)
# if the directory args.save_location already exists, raise an exception:
if os.path.exists(args.cache_path):
raise Exception(f"Save location '{args.cache_path}' already exists, don't want to overwrite")
log(f"Launched as {args}")
return args
def oh_look_another_do_main_function():
args = _cmdline_args()
log("loading tokenizer")
main_tokenizer, main_postok = load_tokenizer(args.mdl_id) #load_hf_tok(args.mdl_id, verbose=True)
coupling_specs = to_cpl_spec(args.langs, None, main_tokenizer, main_postok, None)
if args.anchor_mdl_id is not None:
log("loading anchor model tokenizer")
anchor_tokenizer, anc_postok = load_tokenizer(args.anchor_mdl_id)
coupling_specs += to_cpl_spec(args.anchor_langs, None, anchor_tokenizer, anc_postok, None)
mbd = MultilingualBatchingCachingDataset(args.train_file, coupling_specs, args)
mbd.load_and_cache_data(args.cache_path)
if __name__ == "__main__":
oh_look_another_do_main_function()