from tqdm import tqdm from .bpe import load_asm_tok if __name__ == "__main__": import argparse parser = argparse.ArgumentParser("Tokenize using existing tokenizer") parser.add_argument("-t", "--tokenizer", required=True, help="existing tokenizer") parser.add_argument("-i", "--input", required=True, help="input file") parser.add_argument("-o", "--output", required=True, help="output file") args = parser.parse_args() max_asm_toks = 0 asm_tok = load_asm_tok(args.tokenizer) with open(args.input, "r") as asmf, open(args.output, "w") as asmtokf: for asm in tqdm(asmf, desc=f"Tokenizing"): asm = asm.strip() asm_enc = asm_tok.encode(asm) max_asm_toks = max(max_asm_toks, len(asm_enc.tokens)) asm_seq = " ".join(asm_enc.tokens) asmtokf.write(asm_seq + "\n") print("Maximum tokens:", max_asm_toks)