REMEND / remend /bpe_apply.py
udiboy1209's picture
Add REMEND python module
7145fd6
raw
history blame contribute delete
909 Bytes
from tqdm import tqdm
from .bpe import load_asm_tok
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser("Tokenize using existing tokenizer")
parser.add_argument("-t", "--tokenizer", required=True, help="existing tokenizer")
parser.add_argument("-i", "--input", required=True, help="input file")
parser.add_argument("-o", "--output", required=True, help="output file")
args = parser.parse_args()
max_asm_toks = 0
asm_tok = load_asm_tok(args.tokenizer)
with open(args.input, "r") as asmf, open(args.output, "w") as asmtokf:
for asm in tqdm(asmf, desc=f"Tokenizing"):
asm = asm.strip()
asm_enc = asm_tok.encode(asm)
max_asm_toks = max(max_asm_toks, len(asm_enc.tokens))
asm_seq = " ".join(asm_enc.tokens)
asmtokf.write(asm_seq + "\n")
print("Maximum tokens:", max_asm_toks)