|
from tqdm import tqdm |
|
|
|
from .bpe import load_asm_tok |
|
|
|
if __name__ == "__main__": |
|
import argparse |
|
parser = argparse.ArgumentParser("Tokenize using existing tokenizer") |
|
parser.add_argument("-t", "--tokenizer", required=True, help="existing tokenizer") |
|
parser.add_argument("-i", "--input", required=True, help="input file") |
|
parser.add_argument("-o", "--output", required=True, help="output file") |
|
args = parser.parse_args() |
|
|
|
max_asm_toks = 0 |
|
asm_tok = load_asm_tok(args.tokenizer) |
|
|
|
with open(args.input, "r") as asmf, open(args.output, "w") as asmtokf: |
|
for asm in tqdm(asmf, desc=f"Tokenizing"): |
|
asm = asm.strip() |
|
asm_enc = asm_tok.encode(asm) |
|
max_asm_toks = max(max_asm_toks, len(asm_enc.tokens)) |
|
asm_seq = " ".join(asm_enc.tokens) |
|
asmtokf.write(asm_seq + "\n") |
|
|
|
print("Maximum tokens:", max_asm_toks) |
|
|
|
|