REMEND / tokenize.sh
udiboy1209's picture
Add REMEND synthetic data and models :tada:
9c19cff
raw
history blame contribute delete
504 Bytes
#!/bin/bash
DS=dataset/
TOK=tokenized/
ARCHS=( arm32 aarch64 x64 )
SPLITS=( train valid test )
for arch in ${ARCHS[@]}
do
mkdir -p ${TOK}/${arch}
# Train the BPE tokenizer
python3 remend.bpe -i ${DS}/${arch} -o ${TOK}/${arch}
cp ${DS}/${arch}/{train,valid,test}.eqn ${TOK}/${arch}
fairseq-preprocess -s asm -t eqn \
--trainpref ${TOK}/${arch}/train \
--testpref ${TOK}/${arch}/test \
--validpref ${TOK}/${arch}/valid \
--destdir ${TOK}/${arch}/
done