DS=dataset/ | |
TOK=tokenized/ | |
ARCHS=( arm32 aarch64 x64 ) | |
SPLITS=( train valid test ) | |
for arch in ${ARCHS[@]} | |
do | |
mkdir -p ${TOK}/${arch} | |
# Train the BPE tokenizer | |
python3 remend.bpe -i ${DS}/${arch} -o ${TOK}/${arch} | |
cp ${DS}/${arch}/{train,valid,test}.eqn ${TOK}/${arch} | |
fairseq-preprocess -s asm -t eqn \ | |
--trainpref ${TOK}/${arch}/train \ | |
--testpref ${TOK}/${arch}/test \ | |
--validpref ${TOK}/${arch}/valid \ | |
--destdir ${TOK}/${arch}/ | |
done | |