Muthukumaran's picture
Initial Upload
d7d2441 verified
/opt/share/exec/jbsub8 -cores 8+2 -require a100_80gb && hname!=cccxc607 -queue nonstandard -proj embeddings-kd -name NASA-ft -mem 100G -out /dccstor/aashka1/sentence_embeddings/output/NASA_finetuning_v2/Stagewise_stage1/NASA-WatBERT-300M-NASA-BioMedical-300K/2GPU/FinetuneNASA_pooling_mean_qlen64_plen256_freeze_embedding_true_lr_2e-5_bsz_500_100000steps/sent_embed.out -err /dccstor/aashka1/sentence_embeddings/output/NASA_finetuning_v2/Stagewise_stage1/NASA-WatBERT-300M-NASA-BioMedical-300K/2GPU/FinetuneNASA_pooling_mean_qlen64_plen256_freeze_embedding_true_lr_2e-5_bsz_500_100000steps/sent_embed.err PYTHONPATH=/dccstor/aashka1/onekd deepspeed --master_port 29519 /dccstor/aashka1/onekd/onekd/distillation/text_embeddings/simlm/distil_similarity_scores/train_simlm.py --deepspeed /dccstor/aashka1/onekd/onekd/distillation/text_embeddings/simlm/ds_config.json --model_name_or_path /dccstor/aashka1/sentence_embeddings/output/NASA_finetuning_v2/NASA-WatBERT-300M-NASA-BioMedical/2GPU/FinetuneNASA_pooling_mean_qlen64_plen256_freeze_embedding_true_lr_2e-5_bsz_500_300000steps/checkpoint-300000 --freeze_pos_emb true --pooling_source mean --per_device_train_batch_size 500 --per_device_eval_batch_size 32 --add_pooler False --t 0.02 --seed 42 --do_train --data_dir /dccstor/aashka1/sentence_embeddings/data/sent_embed_data/ --train_data_config //dccstor/phalanx/aashka/sentence-embeddings-NASA/configs/data_config_20M_NASA_stage1_nogooaq_nohotpot_nofever.json --validation_file /dccstor/retrieve-rerank2/data/embeddings-training-data/simlm/dev_w_docs.jsonl --fp16 --q_max_len 64 --p_max_len 256 --train_n_passages 1 --dataloader_num_workers 2 --num_train_epochs 1 --learning_rate 2e-5 --use_scaled_loss True --warmup_steps 1000 --share_encoder True --logging_steps 1000 --output_dir /dccstor/aashka1/sentence_embeddings/output/NASA_finetuning_v2/Stagewise_stage1/NASA-WatBERT-300M-NASA-BioMedical-300K/2GPU/FinetuneNASA_pooling_mean_qlen64_plen256_freeze_embedding_true_lr_2e-5_bsz_500_100000steps --save_total_limit 3 --save_strategy steps --save_steps 100000 --remove_unused_columns False --overwrite_output_dir --disable_tqdm False --max_steps 100000 --report_to none
# bsub -q nonstandard -g /aashka/_/embeddings-kd -J NASA-ft -M 102400 -hl -n 8 -R "select[a100_80gb && hname!=cccxc607] rusage[mem=112640] span[ptile=8] affinity[core(1)]" -oo /dccstor/aashka1/sentence_embeddings/output/NASA_finetuning_v2/Stagewise_stage1/NASA-WatBERT-300M-NASA-BioMedical-300K/2GPU/FinetuneNASA_pooling_mean_qlen64_plen256_freeze_embedding_true_lr_2e-5_bsz_500_100000steps/sent_embed.out -eo /dccstor/aashka1/sentence_embeddings/output/NASA_finetuning_v2/Stagewise_stage1/NASA-WatBERT-300M-NASA-BioMedical-300K/2GPU/FinetuneNASA_pooling_mean_qlen64_plen256_freeze_embedding_true_lr_2e-5_bsz_500_100000steps/sent_embed.err -gpu num=2:mode=exclusive_process PYTHONPATH=/dccstor/aashka1/onekd deepspeed --master_port 29519 /dccstor/aashka1/onekd/onekd/distillation/text_embeddings/simlm/distil_similarity_scores/train_simlm.py --deepspeed /dccstor/aashka1/onekd/onekd/distillation/text_embeddings/simlm/ds_config.json --model_name_or_path /dccstor/aashka1/sentence_embeddings/output/NASA_finetuning_v2/NASA-WatBERT-300M-NASA-BioMedical/2GPU/FinetuneNASA_pooling_mean_qlen64_plen256_freeze_embedding_true_lr_2e-5_bsz_500_300000steps/checkpoint-300000 --freeze_pos_emb true --pooling_source mean --per_device_train_batch_size 500 --per_device_eval_batch_size 32 --add_pooler False --t 0.02 --seed 42 --do_train --data_dir /dccstor/aashka1/sentence_embeddings/data/sent_embed_data/ --train_data_config //dccstor/phalanx/aashka/sentence-embeddings-NASA/configs/data_config_20M_NASA_stage1_nogooaq_nohotpot_nofever.json --validation_file /dccstor/retrieve-rerank2/data/embeddings-training-data/simlm/dev_w_docs.jsonl --fp16 --q_max_len 64 --p_max_len 256 --train_n_passages 1 --dataloader_num_workers 2 --num_train_epochs 1 --learning_rate 2e-5 --use_scaled_loss True --warmup_steps 1000 --share_encoder True --logging_steps 1000 --output_dir /dccstor/aashka1/sentence_embeddings/output/NASA_finetuning_v2/Stagewise_stage1/NASA-WatBERT-300M-NASA-BioMedical-300K/2GPU/FinetuneNASA_pooling_mean_qlen64_plen256_freeze_embedding_true_lr_2e-5_bsz_500_100000steps --save_total_limit 3 --save_strategy steps --save_steps 100000 --remove_unused_columns False --overwrite_output_dir --disable_tqdm False --max_steps 100000 --report_to none
Job <722256> is submitted to queue <nonstandard>.