{ "save_data": "zh_en/data_spm", "src_vocab": "zh-en-benchmark/src.eole.vocab", "report_every": 100, "share_vocab": false, "tgt_vocab": "zh-en-benchmark/tgt.eole.vocab", "vocab_size_multiple": 8, "tensorboard_log_dir_dated": "tensorboard/Feb-12_13-34-26", "src_vocab_size": 32000, "tensorboard": true, "n_sample": 0, "tgt_vocab_size": 32000, "valid_metrics": [ "BLEU" ], "tensorboard_log_dir": "tensorboard", "seed": 1234, "overwrite": true, "transforms": [ "sentencepiece", "filtertoolong" ], "training": { "accum_count": [ 16 ], "train_steps": 100000, "gpu_ranks": [ 0 ], "save_checkpoint_steps": 2000, "decay_method": "noam", "bucket_size": 128000, "world_size": 1, "accum_steps": [ 0 ], "optim": "pagedadamw8bit", "prefetch_factor": 100, "compute_dtype": "torch.bfloat16", "normalization": "tokens", "label_smoothing": 0.1, "batch_size_multiple": 8, "dropout_steps": [ 0 ], "average_decay": 0.0001, "dropout": [ 0.1 ], "batch_type": "tokens", "valid_batch_size": 8192, "param_init_method": "xavier_uniform", "adam_beta2": 0.998, "model_path": "zh-en-benchmark/model", "keep_checkpoint": 4, "num_workers": 0, "batch_size": 8192, "attention_dropout": [ 0.1 ], "warmup_steps": 10000, "valid_steps": 2000, "max_grad_norm": 2.0, "learning_rate": 2.0 }, "data": { "corpus_1": { "path_align": null, "path_src": "zh-en/train.ready.zh", "path_tgt": "zh-en/train.ready.en", "transforms": [ "sentencepiece", "filtertoolong" ] }, "valid": { "path_align": null, "path_src": "zh-en-benchmark/dev.zho", "path_tgt": "zh-en-benchmark/dev.eng", "transforms": [ "sentencepiece", "filtertoolong" ] } }, "transforms_configs": { "sentencepiece": { "tgt_subword_model": "${MODEL_PATH}/tgt.spm.model", "src_subword_model": "${MODEL_PATH}/src.spm.model" }, "filtertoolong": { "tgt_seq_length": 256, "src_seq_length": 256 } }, "model": { "share_decoder_embeddings": true, "position_encoding_type": "SinusoidalInterleaved", "add_qkvbias": false, "architecture": "transformer", "add_ffnbias": true, "hidden_size": 1024, "transformer_ff": 4096, "mlp_activation_fn": "gelu", "norm_eps": 1e-06, "layer_norm": "standard", "heads": 16, "add_estimator": false, "share_embeddings": false, "decoder": { "heads": 16, "decoder_type": "transformer", "position_encoding_type": "SinusoidalInterleaved", "add_qkvbias": false, "layers": 2, "add_ffnbias": true, "hidden_size": 1024, "n_positions": null, "transformer_ff": 4096, "rope_config": null, "mlp_activation_fn": "gelu", "norm_eps": 1e-06, "layer_norm": "standard", "tgt_word_vec_size": 1024 }, "embeddings": { "word_vec_size": 1024, "position_encoding_type": "SinusoidalInterleaved", "tgt_word_vec_size": 1024, "src_word_vec_size": 1024 }, "encoder": { "heads": 16, "position_encoding_type": "SinusoidalInterleaved", "add_qkvbias": false, "layers": 8, "add_ffnbias": true, "hidden_size": 1024, "n_positions": null, "src_word_vec_size": 1024, "transformer_ff": 4096, "rope_config": null, "mlp_activation_fn": "gelu", "norm_eps": 1e-06, "layer_norm": "standard", "encoder_type": "transformer" } } }