diff --git "a/ablations/postfix/trained_arm32_best_postfix/training.log" "b/ablations/postfix/trained_arm32_best_postfix/training.log" new file mode 100644--- /dev/null +++ "b/ablations/postfix/trained_arm32_best_postfix/training.log" @@ -0,0 +1,737 @@ +{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': 'arm32/trained_arm32_best_postfix/training.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 1, 'distributed_num_procs': 1, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': None, 'distributed_port': -1, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 1, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': False, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 1, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 20000, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 1, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 20000, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 100000, 'stop_time_hours': 0.0, 'clip_norm': 0.0, 'sentence_avg': False, 'update_freq': [1], 'lr': [0.0005], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False}, 'checkpoint': {'_name': None, 'save_dir': 'arm32/trained_arm32_best_postfix', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 1, 'save_interval_updates': 0, 'keep_interval_updates': -1, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': 3, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': 4, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 1}, 'generation': {'_name': None, 'beam': 5, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(_name='transformer', activation_dropout=0.0, activation_fn='relu', adam_betas=(0.9, 0.999), adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, aim_repo=None, aim_run_hash=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, arch='transformer', attention_dropout=0.0, azureml_logging=False, batch_size=None, batch_size_valid=None, best_checkpoint_metric='loss', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_activations=False, checkpoint_shard_count=1, checkpoint_suffix='', clip_norm=0.0, combine_valid_subsets=None, continue_once=None, cpu=False, cpu_offload=False, criterion='cross_entropy', cross_self_attention=False, curriculum=0, data='arm32/tokenized_dlsm_arm32_postfix', data_buffer_size=10, dataset_impl=None, ddp_backend='pytorch_ddp', ddp_comm_hook='none', decoder_attention_heads=8, decoder_embed_dim=96, decoder_embed_path=None, decoder_ffn_embed_dim=384, decoder_input_dim=96, decoder_layerdrop=0, decoder_layers=4, decoder_layers_to_keep=None, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim='96', device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_num_procs=1, distributed_port=-1, distributed_rank=0, distributed_world_size=1, dropout=0.05, ema_decay=0.9999, ema_fp32=False, ema_seed_model=None, ema_start_update=0, ema_update_freq=1, empty_cache_freq=0, encoder_attention_heads=8, encoder_embed_dim=384, encoder_embed_path=None, encoder_ffn_embed_dim=1536, encoder_layerdrop=0, encoder_layers=4, encoder_layers_to_keep=None, encoder_learned_pos=False, encoder_normalize_before=False, eos=2, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_bleu_print_samples=False, eval_bleu_remove_bpe=None, eval_tokenized_bleu=False, fast_stat_sync=False, find_unused_parameters=False, finetune_from_model=None, fix_batches_to_gpus=False, fixed_validation_seed=None, fp16=False, fp16_adam_stats=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, fp32_reduce_scatter=False, gen_subset='test', gradient_as_bucket_view=False, grouped_shuffling=False, heartbeat_timeout=-1, ignore_unused_valid_subsets=False, keep_best_checkpoints=3, keep_interval_updates=-1, keep_interval_updates_pattern=-1, keep_last_epochs=-1, layernorm_embedding=False, left_pad_source=True, left_pad_target=False, load_alignments=False, load_checkpoint_on_all_dp_ranks=False, localsgd_frequency=3, log_file='arm32/trained_arm32_best_postfix/training.log', log_format=None, log_interval=100, lr=[0.0005], lr_scheduler='inverse_sqrt', max_epoch=0, max_source_positions=1024, max_target_positions=1024, max_tokens=20000, max_tokens_valid=20000, max_update=100000, max_valid_steps=None, maximize_best_checkpoint_metric=False, memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_cross_attention=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_progress_bar=False, no_reshard_after_forward=False, no_save=False, no_save_optimizer_state=False, no_scale_embedding=False, no_seed_provided=False, no_token_positional_embeddings=False, not_fsdp_flatten_parameters=False, nprocs_per_node=1, num_batch_buckets=0, num_shards=1, num_workers=1, offload_activations=False, on_cpu_convert_precision=False, optimizer='adam', optimizer_overrides='{}', pad=1, patience=4, pipeline_balance=None, pipeline_checkpoint='never', pipeline_chunks=0, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_devices=None, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_model_parallel=False, plasma_path='/tmp/plasma', profile=False, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, quantization_config_path=None, required_batch_size_multiple=8, required_seq_len_multiple=1, reset_dataloader=False, reset_logging=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt', save_dir='arm32/trained_arm32_best_postfix', save_interval=1, save_interval_updates=0, scoring='bleu', seed=1, sentence_avg=False, shard_id=0, share_all_embeddings=False, share_decoder_input_output_embed=False, skip_invalid_size_inputs_valid_test=False, skip_remainder_batch=False, slowmo_base_algorithm='localsgd', slowmo_momentum=None, source_lang=None, stop_min_lr=-1.0, stop_time_hours=0, store_ema=False, suppress_crashes=False, target_lang=None, task='translation', tensorboard_logdir=None, threshold_loss_scale=None, tie_adaptive_weights=False, tokenizer=None, tpu=False, train_subset='train', truncate_source=False, unk=3, update_epoch_batch_itr=False, update_freq=[1], update_ordered_indices_seed=False, upsample_primary=-1, use_bmuf=False, use_old_adam=False, use_plasma_view=False, use_sharded_state=False, user_dir=None, valid_subset='valid', validate_after_updates=0, validate_interval=1, validate_interval_updates=0, wandb_project=None, warmup_init_lr=-1, warmup_updates=4000, weight_decay=0.001, write_checkpoints_asynchronously=False, zero_sharding='none'), 'task': {'_name': 'translation', 'data': 'arm32/tokenized_dlsm_arm32_postfix', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'cross_entropy', 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': [0.9, 0.999], 'adam_eps': 1e-08, 'weight_decay': 0.001, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.0005]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.0005]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}} +TransformerModel( + (encoder): TransformerEncoderBase( + (dropout_module): FairseqDropout() + (embed_tokens): Embedding(224, 384, padding_idx=1) + (embed_positions): SinusoidalPositionalEmbedding() + (layers): ModuleList( + (0-3): 4 x TransformerEncoderLayerBase( + (self_attn): MultiheadAttention( + (dropout_module): FairseqDropout() + (k_proj): Linear(in_features=384, out_features=384, bias=True) + (v_proj): Linear(in_features=384, out_features=384, bias=True) + (q_proj): Linear(in_features=384, out_features=384, bias=True) + (out_proj): Linear(in_features=384, out_features=384, bias=True) + ) + (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True) + (dropout_module): FairseqDropout() + (activation_dropout_module): FairseqDropout() + (fc1): Linear(in_features=384, out_features=1536, bias=True) + (fc2): Linear(in_features=1536, out_features=384, bias=True) + (final_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True) + ) + ) + ) + (decoder): TransformerDecoderBase( + (dropout_module): FairseqDropout() + (embed_tokens): Embedding(40, 96, padding_idx=1) + (embed_positions): SinusoidalPositionalEmbedding() + (layers): ModuleList( + (0-3): 4 x TransformerDecoderLayerBase( + (dropout_module): FairseqDropout() + (self_attn): MultiheadAttention( + (dropout_module): FairseqDropout() + (k_proj): Linear(in_features=96, out_features=96, bias=True) + (v_proj): Linear(in_features=96, out_features=96, bias=True) + (q_proj): Linear(in_features=96, out_features=96, bias=True) + (out_proj): Linear(in_features=96, out_features=96, bias=True) + ) + (activation_dropout_module): FairseqDropout() + (self_attn_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True) + (encoder_attn): MultiheadAttention( + (dropout_module): FairseqDropout() + (k_proj): Linear(in_features=384, out_features=96, bias=True) + (v_proj): Linear(in_features=384, out_features=96, bias=True) + (q_proj): Linear(in_features=96, out_features=96, bias=True) + (out_proj): Linear(in_features=96, out_features=96, bias=True) + ) + (encoder_attn_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True) + (fc1): Linear(in_features=96, out_features=384, bias=True) + (fc2): Linear(in_features=384, out_features=96, bias=True) + (final_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True) + ) + ) + (output_projection): Linear(in_features=96, out_features=40, bias=False) + ) +) +task: TranslationTask +model: TransformerModel +criterion: CrossEntropyCriterion +num. shared model params: 15,101,568 (num. trained: 15,101,568) +num. expert model params: 0 (num. trained: 0) +training on 1 devices (GPUs/TPUs) +max tokens per device = 20000 and max sentences per device = None +Start iterating over samples +begin validation on "valid" subset +epoch 001 | valid on 'valid' subset | loss 0.998 | ppl 2 | wps 34699.2 | wpb 2108.3 | bsz 101.7 | num_updates 4152 +end of epoch 1 (average epoch stats below) +epoch 001 | loss 1.68 | ppl 3.2 | wps 10390.8 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 4152 | lr 0.000490762 | gnorm 2.962 | train_wall 839 | gb_free 13.7 | wall 855 +Start iterating over samples +begin validation on "valid" subset +epoch 002 | valid on 'valid' subset | loss 0.532 | ppl 1.45 | wps 34600.5 | wpb 2108.3 | bsz 101.7 | num_updates 8304 | best_loss 0.532 +epoch 002 | valid on 'valid' subset | loss 0.532 | ppl 1.45 | wps 34600.5 | wpb 2108.3 | bsz 101.7 | num_updates 8304 | best_loss 0.532 +end of epoch 2 (average epoch stats below) +epoch 002 | loss 0.708 | ppl 1.63 | wps 10412.1 | ups 4.87 | wpb 2137.6 | bsz 103.2 | num_updates 8304 | lr 0.000347021 | gnorm 1.369 | train_wall 837 | gb_free 13.5 | wall 1707 +epoch 002 | loss 0.708 | ppl 1.63 | wps 10412.1 | ups 4.87 | wpb 2137.6 | bsz 103.2 | num_updates 8304 | lr 0.000347021 | gnorm 1.369 | train_wall 837 | gb_free 13.5 | wall 1707 +Start iterating over samples +begin validation on "valid" subset +epoch 003 | valid on 'valid' subset | loss 0.318 | ppl 1.25 | wps 34450.1 | wpb 2108.3 | bsz 101.7 | num_updates 12456 | best_loss 0.318 +epoch 003 | valid on 'valid' subset | loss 0.318 | ppl 1.25 | wps 34450.1 | wpb 2108.3 | bsz 101.7 | num_updates 12456 | best_loss 0.318 +epoch 003 | valid on 'valid' subset | loss 0.318 | ppl 1.25 | wps 34450.1 | wpb 2108.3 | bsz 101.7 | num_updates 12456 | best_loss 0.318 +end of epoch 3 (average epoch stats below) +epoch 003 | loss 0.405 | ppl 1.32 | wps 10428.4 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 12456 | lr 0.000283342 | gnorm 0.993 | train_wall 835 | gb_free 13.5 | wall 2558 +epoch 003 | loss 0.405 | ppl 1.32 | wps 10428.4 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 12456 | lr 0.000283342 | gnorm 0.993 | train_wall 835 | gb_free 13.5 | wall 2558 +epoch 003 | loss 0.405 | ppl 1.32 | wps 10428.4 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 12456 | lr 0.000283342 | gnorm 0.993 | train_wall 835 | gb_free 13.5 | wall 2558 +Start iterating over samples +begin validation on "valid" subset +epoch 004 | valid on 'valid' subset | loss 0.245 | ppl 1.19 | wps 34586.7 | wpb 2108.3 | bsz 101.7 | num_updates 16608 | best_loss 0.245 +epoch 004 | valid on 'valid' subset | loss 0.245 | ppl 1.19 | wps 34586.7 | wpb 2108.3 | bsz 101.7 | num_updates 16608 | best_loss 0.245 +epoch 004 | valid on 'valid' subset | loss 0.245 | ppl 1.19 | wps 34586.7 | wpb 2108.3 | bsz 101.7 | num_updates 16608 | best_loss 0.245 +epoch 004 | valid on 'valid' subset | loss 0.245 | ppl 1.19 | wps 34586.7 | wpb 2108.3 | bsz 101.7 | num_updates 16608 | best_loss 0.245 +end of epoch 4 (average epoch stats below) +epoch 004 | loss 0.256 | ppl 1.19 | wps 10432.4 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 16608 | lr 0.000245381 | gnorm 0.818 | train_wall 835 | gb_free 13.9 | wall 3409 +epoch 004 | loss 0.256 | ppl 1.19 | wps 10432.4 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 16608 | lr 0.000245381 | gnorm 0.818 | train_wall 835 | gb_free 13.9 | wall 3409 +epoch 004 | loss 0.256 | ppl 1.19 | wps 10432.4 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 16608 | lr 0.000245381 | gnorm 0.818 | train_wall 835 | gb_free 13.9 | wall 3409 +epoch 004 | loss 0.256 | ppl 1.19 | wps 10432.4 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 16608 | lr 0.000245381 | gnorm 0.818 | train_wall 835 | gb_free 13.9 | wall 3409 +Start iterating over samples +begin validation on "valid" subset +epoch 005 | valid on 'valid' subset | loss 0.191 | ppl 1.14 | wps 34430.3 | wpb 2108.3 | bsz 101.7 | num_updates 20760 | best_loss 0.191 +epoch 005 | valid on 'valid' subset | loss 0.191 | ppl 1.14 | wps 34430.3 | wpb 2108.3 | bsz 101.7 | num_updates 20760 | best_loss 0.191 +epoch 005 | valid on 'valid' subset | loss 0.191 | ppl 1.14 | wps 34430.3 | wpb 2108.3 | bsz 101.7 | num_updates 20760 | best_loss 0.191 +epoch 005 | valid on 'valid' subset | loss 0.191 | ppl 1.14 | wps 34430.3 | wpb 2108.3 | bsz 101.7 | num_updates 20760 | best_loss 0.191 +epoch 005 | valid on 'valid' subset | loss 0.191 | ppl 1.14 | wps 34430.3 | wpb 2108.3 | bsz 101.7 | num_updates 20760 | best_loss 0.191 +end of epoch 5 (average epoch stats below) +epoch 005 | loss 0.162 | ppl 1.12 | wps 10432.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 20760 | lr 0.000219476 | gnorm 0.714 | train_wall 835 | gb_free 13.7 | wall 4260 +epoch 005 | loss 0.162 | ppl 1.12 | wps 10432.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 20760 | lr 0.000219476 | gnorm 0.714 | train_wall 835 | gb_free 13.7 | wall 4260 +epoch 005 | loss 0.162 | ppl 1.12 | wps 10432.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 20760 | lr 0.000219476 | gnorm 0.714 | train_wall 835 | gb_free 13.7 | wall 4260 +epoch 005 | loss 0.162 | ppl 1.12 | wps 10432.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 20760 | lr 0.000219476 | gnorm 0.714 | train_wall 835 | gb_free 13.7 | wall 4260 +epoch 005 | loss 0.162 | ppl 1.12 | wps 10432.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 20760 | lr 0.000219476 | gnorm 0.714 | train_wall 835 | gb_free 13.7 | wall 4260 +Start iterating over samples +begin validation on "valid" subset +epoch 006 | valid on 'valid' subset | loss 0.131 | ppl 1.09 | wps 34782.5 | wpb 2108.3 | bsz 101.7 | num_updates 24912 | best_loss 0.131 +epoch 006 | valid on 'valid' subset | loss 0.131 | ppl 1.09 | wps 34782.5 | wpb 2108.3 | bsz 101.7 | num_updates 24912 | best_loss 0.131 +epoch 006 | valid on 'valid' subset | loss 0.131 | ppl 1.09 | wps 34782.5 | wpb 2108.3 | bsz 101.7 | num_updates 24912 | best_loss 0.131 +epoch 006 | valid on 'valid' subset | loss 0.131 | ppl 1.09 | wps 34782.5 | wpb 2108.3 | bsz 101.7 | num_updates 24912 | best_loss 0.131 +epoch 006 | valid on 'valid' subset | loss 0.131 | ppl 1.09 | wps 34782.5 | wpb 2108.3 | bsz 101.7 | num_updates 24912 | best_loss 0.131 +epoch 006 | valid on 'valid' subset | loss 0.131 | ppl 1.09 | wps 34782.5 | wpb 2108.3 | bsz 101.7 | num_updates 24912 | best_loss 0.131 +end of epoch 6 (average epoch stats below) +epoch 006 | loss 0.103 | ppl 1.07 | wps 10434.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 24912 | lr 0.000200353 | gnorm 0.626 | train_wall 835 | gb_free 13.8 | wall 5111 +epoch 006 | loss 0.103 | ppl 1.07 | wps 10434.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 24912 | lr 0.000200353 | gnorm 0.626 | train_wall 835 | gb_free 13.8 | wall 5111 +epoch 006 | loss 0.103 | ppl 1.07 | wps 10434.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 24912 | lr 0.000200353 | gnorm 0.626 | train_wall 835 | gb_free 13.8 | wall 5111 +epoch 006 | loss 0.103 | ppl 1.07 | wps 10434.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 24912 | lr 0.000200353 | gnorm 0.626 | train_wall 835 | gb_free 13.8 | wall 5111 +epoch 006 | loss 0.103 | ppl 1.07 | wps 10434.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 24912 | lr 0.000200353 | gnorm 0.626 | train_wall 835 | gb_free 13.8 | wall 5111 +epoch 006 | loss 0.103 | ppl 1.07 | wps 10434.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 24912 | lr 0.000200353 | gnorm 0.626 | train_wall 835 | gb_free 13.8 | wall 5111 +Start iterating over samples +begin validation on "valid" subset +epoch 007 | valid on 'valid' subset | loss 0.109 | ppl 1.08 | wps 34656.1 | wpb 2108.3 | bsz 101.7 | num_updates 29064 | best_loss 0.109 +epoch 007 | valid on 'valid' subset | loss 0.109 | ppl 1.08 | wps 34656.1 | wpb 2108.3 | bsz 101.7 | num_updates 29064 | best_loss 0.109 +epoch 007 | valid on 'valid' subset | loss 0.109 | ppl 1.08 | wps 34656.1 | wpb 2108.3 | bsz 101.7 | num_updates 29064 | best_loss 0.109 +epoch 007 | valid on 'valid' subset | loss 0.109 | ppl 1.08 | wps 34656.1 | wpb 2108.3 | bsz 101.7 | num_updates 29064 | best_loss 0.109 +epoch 007 | valid on 'valid' subset | loss 0.109 | ppl 1.08 | wps 34656.1 | wpb 2108.3 | bsz 101.7 | num_updates 29064 | best_loss 0.109 +epoch 007 | valid on 'valid' subset | loss 0.109 | ppl 1.08 | wps 34656.1 | wpb 2108.3 | bsz 101.7 | num_updates 29064 | best_loss 0.109 +epoch 007 | valid on 'valid' subset | loss 0.109 | ppl 1.08 | wps 34656.1 | wpb 2108.3 | bsz 101.7 | num_updates 29064 | best_loss 0.109 +end of epoch 7 (average epoch stats below) +epoch 007 | loss 0.068 | ppl 1.05 | wps 10439.9 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 29064 | lr 0.000185491 | gnorm 0.57 | train_wall 834 | gb_free 13.7 | wall 5961 +epoch 007 | loss 0.068 | ppl 1.05 | wps 10439.9 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 29064 | lr 0.000185491 | gnorm 0.57 | train_wall 834 | gb_free 13.7 | wall 5961 +epoch 007 | loss 0.068 | ppl 1.05 | wps 10439.9 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 29064 | lr 0.000185491 | gnorm 0.57 | train_wall 834 | gb_free 13.7 | wall 5961 +epoch 007 | loss 0.068 | ppl 1.05 | wps 10439.9 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 29064 | lr 0.000185491 | gnorm 0.57 | train_wall 834 | gb_free 13.7 | wall 5961 +epoch 007 | loss 0.068 | ppl 1.05 | wps 10439.9 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 29064 | lr 0.000185491 | gnorm 0.57 | train_wall 834 | gb_free 13.7 | wall 5961 +epoch 007 | loss 0.068 | ppl 1.05 | wps 10439.9 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 29064 | lr 0.000185491 | gnorm 0.57 | train_wall 834 | gb_free 13.7 | wall 5961 +epoch 007 | loss 0.068 | ppl 1.05 | wps 10439.9 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 29064 | lr 0.000185491 | gnorm 0.57 | train_wall 834 | gb_free 13.7 | wall 5961 +Start iterating over samples +begin validation on "valid" subset +epoch 008 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 34735.3 | wpb 2108.3 | bsz 101.7 | num_updates 33216 | best_loss 0.098 +epoch 008 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 34735.3 | wpb 2108.3 | bsz 101.7 | num_updates 33216 | best_loss 0.098 +epoch 008 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 34735.3 | wpb 2108.3 | bsz 101.7 | num_updates 33216 | best_loss 0.098 +epoch 008 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 34735.3 | wpb 2108.3 | bsz 101.7 | num_updates 33216 | best_loss 0.098 +epoch 008 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 34735.3 | wpb 2108.3 | bsz 101.7 | num_updates 33216 | best_loss 0.098 +epoch 008 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 34735.3 | wpb 2108.3 | bsz 101.7 | num_updates 33216 | best_loss 0.098 +epoch 008 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 34735.3 | wpb 2108.3 | bsz 101.7 | num_updates 33216 | best_loss 0.098 +epoch 008 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 34735.3 | wpb 2108.3 | bsz 101.7 | num_updates 33216 | best_loss 0.098 +end of epoch 8 (average epoch stats below) +epoch 008 | loss 0.048 | ppl 1.03 | wps 10442.4 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 33216 | lr 0.000173511 | gnorm 0.517 | train_wall 834 | gb_free 13.7 | wall 6811 +epoch 008 | loss 0.048 | ppl 1.03 | wps 10442.4 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 33216 | lr 0.000173511 | gnorm 0.517 | train_wall 834 | gb_free 13.7 | wall 6811 +epoch 008 | loss 0.048 | ppl 1.03 | wps 10442.4 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 33216 | lr 0.000173511 | gnorm 0.517 | train_wall 834 | gb_free 13.7 | wall 6811 +epoch 008 | loss 0.048 | ppl 1.03 | wps 10442.4 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 33216 | lr 0.000173511 | gnorm 0.517 | train_wall 834 | gb_free 13.7 | wall 6811 +epoch 008 | loss 0.048 | ppl 1.03 | wps 10442.4 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 33216 | lr 0.000173511 | gnorm 0.517 | train_wall 834 | gb_free 13.7 | wall 6811 +epoch 008 | loss 0.048 | ppl 1.03 | wps 10442.4 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 33216 | lr 0.000173511 | gnorm 0.517 | train_wall 834 | gb_free 13.7 | wall 6811 +epoch 008 | loss 0.048 | ppl 1.03 | wps 10442.4 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 33216 | lr 0.000173511 | gnorm 0.517 | train_wall 834 | gb_free 13.7 | wall 6811 +epoch 008 | loss 0.048 | ppl 1.03 | wps 10442.4 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 33216 | lr 0.000173511 | gnorm 0.517 | train_wall 834 | gb_free 13.7 | wall 6811 +Start iterating over samples +begin validation on "valid" subset +epoch 009 | valid on 'valid' subset | loss 0.088 | ppl 1.06 | wps 34729.9 | wpb 2108.3 | bsz 101.7 | num_updates 37368 | best_loss 0.088 +epoch 009 | valid on 'valid' subset | loss 0.088 | ppl 1.06 | wps 34729.9 | wpb 2108.3 | bsz 101.7 | num_updates 37368 | best_loss 0.088 +epoch 009 | valid on 'valid' subset | loss 0.088 | ppl 1.06 | wps 34729.9 | wpb 2108.3 | bsz 101.7 | num_updates 37368 | best_loss 0.088 +epoch 009 | valid on 'valid' subset | loss 0.088 | ppl 1.06 | wps 34729.9 | wpb 2108.3 | bsz 101.7 | num_updates 37368 | best_loss 0.088 +epoch 009 | valid on 'valid' subset | loss 0.088 | ppl 1.06 | wps 34729.9 | wpb 2108.3 | bsz 101.7 | num_updates 37368 | best_loss 0.088 +epoch 009 | valid on 'valid' subset | loss 0.088 | ppl 1.06 | wps 34729.9 | wpb 2108.3 | bsz 101.7 | num_updates 37368 | best_loss 0.088 +epoch 009 | valid on 'valid' subset | loss 0.088 | ppl 1.06 | wps 34729.9 | wpb 2108.3 | bsz 101.7 | num_updates 37368 | best_loss 0.088 +epoch 009 | valid on 'valid' subset | loss 0.088 | ppl 1.06 | wps 34729.9 | wpb 2108.3 | bsz 101.7 | num_updates 37368 | best_loss 0.088 +epoch 009 | valid on 'valid' subset | loss 0.088 | ppl 1.06 | wps 34729.9 | wpb 2108.3 | bsz 101.7 | num_updates 37368 | best_loss 0.088 +end of epoch 9 (average epoch stats below) +epoch 009 | loss 0.037 | ppl 1.03 | wps 10443.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 37368 | lr 0.000163587 | gnorm 0.482 | train_wall 834 | gb_free 13.7 | wall 7660 +epoch 009 | loss 0.037 | ppl 1.03 | wps 10443.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 37368 | lr 0.000163587 | gnorm 0.482 | train_wall 834 | gb_free 13.7 | wall 7660 +epoch 009 | loss 0.037 | ppl 1.03 | wps 10443.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 37368 | lr 0.000163587 | gnorm 0.482 | train_wall 834 | gb_free 13.7 | wall 7660 +epoch 009 | loss 0.037 | ppl 1.03 | wps 10443.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 37368 | lr 0.000163587 | gnorm 0.482 | train_wall 834 | gb_free 13.7 | wall 7660 +epoch 009 | loss 0.037 | ppl 1.03 | wps 10443.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 37368 | lr 0.000163587 | gnorm 0.482 | train_wall 834 | gb_free 13.7 | wall 7660 +epoch 009 | loss 0.037 | ppl 1.03 | wps 10443.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 37368 | lr 0.000163587 | gnorm 0.482 | train_wall 834 | gb_free 13.7 | wall 7660 +epoch 009 | loss 0.037 | ppl 1.03 | wps 10443.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 37368 | lr 0.000163587 | gnorm 0.482 | train_wall 834 | gb_free 13.7 | wall 7660 +epoch 009 | loss 0.037 | ppl 1.03 | wps 10443.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 37368 | lr 0.000163587 | gnorm 0.482 | train_wall 834 | gb_free 13.7 | wall 7660 +epoch 009 | loss 0.037 | ppl 1.03 | wps 10443.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 37368 | lr 0.000163587 | gnorm 0.482 | train_wall 834 | gb_free 13.7 | wall 7660 +Start iterating over samples +begin validation on "valid" subset +epoch 010 | valid on 'valid' subset | loss 0.094 | ppl 1.07 | wps 34542.7 | wpb 2108.3 | bsz 101.7 | num_updates 41520 | best_loss 0.088 +epoch 010 | valid on 'valid' subset | loss 0.094 | ppl 1.07 | wps 34542.7 | wpb 2108.3 | bsz 101.7 | num_updates 41520 | best_loss 0.088 +epoch 010 | valid on 'valid' subset | loss 0.094 | ppl 1.07 | wps 34542.7 | wpb 2108.3 | bsz 101.7 | num_updates 41520 | best_loss 0.088 +epoch 010 | valid on 'valid' subset | loss 0.094 | ppl 1.07 | wps 34542.7 | wpb 2108.3 | bsz 101.7 | num_updates 41520 | best_loss 0.088 +epoch 010 | valid on 'valid' subset | loss 0.094 | ppl 1.07 | wps 34542.7 | wpb 2108.3 | bsz 101.7 | num_updates 41520 | best_loss 0.088 +epoch 010 | valid on 'valid' subset | loss 0.094 | ppl 1.07 | wps 34542.7 | wpb 2108.3 | bsz 101.7 | num_updates 41520 | best_loss 0.088 +epoch 010 | valid on 'valid' subset | loss 0.094 | ppl 1.07 | wps 34542.7 | wpb 2108.3 | bsz 101.7 | num_updates 41520 | best_loss 0.088 +epoch 010 | valid on 'valid' subset | loss 0.094 | ppl 1.07 | wps 34542.7 | wpb 2108.3 | bsz 101.7 | num_updates 41520 | best_loss 0.088 +epoch 010 | valid on 'valid' subset | loss 0.094 | ppl 1.07 | wps 34542.7 | wpb 2108.3 | bsz 101.7 | num_updates 41520 | best_loss 0.088 +epoch 010 | valid on 'valid' subset | loss 0.094 | ppl 1.07 | wps 34542.7 | wpb 2108.3 | bsz 101.7 | num_updates 41520 | best_loss 0.088 +end of epoch 10 (average epoch stats below) +epoch 010 | loss 0.028 | ppl 1.02 | wps 10433.3 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 41520 | lr 0.000155193 | gnorm 0.432 | train_wall 835 | gb_free 13.8 | wall 8511 +epoch 010 | loss 0.028 | ppl 1.02 | wps 10433.3 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 41520 | lr 0.000155193 | gnorm 0.432 | train_wall 835 | gb_free 13.8 | wall 8511 +epoch 010 | loss 0.028 | ppl 1.02 | wps 10433.3 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 41520 | lr 0.000155193 | gnorm 0.432 | train_wall 835 | gb_free 13.8 | wall 8511 +epoch 010 | loss 0.028 | ppl 1.02 | wps 10433.3 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 41520 | lr 0.000155193 | gnorm 0.432 | train_wall 835 | gb_free 13.8 | wall 8511 +epoch 010 | loss 0.028 | ppl 1.02 | wps 10433.3 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 41520 | lr 0.000155193 | gnorm 0.432 | train_wall 835 | gb_free 13.8 | wall 8511 +epoch 010 | loss 0.028 | ppl 1.02 | wps 10433.3 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 41520 | lr 0.000155193 | gnorm 0.432 | train_wall 835 | gb_free 13.8 | wall 8511 +epoch 010 | loss 0.028 | ppl 1.02 | wps 10433.3 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 41520 | lr 0.000155193 | gnorm 0.432 | train_wall 835 | gb_free 13.8 | wall 8511 +epoch 010 | loss 0.028 | ppl 1.02 | wps 10433.3 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 41520 | lr 0.000155193 | gnorm 0.432 | train_wall 835 | gb_free 13.8 | wall 8511 +epoch 010 | loss 0.028 | ppl 1.02 | wps 10433.3 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 41520 | lr 0.000155193 | gnorm 0.432 | train_wall 835 | gb_free 13.8 | wall 8511 +epoch 010 | loss 0.028 | ppl 1.02 | wps 10433.3 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 41520 | lr 0.000155193 | gnorm 0.432 | train_wall 835 | gb_free 13.8 | wall 8511 +Start iterating over samples +begin validation on "valid" subset +epoch 011 | valid on 'valid' subset | loss 0.081 | ppl 1.06 | wps 34443.5 | wpb 2108.3 | bsz 101.7 | num_updates 45672 | best_loss 0.081 +epoch 011 | valid on 'valid' subset | loss 0.081 | ppl 1.06 | wps 34443.5 | wpb 2108.3 | bsz 101.7 | num_updates 45672 | best_loss 0.081 +epoch 011 | valid on 'valid' subset | loss 0.081 | ppl 1.06 | wps 34443.5 | wpb 2108.3 | bsz 101.7 | num_updates 45672 | best_loss 0.081 +epoch 011 | valid on 'valid' subset | loss 0.081 | ppl 1.06 | wps 34443.5 | wpb 2108.3 | bsz 101.7 | num_updates 45672 | best_loss 0.081 +epoch 011 | valid on 'valid' subset | loss 0.081 | ppl 1.06 | wps 34443.5 | wpb 2108.3 | bsz 101.7 | num_updates 45672 | best_loss 0.081 +epoch 011 | valid on 'valid' subset | loss 0.081 | ppl 1.06 | wps 34443.5 | wpb 2108.3 | bsz 101.7 | num_updates 45672 | best_loss 0.081 +epoch 011 | valid on 'valid' subset | loss 0.081 | ppl 1.06 | wps 34443.5 | wpb 2108.3 | bsz 101.7 | num_updates 45672 | best_loss 0.081 +epoch 011 | valid on 'valid' subset | loss 0.081 | ppl 1.06 | wps 34443.5 | wpb 2108.3 | bsz 101.7 | num_updates 45672 | best_loss 0.081 +epoch 011 | valid on 'valid' subset | loss 0.081 | ppl 1.06 | wps 34443.5 | wpb 2108.3 | bsz 101.7 | num_updates 45672 | best_loss 0.081 +epoch 011 | valid on 'valid' subset | loss 0.081 | ppl 1.06 | wps 34443.5 | wpb 2108.3 | bsz 101.7 | num_updates 45672 | best_loss 0.081 +epoch 011 | valid on 'valid' subset | loss 0.081 | ppl 1.06 | wps 34443.5 | wpb 2108.3 | bsz 101.7 | num_updates 45672 | best_loss 0.081 +end of epoch 11 (average epoch stats below) +epoch 011 | loss 0.023 | ppl 1.02 | wps 10433.8 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 45672 | lr 0.00014797 | gnorm 0.407 | train_wall 835 | gb_free 13.7 | wall 9362 +epoch 011 | loss 0.023 | ppl 1.02 | wps 10433.8 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 45672 | lr 0.00014797 | gnorm 0.407 | train_wall 835 | gb_free 13.7 | wall 9362 +epoch 011 | loss 0.023 | ppl 1.02 | wps 10433.8 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 45672 | lr 0.00014797 | gnorm 0.407 | train_wall 835 | gb_free 13.7 | wall 9362 +epoch 011 | loss 0.023 | ppl 1.02 | wps 10433.8 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 45672 | lr 0.00014797 | gnorm 0.407 | train_wall 835 | gb_free 13.7 | wall 9362 +epoch 011 | loss 0.023 | ppl 1.02 | wps 10433.8 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 45672 | lr 0.00014797 | gnorm 0.407 | train_wall 835 | gb_free 13.7 | wall 9362 +epoch 011 | loss 0.023 | ppl 1.02 | wps 10433.8 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 45672 | lr 0.00014797 | gnorm 0.407 | train_wall 835 | gb_free 13.7 | wall 9362 +epoch 011 | loss 0.023 | ppl 1.02 | wps 10433.8 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 45672 | lr 0.00014797 | gnorm 0.407 | train_wall 835 | gb_free 13.7 | wall 9362 +epoch 011 | loss 0.023 | ppl 1.02 | wps 10433.8 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 45672 | lr 0.00014797 | gnorm 0.407 | train_wall 835 | gb_free 13.7 | wall 9362 +epoch 011 | loss 0.023 | ppl 1.02 | wps 10433.8 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 45672 | lr 0.00014797 | gnorm 0.407 | train_wall 835 | gb_free 13.7 | wall 9362 +epoch 011 | loss 0.023 | ppl 1.02 | wps 10433.8 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 45672 | lr 0.00014797 | gnorm 0.407 | train_wall 835 | gb_free 13.7 | wall 9362 +epoch 011 | loss 0.023 | ppl 1.02 | wps 10433.8 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 45672 | lr 0.00014797 | gnorm 0.407 | train_wall 835 | gb_free 13.7 | wall 9362 +Start iterating over samples +begin validation on "valid" subset +epoch 012 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 34414.4 | wpb 2108.3 | bsz 101.7 | num_updates 49824 | best_loss 0.081 +epoch 012 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 34414.4 | wpb 2108.3 | bsz 101.7 | num_updates 49824 | best_loss 0.081 +epoch 012 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 34414.4 | wpb 2108.3 | bsz 101.7 | num_updates 49824 | best_loss 0.081 +epoch 012 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 34414.4 | wpb 2108.3 | bsz 101.7 | num_updates 49824 | best_loss 0.081 +epoch 012 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 34414.4 | wpb 2108.3 | bsz 101.7 | num_updates 49824 | best_loss 0.081 +epoch 012 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 34414.4 | wpb 2108.3 | bsz 101.7 | num_updates 49824 | best_loss 0.081 +epoch 012 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 34414.4 | wpb 2108.3 | bsz 101.7 | num_updates 49824 | best_loss 0.081 +epoch 012 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 34414.4 | wpb 2108.3 | bsz 101.7 | num_updates 49824 | best_loss 0.081 +epoch 012 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 34414.4 | wpb 2108.3 | bsz 101.7 | num_updates 49824 | best_loss 0.081 +epoch 012 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 34414.4 | wpb 2108.3 | bsz 101.7 | num_updates 49824 | best_loss 0.081 +epoch 012 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 34414.4 | wpb 2108.3 | bsz 101.7 | num_updates 49824 | best_loss 0.081 +epoch 012 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 34414.4 | wpb 2108.3 | bsz 101.7 | num_updates 49824 | best_loss 0.081 +end of epoch 12 (average epoch stats below) +epoch 012 | loss 0.019 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 49824 | lr 0.000141671 | gnorm 0.374 | train_wall 834 | gb_free 13.7 | wall 10212 +epoch 012 | loss 0.019 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 49824 | lr 0.000141671 | gnorm 0.374 | train_wall 834 | gb_free 13.7 | wall 10212 +epoch 012 | loss 0.019 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 49824 | lr 0.000141671 | gnorm 0.374 | train_wall 834 | gb_free 13.7 | wall 10212 +epoch 012 | loss 0.019 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 49824 | lr 0.000141671 | gnorm 0.374 | train_wall 834 | gb_free 13.7 | wall 10212 +epoch 012 | loss 0.019 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 49824 | lr 0.000141671 | gnorm 0.374 | train_wall 834 | gb_free 13.7 | wall 10212 +epoch 012 | loss 0.019 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 49824 | lr 0.000141671 | gnorm 0.374 | train_wall 834 | gb_free 13.7 | wall 10212 +epoch 012 | loss 0.019 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 49824 | lr 0.000141671 | gnorm 0.374 | train_wall 834 | gb_free 13.7 | wall 10212 +epoch 012 | loss 0.019 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 49824 | lr 0.000141671 | gnorm 0.374 | train_wall 834 | gb_free 13.7 | wall 10212 +epoch 012 | loss 0.019 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 49824 | lr 0.000141671 | gnorm 0.374 | train_wall 834 | gb_free 13.7 | wall 10212 +epoch 012 | loss 0.019 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 49824 | lr 0.000141671 | gnorm 0.374 | train_wall 834 | gb_free 13.7 | wall 10212 +epoch 012 | loss 0.019 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 49824 | lr 0.000141671 | gnorm 0.374 | train_wall 834 | gb_free 13.7 | wall 10212 +epoch 012 | loss 0.019 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 49824 | lr 0.000141671 | gnorm 0.374 | train_wall 834 | gb_free 13.7 | wall 10212 +Start iterating over samples +begin validation on "valid" subset +epoch 013 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34497.8 | wpb 2108.3 | bsz 101.7 | num_updates 53976 | best_loss 0.08 +epoch 013 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34497.8 | wpb 2108.3 | bsz 101.7 | num_updates 53976 | best_loss 0.08 +epoch 013 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34497.8 | wpb 2108.3 | bsz 101.7 | num_updates 53976 | best_loss 0.08 +epoch 013 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34497.8 | wpb 2108.3 | bsz 101.7 | num_updates 53976 | best_loss 0.08 +epoch 013 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34497.8 | wpb 2108.3 | bsz 101.7 | num_updates 53976 | best_loss 0.08 +epoch 013 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34497.8 | wpb 2108.3 | bsz 101.7 | num_updates 53976 | best_loss 0.08 +epoch 013 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34497.8 | wpb 2108.3 | bsz 101.7 | num_updates 53976 | best_loss 0.08 +epoch 013 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34497.8 | wpb 2108.3 | bsz 101.7 | num_updates 53976 | best_loss 0.08 +epoch 013 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34497.8 | wpb 2108.3 | bsz 101.7 | num_updates 53976 | best_loss 0.08 +epoch 013 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34497.8 | wpb 2108.3 | bsz 101.7 | num_updates 53976 | best_loss 0.08 +epoch 013 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34497.8 | wpb 2108.3 | bsz 101.7 | num_updates 53976 | best_loss 0.08 +epoch 013 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34497.8 | wpb 2108.3 | bsz 101.7 | num_updates 53976 | best_loss 0.08 +epoch 013 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34497.8 | wpb 2108.3 | bsz 101.7 | num_updates 53976 | best_loss 0.08 +end of epoch 13 (average epoch stats below) +epoch 013 | loss 0.016 | ppl 1.01 | wps 10436.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 53976 | lr 0.000136113 | gnorm 0.353 | train_wall 835 | gb_free 13.8 | wall 11062 +epoch 013 | loss 0.016 | ppl 1.01 | wps 10436.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 53976 | lr 0.000136113 | gnorm 0.353 | train_wall 835 | gb_free 13.8 | wall 11062 +epoch 013 | loss 0.016 | ppl 1.01 | wps 10436.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 53976 | lr 0.000136113 | gnorm 0.353 | train_wall 835 | gb_free 13.8 | wall 11062 +epoch 013 | loss 0.016 | ppl 1.01 | wps 10436.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 53976 | lr 0.000136113 | gnorm 0.353 | train_wall 835 | gb_free 13.8 | wall 11062 +epoch 013 | loss 0.016 | ppl 1.01 | wps 10436.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 53976 | lr 0.000136113 | gnorm 0.353 | train_wall 835 | gb_free 13.8 | wall 11062 +epoch 013 | loss 0.016 | ppl 1.01 | wps 10436.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 53976 | lr 0.000136113 | gnorm 0.353 | train_wall 835 | gb_free 13.8 | wall 11062 +epoch 013 | loss 0.016 | ppl 1.01 | wps 10436.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 53976 | lr 0.000136113 | gnorm 0.353 | train_wall 835 | gb_free 13.8 | wall 11062 +epoch 013 | loss 0.016 | ppl 1.01 | wps 10436.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 53976 | lr 0.000136113 | gnorm 0.353 | train_wall 835 | gb_free 13.8 | wall 11062 +epoch 013 | loss 0.016 | ppl 1.01 | wps 10436.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 53976 | lr 0.000136113 | gnorm 0.353 | train_wall 835 | gb_free 13.8 | wall 11062 +epoch 013 | loss 0.016 | ppl 1.01 | wps 10436.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 53976 | lr 0.000136113 | gnorm 0.353 | train_wall 835 | gb_free 13.8 | wall 11062 +epoch 013 | loss 0.016 | ppl 1.01 | wps 10436.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 53976 | lr 0.000136113 | gnorm 0.353 | train_wall 835 | gb_free 13.8 | wall 11062 +epoch 013 | loss 0.016 | ppl 1.01 | wps 10436.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 53976 | lr 0.000136113 | gnorm 0.353 | train_wall 835 | gb_free 13.8 | wall 11062 +epoch 013 | loss 0.016 | ppl 1.01 | wps 10436.5 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 53976 | lr 0.000136113 | gnorm 0.353 | train_wall 835 | gb_free 13.8 | wall 11062 +Start iterating over samples +begin validation on "valid" subset +epoch 014 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34802.6 | wpb 2108.3 | bsz 101.7 | num_updates 58128 | best_loss 0.08 +epoch 014 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34802.6 | wpb 2108.3 | bsz 101.7 | num_updates 58128 | best_loss 0.08 +epoch 014 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34802.6 | wpb 2108.3 | bsz 101.7 | num_updates 58128 | best_loss 0.08 +epoch 014 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34802.6 | wpb 2108.3 | bsz 101.7 | num_updates 58128 | best_loss 0.08 +epoch 014 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34802.6 | wpb 2108.3 | bsz 101.7 | num_updates 58128 | best_loss 0.08 +epoch 014 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34802.6 | wpb 2108.3 | bsz 101.7 | num_updates 58128 | best_loss 0.08 +epoch 014 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34802.6 | wpb 2108.3 | bsz 101.7 | num_updates 58128 | best_loss 0.08 +epoch 014 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34802.6 | wpb 2108.3 | bsz 101.7 | num_updates 58128 | best_loss 0.08 +epoch 014 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34802.6 | wpb 2108.3 | bsz 101.7 | num_updates 58128 | best_loss 0.08 +epoch 014 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34802.6 | wpb 2108.3 | bsz 101.7 | num_updates 58128 | best_loss 0.08 +epoch 014 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34802.6 | wpb 2108.3 | bsz 101.7 | num_updates 58128 | best_loss 0.08 +epoch 014 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34802.6 | wpb 2108.3 | bsz 101.7 | num_updates 58128 | best_loss 0.08 +epoch 014 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34802.6 | wpb 2108.3 | bsz 101.7 | num_updates 58128 | best_loss 0.08 +epoch 014 | valid on 'valid' subset | loss 0.08 | ppl 1.06 | wps 34802.6 | wpb 2108.3 | bsz 101.7 | num_updates 58128 | best_loss 0.08 +end of epoch 14 (average epoch stats below) +epoch 014 | loss 0.014 | ppl 1.01 | wps 10452.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 58128 | lr 0.000131162 | gnorm 0.339 | train_wall 834 | gb_free 13.7 | wall 11911 +epoch 014 | loss 0.014 | ppl 1.01 | wps 10452.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 58128 | lr 0.000131162 | gnorm 0.339 | train_wall 834 | gb_free 13.7 | wall 11911 +epoch 014 | loss 0.014 | ppl 1.01 | wps 10452.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 58128 | lr 0.000131162 | gnorm 0.339 | train_wall 834 | gb_free 13.7 | wall 11911 +epoch 014 | loss 0.014 | ppl 1.01 | wps 10452.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 58128 | lr 0.000131162 | gnorm 0.339 | train_wall 834 | gb_free 13.7 | wall 11911 +epoch 014 | loss 0.014 | ppl 1.01 | wps 10452.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 58128 | lr 0.000131162 | gnorm 0.339 | train_wall 834 | gb_free 13.7 | wall 11911 +epoch 014 | loss 0.014 | ppl 1.01 | wps 10452.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 58128 | lr 0.000131162 | gnorm 0.339 | train_wall 834 | gb_free 13.7 | wall 11911 +epoch 014 | loss 0.014 | ppl 1.01 | wps 10452.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 58128 | lr 0.000131162 | gnorm 0.339 | train_wall 834 | gb_free 13.7 | wall 11911 +epoch 014 | loss 0.014 | ppl 1.01 | wps 10452.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 58128 | lr 0.000131162 | gnorm 0.339 | train_wall 834 | gb_free 13.7 | wall 11911 +epoch 014 | loss 0.014 | ppl 1.01 | wps 10452.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 58128 | lr 0.000131162 | gnorm 0.339 | train_wall 834 | gb_free 13.7 | wall 11911 +epoch 014 | loss 0.014 | ppl 1.01 | wps 10452.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 58128 | lr 0.000131162 | gnorm 0.339 | train_wall 834 | gb_free 13.7 | wall 11911 +epoch 014 | loss 0.014 | ppl 1.01 | wps 10452.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 58128 | lr 0.000131162 | gnorm 0.339 | train_wall 834 | gb_free 13.7 | wall 11911 +epoch 014 | loss 0.014 | ppl 1.01 | wps 10452.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 58128 | lr 0.000131162 | gnorm 0.339 | train_wall 834 | gb_free 13.7 | wall 11911 +epoch 014 | loss 0.014 | ppl 1.01 | wps 10452.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 58128 | lr 0.000131162 | gnorm 0.339 | train_wall 834 | gb_free 13.7 | wall 11911 +epoch 014 | loss 0.014 | ppl 1.01 | wps 10452.2 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 58128 | lr 0.000131162 | gnorm 0.339 | train_wall 834 | gb_free 13.7 | wall 11911 +Start iterating over samples +begin validation on "valid" subset +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +epoch 015 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 34723.3 | wpb 2108.3 | bsz 101.7 | num_updates 62280 | best_loss 0.078 +end of epoch 15 (average epoch stats below) +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +epoch 015 | loss 0.012 | ppl 1.01 | wps 10458.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 62280 | lr 0.000126714 | gnorm 0.321 | train_wall 833 | gb_free 13.7 | wall 12760 +Start iterating over samples +begin validation on "valid" subset +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +epoch 016 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34753.6 | wpb 2108.3 | bsz 101.7 | num_updates 66432 | best_loss 0.071 +end of epoch 16 (average epoch stats below) +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +epoch 016 | loss 0.011 | ppl 1.01 | wps 10457.1 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 66432 | lr 0.000122691 | gnorm 0.306 | train_wall 833 | gb_free 13.7 | wall 13609 +Start iterating over samples +begin validation on "valid" subset +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +epoch 017 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34626.5 | wpb 2108.3 | bsz 101.7 | num_updates 70584 | best_loss 0.071 +end of epoch 17 (average epoch stats below) +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +epoch 017 | loss 0.01 | ppl 1.01 | wps 10299.4 | ups 4.82 | wpb 2137.6 | bsz 103.2 | num_updates 70584 | lr 0.000119027 | gnorm 0.284 | train_wall 833 | gb_free 13.8 | wall 14470 +Start iterating over samples +begin validation on "valid" subset +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +epoch 018 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 34667.3 | wpb 2108.3 | bsz 101.7 | num_updates 74736 | best_loss 0.071 +end of epoch 18 (average epoch stats below) +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +epoch 018 | loss 0.009 | ppl 1.01 | wps 10448.6 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 74736 | lr 0.000115674 | gnorm 0.278 | train_wall 834 | gb_free 13.8 | wall 15320 +Start iterating over samples +begin validation on "valid" subset +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +epoch 019 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34705.2 | wpb 2108.3 | bsz 101.7 | num_updates 78888 | best_loss 0.071 +end of epoch 19 (average epoch stats below) +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +epoch 019 | loss 0.008 | ppl 1.01 | wps 10443.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 78888 | lr 0.000112589 | gnorm 0.274 | train_wall 834 | gb_free 13.9 | wall 16170 +Start iterating over samples +begin validation on "valid" subset +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +epoch 020 | valid on 'valid' subset | loss 0.068 | ppl 1.05 | wps 34737.4 | wpb 2108.3 | bsz 101.7 | num_updates 83040 | best_loss 0.068 +end of epoch 20 (average epoch stats below) +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +epoch 020 | loss 0.007 | ppl 1.01 | wps 10394.4 | ups 4.86 | wpb 2137.6 | bsz 103.2 | num_updates 83040 | lr 0.000109738 | gnorm 0.256 | train_wall 837 | gb_free 13.7 | wall 17023 +Start iterating over samples +begin validation on "valid" subset +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +epoch 021 | valid on 'valid' subset | loss 0.069 | ppl 1.05 | wps 34637.7 | wpb 2108.3 | bsz 101.7 | num_updates 87192 | best_loss 0.068 +end of epoch 21 (average epoch stats below) +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +epoch 021 | loss 0.007 | ppl 1 | wps 10457.9 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 87192 | lr 0.000107093 | gnorm 0.249 | train_wall 833 | gb_free 13.9 | wall 17872 +Start iterating over samples +begin validation on "valid" subset +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +epoch 022 | valid on 'valid' subset | loss 0.082 | ppl 1.06 | wps 34699.9 | wpb 2108.3 | bsz 101.7 | num_updates 91344 | best_loss 0.068 +end of epoch 22 (average epoch stats below) +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +epoch 022 | loss 0.006 | ppl 1 | wps 10449.3 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 91344 | lr 0.000104631 | gnorm 0.232 | train_wall 834 | gb_free 13.7 | wall 18722 +Start iterating over samples +begin validation on "valid" subset +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +epoch 023 | valid on 'valid' subset | loss 0.071 | ppl 1.05 | wps 34572 | wpb 2108.3 | bsz 101.7 | num_updates 95496 | best_loss 0.068 +end of epoch 23 (average epoch stats below) +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +epoch 023 | loss 0.006 | ppl 1 | wps 10439.6 | ups 4.88 | wpb 2137.6 | bsz 103.2 | num_updates 95496 | lr 0.000102331 | gnorm 0.22 | train_wall 835 | gb_free 13.7 | wall 19572 +Start iterating over samples +begin validation on "valid" subset +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +epoch 024 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 34532.5 | wpb 2108.3 | bsz 101.7 | num_updates 99648 | best_loss 0.068 +early stop since valid performance hasn't improved for last 4 runs +end of epoch 24 (average epoch stats below) +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +epoch 024 | loss 0.006 | ppl 1 | wps 10448.8 | ups 4.89 | wpb 2137.6 | bsz 103.2 | num_updates 99648 | lr 0.000100176 | gnorm 0.224 | train_wall 834 | gb_free 13.6 | wall 20421 +done training in 20420.7 seconds