| train: Dataset({ | |
| features: ['id', 'original_id', 'bit_sequence', 'sequence_length', 'negentropy', 'lz_complexity', 'compression_ratio', 'original_length', 'has_parity', 'category', 'original_text', 'pattern_type', 'safety_category', 'target_negentropy', 'target_complexity'], | |
| num_rows: 11670 | |
| }) | |
| validation: Dataset({ | |
| features: ['id', 'original_id', 'bit_sequence', 'sequence_length', 'negentropy', 'lz_complexity', 'compression_ratio', 'original_length', 'has_parity', 'category', 'original_text', 'pattern_type', 'safety_category', 'target_negentropy', 'target_complexity'], | |
| num_rows: 1459 | |
| }) | |
| test: Dataset({ | |
| features: ['id', 'original_id', 'bit_sequence', 'sequence_length', 'negentropy', 'lz_complexity', 'compression_ratio', 'original_length', 'has_parity', 'category', 'original_text', 'pattern_type', 'safety_category', 'target_negentropy', 'target_complexity'], | |
| num_rows: 1459 | |
| }) | |
| }) | |
| 2025-09-04 03:12:47,244 - INFO - Training samples: 11670 | |
| 2025-09-04 03:12:47,245 - INFO - Processing sample 0/11670 | |
| 2025-09-04 03:12:47,561 - INFO - Processing sample 1000/11670 | |
| 2025-09-04 03:12:47,800 - INFO - Processing sample 2000/11670 | |
| 2025-09-04 03:12:48,021 - INFO - Processing sample 3000/11670 | |
| 2025-09-04 03:12:48,245 - INFO - Processing sample 4000/11670 | |
| 2025-09-04 03:12:48,465 - INFO - Processing sample 5000/11670 | |
| 2025-09-04 03:12:48,704 - INFO - Processing sample 6000/11670 | |
| 2025-09-04 03:12:48,951 - INFO - Processing sample 7000/11670 | |
| 2025-09-04 03:12:49,168 - INFO - Processing sample 8000/11670 | |
| 2025-09-04 03:12:49,387 - INFO - Processing sample 9000/11670 | |
| 2025-09-04 03:12:49,643 - INFO - Processing sample 10000/11670 | |
| 2025-09-04 03:12:49,907 - INFO - Processing sample 11000/11670 | |
| 2025-09-04 03:12:50,048 - INFO - Processed 416 valid bit sequences | |
| 2025-09-04 03:12:50,088 - INFO - Created training dataset: torch.Size([2199, 256]) | |
| 2025-09-04 03:12:50,092 - INFO - Loading checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 03:12:50,159 - INFO - Resumed from epoch 0, best loss: inf | |
| 2025-09-04 03:12:50,163 - INFO - ๐ STARTING BREAKTHROUGH BITRANSFORMERLM TRAINING! | |
| 2025-09-04 03:12:50,163 - INFO - Configuration: Fixed RL Adafactor + 16M parameters + CPU training | |
| 2025-09-04 03:12:50,163 - INFO - Starting production training for 50 epochs... | |
| 2025-09-04 03:12:50,163 - INFO - Breakthrough configuration: Fixed RL Adafactor + 16M BitTransformerLM | |
| 2025-09-04 03:12:50,163 - INFO - Starting epoch 1 | |
| 2025-09-04 03:12:57,795 - INFO - Epoch 1, Step 0/549: Loss=6.945019, K=1.000, C=0.000, S=0.481, LR=4.00e-05 | |
| 2025-09-04 03:13:33,916 - INFO - Epoch 1, Step 10/549: Loss=2.408509, K=0.516, C=0.109, S=0.488, LR=4.03e-05 | |
| 2025-09-04 03:14:04,818 - INFO - Epoch 1, Step 20/549: Loss=0.912506, K=0.014, C=0.136, S=0.465, LR=4.11e-05 | |
| 2025-09-04 03:14:34,286 - INFO - Epoch 1, Step 30/549: Loss=0.891918, K=0.021, C=0.062, S=0.465, LR=4.24e-05 | |
| 2025-09-04 03:15:05,491 - INFO - Epoch 1, Step 40/549: Loss=0.888827, K=0.023, C=0.086, S=0.465, LR=4.42e-05 | |
| 2025-09-04 03:15:37,504 - INFO - Epoch 1, Step 50/549: Loss=0.896317, K=0.002, C=0.143, S=0.465, LR=4.64e-05 | |
| 2025-09-04 03:16:11,092 - INFO - Epoch 1, Step 60/549: Loss=0.892713, K=0.004, C=0.156, S=0.465, LR=4.91e-05 | |
| 2025-09-04 03:16:44,201 - INFO - Epoch 1, Step 70/549: Loss=0.945013, K=0.051, C=0.073, S=0.466, LR=5.23e-05 | |
| 2025-09-04 03:17:17,900 - INFO - Epoch 1, Step 80/549: Loss=0.920252, K=0.055, C=0.056, S=0.466, LR=5.59e-05 | |
| 2025-09-04 03:17:50,700 - INFO - Epoch 1, Step 90/549: Loss=0.904275, K=0.043, C=0.065, S=0.466, LR=5.99e-05 | |
| 2025-09-04 03:18:22,812 - INFO - Epoch 1, Step 100/549: Loss=0.899758, K=0.049, C=0.062, S=0.466, LR=6.45e-05 | |
| 2025-09-04 03:18:54,315 - INFO - Epoch 1, Step 110/549: Loss=0.916431, K=0.004, C=0.183, S=0.466, LR=6.95e-05 | |
| 2025-09-04 03:19:27,222 - INFO - Epoch 1, Step 120/549: Loss=0.932081, K=0.002, C=0.203, S=0.466, LR=7.49e-05 | |
| 2025-09-04 03:19:58,408 - INFO - Epoch 1, Step 130/549: Loss=0.949225, K=0.095, C=0.065, S=0.468, LR=8.08e-05 | |
| 2025-09-04 03:20:30,438 - INFO - Epoch 1, Step 140/549: Loss=0.972041, K=0.125, C=0.052, S=0.468, LR=8.71e-05 | |
| 2025-09-04 03:21:02,584 - INFO - Epoch 1, Step 150/549: Loss=0.964834, K=0.124, C=0.077, S=0.469, LR=9.38e-05 | |
| 2025-09-04 03:21:35,118 - INFO - Epoch 1, Step 160/549: Loss=0.984318, K=0.123, C=0.062, S=0.469, LR=1.01e-04 | |
| 2025-09-04 03:22:07,679 - INFO - Epoch 1, Step 170/549: Loss=1.002787, K=0.125, C=0.081, S=0.470, LR=1.09e-04 | |
| 2025-09-04 03:22:38,799 - INFO - Epoch 1, Step 180/549: Loss=1.000302, K=0.096, C=0.096, S=0.469, LR=1.16e-04 | |
| 2025-09-04 03:23:10,657 - INFO - Epoch 1, Step 190/549: Loss=0.963313, K=0.105, C=0.063, S=0.468, LR=1.25e-04 | |
| 2025-09-04 03:23:45,321 - INFO - Epoch 1, Step 200/549: Loss=0.992513, K=0.104, C=0.120, S=0.470, LR=1.34e-04 | |
| 2025-09-04 03:24:18,371 - INFO - Epoch 1, Step 210/549: Loss=1.014876, K=0.133, C=0.105, S=0.471, LR=1.43e-04 | |
| 2025-09-04 03:24:50,586 - INFO - Epoch 1, Step 220/549: Loss=0.998555, K=0.121, C=0.115, S=0.471, LR=1.52e-04 | |
| 2025-09-04 03:25:21,045 - INFO - Epoch 1, Step 230/549: Loss=0.993533, K=0.127, C=0.111, S=0.471, LR=1.62e-04 | |
| 2025-09-04 03:25:52,066 - INFO - Epoch 1, Step 240/549: Loss=1.025823, K=0.158, C=0.047, S=0.470, LR=1.72e-04 | |
| 2025-09-04 03:26:22,809 - INFO - Epoch 1, Step 250/549: Loss=1.034523, K=0.176, C=0.051, S=0.470, LR=1.83e-04 | |
| 2025-09-04 03:26:52,126 - INFO - Epoch 1, Step 260/549: Loss=1.118134, K=0.011, C=0.375, S=0.473, LR=1.94e-04 | |
| 2025-09-04 03:27:22,809 - INFO - Epoch 1, Step 270/549: Loss=1.286714, K=0.008, C=0.416, S=0.474, LR=2.05e-04 | |
| 2025-09-04 03:27:53,127 - INFO - Epoch 1, Step 280/549: Loss=1.321608, K=0.001, C=0.436, S=0.474, LR=2.17e-04 | |
| 2025-09-04 03:28:23,528 - INFO - Epoch 1, Step 290/549: Loss=1.369863, K=0.006, C=0.422, S=0.474, LR=2.29e-04 | |
| 2025-09-04 03:28:55,622 - INFO - Epoch 1, Step 300/549: Loss=1.430617, K=0.008, C=0.426, S=0.475, LR=2.41e-04 | |
| 2025-09-04 03:29:27,634 - INFO - Epoch 1, Step 310/549: Loss=1.406716, K=0.001, C=0.445, S=0.475, LR=2.53e-04 | |
| 2025-09-04 03:30:00,105 - INFO - Epoch 1, Step 320/549: Loss=1.411154, K=0.001, C=0.445, S=0.475, LR=2.66e-04 | |
| 2025-09-04 03:30:31,393 - INFO - Epoch 1, Step 330/549: Loss=1.421272, K=0.002, C=0.438, S=0.475, LR=2.79e-04 | |
| 2025-09-04 03:31:00,422 - INFO - Epoch 1, Step 340/549: Loss=1.330500, K=0.018, C=0.459, S=0.477, LR=2.92e-04 | |
| 2025-09-04 03:31:31,133 - INFO - Epoch 1, Step 350/549: Loss=1.507722, K=0.002, C=0.449, S=0.475, LR=3.05e-04 | |
| 2025-09-04 03:32:03,794 - INFO - Epoch 1, Step 360/549: Loss=1.515965, K=0.007, C=0.480, S=0.477, LR=3.19e-04 | |
| 2025-09-04 03:32:36,023 - INFO - Epoch 1, Step 370/549: Loss=1.598212, K=0.003, C=0.482, S=0.477, LR=3.33e-04 | |
| 2025-09-04 03:33:09,277 - INFO - Epoch 1, Step 380/549: Loss=1.563406, K=0.005, C=0.500, S=0.477, LR=3.47e-04 | |
| 2025-09-04 03:33:42,131 - INFO - Epoch 1, Step 390/549: Loss=1.682776, K=0.069, C=0.377, S=0.481, LR=3.61e-04 | |
| 2025-09-04 03:34:15,234 - INFO - Epoch 1, Step 400/549: Loss=1.837267, K=0.734, C=0.020, S=0.482, LR=3.75e-04 | |
| 2025-09-04 03:34:47,354 - INFO - Epoch 1, Step 410/549: Loss=1.810507, K=0.695, C=0.044, S=0.486, LR=3.90e-04 | |
| 2025-09-04 03:35:18,345 - INFO - Epoch 1, Step 420/549: Loss=1.890824, K=0.727, C=0.030, S=0.484, LR=4.04e-04 | |
| 2025-09-04 03:35:51,069 - INFO - Epoch 1, Step 430/549: Loss=1.854634, K=0.566, C=0.094, S=0.490, LR=4.19e-04 | |
| 2025-09-04 03:36:23,606 - INFO - Epoch 1, Step 440/549: Loss=1.917933, K=0.707, C=0.047, S=0.487, LR=4.34e-04 | |
| 2025-09-04 03:36:57,059 - INFO - Epoch 1, Step 450/549: Loss=1.817696, K=0.555, C=0.098, S=0.490, LR=4.49e-04 | |
| 2025-09-04 03:37:29,606 - INFO - Epoch 1, Step 460/549: Loss=1.942708, K=0.719, C=0.040, S=0.486, LR=4.64e-04 | |
| 2025-09-04 03:38:03,291 - INFO - Epoch 1, Step 470/549: Loss=1.971696, K=0.609, C=0.082, S=0.490, LR=4.79e-04 | |
| 2025-09-04 03:38:36,715 - INFO - Epoch 1, Step 480/549: Loss=2.095640, K=0.680, C=0.057, S=0.488, LR=4.94e-04 | |
| 2025-09-04 03:39:08,709 - INFO - Epoch 1, Step 490/549: Loss=2.007380, K=0.730, C=0.039, S=0.486, LR=5.09e-04 | |
| 2025-09-04 03:39:41,203 - INFO - Epoch 1, Step 500/549: Loss=2.004427, K=0.404, C=0.168, S=0.492, LR=5.24e-04 | |
| 2025-09-04 03:40:12,131 - INFO - Epoch 1, Step 510/549: Loss=2.164621, K=0.766, C=0.036, S=0.486, LR=5.39e-04 | |
| 2025-09-04 03:40:43,654 - INFO - Epoch 1, Step 520/549: Loss=2.156728, K=0.602, C=0.084, S=0.490, LR=5.54e-04 | |
| 2025-09-04 03:41:17,835 - INFO - Epoch 1, Step 530/549: Loss=2.222218, K=0.688, C=0.065, S=0.490, LR=5.69e-04 | |
| 2025-09-04 03:41:51,904 - INFO - Epoch 1, Step 540/549: Loss=2.289957, K=0.730, C=0.050, S=0.488, LR=5.84e-04 | |
| 2025-09-04 03:42:18,802 - INFO - Epoch 1 completed in 1768.6s: Avg Loss=1.491004, K=0.310, C=0.164, S=0.477 | |
| 2025-09-04 03:42:18,992 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 03:42:19,334 - INFO - NEW BEST MODEL! Loss: 1.491004 -> /data/BitTransformerLM/checkpoints/checkpoint_best.pt | |
| 2025-09-04 03:42:19,339 - INFO - === EPOCH 1 COMPLETE === | |
| 2025-09-04 03:42:19,339 - INFO - Loss: 1.491004 (best: 1.491004) | |
| 2025-09-04 03:42:19,339 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 03:42:19,340 - INFO - Starting epoch 2 | |
| 2025-09-04 03:42:22,700 - INFO - Epoch 2, Step 0/549: Loss=2.357343, K=0.879, C=0.017, S=0.485, LR=5.97e-04 | |
| 2025-09-04 03:42:56,102 - INFO - Epoch 2, Step 10/549: Loss=2.197070, K=0.906, C=0.011, S=0.484, LR=6.12e-04 | |
| 2025-09-04 03:43:26,922 - INFO - Epoch 2, Step 20/549: Loss=2.559826, K=0.863, C=0.019, S=0.485, LR=6.27e-04 | |
| 2025-09-04 03:43:55,585 - INFO - Epoch 2, Step 30/549: Loss=2.364161, K=0.875, C=0.017, S=0.485, LR=6.42e-04 | |
| 2025-09-04 03:44:25,465 - INFO - Epoch 2, Step 40/549: Loss=2.334850, K=0.867, C=0.017, S=0.485, LR=6.56e-04 | |
| 2025-09-04 03:44:54,196 - INFO - Epoch 2, Step 50/549: Loss=2.093965, K=0.762, C=0.039, S=0.487, LR=6.71e-04 | |
| 2025-09-04 03:45:23,921 - INFO - Epoch 2, Step 60/549: Loss=2.224668, K=0.762, C=0.040, S=0.487, LR=6.85e-04 | |
| 2025-09-04 03:45:53,075 - INFO - Epoch 2, Step 70/549: Loss=2.304588, K=0.727, C=0.050, S=0.488, LR=6.99e-04 | |
| 2025-09-04 03:46:22,517 - INFO - Epoch 2, Step 80/549: Loss=2.210470, K=0.793, C=0.033, S=0.486, LR=7.13e-04 | |
| 2025-09-04 03:46:51,868 - INFO - Epoch 2, Step 90/549: Loss=2.336373, K=0.750, C=0.042, S=0.487, LR=7.27e-04 | |
| 2025-09-04 03:47:22,728 - INFO - Epoch 2, Step 100/549: Loss=2.409166, K=0.770, C=0.039, S=0.487, LR=7.40e-04 | |
| 2025-09-04 03:47:53,907 - INFO - Epoch 2, Step 110/549: Loss=2.238059, K=0.770, C=0.039, S=0.487, LR=7.53e-04 | |
| 2025-09-04 03:48:25,290 - INFO - Epoch 2, Step 120/549: Loss=2.350143, K=0.156, C=0.332, S=0.488, LR=7.67e-04 | |
| 2025-09-04 03:48:57,272 - INFO - Epoch 2, Step 130/549: Loss=2.518044, K=0.066, C=0.439, S=0.483, LR=7.79e-04 | |
| 2025-09-04 03:49:27,369 - INFO - Epoch 2, Step 140/549: Loss=2.580777, K=0.027, C=0.477, S=0.481, LR=7.92e-04 | |
| 2025-09-04 03:49:56,966 - INFO - Epoch 2, Step 150/549: Loss=2.945192, K=0.013, C=0.531, S=0.481, LR=8.04e-04 | |
| 2025-09-04 03:50:27,361 - INFO - Epoch 2, Step 160/549: Loss=3.147150, K=0.105, C=0.379, S=0.485, LR=8.16e-04 | |
| 2025-09-04 03:50:57,794 - INFO - Epoch 2, Step 170/549: Loss=3.128520, K=0.052, C=0.441, S=0.482, LR=8.28e-04 | |
| 2025-09-04 03:51:27,097 - INFO - Epoch 2, Step 180/549: Loss=3.383422, K=0.073, C=0.395, S=0.482, LR=8.39e-04 | |
| 2025-09-04 03:51:58,965 - INFO - Epoch 2, Step 190/549: Loss=3.726457, K=0.062, C=0.453, S=0.483, LR=8.50e-04 | |
| 2025-09-04 03:52:32,461 - INFO - Epoch 2, Step 200/549: Loss=3.980708, K=0.021, C=0.484, S=0.480, LR=8.61e-04 | |
| 2025-09-04 03:53:03,595 - INFO - Epoch 2, Step 210/549: Loss=4.009552, K=0.020, C=0.498, S=0.480, LR=8.72e-04 | |
| 2025-09-04 03:53:36,889 - INFO - Epoch 2, Step 220/549: Loss=3.895764, K=0.021, C=0.457, S=0.479, LR=8.82e-04 | |
| 2025-09-04 03:54:10,722 - INFO - Epoch 2, Step 230/549: Loss=4.070724, K=0.013, C=0.512, S=0.480, LR=8.92e-04 | |
| 2025-09-04 03:54:44,265 - INFO - Epoch 2, Step 240/549: Loss=3.956797, K=0.017, C=0.494, S=0.480, LR=9.01e-04 | |
| 2025-09-04 03:55:17,701 - INFO - Epoch 2, Step 250/549: Loss=4.001827, K=0.020, C=0.500, S=0.480, LR=9.10e-04 | |
| 2025-09-04 03:55:49,488 - INFO - Epoch 2, Step 260/549: Loss=4.011623, K=0.025, C=0.486, S=0.480, LR=9.19e-04 | |
| 2025-09-04 03:56:22,445 - INFO - Epoch 2, Step 270/549: Loss=4.042419, K=0.016, C=0.492, S=0.480, LR=9.27e-04 | |
| 2025-09-04 03:56:55,993 - INFO - Epoch 2, Step 280/549: Loss=3.862787, K=0.014, C=0.492, S=0.480, LR=9.35e-04 | |
| 2025-09-04 03:57:29,733 - INFO - Epoch 2, Step 290/549: Loss=4.010089, K=0.163, C=0.320, S=0.488, LR=9.42e-04 | |
| 2025-09-04 03:58:04,345 - INFO - Epoch 2, Step 300/549: Loss=4.143061, K=0.037, C=0.484, S=0.482, LR=9.49e-04 | |
| 2025-09-04 03:58:38,407 - INFO - Epoch 2, Step 310/549: Loss=4.109137, K=0.016, C=0.508, S=0.480, LR=9.56e-04 | |
| 2025-09-04 03:59:10,939 - INFO - Epoch 2, Step 320/549: Loss=4.147477, K=0.003, C=0.512, S=0.480, LR=9.62e-04 | |
| 2025-09-04 03:59:44,410 - INFO - Epoch 2, Step 330/549: Loss=4.164157, K=0.083, C=0.408, S=0.484, LR=9.67e-04 | |
| 2025-09-04 04:00:15,850 - INFO - Epoch 2, Step 340/549: Loss=4.186996, K=0.012, C=0.523, S=0.480, LR=9.73e-04 | |
| 2025-09-04 04:00:50,045 - INFO - Epoch 2, Step 350/549: Loss=4.123337, K=0.036, C=0.449, S=0.480, LR=9.77e-04 | |
| 2025-09-04 04:01:23,518 - INFO - Epoch 2, Step 360/549: Loss=4.096387, K=0.012, C=0.500, S=0.480, LR=9.82e-04 | |
| 2025-09-04 04:01:57,463 - INFO - Epoch 2, Step 370/549: Loss=4.103847, K=0.022, C=0.492, S=0.481, LR=9.86e-04 | |
| 2025-09-04 04:02:33,541 - INFO - Epoch 2, Step 380/549: Loss=4.154932, K=0.006, C=0.523, S=0.480, LR=9.89e-04 | |
| 2025-09-04 04:03:10,794 - INFO - Epoch 2, Step 390/549: Loss=4.259106, K=0.011, C=0.523, S=0.480, LR=9.92e-04 | |
| 2025-09-04 04:03:44,734 - INFO - Epoch 2, Step 400/549: Loss=4.099108, K=0.006, C=0.523, S=0.481, LR=9.95e-04 | |
| 2025-09-04 04:04:20,745 - INFO - Epoch 2, Step 410/549: Loss=3.918416, K=0.014, C=0.500, S=0.481, LR=9.97e-04 | |
| 2025-09-04 04:04:57,106 - INFO - Epoch 2, Step 420/549: Loss=4.038072, K=0.019, C=0.484, S=0.480, LR=9.98e-04 | |
| 2025-09-04 04:05:33,517 - INFO - Epoch 2, Step 430/549: Loss=3.934506, K=0.016, C=0.488, S=0.480, LR=9.99e-04 | |
| 2025-09-04 04:06:09,917 - INFO - Epoch 2, Step 440/549: Loss=4.032218, K=0.063, C=0.436, S=0.483, LR=1.00e-03 | |
| 2025-09-04 04:06:44,486 - INFO - Epoch 2, Step 450/549: Loss=4.277268, K=0.009, C=0.531, S=0.481, LR=1.00e-03 | |
| 2025-09-04 04:07:20,133 - INFO - Epoch 2, Step 460/549: Loss=3.914258, K=0.015, C=0.520, S=0.482, LR=1.00e-03 | |
| 2025-09-04 04:07:53,797 - INFO - Epoch 2, Step 470/549: Loss=4.019012, K=0.996, C=0.000, S=0.483, LR=1.00e-03 | |
| 2025-09-04 04:08:25,257 - INFO - Epoch 2, Step 480/549: Loss=3.865450, K=0.969, C=0.004, S=0.484, LR=1.00e-03 | |
| 2025-09-04 04:09:01,238 - INFO - Epoch 2, Step 490/549: Loss=3.422668, K=0.930, C=0.008, S=0.484, LR=1.00e-03 | |
| 2025-09-04 04:09:37,487 - INFO - Epoch 2, Step 500/549: Loss=3.427466, K=0.887, C=0.016, S=0.485, LR=1.00e-03 | |
| 2025-09-04 04:10:13,652 - INFO - Epoch 2, Step 510/549: Loss=3.378032, K=0.855, C=0.021, S=0.485, LR=1.00e-03 | |
| 2025-09-04 04:10:50,044 - INFO - Epoch 2, Step 520/549: Loss=3.316720, K=0.844, C=0.024, S=0.486, LR=1.00e-03 | |
| 2025-09-04 04:11:27,127 - INFO - Epoch 2, Step 530/549: Loss=3.403861, K=0.875, C=0.018, S=0.485, LR=1.00e-03 | |
| 2025-09-04 04:12:01,440 - INFO - Epoch 2, Step 540/549: Loss=3.411167, K=0.875, C=0.018, S=0.485, LR=1.00e-03 | |
| 2025-09-04 04:12:27,053 - INFO - Epoch 2 completed in 1807.7s: Avg Loss=3.291326, K=0.295, C=0.328, S=0.482 | |
| 2025-09-04 04:12:27,246 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 04:12:27,416 - INFO - === EPOCH 2 COMPLETE === | |
| 2025-09-04 04:12:27,416 - INFO - Loss: 3.291326 (best: 1.491004) | |
| 2025-09-04 04:12:27,417 - INFO - Starting epoch 3 | |
| 2025-09-04 04:12:30,641 - INFO - Epoch 3, Step 0/549: Loss=4.510313, K=0.777, C=0.045, S=0.488, LR=1.00e-03 | |
| 2025-09-04 04:13:05,763 - INFO - Epoch 3, Step 10/549: Loss=4.638473, K=0.719, C=0.062, S=0.490, LR=1.00e-03 | |
| 2025-09-04 04:13:42,600 - INFO - Epoch 3, Step 20/549: Loss=4.437867, K=0.762, C=0.050, S=0.489, LR=1.00e-03 | |
| 2025-09-04 04:14:18,698 - INFO - Epoch 3, Step 30/549: Loss=4.490888, K=0.758, C=0.048, S=0.489, LR=9.99e-04 | |
| 2025-09-04 04:14:55,112 - INFO - Epoch 3, Step 40/549: Loss=4.297148, K=0.762, C=0.048, S=0.489, LR=9.99e-04 | |
| 2025-09-04 04:15:28,088 - INFO - Epoch 3, Step 50/549: Loss=4.347990, K=0.789, C=0.040, S=0.488, LR=9.99e-04 | |
| 2025-09-04 04:16:00,343 - INFO - Epoch 3, Step 60/549: Loss=4.330633, K=0.891, C=0.017, S=0.485, LR=9.99e-04 | |
| 2025-09-04 04:16:30,977 - INFO - Epoch 3, Step 70/549: Loss=4.378899, K=0.906, C=0.013, S=0.485, LR=9.99e-04 | |
| 2025-09-04 04:17:02,305 - INFO - Epoch 3, Step 80/549: Loss=4.473257, K=0.883, C=0.021, S=0.486, LR=9.99e-04 | |
| 2025-09-04 04:17:36,217 - INFO - Epoch 3, Step 90/549: Loss=4.553155, K=0.855, C=0.026, S=0.486, LR=9.99e-04 | |
| 2025-09-04 04:18:10,769 - INFO - Epoch 3, Step 100/549: Loss=4.257027, K=0.617, C=0.089, S=0.491, LR=9.99e-04 | |
| 2025-09-04 04:18:44,850 - INFO - Epoch 3, Step 110/549: Loss=4.143906, K=0.734, C=0.054, S=0.489, LR=9.99e-04 | |
| 2025-09-04 04:19:18,700 - INFO - Epoch 3, Step 120/549: Loss=4.387669, K=0.758, C=0.049, S=0.489, LR=9.99e-04 | |
| 2025-09-04 04:19:51,938 - INFO - Epoch 3, Step 130/549: Loss=4.654803, K=0.883, C=0.020, S=0.485, LR=9.98e-04 | |
| 2025-09-04 04:20:24,087 - INFO - Epoch 3, Step 140/549: Loss=4.119586, K=0.844, C=0.027, S=0.486, LR=9.98e-04 | |
| 2025-09-04 04:20:55,013 - INFO - Epoch 3, Step 150/549: Loss=4.222427, K=0.836, C=0.031, S=0.487, LR=9.98e-04 | |
| 2025-09-04 04:21:28,040 - INFO - Epoch 3, Step 160/549: Loss=4.018044, K=0.820, C=0.031, S=0.486, LR=9.98e-04 | |
| 2025-09-04 04:22:00,931 - INFO - Epoch 3, Step 170/549: Loss=4.258954, K=0.934, C=0.011, S=0.484, LR=9.98e-04 | |
| 2025-09-04 04:22:34,217 - INFO - Epoch 3, Step 180/549: Loss=4.425278, K=0.906, C=0.016, S=0.485, LR=9.98e-04 | |
| 2025-09-04 04:23:07,490 - INFO - Epoch 3, Step 190/549: Loss=4.370719, K=0.906, C=0.015, S=0.485, LR=9.97e-04 | |
| 2025-09-04 04:23:40,786 - INFO - Epoch 3, Step 200/549: Loss=3.899398, K=0.820, C=0.033, S=0.487, LR=9.97e-04 | |
| 2025-09-04 04:24:14,429 - INFO - Epoch 3, Step 210/549: Loss=3.938482, K=0.617, C=0.094, S=0.492, LR=9.97e-04 | |
| 2025-09-04 04:24:47,007 - INFO - Epoch 3, Step 220/549: Loss=3.749879, K=0.520, C=0.121, S=0.492, LR=9.97e-04 | |
| 2025-09-04 04:25:19,258 - INFO - Epoch 3, Step 230/549: Loss=3.971390, K=0.766, C=0.045, S=0.488, LR=9.97e-04 | |
| 2025-09-04 04:25:52,459 - INFO - Epoch 3, Step 240/549: Loss=3.832168, K=0.688, C=0.071, S=0.491, LR=9.96e-04 | |
| 2025-09-04 04:26:31,506 - INFO - Epoch 3, Step 250/549: Loss=3.806585, K=0.590, C=0.105, S=0.493, LR=9.96e-04 | |
| 2025-09-04 04:27:13,432 - INFO - Epoch 3, Step 260/549: Loss=3.989101, K=0.676, C=0.074, S=0.491, LR=9.96e-04 | |
| 2025-09-04 04:27:52,596 - INFO - Epoch 3, Step 270/549: Loss=3.604483, K=0.906, C=0.013, S=0.484, LR=9.96e-04 | |
| 2025-09-04 04:28:26,310 - INFO - Epoch 3, Step 280/549: Loss=3.872122, K=0.742, C=0.050, S=0.488, LR=9.96e-04 | |
| 2025-09-04 04:28:58,607 - INFO - Epoch 3, Step 290/549: Loss=3.821562, K=0.418, C=0.170, S=0.493, LR=9.95e-04 | |
| 2025-09-04 04:29:30,731 - INFO - Epoch 3, Step 300/549: Loss=4.247278, K=0.926, C=0.012, S=0.484, LR=9.95e-04 | |
| 2025-09-04 04:30:05,600 - INFO - Epoch 3, Step 310/549: Loss=3.885312, K=0.758, C=0.047, S=0.488, LR=9.95e-04 | |
| 2025-09-04 04:30:41,229 - INFO - Epoch 3, Step 320/549: Loss=3.897176, K=0.797, C=0.039, S=0.487, LR=9.95e-04 | |
| 2025-09-04 04:31:16,002 - INFO - Epoch 3, Step 330/549: Loss=4.264693, K=0.005, C=0.523, S=0.480, LR=9.94e-04 | |
| 2025-09-04 04:31:50,875 - INFO - Epoch 3, Step 340/549: Loss=4.161576, K=0.009, C=0.520, S=0.481, LR=9.94e-04 | |
| 2025-09-04 04:32:25,731 - INFO - Epoch 3, Step 350/549: Loss=3.956192, K=0.009, C=0.535, S=0.481, LR=9.94e-04 | |
| 2025-09-04 04:32:58,920 - INFO - Epoch 3, Step 360/549: Loss=3.937343, K=0.007, C=0.531, S=0.481, LR=9.94e-04 | |
| 2025-09-04 04:33:29,920 - INFO - Epoch 3, Step 370/549: Loss=3.832789, K=0.005, C=0.531, S=0.480, LR=9.93e-04 | |
| 2025-09-04 04:34:02,741 - INFO - Epoch 3, Step 380/549: Loss=3.840409, K=0.005, C=0.531, S=0.480, LR=9.93e-04 | |
| 2025-09-04 04:34:34,919 - INFO - Epoch 3, Step 390/549: Loss=3.349227, K=0.011, C=0.531, S=0.480, LR=9.93e-04 | |
| 2025-09-04 04:35:07,155 - INFO - Epoch 3, Step 400/549: Loss=2.835211, K=0.007, C=0.451, S=0.476, LR=9.92e-04 | |
| 2025-09-04 04:35:40,046 - INFO - Epoch 3, Step 410/549: Loss=2.987521, K=0.005, C=0.463, S=0.476, LR=9.92e-04 | |
| 2025-09-04 04:36:11,947 - INFO - Epoch 3, Step 420/549: Loss=2.680848, K=0.003, C=0.473, S=0.476, LR=9.92e-04 | |
| 2025-09-04 04:36:44,895 - INFO - Epoch 3, Step 430/549: Loss=2.629719, K=0.003, C=0.504, S=0.477, LR=9.91e-04 | |
| 2025-09-04 04:37:16,540 - INFO - Epoch 3, Step 440/549: Loss=2.832083, K=0.004, C=0.484, S=0.477, LR=9.91e-04 | |
| 2025-09-04 04:37:46,832 - INFO - Epoch 3, Step 450/549: Loss=2.895039, K=0.002, C=0.475, S=0.476, LR=9.91e-04 | |
| 2025-09-04 04:38:18,435 - INFO - Epoch 3, Step 460/549: Loss=2.835785, K=0.003, C=0.480, S=0.477, LR=9.90e-04 | |
| 2025-09-04 04:38:50,095 - INFO - Epoch 3, Step 470/549: Loss=2.770210, K=0.003, C=0.500, S=0.477, LR=9.90e-04 | |
| 2025-09-04 04:39:22,065 - INFO - Epoch 3, Step 480/549: Loss=2.583940, K=0.003, C=0.496, S=0.477, LR=9.90e-04 | |
| 2025-09-04 04:39:54,006 - INFO - Epoch 3, Step 490/549: Loss=2.539876, K=0.000, C=0.492, S=0.476, LR=9.89e-04 | |
| 2025-09-04 04:40:25,706 - INFO - Epoch 3, Step 500/549: Loss=2.670730, K=0.004, C=0.480, S=0.477, LR=9.89e-04 | |
| 2025-09-04 04:40:57,505 - INFO - Epoch 3, Step 510/549: Loss=2.636506, K=0.003, C=0.455, S=0.476, LR=9.89e-04 | |
| 2025-09-04 04:41:28,933 - INFO - Epoch 3, Step 520/549: Loss=2.490695, K=0.004, C=0.475, S=0.476, LR=9.88e-04 | |
| 2025-09-04 04:41:59,476 - INFO - Epoch 3, Step 530/549: Loss=2.409058, K=0.004, C=0.445, S=0.475, LR=9.88e-04 | |
| 2025-09-04 04:42:31,243 - INFO - Epoch 3, Step 540/549: Loss=2.279923, K=0.006, C=0.465, S=0.475, LR=9.88e-04 | |
| 2025-09-04 04:42:57,313 - INFO - Epoch 3 completed in 1829.9s: Avg Loss=3.433525, K=0.476, C=0.221, S=0.484 | |
| 2025-09-04 04:42:57,489 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 04:42:57,737 - INFO - === EPOCH 3 COMPLETE === | |
| 2025-09-04 04:42:57,737 - INFO - Loss: 3.433525 (best: 1.491004) | |
| 2025-09-04 04:42:57,737 - INFO - Starting epoch 4 | |
| 2025-09-04 04:43:01,207 - INFO - Epoch 4, Step 0/549: Loss=4.161745, K=0.020, C=0.512, S=0.480, LR=9.87e-04 | |
| 2025-09-04 04:43:33,441 - INFO - Epoch 4, Step 10/549: Loss=3.935947, K=0.013, C=0.516, S=0.480, LR=9.87e-04 | |
| 2025-09-04 04:44:06,590 - INFO - Epoch 4, Step 20/549: Loss=3.837452, K=0.034, C=0.451, S=0.480, LR=9.86e-04 | |
| 2025-09-04 04:44:39,511 - INFO - Epoch 4, Step 30/549: Loss=3.871685, K=0.008, C=0.512, S=0.479, LR=9.86e-04 | |
| 2025-09-04 04:45:11,602 - INFO - Epoch 4, Step 40/549: Loss=3.957943, K=0.106, C=0.373, S=0.484, LR=9.86e-04 | |
| 2025-09-04 04:45:42,343 - INFO - Epoch 4, Step 50/549: Loss=4.187918, K=0.014, C=0.535, S=0.480, LR=9.85e-04 | |
| 2025-09-04 04:46:11,466 - INFO - Epoch 4, Step 60/549: Loss=3.873549, K=0.008, C=0.496, S=0.478, LR=9.85e-04 | |
| 2025-09-04 04:46:42,335 - INFO - Epoch 4, Step 70/549: Loss=3.863484, K=0.018, C=0.504, S=0.480, LR=9.84e-04 | |
| 2025-09-04 04:47:13,569 - INFO - Epoch 4, Step 80/549: Loss=3.708131, K=0.016, C=0.488, S=0.479, LR=9.84e-04 | |
| 2025-09-04 04:47:44,601 - INFO - Epoch 4, Step 90/549: Loss=3.833086, K=0.030, C=0.490, S=0.481, LR=9.83e-04 | |
| 2025-09-04 04:48:19,458 - INFO - Epoch 4, Step 100/549: Loss=3.819363, K=0.014, C=0.508, S=0.479, LR=9.83e-04 | |
| 2025-09-04 04:48:51,327 - INFO - Epoch 4, Step 110/549: Loss=3.850780, K=0.006, C=0.531, S=0.480, LR=9.83e-04 | |
| 2025-09-04 04:49:22,812 - INFO - Epoch 4, Step 120/549: Loss=3.685854, K=0.049, C=0.424, S=0.480, LR=9.82e-04 | |
| 2025-09-04 04:49:54,820 - INFO - Epoch 4, Step 130/549: Loss=3.700493, K=0.039, C=0.457, S=0.480, LR=9.82e-04 | |
| 2025-09-04 04:50:25,012 - INFO - Epoch 4, Step 140/549: Loss=3.763500, K=0.025, C=0.480, S=0.479, LR=9.81e-04 | |
| 2025-09-04 04:50:57,166 - INFO - Epoch 4, Step 150/549: Loss=3.848339, K=0.017, C=0.496, S=0.479, LR=9.81e-04 | |
| 2025-09-04 04:51:31,825 - INFO - Epoch 4, Step 160/549: Loss=3.809134, K=0.015, C=0.535, S=0.481, LR=9.80e-04 | |
| 2025-09-04 04:52:06,507 - INFO - Epoch 4, Step 170/549: Loss=3.714677, K=0.013, C=0.523, S=0.480, LR=9.80e-04 | |
| 2025-09-04 04:52:38,177 - INFO - Epoch 4, Step 180/549: Loss=3.646334, K=0.012, C=0.520, S=0.480, LR=9.79e-04 | |
| 2025-09-04 04:53:08,970 - INFO - Epoch 4, Step 190/549: Loss=3.504847, K=0.024, C=0.480, S=0.480, LR=9.79e-04 | |
| 2025-09-04 04:53:41,760 - INFO - Epoch 4, Step 200/549: Loss=3.574586, K=0.024, C=0.492, S=0.480, LR=9.78e-04 | |
| 2025-09-04 04:54:15,200 - INFO - Epoch 4, Step 210/549: Loss=3.626187, K=0.014, C=0.508, S=0.480, LR=9.78e-04 | |
| 2025-09-04 04:54:47,135 - INFO - Epoch 4, Step 220/549: Loss=3.596747, K=0.020, C=0.480, S=0.479, LR=9.77e-04 | |
| 2025-09-04 04:55:21,507 - INFO - Epoch 4, Step 230/549: Loss=3.637091, K=0.075, C=0.428, S=0.483, LR=9.77e-04 | |
| 2025-09-04 04:55:54,368 - INFO - Epoch 4, Step 240/549: Loss=3.661911, K=0.018, C=0.512, S=0.480, LR=9.76e-04 | |
| 2025-09-04 04:56:28,227 - INFO - Epoch 4, Step 250/549: Loss=3.607255, K=0.017, C=0.500, S=0.479, LR=9.76e-04 | |
| 2025-09-04 04:57:01,845 - INFO - Epoch 4, Step 260/549: Loss=3.539376, K=0.014, C=0.508, S=0.480, LR=9.75e-04 | |
| 2025-09-04 04:57:35,639 - INFO - Epoch 4, Step 270/549: Loss=3.581644, K=0.096, C=0.371, S=0.482, LR=9.74e-04 | |
| 2025-09-04 04:58:08,282 - INFO - Epoch 4, Step 280/549: Loss=3.509186, K=0.055, C=0.430, S=0.481, LR=9.74e-04 | |
| 2025-09-04 04:58:39,068 - INFO - Epoch 4, Step 290/549: Loss=3.592500, K=0.034, C=0.490, S=0.481, LR=9.73e-04 | |
| 2025-09-04 04:59:10,999 - INFO - Epoch 4, Step 300/549: Loss=3.595861, K=0.005, C=0.531, S=0.480, LR=9.73e-04 | |
| 2025-09-04 04:59:44,595 - INFO - Epoch 4, Step 310/549: Loss=3.412889, K=0.031, C=0.463, S=0.480, LR=9.72e-04 | |
| 2025-09-04 05:00:17,445 - INFO - Epoch 4, Step 320/549: Loss=3.481847, K=0.013, C=0.498, S=0.479, LR=9.72e-04 | |
| 2025-09-04 05:00:48,701 - INFO - Epoch 4, Step 330/549: Loss=3.444653, K=0.020, C=0.490, S=0.479, LR=9.71e-04 | |
| 2025-09-04 05:01:21,392 - INFO - Epoch 4, Step 340/549: Loss=3.366562, K=0.018, C=0.494, S=0.480, LR=9.70e-04 | |
| 2025-09-04 05:01:54,944 - INFO - Epoch 4, Step 350/549: Loss=3.560143, K=0.022, C=0.496, S=0.480, LR=9.70e-04 | |
| 2025-09-04 05:02:27,934 - INFO - Epoch 4, Step 360/549: Loss=3.436127, K=0.037, C=0.471, S=0.481, LR=9.69e-04 | |
| 2025-09-04 05:02:59,459 - INFO - Epoch 4, Step 370/549: Loss=3.437571, K=0.102, C=0.379, S=0.484, LR=9.69e-04 | |
| 2025-09-04 05:03:31,308 - INFO - Epoch 4, Step 380/549: Loss=3.435496, K=0.025, C=0.482, S=0.480, LR=9.68e-04 | |
| 2025-09-04 05:04:03,622 - INFO - Epoch 4, Step 390/549: Loss=3.391290, K=0.016, C=0.504, S=0.479, LR=9.67e-04 | |
| 2025-09-04 05:04:38,137 - INFO - Epoch 4, Step 400/549: Loss=3.373513, K=0.035, C=0.457, S=0.479, LR=9.67e-04 | |
| 2025-09-04 05:05:13,609 - INFO - Epoch 4, Step 410/549: Loss=3.378079, K=0.012, C=0.523, S=0.480, LR=9.66e-04 | |
| 2025-09-04 05:05:47,575 - INFO - Epoch 4, Step 420/549: Loss=3.455843, K=0.007, C=0.527, S=0.479, LR=9.66e-04 | |
| 2025-09-04 05:06:22,133 - INFO - Epoch 4, Step 430/549: Loss=3.372872, K=0.104, C=0.367, S=0.483, LR=9.65e-04 | |
| 2025-09-04 05:06:55,122 - INFO - Epoch 4, Step 440/549: Loss=3.399441, K=0.992, C=0.000, S=0.482, LR=9.64e-04 | |
| 2025-09-04 05:07:26,921 - INFO - Epoch 4, Step 450/549: Loss=3.045933, K=0.953, C=0.006, S=0.483, LR=9.64e-04 | |
| 2025-09-04 05:08:03,044 - INFO - Epoch 4, Step 460/549: Loss=3.114237, K=0.957, C=0.005, S=0.482, LR=9.63e-04 | |
| 2025-09-04 05:08:35,147 - INFO - Epoch 4, Step 470/549: Loss=2.946796, K=0.953, C=0.005, S=0.483, LR=9.62e-04 | |
| 2025-09-04 05:09:10,730 - INFO - Epoch 4, Step 480/549: Loss=2.838197, K=0.930, C=0.008, S=0.483, LR=9.62e-04 | |
| 2025-09-04 05:09:43,184 - INFO - Epoch 4, Step 490/549: Loss=2.920776, K=0.930, C=0.009, S=0.483, LR=9.61e-04 | |
| 2025-09-04 05:10:17,483 - INFO - Epoch 4, Step 500/549: Loss=2.660730, K=0.926, C=0.009, S=0.483, LR=9.60e-04 | |
| 2025-09-04 05:10:51,262 - INFO - Epoch 4, Step 510/549: Loss=2.766702, K=0.930, C=0.009, S=0.483, LR=9.60e-04 | |
| 2025-09-04 05:11:22,942 - INFO - Epoch 4, Step 520/549: Loss=2.672099, K=0.883, C=0.016, S=0.484, LR=9.59e-04 | |
| 2025-09-04 05:11:56,022 - INFO - Epoch 4, Step 530/549: Loss=2.550283, K=0.867, C=0.019, S=0.484, LR=9.58e-04 | |
| 2025-09-04 05:12:29,524 - INFO - Epoch 4, Step 540/549: Loss=2.751794, K=0.859, C=0.021, S=0.484, LR=9.57e-04 | |
| 2025-09-04 05:12:56,403 - INFO - Epoch 4 completed in 1798.7s: Avg Loss=2.950807, K=0.184, C=0.389, S=0.480 | |
| 2025-09-04 05:12:56,641 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 05:12:56,880 - INFO - === EPOCH 4 COMPLETE === | |
| 2025-09-04 05:12:56,883 - INFO - Loss: 2.950807 (best: 1.491004) | |
| 2025-09-04 05:12:56,883 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 05:12:56,883 - INFO - Starting epoch 5 | |
| 2025-09-04 05:13:00,206 - INFO - Epoch 5, Step 0/549: Loss=3.772434, K=0.844, C=0.027, S=0.485, LR=9.57e-04 | |
| 2025-09-04 05:13:33,661 - INFO - Epoch 5, Step 10/549: Loss=3.737430, K=0.820, C=0.033, S=0.486, LR=9.56e-04 | |
| 2025-09-04 05:14:06,803 - INFO - Epoch 5, Step 20/549: Loss=3.709211, K=0.812, C=0.036, S=0.486, LR=9.55e-04 | |
| 2025-09-04 05:14:39,458 - INFO - Epoch 5, Step 30/549: Loss=3.757019, K=0.820, C=0.031, S=0.485, LR=9.55e-04 | |
| 2025-09-04 05:15:12,451 - INFO - Epoch 5, Step 40/549: Loss=3.707255, K=0.730, C=0.053, S=0.487, LR=9.54e-04 | |
| 2025-09-04 05:15:47,515 - INFO - Epoch 5, Step 50/549: Loss=3.716178, K=0.762, C=0.046, S=0.487, LR=9.53e-04 | |
| 2025-09-04 05:16:27,115 - INFO - Epoch 5, Step 60/549: Loss=3.739704, K=0.621, C=0.094, S=0.491, LR=9.52e-04 | |
| 2025-09-04 05:17:05,386 - INFO - Epoch 5, Step 70/549: Loss=3.595889, K=0.773, C=0.046, S=0.487, LR=9.52e-04 | |
| 2025-09-04 05:17:41,771 - INFO - Epoch 5, Step 80/549: Loss=3.779707, K=0.797, C=0.038, S=0.486, LR=9.51e-04 | |
| 2025-09-04 05:18:14,636 - INFO - Epoch 5, Step 90/549: Loss=3.684982, K=0.871, C=0.021, S=0.484, LR=9.50e-04 | |
| 2025-09-04 05:18:46,627 - INFO - Epoch 5, Step 100/549: Loss=3.761366, K=0.812, C=0.036, S=0.486, LR=9.49e-04 | |
| 2025-09-04 05:19:18,546 - INFO - Epoch 5, Step 110/549: Loss=3.630908, K=0.891, C=0.017, S=0.484, LR=9.49e-04 | |
| 2025-09-04 05:19:48,338 - INFO - Epoch 5, Step 120/549: Loss=3.571817, K=0.652, C=0.082, S=0.490, LR=9.48e-04 | |
| 2025-09-04 05:20:21,208 - INFO - Epoch 5, Step 130/549: Loss=3.384194, K=0.855, C=0.024, S=0.485, LR=9.47e-04 | |
| 2025-09-04 05:20:53,729 - INFO - Epoch 5, Step 140/549: Loss=3.611512, K=0.727, C=0.057, S=0.488, LR=9.46e-04 | |
| 2025-09-04 05:21:26,902 - INFO - Epoch 5, Step 150/549: Loss=3.479022, K=0.867, C=0.021, S=0.484, LR=9.46e-04 | |
| 2025-09-04 05:21:59,428 - INFO - Epoch 5, Step 160/549: Loss=3.627770, K=0.719, C=0.062, S=0.489, LR=9.45e-04 | |
| 2025-09-04 05:22:30,613 - INFO - Epoch 5, Step 170/549: Loss=3.393934, K=0.840, C=0.031, S=0.486, LR=9.44e-04 | |
| 2025-09-04 05:23:03,095 - INFO - Epoch 5, Step 180/549: Loss=3.421275, K=0.812, C=0.033, S=0.485, LR=9.43e-04 | |
| 2025-09-04 05:23:35,237 - INFO - Epoch 5, Step 190/549: Loss=3.542532, K=0.883, C=0.019, S=0.484, LR=9.42e-04 | |
| 2025-09-04 05:24:06,526 - INFO - Epoch 5, Step 200/549: Loss=3.475889, K=0.781, C=0.042, S=0.486, LR=9.42e-04 | |
| 2025-09-04 05:24:40,102 - INFO - Epoch 5, Step 210/549: Loss=3.398713, K=0.871, C=0.021, S=0.484, LR=9.41e-04 | |
| 2025-09-04 05:25:13,651 - INFO - Epoch 5, Step 220/549: Loss=3.406515, K=0.730, C=0.060, S=0.488, LR=9.40e-04 | |
| 2025-09-04 05:25:44,686 - INFO - Epoch 5, Step 230/549: Loss=3.256667, K=0.789, C=0.041, S=0.486, LR=9.39e-04 | |
| 2025-09-04 05:26:15,513 - INFO - Epoch 5, Step 240/549: Loss=3.202529, K=0.730, C=0.055, S=0.488, LR=9.38e-04 | |
| 2025-09-04 05:26:48,062 - INFO - Epoch 5, Step 250/549: Loss=3.121066, K=0.785, C=0.040, S=0.486, LR=9.37e-04 | |
| 2025-09-04 05:27:21,788 - INFO - Epoch 5, Step 260/549: Loss=3.387870, K=0.852, C=0.027, S=0.485, LR=9.37e-04 | |
| 2025-09-04 05:27:54,724 - INFO - Epoch 5, Step 270/549: Loss=3.151522, K=0.836, C=0.029, S=0.485, LR=9.36e-04 | |
| 2025-09-04 05:28:26,266 - INFO - Epoch 5, Step 280/549: Loss=3.322486, K=0.906, C=0.014, S=0.483, LR=9.35e-04 | |
| 2025-09-04 05:29:02,353 - INFO - Epoch 5, Step 290/549: Loss=3.135036, K=0.934, C=0.008, S=0.482, LR=9.34e-04 | |
| 2025-09-04 05:29:38,391 - INFO - Epoch 5, Step 300/549: Loss=3.255552, K=0.590, C=0.098, S=0.490, LR=9.33e-04 | |
| 2025-09-04 05:30:12,459 - INFO - Epoch 5, Step 310/549: Loss=3.268042, K=0.859, C=0.023, S=0.484, LR=9.32e-04 | |
| 2025-09-04 05:30:46,554 - INFO - Epoch 5, Step 320/549: Loss=3.315834, K=0.934, C=0.011, S=0.483, LR=9.31e-04 | |
| 2025-09-04 05:31:21,151 - INFO - Epoch 5, Step 330/549: Loss=3.195767, K=0.801, C=0.041, S=0.486, LR=9.30e-04 | |
| 2025-09-04 05:31:57,446 - INFO - Epoch 5, Step 340/549: Loss=3.182183, K=0.883, C=0.018, S=0.483, LR=9.30e-04 | |
| 2025-09-04 05:32:30,790 - INFO - Epoch 5, Step 350/549: Loss=3.167413, K=0.828, C=0.030, S=0.485, LR=9.29e-04 | |
| 2025-09-04 05:33:06,450 - INFO - Epoch 5, Step 360/549: Loss=3.127076, K=0.812, C=0.035, S=0.485, LR=9.28e-04 | |
| 2025-09-04 05:33:42,127 - INFO - Epoch 5, Step 370/549: Loss=3.056758, K=0.859, C=0.023, S=0.484, LR=9.27e-04 | |
| 2025-09-04 05:34:17,745 - INFO - Epoch 5, Step 380/549: Loss=3.109424, K=0.660, C=0.080, S=0.490, LR=9.26e-04 | |
| 2025-09-04 05:34:55,201 - INFO - Epoch 5, Step 390/549: Loss=3.061376, K=0.746, C=0.051, S=0.487, LR=9.25e-04 | |
| 2025-09-04 05:35:31,079 - INFO - Epoch 5, Step 400/549: Loss=3.131551, K=0.750, C=0.054, S=0.488, LR=9.24e-04 | |
| 2025-09-04 05:36:05,749 - INFO - Epoch 5, Step 410/549: Loss=3.060396, K=0.539, C=0.119, S=0.491, LR=9.23e-04 | |
| 2025-09-04 05:36:41,398 - INFO - Epoch 5, Step 420/549: Loss=3.216417, K=0.445, C=0.159, S=0.492, LR=9.22e-04 | |
| 2025-09-04 05:37:13,896 - INFO - Epoch 5, Step 430/549: Loss=3.205225, K=0.883, C=0.022, S=0.484, LR=9.21e-04 | |
| 2025-09-04 05:37:50,409 - INFO - Epoch 5, Step 440/549: Loss=3.217103, K=0.680, C=0.077, S=0.490, LR=9.20e-04 | |
| 2025-09-04 05:38:27,050 - INFO - Epoch 5, Step 450/549: Loss=2.992044, K=0.598, C=0.101, S=0.490, LR=9.19e-04 | |
| 2025-09-04 05:39:03,369 - INFO - Epoch 5, Step 460/549: Loss=2.994077, K=0.906, C=0.016, S=0.483, LR=9.18e-04 | |
| 2025-09-04 05:39:41,307 - INFO - Epoch 5, Step 470/549: Loss=2.851570, K=0.922, C=0.011, S=0.482, LR=9.18e-04 | |
| 2025-09-04 05:40:17,322 - INFO - Epoch 5, Step 480/549: Loss=2.905382, K=0.801, C=0.038, S=0.486, LR=9.17e-04 | |
| 2025-09-04 05:40:52,812 - INFO - Epoch 5, Step 490/549: Loss=3.050818, K=0.754, C=0.050, S=0.487, LR=9.16e-04 | |
| 2025-09-04 05:41:26,322 - INFO - Epoch 5, Step 500/549: Loss=3.007807, K=0.758, C=0.051, S=0.487, LR=9.15e-04 | |
| 2025-09-04 05:42:02,956 - INFO - Epoch 5, Step 510/549: Loss=3.043104, K=0.824, C=0.033, S=0.485, LR=9.14e-04 | |
| 2025-09-04 05:42:37,326 - INFO - Epoch 5, Step 520/549: Loss=2.780913, K=0.002, C=0.547, S=0.478, LR=9.13e-04 | |
| 2025-09-04 05:43:15,958 - INFO - Epoch 5, Step 530/549: Loss=2.687726, K=0.004, C=0.520, S=0.477, LR=9.12e-04 | |
| 2025-09-04 05:43:52,133 - INFO - Epoch 5, Step 540/549: Loss=2.735088, K=0.007, C=0.523, S=0.478, LR=9.11e-04 | |
| 2025-09-04 05:44:19,638 - INFO - Epoch 5 completed in 1882.8s: Avg Loss=2.692697, K=0.725, C=0.068, S=0.485 | |
| 2025-09-04 05:44:19,818 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 05:44:20,042 - INFO - === EPOCH 5 COMPLETE === | |
| 2025-09-04 05:44:20,042 - INFO - Loss: 2.692697 (best: 1.491004) | |
| 2025-09-04 05:44:20,042 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 05:44:20,042 - INFO - Starting epoch 6 | |
| 2025-09-04 05:44:23,629 - INFO - Epoch 6, Step 0/549: Loss=3.208239, K=0.016, C=0.520, S=0.479, LR=9.10e-04 | |
| 2025-09-04 05:45:00,626 - INFO - Epoch 6, Step 10/549: Loss=3.244996, K=0.074, C=0.408, S=0.481, LR=9.09e-04 | |
| 2025-09-04 05:45:40,187 - INFO - Epoch 6, Step 20/549: Loss=3.354598, K=0.038, C=0.461, S=0.479, LR=9.08e-04 | |
| 2025-09-04 05:46:16,012 - INFO - Epoch 6, Step 30/549: Loss=3.770369, K=0.008, C=0.531, S=0.478, LR=9.07e-04 | |
| 2025-09-04 05:46:51,265 - INFO - Epoch 6, Step 40/549: Loss=3.484531, K=0.014, C=0.508, S=0.479, LR=9.06e-04 | |
| 2025-09-04 05:47:25,725 - INFO - Epoch 6, Step 50/549: Loss=3.481783, K=0.010, C=0.512, S=0.478, LR=9.05e-04 | |
| 2025-09-04 05:47:59,469 - INFO - Epoch 6, Step 60/549: Loss=3.485155, K=0.019, C=0.508, S=0.479, LR=9.04e-04 | |
| 2025-09-04 05:48:33,092 - INFO - Epoch 6, Step 70/549: Loss=3.279640, K=0.020, C=0.473, S=0.478, LR=9.03e-04 | |
| 2025-09-04 05:49:07,112 - INFO - Epoch 6, Step 80/549: Loss=3.459909, K=0.034, C=0.477, S=0.479, LR=9.02e-04 | |
| 2025-09-04 05:49:41,217 - INFO - Epoch 6, Step 90/549: Loss=3.419902, K=0.009, C=0.527, S=0.478, LR=9.01e-04 | |
| 2025-09-04 05:50:15,218 - INFO - Epoch 6, Step 100/549: Loss=3.344873, K=0.029, C=0.492, S=0.479, LR=9.00e-04 | |
| 2025-09-04 05:50:51,402 - INFO - Epoch 6, Step 110/549: Loss=3.136264, K=0.020, C=0.482, S=0.478, LR=8.98e-04 | |
| 2025-09-04 05:51:25,327 - INFO - Epoch 6, Step 120/549: Loss=3.192173, K=0.025, C=0.479, S=0.478, LR=8.97e-04 | |
| 2025-09-04 05:51:59,396 - INFO - Epoch 6, Step 130/549: Loss=3.063369, K=0.050, C=0.434, S=0.479, LR=8.96e-04 | |
| 2025-09-04 05:52:37,750 - INFO - Epoch 6, Step 140/549: Loss=3.179179, K=0.046, C=0.455, S=0.480, LR=8.95e-04 | |
| 2025-09-04 05:53:12,488 - INFO - Epoch 6, Step 150/549: Loss=3.233383, K=0.012, C=0.523, S=0.478, LR=8.94e-04 | |
| 2025-09-04 05:53:45,939 - INFO - Epoch 6, Step 160/549: Loss=3.120380, K=0.029, C=0.480, S=0.479, LR=8.93e-04 | |
| 2025-09-04 05:54:17,912 - INFO - Epoch 6, Step 170/549: Loss=3.101419, K=0.033, C=0.461, S=0.478, LR=8.92e-04 | |
| 2025-09-04 05:54:53,690 - INFO - Epoch 6, Step 180/549: Loss=2.960550, K=0.025, C=0.480, S=0.479, LR=8.91e-04 | |
| 2025-09-04 05:55:28,913 - INFO - Epoch 6, Step 190/549: Loss=2.973728, K=0.034, C=0.471, S=0.479, LR=8.90e-04 | |
| 2025-09-04 05:56:06,840 - INFO - Epoch 6, Step 200/549: Loss=2.958010, K=0.027, C=0.465, S=0.478, LR=8.89e-04 | |
| 2025-09-04 05:56:41,209 - INFO - Epoch 6, Step 210/549: Loss=3.191554, K=0.014, C=0.527, S=0.478, LR=8.88e-04 | |
| 2025-09-04 05:57:15,882 - INFO - Epoch 6, Step 220/549: Loss=3.144758, K=0.011, C=0.527, S=0.478, LR=8.87e-04 | |
| 2025-09-04 05:57:47,997 - INFO - Epoch 6, Step 230/549: Loss=2.973946, K=0.020, C=0.482, S=0.478, LR=8.85e-04 | |
| 2025-09-04 05:58:19,059 - INFO - Epoch 6, Step 240/549: Loss=2.924570, K=0.018, C=0.496, S=0.479, LR=8.84e-04 | |
| 2025-09-04 05:58:49,434 - INFO - Epoch 6, Step 250/549: Loss=3.031327, K=0.022, C=0.516, S=0.479, LR=8.83e-04 | |
| 2025-09-04 05:59:21,763 - INFO - Epoch 6, Step 260/549: Loss=3.026892, K=0.011, C=0.527, S=0.478, LR=8.82e-04 | |
| 2025-09-04 05:59:53,763 - INFO - Epoch 6, Step 270/549: Loss=2.950773, K=0.046, C=0.461, S=0.480, LR=8.81e-04 | |
| 2025-09-04 06:00:26,251 - INFO - Epoch 6, Step 280/549: Loss=2.876118, K=0.008, C=0.512, S=0.478, LR=8.80e-04 | |
| 2025-09-04 06:00:58,537 - INFO - Epoch 6, Step 290/549: Loss=2.852496, K=0.053, C=0.438, S=0.480, LR=8.79e-04 | |
| 2025-09-04 06:01:30,897 - INFO - Epoch 6, Step 300/549: Loss=2.917918, K=0.031, C=0.477, S=0.478, LR=8.78e-04 | |
| 2025-09-04 06:02:02,539 - INFO - Epoch 6, Step 310/549: Loss=2.732249, K=0.032, C=0.461, S=0.479, LR=8.76e-04 | |
| 2025-09-04 06:02:34,021 - INFO - Epoch 6, Step 320/549: Loss=2.954464, K=0.021, C=0.504, S=0.478, LR=8.75e-04 | |
| 2025-09-04 06:03:04,914 - INFO - Epoch 6, Step 330/549: Loss=2.827548, K=0.015, C=0.500, S=0.478, LR=8.74e-04 | |
| 2025-09-04 06:03:37,105 - INFO - Epoch 6, Step 340/549: Loss=2.849366, K=0.018, C=0.492, S=0.478, LR=8.73e-04 | |
| 2025-09-04 06:04:08,705 - INFO - Epoch 6, Step 350/549: Loss=2.941721, K=0.011, C=0.523, S=0.478, LR=8.72e-04 | |
| 2025-09-04 06:04:41,206 - INFO - Epoch 6, Step 360/549: Loss=2.818981, K=0.068, C=0.418, S=0.480, LR=8.71e-04 | |
| 2025-09-04 06:05:13,742 - INFO - Epoch 6, Step 370/549: Loss=2.864442, K=0.014, C=0.520, S=0.479, LR=8.69e-04 | |
| 2025-09-04 06:05:45,288 - INFO - Epoch 6, Step 380/549: Loss=2.842515, K=0.010, C=0.520, S=0.478, LR=8.68e-04 | |
| 2025-09-04 06:06:17,857 - INFO - Epoch 6, Step 390/549: Loss=2.945434, K=0.020, C=0.523, S=0.478, LR=8.67e-04 | |
| 2025-09-04 06:06:49,441 - INFO - Epoch 6, Step 400/549: Loss=2.713139, K=0.050, C=0.434, S=0.479, LR=8.66e-04 | |
| 2025-09-04 06:07:19,411 - INFO - Epoch 6, Step 410/549: Loss=2.831551, K=0.020, C=0.508, S=0.478, LR=8.65e-04 | |
| 2025-09-04 06:07:51,811 - INFO - Epoch 6, Step 420/549: Loss=2.751174, K=0.054, C=0.453, S=0.480, LR=8.64e-04 | |
| 2025-09-04 06:08:23,446 - INFO - Epoch 6, Step 430/549: Loss=2.725388, K=0.013, C=0.488, S=0.477, LR=8.62e-04 | |
| 2025-09-04 06:08:55,712 - INFO - Epoch 6, Step 440/549: Loss=2.739136, K=0.082, C=0.396, S=0.481, LR=8.61e-04 | |
| 2025-09-04 06:09:29,872 - INFO - Epoch 6, Step 450/549: Loss=2.695577, K=0.016, C=0.496, S=0.478, LR=8.60e-04 | |
| 2025-09-04 06:10:04,924 - INFO - Epoch 6, Step 460/549: Loss=2.685907, K=0.017, C=0.508, S=0.479, LR=8.59e-04 | |
| 2025-09-04 06:10:38,410 - INFO - Epoch 6, Step 470/549: Loss=2.698966, K=0.076, C=0.410, S=0.481, LR=8.57e-04 | |
| 2025-09-04 06:11:11,593 - INFO - Epoch 6, Step 480/549: Loss=2.778441, K=0.013, C=0.523, S=0.478, LR=8.56e-04 | |
| 2025-09-04 06:11:44,503 - INFO - Epoch 6, Step 490/549: Loss=2.654497, K=0.034, C=0.459, S=0.478, LR=8.55e-04 | |
| 2025-09-04 06:12:20,513 - INFO - Epoch 6, Step 500/549: Loss=2.653401, K=0.017, C=0.496, S=0.478, LR=8.54e-04 | |
| 2025-09-04 06:12:55,305 - INFO - Epoch 6, Step 510/549: Loss=2.724613, K=0.111, C=0.375, S=0.483, LR=8.53e-04 | |
| 2025-09-04 06:13:27,720 - INFO - Epoch 6, Step 520/549: Loss=2.806928, K=0.014, C=0.539, S=0.479, LR=8.51e-04 | |
| 2025-09-04 06:14:01,602 - INFO - Epoch 6, Step 530/549: Loss=2.715862, K=0.062, C=0.430, S=0.480, LR=8.50e-04 | |
| 2025-09-04 06:14:35,687 - INFO - Epoch 6, Step 540/549: Loss=2.657403, K=0.018, C=0.504, S=0.478, LR=8.49e-04 | |
| 2025-09-04 06:15:02,604 - INFO - Epoch 6 completed in 1842.6s: Avg Loss=2.377974, K=0.015, C=0.478, S=0.476 | |
| 2025-09-04 06:15:02,803 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 06:15:02,983 - INFO - === EPOCH 6 COMPLETE === | |
| 2025-09-04 06:15:02,984 - INFO - Loss: 2.377974 (best: 1.491004) | |
| 2025-09-04 06:15:02,984 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 06:15:02,984 - INFO - Starting epoch 7 | |
| 2025-09-04 06:15:06,503 - INFO - Epoch 7, Step 0/549: Loss=1.377352, K=0.000, C=0.471, S=0.473, LR=8.48e-04 | |
| 2025-09-04 06:15:37,960 - INFO - Epoch 7, Step 10/549: Loss=1.834314, K=0.551, C=0.103, S=0.488, LR=8.46e-04 | |
| 2025-09-04 06:16:09,817 - INFO - Epoch 7, Step 20/549: Loss=1.852824, K=0.617, C=0.079, S=0.487, LR=8.45e-04 | |
| 2025-09-04 06:16:43,684 - INFO - Epoch 7, Step 30/549: Loss=2.098953, K=0.750, C=0.041, S=0.484, LR=8.44e-04 | |
| 2025-09-04 06:17:15,427 - INFO - Epoch 7, Step 40/549: Loss=1.991240, K=0.562, C=0.107, S=0.489, LR=8.43e-04 | |
| 2025-09-04 06:17:48,144 - INFO - Epoch 7, Step 50/549: Loss=2.128836, K=0.609, C=0.088, S=0.488, LR=8.41e-04 | |
| 2025-09-04 06:18:21,186 - INFO - Epoch 7, Step 60/549: Loss=2.036095, K=0.602, C=0.089, S=0.488, LR=8.40e-04 | |
| 2025-09-04 06:18:54,606 - INFO - Epoch 7, Step 70/549: Loss=2.035474, K=0.574, C=0.096, S=0.488, LR=8.39e-04 | |
| 2025-09-04 06:19:29,306 - INFO - Epoch 7, Step 80/549: Loss=2.051978, K=0.688, C=0.061, S=0.486, LR=8.38e-04 | |
| 2025-09-04 06:20:03,410 - INFO - Epoch 7, Step 90/549: Loss=2.140609, K=0.641, C=0.078, S=0.488, LR=8.36e-04 | |
| 2025-09-04 06:20:37,314 - INFO - Epoch 7, Step 100/549: Loss=2.052162, K=0.715, C=0.052, S=0.485, LR=8.35e-04 | |
| 2025-09-04 06:21:09,868 - INFO - Epoch 7, Step 110/549: Loss=2.057183, K=0.746, C=0.040, S=0.484, LR=8.34e-04 | |
| 2025-09-04 06:21:42,090 - INFO - Epoch 7, Step 120/549: Loss=1.978107, K=0.645, C=0.074, S=0.487, LR=8.32e-04 | |
| 2025-09-04 06:22:14,645 - INFO - Epoch 7, Step 130/549: Loss=2.104367, K=0.539, C=0.113, S=0.489, LR=8.31e-04 | |
| 2025-09-04 06:22:46,943 - INFO - Epoch 7, Step 140/549: Loss=2.138752, K=0.707, C=0.058, S=0.486, LR=8.30e-04 | |
| 2025-09-04 06:23:19,403 - INFO - Epoch 7, Step 150/549: Loss=2.197870, K=0.730, C=0.049, S=0.485, LR=8.28e-04 | |
| 2025-09-04 06:23:51,463 - INFO - Epoch 7, Step 160/549: Loss=2.223576, K=0.648, C=0.077, S=0.488, LR=8.27e-04 | |
| 2025-09-04 06:24:22,190 - INFO - Epoch 7, Step 170/549: Loss=2.535425, K=0.750, C=0.046, S=0.485, LR=8.26e-04 | |
| 2025-09-04 06:24:55,428 - INFO - Epoch 7, Step 180/549: Loss=2.333754, K=0.367, C=0.200, S=0.491, LR=8.24e-04 | |
| 2025-09-04 06:25:30,342 - INFO - Epoch 7, Step 190/549: Loss=2.396807, K=0.605, C=0.097, S=0.490, LR=8.23e-04 | |
| 2025-09-04 06:26:05,418 - INFO - Epoch 7, Step 200/549: Loss=2.367197, K=0.699, C=0.061, S=0.487, LR=8.22e-04 | |
| 2025-09-04 06:26:37,915 - INFO - Epoch 7, Step 210/549: Loss=2.428536, K=0.605, C=0.094, S=0.489, LR=8.20e-04 | |
| 2025-09-04 06:27:09,711 - INFO - Epoch 7, Step 220/549: Loss=2.398775, K=0.723, C=0.052, S=0.486, LR=8.19e-04 | |
| 2025-09-04 06:27:42,438 - INFO - Epoch 7, Step 230/549: Loss=2.494458, K=0.602, C=0.099, S=0.490, LR=8.18e-04 | |
| 2025-09-04 06:28:14,019 - INFO - Epoch 7, Step 240/549: Loss=2.361668, K=0.766, C=0.043, S=0.485, LR=8.16e-04 | |
| 2025-09-04 06:28:44,073 - INFO - Epoch 7, Step 250/549: Loss=2.370581, K=0.578, C=0.103, S=0.489, LR=8.15e-04 | |
| 2025-09-04 06:29:15,566 - INFO - Epoch 7, Step 260/549: Loss=2.464354, K=0.711, C=0.060, S=0.487, LR=8.14e-04 | |
| 2025-09-04 06:29:47,158 - INFO - Epoch 7, Step 270/549: Loss=2.462771, K=0.691, C=0.070, S=0.488, LR=8.12e-04 | |
| 2025-09-04 06:30:20,406 - INFO - Epoch 7, Step 280/549: Loss=2.411393, K=0.691, C=0.067, S=0.487, LR=8.11e-04 | |
| 2025-09-04 06:30:53,603 - INFO - Epoch 7, Step 290/549: Loss=2.362500, K=0.609, C=0.087, S=0.488, LR=8.10e-04 | |
| 2025-09-04 06:31:28,051 - INFO - Epoch 7, Step 300/549: Loss=2.448665, K=0.664, C=0.074, S=0.488, LR=8.08e-04 | |
| 2025-09-04 06:32:02,450 - INFO - Epoch 7, Step 310/549: Loss=2.413326, K=0.742, C=0.047, S=0.485, LR=8.07e-04 | |
| 2025-09-04 06:32:36,626 - INFO - Epoch 7, Step 320/549: Loss=2.342161, K=0.742, C=0.046, S=0.485, LR=8.05e-04 | |
| 2025-09-04 06:33:07,929 - INFO - Epoch 7, Step 330/549: Loss=2.512723, K=0.629, C=0.089, S=0.489, LR=8.04e-04 | |
| 2025-09-04 06:33:41,821 - INFO - Epoch 7, Step 340/549: Loss=2.426517, K=0.590, C=0.100, S=0.489, LR=8.03e-04 | |
| 2025-09-04 06:34:16,429 - INFO - Epoch 7, Step 350/549: Loss=2.332337, K=0.828, C=0.026, S=0.483, LR=8.01e-04 | |
| 2025-09-04 06:34:51,686 - INFO - Epoch 7, Step 360/549: Loss=2.381895, K=0.723, C=0.056, S=0.486, LR=8.00e-04 | |
| 2025-09-04 06:35:25,898 - INFO - Epoch 7, Step 370/549: Loss=2.396749, K=0.504, C=0.130, S=0.490, LR=7.99e-04 | |
| 2025-09-04 06:36:00,448 - INFO - Epoch 7, Step 380/549: Loss=2.286422, K=0.660, C=0.073, S=0.487, LR=7.97e-04 | |
| 2025-09-04 06:36:34,848 - INFO - Epoch 7, Step 390/549: Loss=2.355178, K=0.691, C=0.063, S=0.487, LR=7.96e-04 | |
| 2025-09-04 06:37:08,330 - INFO - Epoch 7, Step 400/549: Loss=2.297110, K=0.812, C=0.030, S=0.483, LR=7.94e-04 | |
| 2025-09-04 06:37:42,112 - INFO - Epoch 7, Step 410/549: Loss=2.222526, K=0.762, C=0.040, S=0.484, LR=7.93e-04 | |
| 2025-09-04 06:38:15,748 - INFO - Epoch 7, Step 420/549: Loss=2.382048, K=0.758, C=0.045, S=0.485, LR=7.91e-04 | |
| 2025-09-04 06:38:49,757 - INFO - Epoch 7, Step 430/549: Loss=2.133442, K=0.006, C=0.531, S=0.477, LR=7.90e-04 | |
| 2025-09-04 06:39:25,099 - INFO - Epoch 7, Step 440/549: Loss=2.152048, K=0.003, C=0.500, S=0.476, LR=7.89e-04 | |
| 2025-09-04 06:39:58,997 - INFO - Epoch 7, Step 450/549: Loss=2.014617, K=0.002, C=0.508, S=0.475, LR=7.87e-04 | |
| 2025-09-04 06:40:33,608 - INFO - Epoch 7, Step 460/549: Loss=1.968774, K=0.005, C=0.508, S=0.476, LR=7.86e-04 | |
| 2025-09-04 06:41:07,237 - INFO - Epoch 7, Step 470/549: Loss=1.800553, K=0.006, C=0.520, S=0.476, LR=7.84e-04 | |
| 2025-09-04 06:41:39,092 - INFO - Epoch 7, Step 480/549: Loss=1.821048, K=0.004, C=0.508, S=0.475, LR=7.83e-04 | |
| 2025-09-04 06:42:14,107 - INFO - Epoch 7, Step 490/549: Loss=1.821703, K=0.005, C=0.486, S=0.475, LR=7.81e-04 | |
| 2025-09-04 06:42:48,096 - INFO - Epoch 7, Step 500/549: Loss=1.692399, K=0.005, C=0.484, S=0.475, LR=7.80e-04 | |
| 2025-09-04 06:43:22,288 - INFO - Epoch 7, Step 510/549: Loss=1.686641, K=0.002, C=0.463, S=0.474, LR=7.79e-04 | |
| 2025-09-04 06:43:57,268 - INFO - Epoch 7, Step 520/549: Loss=1.507598, K=0.001, C=0.492, S=0.473, LR=7.77e-04 | |
| 2025-09-04 06:44:33,130 - INFO - Epoch 7, Step 530/549: Loss=1.555242, K=0.002, C=0.465, S=0.473, LR=7.76e-04 | |
| 2025-09-04 06:45:08,209 - INFO - Epoch 7, Step 540/549: Loss=1.420585, K=0.005, C=0.480, S=0.473, LR=7.74e-04 | |
| 2025-09-04 06:45:35,084 - INFO - Epoch 7 completed in 1832.1s: Avg Loss=2.125773, K=0.576, C=0.143, S=0.482 | |
| 2025-09-04 06:45:35,262 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 06:45:35,466 - INFO - === EPOCH 7 COMPLETE === | |
| 2025-09-04 06:45:35,466 - INFO - Loss: 2.125773 (best: 1.491004) | |
| 2025-09-04 06:45:35,466 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 06:45:35,466 - INFO - Starting epoch 8 | |
| 2025-09-04 06:45:38,898 - INFO - Epoch 8, Step 0/549: Loss=2.415756, K=0.021, C=0.480, S=0.477, LR=7.73e-04 | |
| 2025-09-04 06:46:14,005 - INFO - Epoch 8, Step 10/549: Loss=2.383639, K=0.029, C=0.479, S=0.478, LR=7.71e-04 | |
| 2025-09-04 06:46:50,170 - INFO - Epoch 8, Step 20/549: Loss=2.508118, K=0.025, C=0.516, S=0.478, LR=7.70e-04 | |
| 2025-09-04 06:47:26,236 - INFO - Epoch 8, Step 30/549: Loss=2.334394, K=0.029, C=0.475, S=0.478, LR=7.69e-04 | |
| 2025-09-04 06:48:02,417 - INFO - Epoch 8, Step 40/549: Loss=2.310066, K=0.025, C=0.484, S=0.478, LR=7.67e-04 | |
| 2025-09-04 06:48:38,762 - INFO - Epoch 8, Step 50/549: Loss=2.308972, K=0.034, C=0.457, S=0.477, LR=7.66e-04 | |
| 2025-09-04 06:49:14,761 - INFO - Epoch 8, Step 60/549: Loss=2.348292, K=0.050, C=0.432, S=0.478, LR=7.64e-04 | |
| 2025-09-04 06:49:49,993 - INFO - Epoch 8, Step 70/549: Loss=2.371967, K=0.019, C=0.482, S=0.476, LR=7.63e-04 | |
| 2025-09-04 06:50:25,503 - INFO - Epoch 8, Step 80/549: Loss=2.304859, K=0.013, C=0.496, S=0.477, LR=7.61e-04 | |
| 2025-09-04 06:51:02,729 - INFO - Epoch 8, Step 90/549: Loss=2.209302, K=0.031, C=0.445, S=0.477, LR=7.60e-04 | |
| 2025-09-04 06:51:37,787 - INFO - Epoch 8, Step 100/549: Loss=2.295153, K=0.035, C=0.471, S=0.478, LR=7.58e-04 | |
| 2025-09-04 06:52:11,314 - INFO - Epoch 8, Step 110/549: Loss=2.167519, K=0.021, C=0.469, S=0.477, LR=7.57e-04 | |
| 2025-09-04 06:52:44,050 - INFO - Epoch 8, Step 120/549: Loss=2.282100, K=0.100, C=0.391, S=0.482, LR=7.55e-04 | |
| 2025-09-04 06:53:19,143 - INFO - Epoch 8, Step 130/549: Loss=2.325567, K=0.023, C=0.488, S=0.477, LR=7.54e-04 | |
| 2025-09-04 06:53:51,491 - INFO - Epoch 8, Step 140/549: Loss=2.285536, K=0.009, C=0.508, S=0.477, LR=7.52e-04 | |
| 2025-09-04 06:54:23,821 - INFO - Epoch 8, Step 150/549: Loss=2.196999, K=0.029, C=0.467, S=0.477, LR=7.51e-04 | |
| 2025-09-04 06:54:56,585 - INFO - Epoch 8, Step 160/549: Loss=2.307340, K=0.017, C=0.520, S=0.477, LR=7.49e-04 | |
| 2025-09-04 06:55:28,691 - INFO - Epoch 8, Step 170/549: Loss=2.230043, K=0.016, C=0.508, S=0.477, LR=7.48e-04 | |
| 2025-09-04 06:56:01,525 - INFO - Epoch 8, Step 180/549: Loss=2.246774, K=0.020, C=0.508, S=0.477, LR=7.46e-04 | |
| 2025-09-04 06:56:34,007 - INFO - Epoch 8, Step 190/549: Loss=2.168950, K=0.057, C=0.418, S=0.478, LR=7.45e-04 | |
| 2025-09-04 06:57:06,697 - INFO - Epoch 8, Step 200/549: Loss=2.499682, K=0.957, C=0.002, S=0.480, LR=7.43e-04 | |
| 2025-09-04 06:57:39,195 - INFO - Epoch 8, Step 210/549: Loss=2.411283, K=0.922, C=0.009, S=0.481, LR=7.41e-04 | |
| 2025-09-04 06:58:11,125 - INFO - Epoch 8, Step 220/549: Loss=2.325681, K=0.918, C=0.007, S=0.480, LR=7.40e-04 | |
| 2025-09-04 06:58:42,217 - INFO - Epoch 8, Step 230/549: Loss=2.277281, K=0.918, C=0.005, S=0.480, LR=7.38e-04 | |
| 2025-09-04 06:59:16,483 - INFO - Epoch 8, Step 240/549: Loss=2.212095, K=0.898, C=0.010, S=0.480, LR=7.37e-04 | |
| 2025-09-04 06:59:50,639 - INFO - Epoch 8, Step 250/549: Loss=2.185850, K=0.867, C=0.017, S=0.481, LR=7.35e-04 | |
| 2025-09-04 07:00:23,405 - INFO - Epoch 8, Step 260/549: Loss=2.069123, K=0.883, C=0.013, S=0.481, LR=7.34e-04 | |
| 2025-09-04 07:00:56,994 - INFO - Epoch 8, Step 270/549: Loss=2.093115, K=0.848, C=0.019, S=0.481, LR=7.32e-04 | |
| 2025-09-04 07:01:29,569 - INFO - Epoch 8, Step 280/549: Loss=2.078769, K=0.875, C=0.011, S=0.480, LR=7.31e-04 | |
| 2025-09-04 07:02:02,523 - INFO - Epoch 8, Step 290/549: Loss=2.009828, K=0.875, C=0.010, S=0.480, LR=7.29e-04 | |
| 2025-09-04 07:02:34,707 - INFO - Epoch 8, Step 300/549: Loss=1.829325, K=0.711, C=0.055, S=0.485, LR=7.28e-04 | |
| 2025-09-04 07:03:05,617 - INFO - Epoch 8, Step 310/549: Loss=1.851745, K=0.820, C=0.017, S=0.480, LR=7.26e-04 | |
| 2025-09-04 07:03:38,749 - INFO - Epoch 8, Step 320/549: Loss=1.792879, K=0.816, C=0.016, S=0.480, LR=7.25e-04 | |
| 2025-09-04 07:04:10,817 - INFO - Epoch 8, Step 330/549: Loss=1.768430, K=0.820, C=0.003, S=0.477, LR=7.23e-04 | |
| 2025-09-04 07:04:44,165 - INFO - Epoch 8, Step 340/549: Loss=1.763450, K=0.750, C=0.032, S=0.482, LR=7.21e-04 | |
| 2025-09-04 07:05:17,330 - INFO - Epoch 8, Step 350/549: Loss=1.723092, K=0.805, C=0.006, S=0.477, LR=7.20e-04 | |
| 2025-09-04 07:05:49,857 - INFO - Epoch 8, Step 360/549: Loss=1.667485, K=0.715, C=0.037, S=0.482, LR=7.18e-04 | |
| 2025-09-04 07:06:22,631 - INFO - Epoch 8, Step 370/549: Loss=1.604710, K=0.773, C=0.006, S=0.477, LR=7.17e-04 | |
| 2025-09-04 07:06:54,608 - INFO - Epoch 8, Step 380/549: Loss=1.584185, K=0.727, C=0.026, S=0.480, LR=7.15e-04 | |
| 2025-09-04 07:07:25,744 - INFO - Epoch 8, Step 390/549: Loss=1.565523, K=0.719, C=0.029, S=0.480, LR=7.14e-04 | |
| 2025-09-04 07:07:58,424 - INFO - Epoch 8, Step 400/549: Loss=1.936129, K=0.044, C=0.449, S=0.477, LR=7.12e-04 | |
| 2025-09-04 07:08:31,099 - INFO - Epoch 8, Step 410/549: Loss=1.795048, K=0.040, C=0.443, S=0.477, LR=7.10e-04 | |
| 2025-09-04 07:09:03,919 - INFO - Epoch 8, Step 420/549: Loss=1.765663, K=0.073, C=0.406, S=0.479, LR=7.09e-04 | |
| 2025-09-04 07:09:38,092 - INFO - Epoch 8, Step 430/549: Loss=1.750247, K=0.058, C=0.416, S=0.478, LR=7.07e-04 | |
| 2025-09-04 07:10:12,470 - INFO - Epoch 8, Step 440/549: Loss=1.794834, K=0.050, C=0.426, S=0.477, LR=7.06e-04 | |
| 2025-09-04 07:10:47,096 - INFO - Epoch 8, Step 450/549: Loss=1.730206, K=0.053, C=0.414, S=0.477, LR=7.04e-04 | |
| 2025-09-04 07:11:22,728 - INFO - Epoch 8, Step 460/549: Loss=1.754562, K=0.066, C=0.400, S=0.478, LR=7.02e-04 | |
| 2025-09-04 07:11:55,921 - INFO - Epoch 8, Step 470/549: Loss=1.685577, K=0.030, C=0.447, S=0.476, LR=7.01e-04 | |
| 2025-09-04 07:12:33,399 - INFO - Epoch 8, Step 480/549: Loss=1.720628, K=0.149, C=0.336, S=0.483, LR=6.99e-04 | |
| 2025-09-04 07:13:10,142 - INFO - Epoch 8, Step 490/549: Loss=1.787863, K=0.492, C=0.132, S=0.489, LR=6.98e-04 | |
| 2025-09-04 07:13:45,572 - INFO - Epoch 8, Step 500/549: Loss=1.768128, K=0.785, C=0.022, S=0.480, LR=6.96e-04 | |
| 2025-09-04 07:14:18,245 - INFO - Epoch 8, Step 510/549: Loss=1.703664, K=0.777, C=0.019, S=0.480, LR=6.94e-04 | |
| 2025-09-04 07:14:51,823 - INFO - Epoch 8, Step 520/549: Loss=1.647775, K=0.707, C=0.035, S=0.481, LR=6.93e-04 | |
| 2025-09-04 07:15:24,936 - INFO - Epoch 8, Step 530/549: Loss=1.611894, K=0.711, C=0.031, S=0.480, LR=6.91e-04 | |
| 2025-09-04 07:15:56,165 - INFO - Epoch 8, Step 540/549: Loss=1.670787, K=0.758, C=0.004, S=0.476, LR=6.90e-04 | |
| 2025-09-04 07:16:21,902 - INFO - Epoch 8 completed in 1846.4s: Avg Loss=1.790950, K=0.324, C=0.269, S=0.479 | |
| 2025-09-04 07:16:22,081 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 07:16:22,282 - INFO - === EPOCH 8 COMPLETE === | |
| 2025-09-04 07:16:22,283 - INFO - Loss: 1.790950 (best: 1.491004) | |
| 2025-09-04 07:16:22,283 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 07:16:22,283 - INFO - Starting epoch 9 | |
| 2025-09-04 07:16:25,526 - INFO - Epoch 9, Step 0/549: Loss=1.602849, K=0.490, C=0.108, S=0.485, LR=6.88e-04 | |
| 2025-09-04 07:16:59,226 - INFO - Epoch 9, Step 10/549: Loss=1.928208, K=0.007, C=0.512, S=0.476, LR=6.86e-04 | |
| 2025-09-04 07:17:31,830 - INFO - Epoch 9, Step 20/549: Loss=1.813220, K=0.011, C=0.500, S=0.476, LR=6.85e-04 | |
| 2025-09-04 07:18:05,268 - INFO - Epoch 9, Step 30/549: Loss=1.762461, K=0.004, C=0.480, S=0.474, LR=6.83e-04 | |
| 2025-09-04 07:18:38,388 - INFO - Epoch 9, Step 40/549: Loss=1.654752, K=0.010, C=0.477, S=0.474, LR=6.82e-04 | |
| 2025-09-04 07:19:11,950 - INFO - Epoch 9, Step 50/549: Loss=1.637223, K=0.014, C=0.467, S=0.474, LR=6.80e-04 | |
| 2025-09-04 07:19:44,352 - INFO - Epoch 9, Step 60/549: Loss=1.638509, K=0.006, C=0.482, S=0.474, LR=6.78e-04 | |
| 2025-09-04 07:20:15,924 - INFO - Epoch 9, Step 70/549: Loss=1.549169, K=0.014, C=0.465, S=0.474, LR=6.77e-04 | |
| 2025-09-04 07:20:47,938 - INFO - Epoch 9, Step 80/549: Loss=1.601761, K=0.007, C=0.473, S=0.474, LR=6.75e-04 | |
| 2025-09-04 07:21:19,487 - INFO - Epoch 9, Step 90/549: Loss=1.485152, K=0.016, C=0.439, S=0.473, LR=6.73e-04 | |
| 2025-09-04 07:21:50,814 - INFO - Epoch 9, Step 100/549: Loss=1.446629, K=0.018, C=0.469, S=0.474, LR=6.72e-04 | |
| 2025-09-04 07:22:22,590 - INFO - Epoch 9, Step 110/549: Loss=1.764403, K=0.547, C=0.097, S=0.486, LR=6.70e-04 | |
| 2025-09-04 07:22:53,518 - INFO - Epoch 9, Step 120/549: Loss=1.643007, K=0.443, C=0.141, S=0.487, LR=6.69e-04 | |
| 2025-09-04 07:23:24,155 - INFO - Epoch 9, Step 130/549: Loss=1.606718, K=0.504, C=0.100, S=0.485, LR=6.67e-04 | |
| 2025-09-04 07:23:55,078 - INFO - Epoch 9, Step 140/549: Loss=1.630130, K=0.578, C=0.066, S=0.482, LR=6.65e-04 | |
| 2025-09-04 07:24:24,542 - INFO - Epoch 9, Step 150/549: Loss=1.603371, K=0.590, C=0.066, S=0.483, LR=6.64e-04 | |
| 2025-09-04 07:24:58,092 - INFO - Epoch 9, Step 160/549: Loss=1.596443, K=0.488, C=0.107, S=0.485, LR=6.62e-04 | |
| 2025-09-04 07:25:32,804 - INFO - Epoch 9, Step 170/549: Loss=1.650675, K=0.477, C=0.121, S=0.486, LR=6.60e-04 | |
| 2025-09-04 07:26:07,096 - INFO - Epoch 9, Step 180/549: Loss=1.758250, K=0.418, C=0.159, S=0.488, LR=6.59e-04 | |
| 2025-09-04 07:26:39,672 - INFO - Epoch 9, Step 190/549: Loss=1.720876, K=0.523, C=0.105, S=0.486, LR=6.57e-04 | |
| 2025-09-04 07:27:11,264 - INFO - Epoch 9, Step 200/549: Loss=1.752523, K=0.512, C=0.107, S=0.486, LR=6.55e-04 | |
| 2025-09-04 07:27:43,852 - INFO - Epoch 9, Step 210/549: Loss=1.684940, K=0.201, C=0.279, S=0.484, LR=6.54e-04 | |
| 2025-09-04 07:28:15,764 - INFO - Epoch 9, Step 220/549: Loss=1.997398, K=0.002, C=0.492, S=0.475, LR=6.52e-04 | |
| 2025-09-04 07:28:46,908 - INFO - Epoch 9, Step 230/549: Loss=1.883976, K=0.007, C=0.496, S=0.475, LR=6.50e-04 | |
| 2025-09-04 07:29:21,124 - INFO - Epoch 9, Step 240/549: Loss=1.856509, K=0.012, C=0.492, S=0.476, LR=6.49e-04 | |
| 2025-09-04 07:29:55,730 - INFO - Epoch 9, Step 250/549: Loss=1.794364, K=0.012, C=0.490, S=0.475, LR=6.47e-04 | |
| 2025-09-04 07:30:28,912 - INFO - Epoch 9, Step 260/549: Loss=1.797725, K=0.012, C=0.479, S=0.475, LR=6.45e-04 | |
| 2025-09-04 07:31:03,209 - INFO - Epoch 9, Step 270/549: Loss=1.748876, K=0.014, C=0.457, S=0.474, LR=6.44e-04 | |
| 2025-09-04 07:31:36,537 - INFO - Epoch 9, Step 280/549: Loss=1.687223, K=0.004, C=0.477, S=0.474, LR=6.42e-04 | |
| 2025-09-04 07:32:09,576 - INFO - Epoch 9, Step 290/549: Loss=1.592814, K=0.139, C=0.309, S=0.481, LR=6.40e-04 | |
| 2025-09-04 07:32:42,215 - INFO - Epoch 9, Step 300/549: Loss=1.637882, K=0.512, C=0.106, S=0.486, LR=6.39e-04 | |
| 2025-09-04 07:33:12,424 - INFO - Epoch 9, Step 310/549: Loss=1.572603, K=0.562, C=0.073, S=0.483, LR=6.37e-04 | |
| 2025-09-04 07:33:44,334 - INFO - Epoch 9, Step 320/549: Loss=1.565689, K=0.410, C=0.144, S=0.486, LR=6.35e-04 | |
| 2025-09-04 07:34:17,061 - INFO - Epoch 9, Step 330/549: Loss=1.538683, K=0.441, C=0.130, S=0.485, LR=6.34e-04 | |
| 2025-09-04 07:34:49,546 - INFO - Epoch 9, Step 340/549: Loss=1.631928, K=0.570, C=0.068, S=0.482, LR=6.32e-04 | |
| 2025-09-04 07:35:23,470 - INFO - Epoch 9, Step 350/549: Loss=1.599241, K=0.355, C=0.175, S=0.486, LR=6.30e-04 | |
| 2025-09-04 07:35:56,448 - INFO - Epoch 9, Step 360/549: Loss=1.563608, K=0.430, C=0.145, S=0.487, LR=6.29e-04 | |
| 2025-09-04 07:36:29,945 - INFO - Epoch 9, Step 370/549: Loss=1.631600, K=0.432, C=0.136, S=0.486, LR=6.27e-04 | |
| 2025-09-04 07:37:02,279 - INFO - Epoch 9, Step 380/549: Loss=1.624941, K=0.441, C=0.137, S=0.486, LR=6.25e-04 | |
| 2025-09-04 07:37:33,662 - INFO - Epoch 9, Step 390/549: Loss=1.688329, K=0.059, C=0.416, S=0.477, LR=6.24e-04 | |
| 2025-09-04 07:38:06,664 - INFO - Epoch 9, Step 400/549: Loss=1.758411, K=0.005, C=0.477, S=0.474, LR=6.22e-04 | |
| 2025-09-04 07:38:38,337 - INFO - Epoch 9, Step 410/549: Loss=1.655838, K=0.011, C=0.480, S=0.475, LR=6.20e-04 | |
| 2025-09-04 07:39:10,521 - INFO - Epoch 9, Step 420/549: Loss=1.512481, K=0.010, C=0.482, S=0.474, LR=6.18e-04 | |
| 2025-09-04 07:39:43,637 - INFO - Epoch 9, Step 430/549: Loss=1.495702, K=0.019, C=0.457, S=0.474, LR=6.17e-04 | |
| 2025-09-04 07:40:16,019 - INFO - Epoch 9, Step 440/549: Loss=1.508449, K=0.008, C=0.471, S=0.473, LR=6.15e-04 | |
| 2025-09-04 07:40:49,331 - INFO - Epoch 9, Step 450/549: Loss=1.450635, K=0.023, C=0.441, S=0.474, LR=6.13e-04 | |
| 2025-09-04 07:41:21,593 - INFO - Epoch 9, Step 460/549: Loss=1.318755, K=0.005, C=0.484, S=0.472, LR=6.12e-04 | |
| 2025-09-04 07:41:51,074 - INFO - Epoch 9, Step 470/549: Loss=1.395628, K=0.005, C=0.449, S=0.472, LR=6.10e-04 | |
| 2025-09-04 07:42:23,015 - INFO - Epoch 9, Step 480/549: Loss=1.343502, K=0.005, C=0.455, S=0.472, LR=6.08e-04 | |
| 2025-09-04 07:42:55,349 - INFO - Epoch 9, Step 490/549: Loss=1.316134, K=0.001, C=0.449, S=0.471, LR=6.07e-04 | |
| 2025-09-04 07:43:27,432 - INFO - Epoch 9, Step 500/549: Loss=1.313864, K=0.018, C=0.428, S=0.472, LR=6.05e-04 | |
| 2025-09-04 07:43:58,388 - INFO - Epoch 9, Step 510/549: Loss=1.280239, K=0.004, C=0.447, S=0.471, LR=6.03e-04 | |
| 2025-09-04 07:44:29,168 - INFO - Epoch 9, Step 520/549: Loss=1.304508, K=0.001, C=0.432, S=0.471, LR=6.01e-04 | |
| 2025-09-04 07:45:00,546 - INFO - Epoch 9, Step 530/549: Loss=1.610348, K=0.633, C=0.017, S=0.475, LR=6.00e-04 | |
| 2025-09-04 07:45:31,127 - INFO - Epoch 9, Step 540/549: Loss=1.416186, K=0.422, C=0.120, S=0.483, LR=5.98e-04 | |
| 2025-09-04 07:45:54,091 - INFO - Epoch 9 completed in 1771.8s: Avg Loss=1.553944, K=0.262, C=0.281, S=0.478 | |
| 2025-09-04 07:45:54,274 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 07:45:54,459 - INFO - === EPOCH 9 COMPLETE === | |
| 2025-09-04 07:45:54,460 - INFO - Loss: 1.553944 (best: 1.491004) | |
| 2025-09-04 07:45:54,460 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 07:45:54,460 - INFO - Starting epoch 10 | |
| 2025-09-04 07:45:57,693 - INFO - Epoch 10, Step 0/549: Loss=1.543520, K=0.695, C=0.029, S=0.479, LR=5.96e-04 | |
| 2025-09-04 07:46:28,072 - INFO - Epoch 10, Step 10/549: Loss=1.413275, K=0.613, C=0.050, S=0.480, LR=5.95e-04 | |
| 2025-09-04 07:46:58,991 - INFO - Epoch 10, Step 20/549: Loss=1.429822, K=0.613, C=0.046, S=0.480, LR=5.93e-04 | |
| 2025-09-04 07:47:29,402 - INFO - Epoch 10, Step 30/549: Loss=1.395326, K=0.621, C=0.041, S=0.479, LR=5.91e-04 | |
| 2025-09-04 07:47:59,957 - INFO - Epoch 10, Step 40/549: Loss=1.411056, K=0.637, C=0.020, S=0.476, LR=5.90e-04 | |
| 2025-09-04 07:48:31,696 - INFO - Epoch 10, Step 50/549: Loss=1.347085, K=0.594, C=0.050, S=0.480, LR=5.88e-04 | |
| 2025-09-04 07:49:02,682 - INFO - Epoch 10, Step 60/549: Loss=1.457874, K=0.059, C=0.391, S=0.476, LR=5.86e-04 | |
| 2025-09-04 07:49:32,968 - INFO - Epoch 10, Step 70/549: Loss=1.434453, K=0.017, C=0.424, S=0.472, LR=5.84e-04 | |
| 2025-09-04 07:50:02,313 - INFO - Epoch 10, Step 80/549: Loss=1.486772, K=0.062, C=0.393, S=0.476, LR=5.83e-04 | |
| 2025-09-04 07:50:31,153 - INFO - Epoch 10, Step 90/549: Loss=1.429005, K=0.073, C=0.365, S=0.476, LR=5.81e-04 | |
| 2025-09-04 07:51:02,030 - INFO - Epoch 10, Step 100/549: Loss=1.452518, K=0.080, C=0.365, S=0.477, LR=5.79e-04 | |
| 2025-09-04 07:51:33,051 - INFO - Epoch 10, Step 110/549: Loss=1.414706, K=0.067, C=0.367, S=0.476, LR=5.78e-04 | |
| 2025-09-04 07:52:04,140 - INFO - Epoch 10, Step 120/549: Loss=1.460027, K=0.035, C=0.412, S=0.474, LR=5.76e-04 | |
| 2025-09-04 07:52:35,605 - INFO - Epoch 10, Step 130/549: Loss=1.393601, K=0.025, C=0.410, S=0.473, LR=5.74e-04 | |
| 2025-09-04 07:53:06,741 - INFO - Epoch 10, Step 140/549: Loss=1.579635, K=0.781, C=0.006, S=0.476, LR=5.72e-04 | |
| 2025-09-04 07:53:38,587 - INFO - Epoch 10, Step 150/549: Loss=1.521360, K=0.727, C=0.008, S=0.476, LR=5.71e-04 | |
| 2025-09-04 07:54:08,334 - INFO - Epoch 10, Step 160/549: Loss=1.475150, K=0.695, C=0.006, S=0.475, LR=5.69e-04 | |
| 2025-09-04 07:54:37,487 - INFO - Epoch 10, Step 170/549: Loss=1.426414, K=0.578, C=0.064, S=0.482, LR=5.67e-04 | |
| 2025-09-04 07:55:08,164 - INFO - Epoch 10, Step 180/549: Loss=1.418341, K=0.633, C=0.040, S=0.479, LR=5.65e-04 | |
| 2025-09-04 07:55:38,396 - INFO - Epoch 10, Step 190/549: Loss=1.397569, K=0.660, C=0.007, S=0.474, LR=5.64e-04 | |
| 2025-09-04 07:56:09,028 - INFO - Epoch 10, Step 200/549: Loss=1.385499, K=0.633, C=0.025, S=0.477, LR=5.62e-04 | |
| 2025-09-04 07:56:40,205 - INFO - Epoch 10, Step 210/549: Loss=1.331737, K=0.449, C=0.116, S=0.484, LR=5.60e-04 | |
| 2025-09-04 07:57:10,813 - INFO - Epoch 10, Step 220/549: Loss=1.438372, K=0.050, C=0.391, S=0.475, LR=5.59e-04 | |
| 2025-09-04 07:57:41,471 - INFO - Epoch 10, Step 230/549: Loss=1.420099, K=0.034, C=0.410, S=0.474, LR=5.57e-04 | |
| 2025-09-04 07:58:14,180 - INFO - Epoch 10, Step 240/549: Loss=1.367751, K=0.061, C=0.383, S=0.475, LR=5.55e-04 | |
| 2025-09-04 07:58:44,720 - INFO - Epoch 10, Step 250/549: Loss=1.337871, K=0.117, C=0.314, S=0.478, LR=5.53e-04 | |
| 2025-09-04 07:59:15,201 - INFO - Epoch 10, Step 260/549: Loss=1.377901, K=0.085, C=0.346, S=0.476, LR=5.52e-04 | |
| 2025-09-04 07:59:45,786 - INFO - Epoch 10, Step 270/549: Loss=1.340977, K=0.186, C=0.262, S=0.481, LR=5.50e-04 | |
| 2025-09-04 08:00:16,125 - INFO - Epoch 10, Step 280/549: Loss=1.437757, K=0.664, C=0.015, S=0.476, LR=5.48e-04 | |
| 2025-09-04 08:00:47,332 - INFO - Epoch 10, Step 290/549: Loss=1.361484, K=0.570, C=0.051, S=0.479, LR=5.46e-04 | |
| 2025-09-04 08:01:18,427 - INFO - Epoch 10, Step 300/549: Loss=1.342876, K=0.594, C=0.031, S=0.477, LR=5.45e-04 | |
| 2025-09-04 08:01:49,218 - INFO - Epoch 10, Step 310/549: Loss=1.315475, K=0.609, C=0.023, S=0.476, LR=5.43e-04 | |
| 2025-09-04 08:02:19,552 - INFO - Epoch 10, Step 320/549: Loss=1.295545, K=0.328, C=0.170, S=0.483, LR=5.41e-04 | |
| 2025-09-04 08:02:48,626 - INFO - Epoch 10, Step 330/549: Loss=1.382769, K=0.055, C=0.395, S=0.475, LR=5.39e-04 | |
| 2025-09-04 08:03:18,026 - INFO - Epoch 10, Step 340/549: Loss=1.301119, K=0.072, C=0.344, S=0.475, LR=5.38e-04 | |
| 2025-09-04 08:03:49,200 - INFO - Epoch 10, Step 350/549: Loss=1.347241, K=0.034, C=0.400, S=0.473, LR=5.36e-04 | |
| 2025-09-04 08:04:20,569 - INFO - Epoch 10, Step 360/549: Loss=1.297386, K=0.128, C=0.307, S=0.479, LR=5.34e-04 | |
| 2025-09-04 08:04:51,156 - INFO - Epoch 10, Step 370/549: Loss=1.409346, K=0.645, C=0.022, S=0.476, LR=5.32e-04 | |
| 2025-09-04 08:05:22,359 - INFO - Epoch 10, Step 380/549: Loss=1.340227, K=0.574, C=0.046, S=0.478, LR=5.31e-04 | |
| 2025-09-04 08:05:54,166 - INFO - Epoch 10, Step 390/549: Loss=1.322983, K=0.492, C=0.081, S=0.481, LR=5.29e-04 | |
| 2025-09-04 08:06:26,374 - INFO - Epoch 10, Step 400/549: Loss=1.306375, K=0.562, C=0.035, S=0.476, LR=5.27e-04 | |
| 2025-09-04 08:06:58,782 - INFO - Epoch 10, Step 410/549: Loss=1.258540, K=0.516, C=0.064, S=0.479, LR=5.25e-04 | |
| 2025-09-04 08:07:28,635 - INFO - Epoch 10, Step 420/549: Loss=1.358310, K=0.085, C=0.346, S=0.476, LR=5.24e-04 | |
| 2025-09-04 08:08:00,463 - INFO - Epoch 10, Step 430/549: Loss=1.303838, K=0.082, C=0.336, S=0.476, LR=5.22e-04 | |
| 2025-09-04 08:08:33,016 - INFO - Epoch 10, Step 440/549: Loss=1.306520, K=0.083, C=0.332, S=0.475, LR=5.20e-04 | |
| 2025-09-04 08:09:06,252 - INFO - Epoch 10, Step 450/549: Loss=1.343450, K=0.021, C=0.414, S=0.472, LR=5.18e-04 | |
| 2025-09-04 08:09:42,064 - INFO - Epoch 10, Step 460/549: Loss=1.284159, K=0.062, C=0.350, S=0.474, LR=5.17e-04 | |
| 2025-09-04 08:10:17,178 - INFO - Epoch 10, Step 470/549: Loss=1.339769, K=0.043, C=0.389, S=0.473, LR=5.15e-04 | |
| 2025-09-04 08:10:49,073 - INFO - Epoch 10, Step 480/549: Loss=1.348410, K=0.172, C=0.283, S=0.481, LR=5.13e-04 | |
| 2025-09-04 08:11:19,530 - INFO - Epoch 10, Step 490/549: Loss=1.402361, K=0.664, C=0.012, S=0.475, LR=5.12e-04 | |
| 2025-09-04 08:11:49,381 - INFO - Epoch 10, Step 500/549: Loss=1.374814, K=0.578, C=0.044, S=0.478, LR=5.10e-04 | |
| 2025-09-04 08:12:21,425 - INFO - Epoch 10, Step 510/549: Loss=1.329129, K=0.449, C=0.108, S=0.483, LR=5.08e-04 | |
| 2025-09-04 08:12:54,229 - INFO - Epoch 10, Step 520/549: Loss=1.331139, K=0.157, C=0.289, S=0.480, LR=5.06e-04 | |
| 2025-09-04 08:13:26,838 - INFO - Epoch 10, Step 530/549: Loss=1.365656, K=0.043, C=0.393, S=0.473, LR=5.05e-04 | |
| 2025-09-04 08:13:59,068 - INFO - Epoch 10, Step 540/549: Loss=1.356993, K=0.068, C=0.361, S=0.475, LR=5.03e-04 | |
| 2025-09-04 08:14:23,322 - INFO - Epoch 10 completed in 1708.9s: Avg Loss=1.347093, K=0.252, C=0.248, S=0.476 | |
| 2025-09-04 08:14:23,509 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 08:14:23,859 - INFO - NEW BEST MODEL! Loss: 1.347093 -> /data/BitTransformerLM/checkpoints/checkpoint_best.pt | |
| 2025-09-04 08:14:23,863 - INFO - === EPOCH 10 COMPLETE === | |
| 2025-09-04 08:14:23,863 - INFO - Loss: 1.347093 (best: 1.347093) | |
| 2025-09-04 08:14:23,863 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 08:14:23,863 - INFO - Starting epoch 11 | |
| 2025-09-04 08:14:27,000 - INFO - Epoch 11, Step 0/549: Loss=1.128921, K=0.004, C=0.428, S=0.469, LR=5.01e-04 | |
| 2025-09-04 08:14:57,815 - INFO - Epoch 11, Step 10/549: Loss=1.163665, K=0.003, C=0.402, S=0.469, LR=4.99e-04 | |
| 2025-09-04 08:15:28,762 - INFO - Epoch 11, Step 20/549: Loss=1.153064, K=0.011, C=0.377, S=0.469, LR=4.98e-04 | |
| 2025-09-04 08:15:57,941 - INFO - Epoch 11, Step 30/549: Loss=1.294581, K=0.420, C=0.062, S=0.475, LR=4.96e-04 | |
| 2025-09-04 08:16:29,966 - INFO - Epoch 11, Step 40/549: Loss=1.167952, K=0.305, C=0.113, S=0.476, LR=4.94e-04 | |
| 2025-09-04 08:17:00,959 - INFO - Epoch 11, Step 50/549: Loss=1.184807, K=0.266, C=0.146, S=0.477, LR=4.92e-04 | |
| 2025-09-04 08:17:32,050 - INFO - Epoch 11, Step 60/549: Loss=1.187424, K=0.316, C=0.107, S=0.476, LR=4.91e-04 | |
| 2025-09-04 08:18:03,357 - INFO - Epoch 11, Step 70/549: Loss=1.196470, K=0.270, C=0.139, S=0.477, LR=4.89e-04 | |
| 2025-09-04 08:18:34,140 - INFO - Epoch 11, Step 80/549: Loss=1.186155, K=0.102, C=0.281, S=0.474, LR=4.87e-04 | |
| 2025-09-04 08:19:05,610 - INFO - Epoch 11, Step 90/549: Loss=1.183390, K=0.004, C=0.434, S=0.470, LR=4.86e-04 | |
| 2025-09-04 08:19:36,831 - INFO - Epoch 11, Step 100/549: Loss=1.209812, K=0.003, C=0.406, S=0.469, LR=4.84e-04 | |
| 2025-09-04 08:20:05,540 - INFO - Epoch 11, Step 110/549: Loss=1.184296, K=0.005, C=0.387, S=0.469, LR=4.82e-04 | |
| 2025-09-04 08:20:36,332 - INFO - Epoch 11, Step 120/549: Loss=1.165297, K=0.202, C=0.197, S=0.477, LR=4.80e-04 | |
| 2025-09-04 08:21:09,422 - INFO - Epoch 11, Step 130/549: Loss=1.142168, K=0.340, C=0.067, S=0.473, LR=4.79e-04 | |
| 2025-09-04 08:21:42,172 - INFO - Epoch 11, Step 140/549: Loss=1.176476, K=0.229, C=0.174, S=0.477, LR=4.77e-04 | |
| 2025-09-04 08:22:12,990 - INFO - Epoch 11, Step 150/549: Loss=1.194135, K=0.297, C=0.117, S=0.476, LR=4.75e-04 | |
| 2025-09-04 08:22:43,590 - INFO - Epoch 11, Step 160/549: Loss=1.119312, K=0.135, C=0.256, S=0.476, LR=4.73e-04 | |
| 2025-09-04 08:23:13,998 - INFO - Epoch 11, Step 170/549: Loss=1.249950, K=0.002, C=0.406, S=0.469, LR=4.72e-04 | |
| 2025-09-04 08:23:47,566 - INFO - Epoch 11, Step 180/549: Loss=1.185361, K=0.005, C=0.404, S=0.469, LR=4.70e-04 | |
| 2025-09-04 08:24:18,069 - INFO - Epoch 11, Step 190/549: Loss=1.178665, K=0.009, C=0.398, S=0.470, LR=4.68e-04 | |
| 2025-09-04 08:24:49,051 - INFO - Epoch 11, Step 200/549: Loss=1.151106, K=0.081, C=0.295, S=0.473, LR=4.66e-04 | |
| 2025-09-04 08:25:19,856 - INFO - Epoch 11, Step 210/549: Loss=1.200058, K=0.309, C=0.115, S=0.476, LR=4.65e-04 | |
| 2025-09-04 08:25:51,701 - INFO - Epoch 11, Step 220/549: Loss=1.191276, K=0.211, C=0.201, S=0.478, LR=4.63e-04 | |
| 2025-09-04 08:26:23,941 - INFO - Epoch 11, Step 230/549: Loss=1.167594, K=0.210, C=0.191, S=0.477, LR=4.61e-04 | |
| 2025-09-04 08:26:55,804 - INFO - Epoch 11, Step 240/549: Loss=1.177340, K=0.247, C=0.155, S=0.477, LR=4.59e-04 | |
| 2025-09-04 08:27:27,543 - INFO - Epoch 11, Step 250/549: Loss=1.251748, K=0.005, C=0.430, S=0.470, LR=4.58e-04 | |
| 2025-09-04 08:27:58,969 - INFO - Epoch 11, Step 260/549: Loss=1.193401, K=0.007, C=0.410, S=0.470, LR=4.56e-04 | |
| 2025-09-04 08:28:28,121 - INFO - Epoch 11, Step 270/549: Loss=1.187478, K=0.023, C=0.369, S=0.470, LR=4.54e-04 | |
| 2025-09-04 08:28:58,401 - INFO - Epoch 11, Step 280/549: Loss=1.136803, K=0.017, C=0.396, S=0.470, LR=4.52e-04 | |
| 2025-09-04 08:29:28,804 - INFO - Epoch 11, Step 290/549: Loss=1.137807, K=0.055, C=0.336, S=0.472, LR=4.51e-04 | |
| 2025-09-04 08:29:59,023 - INFO - Epoch 11, Step 300/549: Loss=1.167481, K=0.207, C=0.189, S=0.477, LR=4.49e-04 | |
| 2025-09-04 08:30:29,550 - INFO - Epoch 11, Step 310/549: Loss=1.184783, K=0.277, C=0.133, S=0.476, LR=4.47e-04 | |
| 2025-09-04 08:30:59,924 - INFO - Epoch 11, Step 320/549: Loss=1.153422, K=0.246, C=0.158, S=0.477, LR=4.45e-04 | |
| 2025-09-04 08:31:30,726 - INFO - Epoch 11, Step 330/549: Loss=1.192094, K=0.145, C=0.236, S=0.475, LR=4.44e-04 | |
| 2025-09-04 08:32:01,883 - INFO - Epoch 11, Step 340/549: Loss=1.178942, K=0.007, C=0.408, S=0.469, LR=4.42e-04 | |
| 2025-09-04 08:32:31,197 - INFO - Epoch 11, Step 350/549: Loss=1.157690, K=0.008, C=0.398, S=0.469, LR=4.40e-04 | |
| 2025-09-04 08:33:00,044 - INFO - Epoch 11, Step 360/549: Loss=1.114241, K=0.021, C=0.387, S=0.470, LR=4.39e-04 | |
| 2025-09-04 08:33:30,626 - INFO - Epoch 11, Step 370/549: Loss=1.081933, K=0.040, C=0.338, S=0.470, LR=4.37e-04 | |
| 2025-09-04 08:34:00,196 - INFO - Epoch 11, Step 380/549: Loss=1.097263, K=0.025, C=0.352, S=0.469, LR=4.35e-04 | |
| 2025-09-04 08:34:31,157 - INFO - Epoch 11, Step 390/549: Loss=1.134251, K=0.260, C=0.119, S=0.474, LR=4.33e-04 | |
| 2025-09-04 08:35:02,293 - INFO - Epoch 11, Step 400/549: Loss=1.118939, K=0.229, C=0.144, S=0.475, LR=4.32e-04 | |
| 2025-09-04 08:35:33,736 - INFO - Epoch 11, Step 410/549: Loss=1.128599, K=0.260, C=0.113, S=0.474, LR=4.30e-04 | |
| 2025-09-04 08:36:05,415 - INFO - Epoch 11, Step 420/549: Loss=1.066705, K=0.139, C=0.222, S=0.474, LR=4.28e-04 | |
| 2025-09-04 08:36:40,096 - INFO - Epoch 11, Step 430/549: Loss=1.102898, K=0.006, C=0.402, S=0.469, LR=4.26e-04 | |
| 2025-09-04 08:37:12,736 - INFO - Epoch 11, Step 440/549: Loss=1.110210, K=0.007, C=0.387, S=0.469, LR=4.25e-04 | |
| 2025-09-04 08:37:43,798 - INFO - Epoch 11, Step 450/549: Loss=1.103575, K=0.003, C=0.383, S=0.468, LR=4.23e-04 | |
| 2025-09-04 08:38:14,163 - INFO - Epoch 11, Step 460/549: Loss=1.129327, K=0.008, C=0.359, S=0.468, LR=4.21e-04 | |
| 2025-09-04 08:38:45,649 - INFO - Epoch 11, Step 470/549: Loss=1.154011, K=0.318, C=0.072, S=0.473, LR=4.20e-04 | |
| 2025-09-04 08:39:16,537 - INFO - Epoch 11, Step 480/549: Loss=1.104337, K=0.262, C=0.105, S=0.473, LR=4.18e-04 | |
| 2025-09-04 08:39:47,357 - INFO - Epoch 11, Step 490/549: Loss=1.103019, K=0.249, C=0.120, S=0.474, LR=4.16e-04 | |
| 2025-09-04 08:40:18,507 - INFO - Epoch 11, Step 500/549: Loss=1.090783, K=0.244, C=0.120, S=0.474, LR=4.14e-04 | |
| 2025-09-04 08:40:48,673 - INFO - Epoch 11, Step 510/549: Loss=1.106962, K=0.205, C=0.170, S=0.475, LR=4.13e-04 | |
| 2025-09-04 08:41:18,813 - INFO - Epoch 11, Step 520/549: Loss=1.062847, K=0.175, C=0.174, S=0.473, LR=4.11e-04 | |
| 2025-09-04 08:41:52,521 - INFO - Epoch 11, Step 530/549: Loss=1.169632, K=0.002, C=0.402, S=0.469, LR=4.09e-04 | |
| 2025-09-04 08:42:26,594 - INFO - Epoch 11, Step 540/549: Loss=1.089396, K=0.003, C=0.408, S=0.468, LR=4.08e-04 | |
| 2025-09-04 08:42:52,994 - INFO - Epoch 11 completed in 1709.1s: Avg Loss=1.177518, K=0.200, C=0.228, S=0.473 | |
| 2025-09-04 08:42:53,195 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 08:42:53,610 - INFO - NEW BEST MODEL! Loss: 1.177518 -> /data/BitTransformerLM/checkpoints/checkpoint_best.pt | |
| 2025-09-04 08:42:53,615 - INFO - === EPOCH 11 COMPLETE === | |
| 2025-09-04 08:42:53,615 - INFO - Loss: 1.177518 (best: 1.177518) | |
| 2025-09-04 08:42:53,615 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 08:42:53,615 - INFO - Starting epoch 12 | |
| 2025-09-04 08:42:57,144 - INFO - Epoch 12, Step 0/549: Loss=1.100482, K=0.093, C=0.273, S=0.473, LR=4.06e-04 | |
| 2025-09-04 08:43:32,770 - INFO - Epoch 12, Step 10/549: Loss=1.100609, K=0.066, C=0.289, S=0.471, LR=4.04e-04 | |
| 2025-09-04 08:44:06,268 - INFO - Epoch 12, Step 20/549: Loss=1.116179, K=0.044, C=0.311, S=0.470, LR=4.03e-04 | |
| 2025-09-04 08:44:39,528 - INFO - Epoch 12, Step 30/549: Loss=1.067337, K=0.066, C=0.279, S=0.471, LR=4.01e-04 | |
| 2025-09-04 08:45:10,725 - INFO - Epoch 12, Step 40/549: Loss=1.140722, K=0.457, C=0.013, S=0.470, LR=3.99e-04 | |
| 2025-09-04 08:45:40,043 - INFO - Epoch 12, Step 50/549: Loss=1.126642, K=0.414, C=0.036, S=0.472, LR=3.97e-04 | |
| 2025-09-04 08:46:10,906 - INFO - Epoch 12, Step 60/549: Loss=1.095711, K=0.348, C=0.075, S=0.474, LR=3.96e-04 | |
| 2025-09-04 08:46:42,346 - INFO - Epoch 12, Step 70/549: Loss=1.052853, K=0.312, C=0.095, S=0.475, LR=3.94e-04 | |
| 2025-09-04 08:47:14,030 - INFO - Epoch 12, Step 80/549: Loss=1.080968, K=0.179, C=0.191, S=0.475, LR=3.92e-04 | |
| 2025-09-04 08:47:44,307 - INFO - Epoch 12, Step 90/549: Loss=1.092029, K=0.034, C=0.320, S=0.469, LR=3.91e-04 | |
| 2025-09-04 08:48:15,446 - INFO - Epoch 12, Step 100/549: Loss=1.063589, K=0.062, C=0.281, S=0.470, LR=3.89e-04 | |
| 2025-09-04 08:48:46,110 - INFO - Epoch 12, Step 110/549: Loss=1.073579, K=0.057, C=0.283, S=0.470, LR=3.87e-04 | |
| 2025-09-04 08:49:16,244 - INFO - Epoch 12, Step 120/549: Loss=1.060793, K=0.181, C=0.188, S=0.475, LR=3.85e-04 | |
| 2025-09-04 08:49:45,229 - INFO - Epoch 12, Step 130/549: Loss=1.088574, K=0.430, C=0.015, S=0.470, LR=3.84e-04 | |
| 2025-09-04 08:50:15,540 - INFO - Epoch 12, Step 140/549: Loss=1.080757, K=0.387, C=0.041, S=0.472, LR=3.82e-04 | |
| 2025-09-04 08:50:46,026 - INFO - Epoch 12, Step 150/549: Loss=1.069091, K=0.229, C=0.149, S=0.475, LR=3.80e-04 | |
| 2025-09-04 08:51:16,311 - INFO - Epoch 12, Step 160/549: Loss=1.100613, K=0.031, C=0.324, S=0.469, LR=3.79e-04 | |
| 2025-09-04 08:51:47,351 - INFO - Epoch 12, Step 170/549: Loss=1.072200, K=0.086, C=0.273, S=0.472, LR=3.77e-04 | |
| 2025-09-04 08:52:17,491 - INFO - Epoch 12, Step 180/549: Loss=1.100729, K=0.050, C=0.303, S=0.470, LR=3.75e-04 | |
| 2025-09-04 08:52:48,937 - INFO - Epoch 12, Step 190/549: Loss=1.085282, K=0.025, C=0.322, S=0.468, LR=3.74e-04 | |
| 2025-09-04 08:53:19,559 - INFO - Epoch 12, Step 200/549: Loss=1.065065, K=0.103, C=0.252, S=0.472, LR=3.72e-04 | |
| 2025-09-04 08:53:49,164 - INFO - Epoch 12, Step 210/549: Loss=1.091933, K=0.379, C=0.075, S=0.475, LR=3.70e-04 | |
| 2025-09-04 08:54:18,563 - INFO - Epoch 12, Step 220/549: Loss=1.084297, K=0.410, C=0.040, S=0.472, LR=3.69e-04 | |
| 2025-09-04 08:54:48,934 - INFO - Epoch 12, Step 230/549: Loss=1.087231, K=0.324, C=0.083, S=0.474, LR=3.67e-04 | |
| 2025-09-04 08:55:19,422 - INFO - Epoch 12, Step 240/549: Loss=1.033675, K=0.316, C=0.092, S=0.474, LR=3.65e-04 | |
| 2025-09-04 08:55:51,042 - INFO - Epoch 12, Step 250/549: Loss=1.102804, K=0.072, C=0.297, S=0.471, LR=3.64e-04 | |
| 2025-09-04 08:56:25,110 - INFO - Epoch 12, Step 260/549: Loss=1.046524, K=0.081, C=0.260, S=0.471, LR=3.62e-04 | |
| 2025-09-04 08:56:59,072 - INFO - Epoch 12, Step 270/549: Loss=1.098616, K=0.035, C=0.316, S=0.469, LR=3.60e-04 | |
| 2025-09-04 08:57:33,044 - INFO - Epoch 12, Step 280/549: Loss=1.052449, K=0.242, C=0.143, S=0.475, LR=3.58e-04 | |
| 2025-09-04 08:58:04,649 - INFO - Epoch 12, Step 290/549: Loss=1.089074, K=0.373, C=0.044, S=0.472, LR=3.57e-04 | |
| 2025-09-04 08:58:35,046 - INFO - Epoch 12, Step 300/549: Loss=1.055546, K=0.383, C=0.026, S=0.470, LR=3.55e-04 | |
| 2025-09-04 08:59:06,175 - INFO - Epoch 12, Step 310/549: Loss=1.030473, K=0.316, C=0.070, S=0.472, LR=3.53e-04 | |
| 2025-09-04 08:59:36,726 - INFO - Epoch 12, Step 320/549: Loss=1.053085, K=0.348, C=0.031, S=0.470, LR=3.52e-04 | |
| 2025-09-04 09:00:11,207 - INFO - Epoch 12, Step 330/549: Loss=1.016486, K=0.124, C=0.210, S=0.472, LR=3.50e-04 | |
| 2025-09-04 09:00:45,748 - INFO - Epoch 12, Step 340/549: Loss=1.085325, K=0.034, C=0.312, S=0.468, LR=3.48e-04 | |
| 2025-09-04 09:01:19,717 - INFO - Epoch 12, Step 350/549: Loss=1.032531, K=0.060, C=0.266, S=0.469, LR=3.47e-04 | |
| 2025-09-04 09:01:54,054 - INFO - Epoch 12, Step 360/549: Loss=1.042160, K=0.193, C=0.164, S=0.474, LR=3.45e-04 | |
| 2025-09-04 09:02:27,908 - INFO - Epoch 12, Step 370/549: Loss=1.035462, K=0.192, C=0.169, S=0.474, LR=3.44e-04 | |
| 2025-09-04 09:03:02,980 - INFO - Epoch 12, Step 380/549: Loss=1.029811, K=0.324, C=0.059, S=0.472, LR=3.42e-04 | |
| 2025-09-04 09:03:37,345 - INFO - Epoch 12, Step 390/549: Loss=1.022570, K=0.230, C=0.125, S=0.473, LR=3.40e-04 | |
| 2025-09-04 09:04:13,099 - INFO - Epoch 12, Step 400/549: Loss=1.042086, K=0.055, C=0.271, S=0.469, LR=3.39e-04 | |
| 2025-09-04 09:04:48,606 - INFO - Epoch 12, Step 410/549: Loss=1.072626, K=0.068, C=0.281, S=0.470, LR=3.37e-04 | |
| 2025-09-04 09:05:26,746 - INFO - Epoch 12, Step 420/549: Loss=1.048563, K=0.043, C=0.285, S=0.468, LR=3.35e-04 | |
| 2025-09-04 09:06:02,522 - INFO - Epoch 12, Step 430/549: Loss=1.027567, K=0.158, C=0.194, S=0.473, LR=3.34e-04 | |
| 2025-09-04 09:06:33,161 - INFO - Epoch 12, Step 440/549: Loss=1.044817, K=0.375, C=0.018, S=0.469, LR=3.32e-04 | |
| 2025-09-04 09:07:02,709 - INFO - Epoch 12, Step 450/549: Loss=1.036247, K=0.320, C=0.061, S=0.472, LR=3.30e-04 | |
| 2025-09-04 09:07:36,230 - INFO - Epoch 12, Step 460/549: Loss=1.038428, K=0.291, C=0.079, S=0.472, LR=3.29e-04 | |
| 2025-09-04 09:08:11,038 - INFO - Epoch 12, Step 470/549: Loss=1.015239, K=0.090, C=0.238, S=0.471, LR=3.27e-04 | |
| 2025-09-04 09:08:47,039 - INFO - Epoch 12, Step 480/549: Loss=1.011713, K=0.055, C=0.271, S=0.469, LR=3.25e-04 | |
| 2025-09-04 09:09:21,142 - INFO - Epoch 12, Step 490/549: Loss=1.064295, K=0.037, C=0.301, S=0.468, LR=3.24e-04 | |
| 2025-09-04 09:09:56,160 - INFO - Epoch 12, Step 500/549: Loss=1.041557, K=0.023, C=0.295, S=0.467, LR=3.22e-04 | |
| 2025-09-04 09:10:29,598 - INFO - Epoch 12, Step 510/549: Loss=0.998173, K=0.076, C=0.239, S=0.470, LR=3.20e-04 | |
| 2025-09-04 09:11:01,532 - INFO - Epoch 12, Step 520/549: Loss=1.021140, K=0.079, C=0.246, S=0.470, LR=3.19e-04 | |
| 2025-09-04 09:11:35,347 - INFO - Epoch 12, Step 530/549: Loss=1.026406, K=0.346, C=0.046, S=0.471, LR=3.17e-04 | |
| 2025-09-04 09:12:08,307 - INFO - Epoch 12, Step 540/549: Loss=1.050349, K=0.344, C=0.032, S=0.470, LR=3.16e-04 | |
| 2025-09-04 09:12:35,681 - INFO - Epoch 12 completed in 1782.1s: Avg Loss=1.041487, K=0.136, C=0.212, S=0.470 | |
| 2025-09-04 09:12:35,886 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 09:12:36,329 - INFO - NEW BEST MODEL! Loss: 1.041487 -> /data/BitTransformerLM/checkpoints/checkpoint_best.pt | |
| 2025-09-04 09:12:36,334 - INFO - === EPOCH 12 COMPLETE === | |
| 2025-09-04 09:12:36,335 - INFO - Loss: 1.041487 (best: 1.041487) | |
| 2025-09-04 09:12:36,335 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 09:12:36,335 - INFO - Starting epoch 13 | |
| 2025-09-04 09:12:39,894 - INFO - Epoch 13, Step 0/549: Loss=0.954399, K=0.057, C=0.218, S=0.467, LR=3.14e-04 | |
| 2025-09-04 09:13:13,635 - INFO - Epoch 13, Step 10/549: Loss=0.971194, K=0.100, C=0.147, S=0.467, LR=3.13e-04 | |
| 2025-09-04 09:13:45,545 - INFO - Epoch 13, Step 20/549: Loss=0.956478, K=0.007, C=0.295, S=0.465, LR=3.11e-04 | |
| 2025-09-04 09:14:18,398 - INFO - Epoch 13, Step 30/549: Loss=0.945213, K=0.025, C=0.262, S=0.466, LR=3.09e-04 | |
| 2025-09-04 09:14:50,414 - INFO - Epoch 13, Step 40/549: Loss=0.984780, K=0.002, C=0.289, S=0.465, LR=3.08e-04 | |
| 2025-09-04 09:15:19,492 - INFO - Epoch 13, Step 50/549: Loss=0.898008, K=0.004, C=0.301, S=0.465, LR=3.06e-04 | |
| 2025-09-04 09:15:49,994 - INFO - Epoch 13, Step 60/549: Loss=0.906105, K=0.020, C=0.252, S=0.465, LR=3.04e-04 | |
| 2025-09-04 09:16:20,706 - INFO - Epoch 13, Step 70/549: Loss=0.952071, K=0.108, C=0.116, S=0.466, LR=3.03e-04 | |
| 2025-09-04 09:16:53,189 - INFO - Epoch 13, Step 80/549: Loss=0.921995, K=0.105, C=0.128, S=0.467, LR=3.01e-04 | |
| 2025-09-04 09:17:25,496 - INFO - Epoch 13, Step 90/549: Loss=0.946287, K=0.119, C=0.100, S=0.466, LR=3.00e-04 | |
| 2025-09-04 09:17:57,513 - INFO - Epoch 13, Step 100/549: Loss=0.932612, K=0.098, C=0.131, S=0.467, LR=2.98e-04 | |
| 2025-09-04 09:18:30,092 - INFO - Epoch 13, Step 110/549: Loss=0.949693, K=0.125, C=0.089, S=0.466, LR=2.96e-04 | |
| 2025-09-04 09:19:01,599 - INFO - Epoch 13, Step 120/549: Loss=0.945234, K=0.099, C=0.145, S=0.467, LR=2.95e-04 | |
| 2025-09-04 09:19:30,767 - INFO - Epoch 13, Step 130/549: Loss=0.950690, K=0.002, C=0.320, S=0.465, LR=2.93e-04 | |
| 2025-09-04 09:20:00,821 - INFO - Epoch 13, Step 140/549: Loss=0.948834, K=0.008, C=0.277, S=0.465, LR=2.92e-04 | |
| 2025-09-04 09:20:31,004 - INFO - Epoch 13, Step 150/549: Loss=0.934094, K=0.001, C=0.301, S=0.465, LR=2.90e-04 | |
| 2025-09-04 09:21:01,819 - INFO - Epoch 13, Step 160/549: Loss=0.934276, K=0.003, C=0.301, S=0.465, LR=2.89e-04 | |
| 2025-09-04 09:21:35,486 - INFO - Epoch 13, Step 170/549: Loss=0.925745, K=0.002, C=0.295, S=0.465, LR=2.87e-04 | |
| 2025-09-04 09:22:09,448 - INFO - Epoch 13, Step 180/549: Loss=0.927130, K=0.046, C=0.195, S=0.466, LR=2.85e-04 | |
| 2025-09-04 09:22:45,254 - INFO - Epoch 13, Step 190/549: Loss=0.944417, K=0.123, C=0.080, S=0.466, LR=2.84e-04 | |
| 2025-09-04 09:23:20,752 - INFO - Epoch 13, Step 200/549: Loss=0.958885, K=0.106, C=0.120, S=0.467, LR=2.82e-04 | |
| 2025-09-04 09:23:54,117 - INFO - Epoch 13, Step 210/549: Loss=0.918934, K=0.016, C=0.250, S=0.465, LR=2.81e-04 | |
| 2025-09-04 09:24:28,735 - INFO - Epoch 13, Step 220/549: Loss=0.918668, K=0.014, C=0.260, S=0.465, LR=2.79e-04 | |
| 2025-09-04 09:25:03,866 - INFO - Epoch 13, Step 230/549: Loss=0.934403, K=0.083, C=0.145, S=0.466, LR=2.78e-04 | |
| 2025-09-04 09:25:38,905 - INFO - Epoch 13, Step 240/549: Loss=0.945165, K=0.119, C=0.101, S=0.466, LR=2.76e-04 | |
| 2025-09-04 09:26:16,445 - INFO - Epoch 13, Step 250/549: Loss=0.938872, K=0.069, C=0.168, S=0.466, LR=2.74e-04 | |
| 2025-09-04 09:26:54,301 - INFO - Epoch 13, Step 260/549: Loss=0.937755, K=0.003, C=0.289, S=0.465, LR=2.73e-04 | |
| 2025-09-04 09:27:29,620 - INFO - Epoch 13, Step 270/549: Loss=0.943534, K=0.004, C=0.285, S=0.465, LR=2.71e-04 | |
| 2025-09-04 09:28:05,274 - INFO - Epoch 13, Step 280/549: Loss=0.940875, K=0.124, C=0.104, S=0.467, LR=2.70e-04 | |
| 2025-09-04 09:28:42,390 - INFO - Epoch 13, Step 290/549: Loss=0.939664, K=0.111, C=0.116, S=0.467, LR=2.68e-04 | |
| 2025-09-04 09:29:24,232 - INFO - Epoch 13, Step 300/549: Loss=0.928142, K=0.082, C=0.159, S=0.467, LR=2.67e-04 | |
| 2025-09-04 09:30:02,307 - INFO - Epoch 13, Step 310/549: Loss=0.932519, K=0.072, C=0.186, S=0.467, LR=2.65e-04 | |
| 2025-09-04 09:30:35,702 - INFO - Epoch 13, Step 320/549: Loss=0.953126, K=0.003, C=0.305, S=0.465, LR=2.64e-04 | |
| 2025-09-04 09:31:06,739 - INFO - Epoch 13, Step 330/549: Loss=0.920605, K=0.026, C=0.240, S=0.465, LR=2.62e-04 | |
| 2025-09-04 09:31:38,499 - INFO - Epoch 13, Step 340/549: Loss=0.902232, K=0.000, C=0.285, S=0.464, LR=2.61e-04 | |
| 2025-09-04 09:32:09,168 - INFO - Epoch 13, Step 350/549: Loss=0.908309, K=0.002, C=0.266, S=0.464, LR=2.59e-04 | |
| 2025-09-04 09:32:38,809 - INFO - Epoch 13, Step 360/549: Loss=0.924793, K=0.004, C=0.262, S=0.464, LR=2.57e-04 | |
| 2025-09-04 09:33:11,897 - INFO - Epoch 13, Step 370/549: Loss=0.889726, K=0.007, C=0.243, S=0.464, LR=2.56e-04 | |
| 2025-09-04 09:33:46,058 - INFO - Epoch 13, Step 380/549: Loss=0.910812, K=0.019, C=0.207, S=0.464, LR=2.54e-04 | |
| 2025-09-04 09:34:21,256 - INFO - Epoch 13, Step 390/549: Loss=0.917069, K=0.064, C=0.124, S=0.465, LR=2.53e-04 | |
| 2025-09-04 09:34:54,682 - INFO - Epoch 13, Step 400/549: Loss=0.883151, K=0.027, C=0.189, S=0.464, LR=2.51e-04 | |
| 2025-09-04 09:35:28,908 - INFO - Epoch 13, Step 410/549: Loss=0.891941, K=0.008, C=0.232, S=0.464, LR=2.50e-04 | |
| 2025-09-04 09:36:06,448 - INFO - Epoch 13, Step 420/549: Loss=0.903110, K=0.014, C=0.214, S=0.464, LR=2.48e-04 | |
| 2025-09-04 09:36:41,380 - INFO - Epoch 13, Step 430/549: Loss=0.879222, K=0.017, C=0.213, S=0.464, LR=2.47e-04 | |
| 2025-09-04 09:37:17,899 - INFO - Epoch 13, Step 440/549: Loss=0.899931, K=0.073, C=0.107, S=0.464, LR=2.45e-04 | |
| 2025-09-04 09:37:53,557 - INFO - Epoch 13, Step 450/549: Loss=0.891824, K=0.041, C=0.172, S=0.465, LR=2.44e-04 | |
| 2025-09-04 09:38:30,456 - INFO - Epoch 13, Step 460/549: Loss=0.913455, K=0.068, C=0.127, S=0.465, LR=2.42e-04 | |
| 2025-09-04 09:39:11,941 - INFO - Epoch 13, Step 470/549: Loss=0.906748, K=0.074, C=0.104, S=0.464, LR=2.41e-04 | |
| 2025-09-04 09:39:49,858 - INFO - Epoch 13, Step 480/549: Loss=0.917027, K=0.013, C=0.238, S=0.464, LR=2.39e-04 | |
| 2025-09-04 09:40:30,018 - INFO - Epoch 13, Step 490/549: Loss=0.900388, K=0.004, C=0.260, S=0.464, LR=2.38e-04 | |
| 2025-09-04 09:41:04,673 - INFO - Epoch 13, Step 500/549: Loss=0.885526, K=0.001, C=0.258, S=0.464, LR=2.36e-04 | |
| 2025-09-04 09:41:41,752 - INFO - Epoch 13, Step 510/549: Loss=0.908080, K=0.001, C=0.250, S=0.464, LR=2.35e-04 | |
| 2025-09-04 09:42:18,291 - INFO - Epoch 13, Step 520/549: Loss=0.882926, K=0.017, C=0.246, S=0.465, LR=2.33e-04 | |
| 2025-09-04 09:42:58,459 - INFO - Epoch 13, Step 530/549: Loss=0.877363, K=0.000, C=0.244, S=0.463, LR=2.32e-04 | |
| 2025-09-04 09:43:37,343 - INFO - Epoch 13, Step 540/549: Loss=0.878414, K=0.007, C=0.201, S=0.463, LR=2.30e-04 | |
| 2025-09-04 09:44:08,304 - INFO - Epoch 13 completed in 1892.0s: Avg Loss=0.947074, K=0.090, C=0.180, S=0.467 | |
| 2025-09-04 09:44:08,544 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 09:44:08,944 - INFO - NEW BEST MODEL! Loss: 0.947074 -> /data/BitTransformerLM/checkpoints/checkpoint_best.pt | |
| 2025-09-04 09:44:08,951 - INFO - === EPOCH 13 COMPLETE === | |
| 2025-09-04 09:44:08,951 - INFO - Loss: 0.947074 (best: 0.947074) | |
| 2025-09-04 09:44:08,951 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 09:44:08,951 - INFO - Starting epoch 14 | |
| 2025-09-04 09:44:12,996 - INFO - Epoch 14, Step 0/549: Loss=0.935362, K=0.162, C=0.111, S=0.469, LR=2.29e-04 | |
| 2025-09-04 09:44:48,757 - INFO - Epoch 14, Step 10/549: Loss=0.924006, K=0.150, C=0.113, S=0.468, LR=2.28e-04 | |
| 2025-09-04 09:45:25,906 - INFO - Epoch 14, Step 20/549: Loss=0.923493, K=0.093, C=0.158, S=0.467, LR=2.26e-04 | |
| 2025-09-04 09:45:59,930 - INFO - Epoch 14, Step 30/549: Loss=0.911258, K=0.081, C=0.169, S=0.467, LR=2.25e-04 | |
| 2025-09-04 09:46:34,637 - INFO - Epoch 14, Step 40/549: Loss=0.919861, K=0.114, C=0.140, S=0.468, LR=2.23e-04 | |
| 2025-09-04 09:47:09,007 - INFO - Epoch 14, Step 50/549: Loss=0.920501, K=0.191, C=0.059, S=0.467, LR=2.22e-04 | |
| 2025-09-04 09:47:44,272 - INFO - Epoch 14, Step 60/549: Loss=0.907952, K=0.139, C=0.112, S=0.468, LR=2.20e-04 | |
| 2025-09-04 09:48:17,933 - INFO - Epoch 14, Step 70/549: Loss=0.914429, K=0.028, C=0.201, S=0.465, LR=2.19e-04 | |
| 2025-09-04 09:48:50,507 - INFO - Epoch 14, Step 80/549: Loss=0.911535, K=0.037, C=0.192, S=0.465, LR=2.18e-04 | |
| 2025-09-04 09:49:22,819 - INFO - Epoch 14, Step 90/549: Loss=0.924511, K=0.149, C=0.118, S=0.468, LR=2.16e-04 | |
| 2025-09-04 09:49:53,641 - INFO - Epoch 14, Step 100/549: Loss=0.936632, K=0.068, C=0.182, S=0.467, LR=2.15e-04 | |
| 2025-09-04 09:50:27,248 - INFO - Epoch 14, Step 110/549: Loss=0.924409, K=0.071, C=0.172, S=0.466, LR=2.13e-04 | |
| 2025-09-04 09:51:00,533 - INFO - Epoch 14, Step 120/549: Loss=0.917497, K=0.031, C=0.205, S=0.465, LR=2.12e-04 | |
| 2025-09-04 09:51:34,101 - INFO - Epoch 14, Step 130/549: Loss=0.930762, K=0.188, C=0.070, S=0.467, LR=2.10e-04 | |
| 2025-09-04 09:52:09,867 - INFO - Epoch 14, Step 140/549: Loss=0.934426, K=0.225, C=0.021, S=0.466, LR=2.09e-04 | |
| 2025-09-04 09:52:49,257 - INFO - Epoch 14, Step 150/549: Loss=0.909612, K=0.088, C=0.150, S=0.467, LR=2.08e-04 | |
| 2025-09-04 09:53:25,035 - INFO - Epoch 14, Step 160/549: Loss=0.921523, K=0.082, C=0.152, S=0.466, LR=2.06e-04 | |
| 2025-09-04 09:54:00,190 - INFO - Epoch 14, Step 170/549: Loss=0.931221, K=0.177, C=0.064, S=0.467, LR=2.05e-04 | |
| 2025-09-04 09:54:29,231 - INFO - Epoch 14, Step 180/549: Loss=0.923912, K=0.215, C=0.037, S=0.466, LR=2.03e-04 | |
| 2025-09-04 09:55:00,297 - INFO - Epoch 14, Step 190/549: Loss=0.910629, K=0.139, C=0.099, S=0.467, LR=2.02e-04 | |
| 2025-09-04 09:55:33,367 - INFO - Epoch 14, Step 200/549: Loss=0.910329, K=0.012, C=0.205, S=0.463, LR=2.00e-04 | |
| 2025-09-04 09:56:05,669 - INFO - Epoch 14, Step 210/549: Loss=0.895459, K=0.037, C=0.182, S=0.465, LR=1.99e-04 | |
| 2025-09-04 09:56:39,798 - INFO - Epoch 14, Step 220/549: Loss=0.897376, K=0.056, C=0.168, S=0.465, LR=1.98e-04 | |
| 2025-09-04 09:57:14,648 - INFO - Epoch 14, Step 230/549: Loss=0.926665, K=0.020, C=0.209, S=0.464, LR=1.96e-04 | |
| 2025-09-04 09:57:50,104 - INFO - Epoch 14, Step 240/549: Loss=0.912471, K=0.043, C=0.183, S=0.465, LR=1.95e-04 | |
| 2025-09-04 09:58:23,865 - INFO - Epoch 14, Step 250/549: Loss=0.907300, K=0.047, C=0.175, S=0.465, LR=1.94e-04 | |
| 2025-09-04 09:58:54,133 - INFO - Epoch 14, Step 260/549: Loss=0.933851, K=0.184, C=0.060, S=0.467, LR=1.92e-04 | |
| 2025-09-04 09:59:24,926 - INFO - Epoch 14, Step 270/549: Loss=0.903470, K=0.143, C=0.087, S=0.467, LR=1.91e-04 | |
| 2025-09-04 09:59:57,831 - INFO - Epoch 14, Step 280/549: Loss=0.908406, K=0.139, C=0.092, S=0.467, LR=1.89e-04 | |
| 2025-09-04 10:00:31,730 - INFO - Epoch 14, Step 290/549: Loss=0.901292, K=0.079, C=0.136, S=0.466, LR=1.88e-04 | |
| 2025-09-04 10:01:03,751 - INFO - Epoch 14, Step 300/549: Loss=0.910901, K=0.187, C=0.027, S=0.465, LR=1.87e-04 | |
| 2025-09-04 10:01:35,390 - INFO - Epoch 14, Step 310/549: Loss=0.925357, K=0.196, C=0.021, S=0.465, LR=1.85e-04 | |
| 2025-09-04 10:02:06,821 - INFO - Epoch 14, Step 320/549: Loss=0.888620, K=0.084, C=0.109, S=0.465, LR=1.84e-04 | |
| 2025-09-04 10:02:36,790 - INFO - Epoch 14, Step 330/549: Loss=0.883064, K=0.090, C=0.094, S=0.465, LR=1.83e-04 | |
| 2025-09-04 10:03:07,360 - INFO - Epoch 14, Step 340/549: Loss=0.871016, K=0.037, C=0.148, S=0.464, LR=1.81e-04 | |
| 2025-09-04 10:03:41,053 - INFO - Epoch 14, Step 350/549: Loss=0.860965, K=0.028, C=0.146, S=0.463, LR=1.80e-04 | |
| 2025-09-04 10:04:13,590 - INFO - Epoch 14, Step 360/549: Loss=0.873784, K=0.085, C=0.100, S=0.465, LR=1.79e-04 | |
| 2025-09-04 10:04:46,528 - INFO - Epoch 14, Step 370/549: Loss=0.873712, K=0.136, C=0.038, S=0.464, LR=1.77e-04 | |
| 2025-09-04 10:05:20,565 - INFO - Epoch 14, Step 380/549: Loss=0.880419, K=0.053, C=0.123, S=0.464, LR=1.76e-04 | |
| 2025-09-04 10:05:54,751 - INFO - Epoch 14, Step 390/549: Loss=0.864731, K=0.093, C=0.083, S=0.464, LR=1.75e-04 | |
| 2025-09-04 10:06:30,347 - INFO - Epoch 14, Step 400/549: Loss=0.867231, K=0.098, C=0.074, S=0.464, LR=1.73e-04 | |
| 2025-09-04 10:07:04,671 - INFO - Epoch 14, Step 410/549: Loss=0.879796, K=0.010, C=0.163, S=0.462, LR=1.72e-04 | |
| 2025-09-04 10:07:40,510 - INFO - Epoch 14, Step 420/549: Loss=0.872540, K=0.102, C=0.078, S=0.465, LR=1.71e-04 | |
| 2025-09-04 10:08:15,555 - INFO - Epoch 14, Step 430/549: Loss=0.869573, K=0.094, C=0.082, S=0.464, LR=1.69e-04 | |
| 2025-09-04 10:08:51,346 - INFO - Epoch 14, Step 440/549: Loss=0.865877, K=0.122, C=0.046, S=0.464, LR=1.68e-04 | |
| 2025-09-04 10:09:26,805 - INFO - Epoch 14, Step 450/549: Loss=0.889435, K=0.125, C=0.026, S=0.463, LR=1.67e-04 | |
| 2025-09-04 10:10:00,343 - INFO - Epoch 14, Step 460/549: Loss=0.861273, K=0.031, C=0.135, S=0.463, LR=1.65e-04 | |
| 2025-09-04 10:10:33,861 - INFO - Epoch 14, Step 470/549: Loss=0.873421, K=0.060, C=0.109, S=0.464, LR=1.64e-04 | |
| 2025-09-04 10:11:06,875 - INFO - Epoch 14, Step 480/549: Loss=0.873727, K=0.018, C=0.152, S=0.463, LR=1.63e-04 | |
| 2025-09-04 10:11:41,123 - INFO - Epoch 14, Step 490/549: Loss=0.868233, K=0.115, C=0.043, S=0.464, LR=1.62e-04 | |
| 2025-09-04 10:12:14,940 - INFO - Epoch 14, Step 500/549: Loss=0.855146, K=0.090, C=0.067, S=0.464, LR=1.60e-04 | |
| 2025-09-04 10:12:47,957 - INFO - Epoch 14, Step 510/549: Loss=0.857319, K=0.088, C=0.056, S=0.463, LR=1.59e-04 | |
| 2025-09-04 10:13:22,369 - INFO - Epoch 14, Step 520/549: Loss=0.869246, K=0.019, C=0.137, S=0.462, LR=1.58e-04 | |
| 2025-09-04 10:13:55,341 - INFO - Epoch 14, Step 530/549: Loss=0.861319, K=0.069, C=0.085, S=0.464, LR=1.56e-04 | |
| 2025-09-04 10:14:29,116 - INFO - Epoch 14, Step 540/549: Loss=0.848847, K=0.070, C=0.083, S=0.464, LR=1.55e-04 | |
| 2025-09-04 10:14:55,921 - INFO - Epoch 14 completed in 1847.0s: Avg Loss=0.879385, K=0.057, C=0.137, S=0.464 | |
| 2025-09-04 10:14:56,143 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 10:14:56,547 - INFO - NEW BEST MODEL! Loss: 0.879385 -> /data/BitTransformerLM/checkpoints/checkpoint_best.pt | |
| 2025-09-04 10:14:56,555 - INFO - === EPOCH 14 COMPLETE === | |
| 2025-09-04 10:14:56,555 - INFO - Loss: 0.879385 (best: 0.879385) | |
| 2025-09-04 10:14:56,555 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 10:14:56,555 - INFO - Starting epoch 15 | |
| 2025-09-04 10:14:59,818 - INFO - Epoch 15, Step 0/549: Loss=0.853662, K=0.004, C=0.174, S=0.462, LR=1.54e-04 | |
| 2025-09-04 10:15:31,794 - INFO - Epoch 15, Step 10/549: Loss=0.851136, K=0.008, C=0.155, S=0.462, LR=1.53e-04 | |
| 2025-09-04 10:16:02,249 - INFO - Epoch 15, Step 20/549: Loss=0.865113, K=0.031, C=0.048, S=0.461, LR=1.52e-04 | |
| 2025-09-04 10:16:34,728 - INFO - Epoch 15, Step 30/549: Loss=0.843478, K=0.013, C=0.142, S=0.462, LR=1.50e-04 | |
| 2025-09-04 10:17:09,182 - INFO - Epoch 15, Step 40/549: Loss=0.852049, K=0.015, C=0.129, S=0.462, LR=1.49e-04 | |
| 2025-09-04 10:17:47,383 - INFO - Epoch 15, Step 50/549: Loss=0.850287, K=0.007, C=0.142, S=0.462, LR=1.48e-04 | |
| 2025-09-04 10:18:22,257 - INFO - Epoch 15, Step 60/549: Loss=0.845456, K=0.002, C=0.173, S=0.462, LR=1.47e-04 | |
| 2025-09-04 10:18:59,020 - INFO - Epoch 15, Step 70/549: Loss=0.843317, K=0.000, C=0.164, S=0.462, LR=1.45e-04 | |
| 2025-09-04 10:19:31,575 - INFO - Epoch 15, Step 80/549: Loss=0.843058, K=0.020, C=0.052, S=0.461, LR=1.44e-04 | |
| 2025-09-04 10:20:01,633 - INFO - Epoch 15, Step 90/549: Loss=0.841604, K=0.020, C=0.045, S=0.461, LR=1.43e-04 | |
| 2025-09-04 10:20:37,561 - INFO - Epoch 15, Step 100/549: Loss=0.843079, K=0.018, C=0.072, S=0.461, LR=1.42e-04 | |
| 2025-09-04 10:21:12,956 - INFO - Epoch 15, Step 110/549: Loss=0.845662, K=0.008, C=0.036, S=0.461, LR=1.40e-04 | |
| 2025-09-04 10:21:48,587 - INFO - Epoch 15, Step 120/549: Loss=0.839171, K=0.008, C=0.105, S=0.461, LR=1.39e-04 | |
| 2025-09-04 10:22:23,909 - INFO - Epoch 15, Step 130/549: Loss=0.839368, K=0.008, C=0.106, S=0.461, LR=1.38e-04 | |
| 2025-09-04 10:22:57,911 - INFO - Epoch 15, Step 140/549: Loss=0.840365, K=0.005, C=0.146, S=0.462, LR=1.37e-04 | |
| 2025-09-04 10:23:34,548 - INFO - Epoch 15, Step 150/549: Loss=0.834638, K=0.007, C=0.168, S=0.462, LR=1.36e-04 | |
| 2025-09-04 10:24:08,492 - INFO - Epoch 15, Step 160/549: Loss=0.811811, K=0.002, C=0.197, S=0.462, LR=1.34e-04 | |
| 2025-09-04 10:24:43,904 - INFO - Epoch 15, Step 170/549: Loss=0.849220, K=0.010, C=0.112, S=0.462, LR=1.33e-04 | |
| 2025-09-04 10:25:17,598 - INFO - Epoch 15, Step 180/549: Loss=0.850176, K=0.005, C=0.156, S=0.462, LR=1.32e-04 | |
| 2025-09-04 10:25:51,704 - INFO - Epoch 15, Step 190/549: Loss=0.847402, K=0.019, C=0.102, S=0.462, LR=1.31e-04 | |
| 2025-09-04 10:26:27,838 - INFO - Epoch 15, Step 200/549: Loss=0.855367, K=0.031, C=0.026, S=0.461, LR=1.30e-04 | |
| 2025-09-04 10:27:01,635 - INFO - Epoch 15, Step 210/549: Loss=0.844257, K=0.000, C=0.075, S=0.461, LR=1.29e-04 | |
| 2025-09-04 10:27:33,115 - INFO - Epoch 15, Step 220/549: Loss=0.831329, K=0.020, C=0.084, S=0.462, LR=1.27e-04 | |
| 2025-09-04 10:28:04,262 - INFO - Epoch 15, Step 230/549: Loss=0.831927, K=0.026, C=0.067, S=0.462, LR=1.26e-04 | |
| 2025-09-04 10:28:36,799 - INFO - Epoch 15, Step 240/549: Loss=0.827289, K=0.058, C=0.029, S=0.462, LR=1.25e-04 | |
| 2025-09-04 10:29:11,085 - INFO - Epoch 15, Step 250/549: Loss=0.839956, K=0.056, C=0.029, S=0.462, LR=1.24e-04 | |
| 2025-09-04 10:29:44,941 - INFO - Epoch 15, Step 260/549: Loss=0.832221, K=0.021, C=0.058, S=0.461, LR=1.23e-04 | |
| 2025-09-04 10:30:18,482 - INFO - Epoch 15, Step 270/549: Loss=0.830954, K=0.031, C=0.041, S=0.461, LR=1.22e-04 | |
| 2025-09-04 10:30:51,429 - INFO - Epoch 15, Step 280/549: Loss=0.835595, K=0.060, C=0.042, S=0.462, LR=1.20e-04 | |
| 2025-09-04 10:31:23,906 - INFO - Epoch 15, Step 290/549: Loss=0.822466, K=0.043, C=0.032, S=0.461, LR=1.19e-04 | |
| 2025-09-04 10:31:58,192 - INFO - Epoch 15, Step 300/549: Loss=0.829925, K=0.043, C=0.031, S=0.461, LR=1.18e-04 | |
| 2025-09-04 10:32:31,184 - INFO - Epoch 15, Step 310/549: Loss=0.820592, K=0.029, C=0.040, S=0.461, LR=1.17e-04 | |
| 2025-09-04 10:33:03,791 - INFO - Epoch 15, Step 320/549: Loss=0.835744, K=0.043, C=0.028, S=0.461, LR=1.16e-04 | |
| 2025-09-04 10:33:36,438 - INFO - Epoch 15, Step 330/549: Loss=0.827091, K=0.043, C=0.028, S=0.461, LR=1.15e-04 | |
| 2025-09-04 10:34:09,535 - INFO - Epoch 15, Step 340/549: Loss=0.827883, K=0.034, C=0.034, S=0.461, LR=1.14e-04 | |
| 2025-09-04 10:34:43,235 - INFO - Epoch 15, Step 350/549: Loss=0.821158, K=0.005, C=0.066, S=0.461, LR=1.13e-04 | |
| 2025-09-04 10:35:18,093 - INFO - Epoch 15, Step 360/549: Loss=0.819553, K=0.025, C=0.041, S=0.461, LR=1.12e-04 | |
| 2025-09-04 10:35:50,532 - INFO - Epoch 15, Step 370/549: Loss=0.837331, K=0.003, C=0.083, S=0.461, LR=1.10e-04 | |
| 2025-09-04 10:36:22,515 - INFO - Epoch 15, Step 380/549: Loss=0.825801, K=0.020, C=0.061, S=0.461, LR=1.09e-04 | |
| 2025-09-04 10:36:54,152 - INFO - Epoch 15, Step 390/549: Loss=0.834430, K=0.043, C=0.035, S=0.461, LR=1.08e-04 | |
| 2025-09-04 10:37:27,413 - INFO - Epoch 15, Step 400/549: Loss=0.829457, K=0.043, C=0.030, S=0.461, LR=1.07e-04 | |
| 2025-09-04 10:38:03,232 - INFO - Epoch 15, Step 410/549: Loss=0.825395, K=0.043, C=0.035, S=0.461, LR=1.06e-04 | |
| 2025-09-04 10:38:40,163 - INFO - Epoch 15, Step 420/549: Loss=0.829818, K=0.010, C=0.053, S=0.461, LR=1.05e-04 | |
| 2025-09-04 10:39:14,878 - INFO - Epoch 15, Step 430/549: Loss=0.826920, K=0.027, C=0.028, S=0.461, LR=1.04e-04 | |
| 2025-09-04 10:39:48,379 - INFO - Epoch 15, Step 440/549: Loss=0.826286, K=0.008, C=0.053, S=0.461, LR=1.03e-04 | |
| 2025-09-04 10:40:22,160 - INFO - Epoch 15, Step 450/549: Loss=0.823039, K=0.004, C=0.070, S=0.461, LR=1.02e-04 | |
| 2025-09-04 10:40:54,904 - INFO - Epoch 15, Step 460/549: Loss=0.827203, K=0.007, C=0.063, S=0.461, LR=1.01e-04 | |
| 2025-09-04 10:41:28,155 - INFO - Epoch 15, Step 470/549: Loss=0.832595, K=0.010, C=0.065, S=0.461, LR=9.97e-05 | |
| 2025-09-04 10:42:02,303 - INFO - Epoch 15, Step 480/549: Loss=0.825540, K=0.040, C=0.031, S=0.461, LR=9.87e-05 | |
| 2025-09-04 10:42:36,288 - INFO - Epoch 15, Step 490/549: Loss=0.827277, K=0.020, C=0.052, S=0.461, LR=9.77e-05 | |
| 2025-09-04 10:43:11,024 - INFO - Epoch 15, Step 500/549: Loss=0.823167, K=0.029, C=0.039, S=0.461, LR=9.66e-05 | |
| 2025-09-04 10:43:44,053 - INFO - Epoch 15, Step 510/549: Loss=0.829609, K=0.055, C=0.034, S=0.462, LR=9.56e-05 | |
| 2025-09-04 10:44:18,418 - INFO - Epoch 15, Step 520/549: Loss=0.823542, K=0.043, C=0.041, S=0.462, LR=9.46e-05 | |
| 2025-09-04 10:44:52,119 - INFO - Epoch 15, Step 530/549: Loss=0.829527, K=0.043, C=0.039, S=0.462, LR=9.36e-05 | |
| 2025-09-04 10:45:22,306 - INFO - Epoch 15, Step 540/549: Loss=0.831467, K=0.004, C=0.074, S=0.461, LR=9.25e-05 | |
| 2025-09-04 10:45:48,559 - INFO - Epoch 15 completed in 1852.0s: Avg Loss=0.841651, K=0.029, C=0.093, S=0.462 | |
| 2025-09-04 10:45:48,750 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 10:45:49,083 - INFO - NEW BEST MODEL! Loss: 0.841651 -> /data/BitTransformerLM/checkpoints/checkpoint_best.pt | |
| 2025-09-04 10:45:49,088 - INFO - === EPOCH 15 COMPLETE === | |
| 2025-09-04 10:45:49,088 - INFO - Loss: 0.841651 (best: 0.841651) | |
| 2025-09-04 10:45:49,088 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 10:45:49,088 - INFO - Starting epoch 16 | |
| 2025-09-04 10:45:52,543 - INFO - Epoch 16, Step 0/549: Loss=0.822511, K=0.003, C=0.141, S=0.462, LR=9.16e-05 | |
| 2025-09-04 10:46:25,991 - INFO - Epoch 16, Step 10/549: Loss=0.829535, K=0.008, C=0.033, S=0.461, LR=9.06e-05 | |
| 2025-09-04 10:47:00,641 - INFO - Epoch 16, Step 20/549: Loss=0.827016, K=0.008, C=0.056, S=0.461, LR=8.96e-05 | |
| 2025-09-04 10:47:35,856 - INFO - Epoch 16, Step 30/549: Loss=0.823210, K=0.002, C=0.037, S=0.460, LR=8.86e-05 | |
| 2025-09-04 10:48:12,099 - INFO - Epoch 16, Step 40/549: Loss=0.821257, K=0.005, C=0.039, S=0.460, LR=8.76e-05 | |
| 2025-09-04 10:48:47,226 - INFO - Epoch 16, Step 50/549: Loss=0.826760, K=0.014, C=0.045, S=0.461, LR=8.67e-05 | |
| 2025-09-04 10:49:19,799 - INFO - Epoch 16, Step 60/549: Loss=0.820646, K=0.031, C=0.027, S=0.461, LR=8.57e-05 | |
| 2025-09-04 10:49:56,095 - INFO - Epoch 16, Step 70/549: Loss=0.824105, K=0.020, C=0.036, S=0.461, LR=8.47e-05 | |
| 2025-09-04 10:50:31,725 - INFO - Epoch 16, Step 80/549: Loss=0.818535, K=0.027, C=0.028, S=0.461, LR=8.37e-05 | |
| 2025-09-04 10:51:04,718 - INFO - Epoch 16, Step 90/549: Loss=0.822558, K=0.020, C=0.035, S=0.461, LR=8.28e-05 | |
| 2025-09-04 10:51:37,227 - INFO - Epoch 16, Step 100/549: Loss=0.826607, K=0.040, C=0.040, S=0.462, LR=8.18e-05 | |
| 2025-09-04 10:52:10,204 - INFO - Epoch 16, Step 110/549: Loss=0.827157, K=0.008, C=0.057, S=0.461, LR=8.09e-05 | |
| 2025-09-04 10:52:42,806 - INFO - Epoch 16, Step 120/549: Loss=0.827890, K=0.027, C=0.046, S=0.461, LR=7.99e-05 | |
| 2025-09-04 10:53:13,637 - INFO - Epoch 16, Step 130/549: Loss=0.813257, K=0.037, C=0.086, S=0.462, LR=7.90e-05 | |
| 2025-09-04 10:53:44,348 - INFO - Epoch 16, Step 140/549: Loss=0.821901, K=0.031, C=0.033, S=0.461, LR=7.80e-05 | |
| 2025-09-04 10:54:19,855 - INFO - Epoch 16, Step 150/549: Loss=0.822600, K=0.031, C=0.028, S=0.461, LR=7.71e-05 | |
| 2025-09-04 10:54:53,364 - INFO - Epoch 16, Step 160/549: Loss=0.817798, K=0.031, C=0.032, S=0.461, LR=7.62e-05 | |
| 2025-09-04 10:55:27,683 - INFO - Epoch 16, Step 170/549: Loss=0.825417, K=0.043, C=0.028, S=0.461, LR=7.52e-05 | |
| 2025-09-04 10:56:05,958 - INFO - Epoch 16, Step 180/549: Loss=0.824862, K=0.020, C=0.044, S=0.461, LR=7.43e-05 | |
| 2025-09-04 10:56:40,793 - INFO - Epoch 16, Step 190/549: Loss=0.824415, K=0.012, C=0.046, S=0.461, LR=7.34e-05 | |
| 2025-09-04 10:57:14,819 - INFO - Epoch 16, Step 200/549: Loss=0.822188, K=0.005, C=0.044, S=0.461, LR=7.25e-05 | |
| 2025-09-04 10:57:47,630 - INFO - Epoch 16, Step 210/549: Loss=0.823174, K=0.001, C=0.031, S=0.460, LR=7.16e-05 | |
| 2025-09-04 10:58:23,351 - INFO - Epoch 16, Step 220/549: Loss=0.824704, K=0.000, C=0.055, S=0.461, LR=7.07e-05 | |
| 2025-09-04 10:58:59,635 - INFO - Epoch 16, Step 230/549: Loss=0.821573, K=0.001, C=0.103, S=0.461, LR=6.98e-05 | |
| 2025-09-04 10:59:36,138 - INFO - Epoch 16, Step 240/549: Loss=0.825838, K=0.003, C=0.101, S=0.461, LR=6.89e-05 | |
| 2025-09-04 11:00:13,612 - INFO - Epoch 16, Step 250/549: Loss=0.825868, K=0.002, C=0.083, S=0.461, LR=6.80e-05 | |
| 2025-09-04 11:00:59,953 - INFO - Epoch 16, Step 260/549: Loss=0.822894, K=0.000, C=0.108, S=0.461, LR=6.72e-05 | |
| 2025-09-04 11:01:41,794 - INFO - Epoch 16, Step 270/549: Loss=0.814951, K=0.004, C=0.077, S=0.461, LR=6.63e-05 | |
| 2025-09-04 11:02:23,435 - INFO - Epoch 16, Step 280/549: Loss=0.817939, K=0.004, C=0.028, S=0.460, LR=6.54e-05 | |
| 2025-09-04 11:03:02,036 - INFO - Epoch 16, Step 290/549: Loss=0.816068, K=0.010, C=0.028, S=0.461, LR=6.46e-05 | |
| 2025-09-04 11:03:42,086 - INFO - Epoch 16, Step 300/549: Loss=0.820991, K=0.012, C=0.030, S=0.461, LR=6.37e-05 | |
| 2025-09-04 11:04:21,717 - INFO - Epoch 16, Step 310/549: Loss=0.825303, K=0.031, C=0.036, S=0.461, LR=6.29e-05 | |
| 2025-09-04 11:04:59,998 - INFO - Epoch 16, Step 320/549: Loss=0.810465, K=0.025, C=0.027, S=0.461, LR=6.20e-05 | |
| 2025-09-04 11:05:38,677 - INFO - Epoch 16, Step 330/549: Loss=0.815960, K=0.008, C=0.029, S=0.460, LR=6.12e-05 | |
| 2025-09-04 11:06:16,013 - INFO - Epoch 16, Step 340/549: Loss=0.823374, K=0.004, C=0.052, S=0.461, LR=6.03e-05 | |
| 2025-09-04 11:06:53,695 - INFO - Epoch 16, Step 350/549: Loss=0.819839, K=0.020, C=0.035, S=0.461, LR=5.95e-05 | |
| 2025-09-04 11:07:27,786 - INFO - Epoch 16, Step 360/549: Loss=0.818380, K=0.012, C=0.030, S=0.461, LR=5.87e-05 | |
| 2025-09-04 11:08:02,736 - INFO - Epoch 16, Step 370/549: Loss=0.815134, K=0.008, C=0.029, S=0.461, LR=5.79e-05 | |
| 2025-09-04 11:08:37,313 - INFO - Epoch 16, Step 380/549: Loss=0.830299, K=0.031, C=0.054, S=0.462, LR=5.71e-05 | |
| 2025-09-04 11:09:13,058 - INFO - Epoch 16, Step 390/549: Loss=0.814367, K=0.020, C=0.098, S=0.462, LR=5.63e-05 | |
| 2025-09-04 11:09:52,905 - INFO - Epoch 16, Step 400/549: Loss=0.822448, K=0.005, C=0.107, S=0.461, LR=5.55e-05 | |
| 2025-09-04 11:10:35,774 - INFO - Epoch 16, Step 410/549: Loss=0.805928, K=0.000, C=0.096, S=0.461, LR=5.47e-05 | |
| 2025-09-04 11:11:11,024 - INFO - Epoch 16, Step 420/549: Loss=0.813309, K=0.010, C=0.029, S=0.461, LR=5.39e-05 | |
| 2025-09-04 11:11:47,407 - INFO - Epoch 16, Step 430/549: Loss=0.817531, K=0.031, C=0.037, S=0.461, LR=5.31e-05 | |
| 2025-09-04 11:12:22,859 - INFO - Epoch 16, Step 440/549: Loss=0.821777, K=0.025, C=0.035, S=0.461, LR=5.23e-05 | |
| 2025-09-04 11:12:57,000 - INFO - Epoch 16, Step 450/549: Loss=0.811249, K=0.020, C=0.033, S=0.461, LR=5.15e-05 | |
| 2025-09-04 11:13:31,877 - INFO - Epoch 16, Step 460/549: Loss=0.814315, K=0.016, C=0.028, S=0.461, LR=5.08e-05 | |
| 2025-09-04 11:14:07,864 - INFO - Epoch 16, Step 470/549: Loss=0.822468, K=0.031, C=0.080, S=0.462, LR=5.00e-05 | |
| 2025-09-04 11:14:41,164 - INFO - Epoch 16, Step 480/549: Loss=0.821247, K=0.008, C=0.038, S=0.461, LR=4.92e-05 | |
| 2025-09-04 11:15:14,191 - INFO - Epoch 16, Step 490/549: Loss=0.820951, K=0.010, C=0.069, S=0.461, LR=4.85e-05 | |
| 2025-09-04 11:15:46,520 - INFO - Epoch 16, Step 500/549: Loss=0.816407, K=0.008, C=0.030, S=0.460, LR=4.77e-05 | |
| 2025-09-04 11:16:23,292 - INFO - Epoch 16, Step 510/549: Loss=0.810973, K=0.020, C=0.065, S=0.461, LR=4.70e-05 | |
| 2025-09-04 11:17:00,334 - INFO - Epoch 16, Step 520/549: Loss=0.822527, K=0.014, C=0.034, S=0.461, LR=4.63e-05 | |
| 2025-09-04 11:17:37,859 - INFO - Epoch 16, Step 530/549: Loss=0.817438, K=0.012, C=0.028, S=0.461, LR=4.55e-05 | |
| 2025-09-04 11:18:16,125 - INFO - Epoch 16, Step 540/549: Loss=0.810308, K=0.014, C=0.052, S=0.461, LR=4.48e-05 | |
| 2025-09-04 11:18:43,403 - INFO - Epoch 16 completed in 1974.3s: Avg Loss=0.820893, K=0.013, C=0.066, S=0.461 | |
| 2025-09-04 11:18:43,728 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 11:18:44,135 - INFO - NEW BEST MODEL! Loss: 0.820893 -> /data/BitTransformerLM/checkpoints/checkpoint_best.pt | |
| 2025-09-04 11:18:44,151 - INFO - === EPOCH 16 COMPLETE === | |
| 2025-09-04 11:18:44,151 - INFO - Loss: 0.820893 (best: 0.820893) | |
| 2025-09-04 11:18:44,152 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 11:18:44,152 - INFO - Starting epoch 17 | |
| 2025-09-04 11:18:47,864 - INFO - Epoch 17, Step 0/549: Loss=0.807558, K=0.000, C=0.084, S=0.461, LR=4.42e-05 | |
| 2025-09-04 11:19:20,714 - INFO - Epoch 17, Step 10/549: Loss=0.814024, K=0.004, C=0.033, S=0.460, LR=4.34e-05 | |
| 2025-09-04 11:19:50,967 - INFO - Epoch 17, Step 20/549: Loss=0.800918, K=0.004, C=0.069, S=0.461, LR=4.27e-05 | |
| 2025-09-04 11:20:22,701 - INFO - Epoch 17, Step 30/549: Loss=0.818978, K=0.008, C=0.038, S=0.461, LR=4.20e-05 | |
| 2025-09-04 11:20:55,008 - INFO - Epoch 17, Step 40/549: Loss=0.809655, K=0.020, C=0.063, S=0.461, LR=4.13e-05 | |
| 2025-09-04 11:21:28,823 - INFO - Epoch 17, Step 50/549: Loss=0.810181, K=0.014, C=0.037, S=0.461, LR=4.06e-05 | |
| 2025-09-04 11:22:03,849 - INFO - Epoch 17, Step 60/549: Loss=0.818246, K=0.031, C=0.060, S=0.462, LR=4.00e-05 | |
| 2025-09-04 11:22:35,899 - INFO - Epoch 17, Step 70/549: Loss=0.822136, K=0.016, C=0.031, S=0.461, LR=3.93e-05 | |
| 2025-09-04 11:23:07,394 - INFO - Epoch 17, Step 80/549: Loss=0.812792, K=0.007, C=0.063, S=0.461, LR=3.86e-05 | |
| 2025-09-04 11:23:39,877 - INFO - Epoch 17, Step 90/549: Loss=0.811681, K=0.001, C=0.091, S=0.461, LR=3.79e-05 | |
| 2025-09-04 11:24:12,035 - INFO - Epoch 17, Step 100/549: Loss=0.805407, K=0.017, C=0.053, S=0.461, LR=3.73e-05 | |
| 2025-09-04 11:24:46,034 - INFO - Epoch 17, Step 110/549: Loss=0.815489, K=0.006, C=0.056, S=0.461, LR=3.66e-05 | |
| 2025-09-04 11:25:20,864 - INFO - Epoch 17, Step 120/549: Loss=0.804230, K=0.016, C=0.044, S=0.461, LR=3.60e-05 | |
| 2025-09-04 11:25:53,654 - INFO - Epoch 17, Step 130/549: Loss=0.814091, K=0.016, C=0.028, S=0.461, LR=3.53e-05 | |
| 2025-09-04 11:26:25,529 - INFO - Epoch 17, Step 140/549: Loss=0.822470, K=0.007, C=0.091, S=0.461, LR=3.47e-05 | |
| 2025-09-04 11:27:00,815 - INFO - Epoch 17, Step 150/549: Loss=0.817330, K=0.000, C=0.034, S=0.460, LR=3.40e-05 | |
| 2025-09-04 11:27:37,655 - INFO - Epoch 17, Step 160/549: Loss=0.813553, K=0.000, C=0.069, S=0.461, LR=3.34e-05 | |
| 2025-09-04 11:28:10,661 - INFO - Epoch 17, Step 170/549: Loss=0.811337, K=0.020, C=0.054, S=0.461, LR=3.28e-05 | |
| 2025-09-04 11:28:44,018 - INFO - Epoch 17, Step 180/549: Loss=0.811751, K=0.006, C=0.033, S=0.461, LR=3.22e-05 | |
| 2025-09-04 11:29:15,047 - INFO - Epoch 17, Step 190/549: Loss=0.813382, K=0.000, C=0.061, S=0.461, LR=3.15e-05 | |
| 2025-09-04 11:29:46,256 - INFO - Epoch 17, Step 200/549: Loss=0.816229, K=0.010, C=0.062, S=0.461, LR=3.09e-05 | |
| 2025-09-04 11:30:17,993 - INFO - Epoch 17, Step 210/549: Loss=0.814024, K=0.000, C=0.041, S=0.460, LR=3.03e-05 | |
| 2025-09-04 11:30:49,500 - INFO - Epoch 17, Step 220/549: Loss=0.810270, K=0.002, C=0.077, S=0.461, LR=2.97e-05 | |
| 2025-09-04 11:31:21,156 - INFO - Epoch 17, Step 230/549: Loss=0.810279, K=0.010, C=0.042, S=0.461, LR=2.92e-05 | |
| 2025-09-04 11:31:55,646 - INFO - Epoch 17, Step 240/549: Loss=0.820203, K=0.004, C=0.029, S=0.460, LR=2.86e-05 | |
| 2025-09-04 11:32:25,421 - INFO - Epoch 17, Step 250/549: Loss=0.824366, K=0.006, C=0.034, S=0.460, LR=2.80e-05 | |
| 2025-09-04 11:32:56,844 - INFO - Epoch 17, Step 260/549: Loss=0.823138, K=0.000, C=0.062, S=0.461, LR=2.74e-05 | |
| 2025-09-04 11:33:28,481 - INFO - Epoch 17, Step 270/549: Loss=0.810317, K=0.020, C=0.040, S=0.461, LR=2.68e-05 | |
| 2025-09-04 11:34:00,430 - INFO - Epoch 17, Step 280/549: Loss=0.814977, K=0.010, C=0.055, S=0.461, LR=2.63e-05 | |
| 2025-09-04 11:34:32,838 - INFO - Epoch 17, Step 290/549: Loss=0.805986, K=0.020, C=0.050, S=0.461, LR=2.57e-05 | |
| 2025-09-04 11:35:05,124 - INFO - Epoch 17, Step 300/549: Loss=0.812902, K=0.000, C=0.028, S=0.460, LR=2.52e-05 | |
| 2025-09-04 11:35:37,079 - INFO - Epoch 17, Step 310/549: Loss=0.814139, K=0.000, C=0.049, S=0.460, LR=2.46e-05 | |
| 2025-09-04 11:36:07,867 - INFO - Epoch 17, Step 320/549: Loss=0.811949, K=0.008, C=0.041, S=0.461, LR=2.41e-05 | |
| 2025-09-04 11:36:37,451 - INFO - Epoch 17, Step 330/549: Loss=0.805453, K=0.020, C=0.048, S=0.461, LR=2.36e-05 | |
| 2025-09-04 11:37:07,653 - INFO - Epoch 17, Step 340/549: Loss=0.821453, K=0.005, C=0.046, S=0.461, LR=2.30e-05 | |
| 2025-09-04 11:37:39,355 - INFO - Epoch 17, Step 350/549: Loss=0.817538, K=0.005, C=0.065, S=0.461, LR=2.25e-05 | |
| 2025-09-04 11:38:10,923 - INFO - Epoch 17, Step 360/549: Loss=0.811746, K=0.000, C=0.054, S=0.461, LR=2.20e-05 | |
| 2025-09-04 11:38:42,301 - INFO - Epoch 17, Step 370/549: Loss=0.817912, K=0.002, C=0.059, S=0.461, LR=2.15e-05 | |
| 2025-09-04 11:39:14,184 - INFO - Epoch 17, Step 380/549: Loss=0.812139, K=0.010, C=0.039, S=0.461, LR=2.10e-05 | |
| 2025-09-04 11:39:49,450 - INFO - Epoch 17, Step 390/549: Loss=0.823998, K=0.005, C=0.028, S=0.460, LR=2.05e-05 | |
| 2025-09-04 11:40:22,531 - INFO - Epoch 17, Step 400/549: Loss=0.811971, K=0.008, C=0.084, S=0.461, LR=2.00e-05 | |
| 2025-09-04 11:40:53,448 - INFO - Epoch 17, Step 410/549: Loss=0.813046, K=0.000, C=0.084, S=0.461, LR=1.95e-05 | |
| 2025-09-04 11:41:24,974 - INFO - Epoch 17, Step 420/549: Loss=0.806025, K=0.012, C=0.055, S=0.461, LR=1.90e-05 | |
| 2025-09-04 11:41:56,615 - INFO - Epoch 17, Step 430/549: Loss=0.816503, K=0.006, C=0.043, S=0.461, LR=1.86e-05 | |
| 2025-09-04 11:42:27,970 - INFO - Epoch 17, Step 440/549: Loss=0.814109, K=0.008, C=0.046, S=0.461, LR=1.81e-05 | |
| 2025-09-04 11:42:59,032 - INFO - Epoch 17, Step 450/549: Loss=0.809269, K=0.007, C=0.051, S=0.461, LR=1.76e-05 | |
| 2025-09-04 11:43:30,838 - INFO - Epoch 17, Step 460/549: Loss=0.809703, K=0.003, C=0.083, S=0.461, LR=1.72e-05 | |
| 2025-09-04 11:44:03,352 - INFO - Epoch 17, Step 470/549: Loss=0.816679, K=0.008, C=0.058, S=0.461, LR=1.67e-05 | |
| 2025-09-04 11:44:35,100 - INFO - Epoch 17, Step 480/549: Loss=0.816326, K=0.007, C=0.060, S=0.461, LR=1.63e-05 | |
| 2025-09-04 11:45:06,322 - INFO - Epoch 17, Step 490/549: Loss=0.809298, K=0.008, C=0.062, S=0.461, LR=1.58e-05 | |
| 2025-09-04 11:45:39,601 - INFO - Epoch 17, Step 500/549: Loss=0.818030, K=0.008, C=0.047, S=0.461, LR=1.54e-05 | |
| 2025-09-04 11:46:11,121 - INFO - Epoch 17, Step 510/549: Loss=0.812435, K=0.008, C=0.037, S=0.461, LR=1.50e-05 | |
| 2025-09-04 11:46:42,212 - INFO - Epoch 17, Step 520/549: Loss=0.807743, K=0.008, C=0.047, S=0.461, LR=1.46e-05 | |
| 2025-09-04 11:47:13,481 - INFO - Epoch 17, Step 530/549: Loss=0.817652, K=0.007, C=0.049, S=0.461, LR=1.41e-05 | |
| 2025-09-04 11:47:45,437 - INFO - Epoch 17, Step 540/549: Loss=0.808471, K=0.006, C=0.074, S=0.461, LR=1.37e-05 | |
| 2025-09-04 11:48:11,299 - INFO - Epoch 17 completed in 1767.1s: Avg Loss=0.813790, K=0.007, C=0.053, S=0.461 | |
| 2025-09-04 11:48:11,486 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 11:48:11,852 - INFO - NEW BEST MODEL! Loss: 0.813790 -> /data/BitTransformerLM/checkpoints/checkpoint_best.pt | |
| 2025-09-04 11:48:11,857 - INFO - === EPOCH 17 COMPLETE === | |
| 2025-09-04 11:48:11,858 - INFO - Loss: 0.813790 (best: 0.813790) | |
| 2025-09-04 11:48:11,858 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 11:48:11,858 - INFO - Starting epoch 18 | |
| 2025-09-04 11:48:15,416 - INFO - Epoch 18, Step 0/549: Loss=0.813287, K=0.007, C=0.048, S=0.461, LR=1.34e-05 | |
| 2025-09-04 11:48:50,675 - INFO - Epoch 18, Step 10/549: Loss=0.815183, K=0.006, C=0.048, S=0.461, LR=1.30e-05 | |
| 2025-09-04 11:49:22,343 - INFO - Epoch 18, Step 20/549: Loss=0.815260, K=0.000, C=0.042, S=0.460, LR=1.26e-05 | |
| 2025-09-04 11:49:54,351 - INFO - Epoch 18, Step 30/549: Loss=0.813207, K=0.007, C=0.040, S=0.461, LR=1.22e-05 | |
| 2025-09-04 11:50:26,932 - INFO - Epoch 18, Step 40/549: Loss=0.810523, K=0.002, C=0.043, S=0.460, LR=1.18e-05 | |
| 2025-09-04 11:50:59,645 - INFO - Epoch 18, Step 50/549: Loss=0.816084, K=0.004, C=0.064, S=0.461, LR=1.14e-05 | |
| 2025-09-04 11:51:32,598 - INFO - Epoch 18, Step 60/549: Loss=0.806345, K=0.010, C=0.050, S=0.461, LR=1.11e-05 | |
| 2025-09-04 11:52:06,711 - INFO - Epoch 18, Step 70/549: Loss=0.811512, K=0.005, C=0.046, S=0.461, LR=1.07e-05 | |
| 2025-09-04 11:52:41,491 - INFO - Epoch 18, Step 80/549: Loss=0.809333, K=0.008, C=0.046, S=0.461, LR=1.04e-05 | |
| 2025-09-04 11:53:12,855 - INFO - Epoch 18, Step 90/549: Loss=0.811572, K=0.005, C=0.050, S=0.461, LR=1.00e-05 | |
| 2025-09-04 11:53:43,554 - INFO - Epoch 18, Step 100/549: Loss=0.812184, K=0.008, C=0.057, S=0.461, LR=9.66e-06 | |
| 2025-09-04 11:54:15,485 - INFO - Epoch 18, Step 110/549: Loss=0.813912, K=0.010, C=0.044, S=0.461, LR=9.32e-06 | |
| 2025-09-04 11:54:46,423 - INFO - Epoch 18, Step 120/549: Loss=0.813472, K=0.008, C=0.053, S=0.461, LR=8.99e-06 | |
| 2025-09-04 11:55:17,113 - INFO - Epoch 18, Step 130/549: Loss=0.817152, K=0.004, C=0.049, S=0.461, LR=8.67e-06 | |
| 2025-09-04 11:55:48,788 - INFO - Epoch 18, Step 140/549: Loss=0.820724, K=0.005, C=0.049, S=0.461, LR=8.34e-06 | |
| 2025-09-04 11:56:20,625 - INFO - Epoch 18, Step 150/549: Loss=0.817962, K=0.007, C=0.055, S=0.461, LR=8.03e-06 | |
| 2025-09-04 11:56:52,379 - INFO - Epoch 18, Step 160/549: Loss=0.815428, K=0.005, C=0.063, S=0.461, LR=7.72e-06 | |
| 2025-09-04 11:57:23,777 - INFO - Epoch 18, Step 170/549: Loss=0.813844, K=0.008, C=0.051, S=0.461, LR=7.42e-06 | |
| 2025-09-04 11:57:54,269 - INFO - Epoch 18, Step 180/549: Loss=0.803238, K=0.008, C=0.060, S=0.461, LR=7.12e-06 | |
| 2025-09-04 11:58:25,347 - INFO - Epoch 18, Step 190/549: Loss=0.812024, K=0.008, C=0.048, S=0.461, LR=6.83e-06 | |
| 2025-09-04 11:58:57,202 - INFO - Epoch 18, Step 200/549: Loss=0.814483, K=0.008, C=0.052, S=0.461, LR=6.55e-06 | |
| 2025-09-04 11:59:31,134 - INFO - Epoch 18, Step 210/549: Loss=0.811044, K=0.008, C=0.052, S=0.461, LR=6.27e-06 | |
| 2025-09-04 12:00:02,838 - INFO - Epoch 18, Step 220/549: Loss=0.816624, K=0.007, C=0.056, S=0.461, LR=6.00e-06 | |
| 2025-09-04 12:00:34,052 - INFO - Epoch 18, Step 230/549: Loss=0.808674, K=0.007, C=0.051, S=0.461, LR=5.73e-06 | |
| 2025-09-04 12:01:06,055 - INFO - Epoch 18, Step 240/549: Loss=0.812694, K=0.008, C=0.046, S=0.461, LR=5.47e-06 | |
| 2025-09-04 12:01:41,216 - INFO - Epoch 18, Step 250/549: Loss=0.810793, K=0.008, C=0.060, S=0.461, LR=5.22e-06 | |
| 2025-09-04 12:02:12,913 - INFO - Epoch 18, Step 260/549: Loss=0.811579, K=0.008, C=0.044, S=0.461, LR=4.97e-06 | |
| 2025-09-04 12:02:44,183 - INFO - Epoch 18, Step 270/549: Loss=0.820372, K=0.005, C=0.053, S=0.461, LR=4.73e-06 | |
| 2025-09-04 12:03:18,455 - INFO - Epoch 18, Step 280/549: Loss=0.808545, K=0.007, C=0.043, S=0.461, LR=4.49e-06 | |
| 2025-09-04 12:03:52,581 - INFO - Epoch 18, Step 290/549: Loss=0.808770, K=0.008, C=0.058, S=0.461, LR=4.26e-06 | |
| 2025-09-04 12:04:25,006 - INFO - Epoch 18, Step 300/549: Loss=0.821251, K=0.008, C=0.050, S=0.461, LR=4.03e-06 | |
| 2025-09-04 12:04:59,012 - INFO - Epoch 18, Step 310/549: Loss=0.817162, K=0.008, C=0.057, S=0.461, LR=3.82e-06 | |
| 2025-09-04 12:05:32,252 - INFO - Epoch 18, Step 320/549: Loss=0.813141, K=0.007, C=0.052, S=0.461, LR=3.60e-06 | |
| 2025-09-04 12:06:06,285 - INFO - Epoch 18, Step 330/549: Loss=0.809896, K=0.005, C=0.064, S=0.461, LR=3.40e-06 | |
| 2025-09-04 12:06:41,062 - INFO - Epoch 18, Step 340/549: Loss=0.815352, K=0.004, C=0.047, S=0.461, LR=3.20e-06 | |
| 2025-09-04 12:07:17,016 - INFO - Epoch 18, Step 350/549: Loss=0.818691, K=0.005, C=0.049, S=0.461, LR=3.00e-06 | |
| 2025-09-04 12:07:52,840 - INFO - Epoch 18, Step 360/549: Loss=0.812978, K=0.005, C=0.042, S=0.461, LR=2.82e-06 | |
| 2025-09-04 12:08:27,822 - INFO - Epoch 18, Step 370/549: Loss=0.814032, K=0.004, C=0.055, S=0.461, LR=2.63e-06 | |
| 2025-09-04 12:09:00,934 - INFO - Epoch 18, Step 380/549: Loss=0.815932, K=0.008, C=0.043, S=0.461, LR=2.46e-06 | |
| 2025-09-04 12:09:32,758 - INFO - Epoch 18, Step 390/549: Loss=0.812755, K=0.006, C=0.053, S=0.461, LR=2.29e-06 | |
| 2025-09-04 12:10:04,718 - INFO - Epoch 18, Step 400/549: Loss=0.811580, K=0.005, C=0.059, S=0.461, LR=2.13e-06 | |
| 2025-09-04 12:10:35,262 - INFO - Epoch 18, Step 410/549: Loss=0.805872, K=0.008, C=0.048, S=0.461, LR=1.97e-06 | |
| 2025-09-04 12:11:05,505 - INFO - Epoch 18, Step 420/549: Loss=0.812275, K=0.004, C=0.052, S=0.461, LR=1.82e-06 | |
| 2025-09-04 12:11:37,209 - INFO - Epoch 18, Step 430/549: Loss=0.808113, K=0.005, C=0.047, S=0.461, LR=1.67e-06 | |
| 2025-09-04 12:12:09,731 - INFO - Epoch 18, Step 440/549: Loss=0.813275, K=0.007, C=0.051, S=0.461, LR=1.53e-06 | |
| 2025-09-04 12:12:41,746 - INFO - Epoch 18, Step 450/549: Loss=0.812143, K=0.007, C=0.047, S=0.461, LR=1.40e-06 | |
| 2025-09-04 12:13:13,550 - INFO - Epoch 18, Step 460/549: Loss=0.810323, K=0.005, C=0.050, S=0.461, LR=1.27e-06 | |
| 2025-09-04 12:13:44,647 - INFO - Epoch 18, Step 470/549: Loss=0.811157, K=0.005, C=0.050, S=0.461, LR=1.15e-06 | |
| 2025-09-04 12:14:16,194 - INFO - Epoch 18, Step 480/549: Loss=0.809499, K=0.007, C=0.051, S=0.461, LR=1.03e-06 | |
| 2025-09-04 12:14:47,364 - INFO - Epoch 18, Step 490/549: Loss=0.813120, K=0.008, C=0.056, S=0.461, LR=9.26e-07 | |
| 2025-09-04 12:15:20,704 - INFO - Epoch 18, Step 500/549: Loss=0.810548, K=0.004, C=0.054, S=0.461, LR=8.23e-07 | |
| 2025-09-04 12:15:53,750 - INFO - Epoch 18, Step 510/549: Loss=0.812020, K=0.007, C=0.058, S=0.461, LR=7.26e-07 | |
| 2025-09-04 12:16:25,435 - INFO - Epoch 18, Step 520/549: Loss=0.812569, K=0.005, C=0.054, S=0.461, LR=6.36e-07 | |
| 2025-09-04 12:16:57,415 - INFO - Epoch 18, Step 530/549: Loss=0.811905, K=0.004, C=0.052, S=0.461, LR=5.51e-07 | |
| 2025-09-04 12:17:29,381 - INFO - Epoch 18, Step 540/549: Loss=0.808056, K=0.006, C=0.054, S=0.461, LR=4.72e-07 | |
| 2025-09-04 12:17:55,498 - INFO - Epoch 18 completed in 1783.6s: Avg Loss=0.812449, K=0.006, C=0.051, S=0.461 | |
| 2025-09-04 12:17:55,739 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |
| 2025-09-04 12:17:56,142 - INFO - NEW BEST MODEL! Loss: 0.812449 -> /data/BitTransformerLM/checkpoints/checkpoint_best.pt | |
| 2025-09-04 12:17:56,150 - INFO - === EPOCH 18 COMPLETE === | |
| 2025-09-04 12:17:56,150 - INFO - Loss: 0.812449 (best: 0.812449) | |
| 2025-09-04 12:17:56,150 - INFO - ๐ BREAKTHROUGH PERFORMANCE ACHIEVED! Loss < 3.0! | |
| 2025-09-04 12:17:56,150 - INFO - Starting epoch 19 | |
| 2025-09-04 12:17:59,385 - INFO - Epoch 19, Step 0/549: Loss=0.811162, K=0.005, C=0.052, S=0.461, LR=4.07e-07 | |
| 2025-09-04 12:18:30,768 - INFO - Epoch 19, Step 10/549: Loss=0.809241, K=0.006, C=0.052, S=0.461, LR=3.40e-07 | |
| 2025-09-04 12:19:04,072 - INFO - Epoch 19, Step 20/549: Loss=0.820437, K=0.006, C=0.052, S=0.461, LR=2.79e-07 | |
| 2025-09-04 12:19:35,434 - INFO - Epoch 19, Step 30/549: Loss=0.811057, K=0.005, C=0.051, S=0.461, LR=2.24e-07 | |
| 2025-09-04 12:20:10,302 - INFO - Epoch 19, Step 40/549: Loss=0.811471, K=0.005, C=0.054, S=0.461, LR=1.75e-07 | |
| 2025-09-04 12:20:44,940 - INFO - Epoch 19, Step 50/549: Loss=0.808459, K=0.005, C=0.053, S=0.461, LR=1.33e-07 | |
| 2025-09-04 12:21:19,184 - INFO - Epoch 19, Step 60/549: Loss=0.819490, K=0.005, C=0.050, S=0.461, LR=9.61e-08 | |
| 2025-09-04 12:21:54,205 - INFO - Epoch 19, Step 70/549: Loss=0.807116, K=0.005, C=0.053, S=0.461, LR=6.57e-08 | |
| 2025-09-04 12:22:28,047 - INFO - Epoch 19, Step 80/549: Loss=0.809450, K=0.005, C=0.053, S=0.461, LR=4.13e-08 | |
| 2025-09-04 12:23:00,225 - INFO - Epoch 19, Step 90/549: Loss=0.814382, K=0.005, C=0.052, S=0.461, LR=2.30e-08 | |
| 2025-09-04 12:23:31,683 - INFO - Epoch 19, Step 100/549: Loss=0.809265, K=0.006, C=0.052, S=0.461, LR=1.09e-08 | |
| 2025-09-04 12:24:02,794 - INFO - Epoch 19, Step 110/549: Loss=0.810947, K=0.004, C=0.053, S=0.461, LR=4.76e-09 | |
| 2025-09-04 12:24:24,928 - ERROR - Error in epoch 19: Tried to step 10001 times. The specified number of total steps is 10000 | |
| 2025-09-04 12:24:25,133 - INFO - Saved checkpoint: /data/BitTransformerLM/checkpoints/checkpoint_latest.pt | |