diff --git a/README.md b/README.md index 7da79263d0e701a53a0d662d245c4fc3db39b287..41c4f61dad0f5bf885b21cb8bfab0aec03dea384 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,78 @@ ---- -license: cc-by-4.0 ---- +--- +library_name: transformers +tags: +- generated_from_trainer +metrics: +- accuracy +model-index: +- name: dna_model + results: [] +--- + + + +# dna_model + +This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset. +It achieves the following results on the evaluation set: +- Loss: 1.0299 +- Accuracy: 0.5324 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0003 +- train_batch_size: 64 +- eval_batch_size: 8 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 4 +- total_train_batch_size: 256 +- total_eval_batch_size: 32 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: linear +- lr_scheduler_warmup_steps: 1000 +- num_epochs: 10.0 +- mixed_precision_training: Native AMP + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Accuracy | +|:-------------:|:------:|:-----:|:---------------:|:--------:| +| 1.1252 | 0.6908 | 5000 | 1.1206 | 0.4745 | +| 1.0835 | 1.3816 | 10000 | 1.0814 | 0.4991 | +| 1.0641 | 2.0724 | 15000 | 1.0639 | 0.5103 | +| 1.0563 | 2.7632 | 20000 | 1.0547 | 0.5163 | +| 1.0504 | 3.4540 | 25000 | 1.0486 | 0.5204 | +| 1.0439 | 4.1448 | 30000 | 1.0439 | 0.5233 | +| 1.0425 | 4.8356 | 35000 | 1.0407 | 0.5254 | +| 1.0365 | 5.5264 | 40000 | 1.0380 | 0.5271 | +| 1.0325 | 6.2172 | 45000 | 1.0361 | 0.5284 | +| 1.0322 | 6.9080 | 50000 | 1.0341 | 0.5296 | +| 1.0307 | 7.5988 | 55000 | 1.0328 | 0.5305 | +| 1.0267 | 8.2896 | 60000 | 1.0316 | 0.5313 | +| 1.0273 | 8.9804 | 65000 | 1.0306 | 0.5320 | +| 1.027 | 9.6712 | 70000 | 1.0299 | 0.5324 | + + +### Framework versions + +- Transformers 4.52.0.dev0 +- Pytorch 2.3.0+cu121 +- Datasets 3.0.0 +- Tokenizers 0.21.1 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ca2097f016bb156a70338923a11008f1bd6ce667 --- /dev/null +++ b/all_results.json @@ -0,0 +1,16 @@ +{ + "epoch": 10.0, + "eval_accuracy": 0.5323623139821072, + "eval_loss": 1.029943823814392, + "eval_runtime": 721.9591, + "eval_samples": 205861, + "eval_samples_per_second": 285.142, + "eval_steps_per_second": 8.912, + "perplexity": 2.8009084859245172, + "total_flos": 9.683106445125485e+18, + "train_loss": 1.0569831334908848, + "train_runtime": 55085.3166, + "train_samples": 1852919, + "train_samples_per_second": 336.373, + "train_steps_per_second": 1.314 +} \ No newline at end of file diff --git a/checkpoint-65000/config.json b/checkpoint-65000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..192ba05a8714569e728cced45eaebf4106596353 --- /dev/null +++ b/checkpoint-65000/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 50256, + "embd_pdrop": 0.1, + "eos_token_id": 50256, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 768, + "n_head": 12, + "n_inner": null, + "n_layer": 12, + "n_positions": 1024, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.1, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.52.0.dev0", + "use_cache": true, + "vocab_size": 5 +} diff --git a/checkpoint-65000/generation_config.json b/checkpoint-65000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c829fa47bd90bfe00fdb37ed6d41324f6fb81f63 --- /dev/null +++ b/checkpoint-65000/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 50256, + "eos_token_id": 50256, + "transformers_version": "4.52.0.dev0" +} diff --git a/checkpoint-65000/merges.txt b/checkpoint-65000/merges.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e7f1fd94996c8e2b65adea828af1b398eace61f --- /dev/null +++ b/checkpoint-65000/merges.txt @@ -0,0 +1 @@ +#version: 0.2 diff --git a/checkpoint-65000/model.safetensors b/checkpoint-65000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..efb7ff00dccb5ebf1259f8d6e1d708c291559bbc --- /dev/null +++ b/checkpoint-65000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0b2555f6669e4ccea28c5ca35ca80703169550c0089256aef6f76de2fb02c03 +size 343400064 diff --git a/checkpoint-65000/optimizer.pt b/checkpoint-65000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9302ad7809ed281843c862080f11b9d1b3f825c2 --- /dev/null +++ b/checkpoint-65000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90b816e703a34fa1d32bb6a8ab69151a6094d3d559265e1abde89cacb4d87f62 +size 686894010 diff --git a/checkpoint-65000/rng_state_0.pth b/checkpoint-65000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c997380b6b198035d90575a17acbd81d31c692cf --- /dev/null +++ b/checkpoint-65000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a73a83b1c17f13cda5530062a0df264cb7442b67bef02fa6d887e5c28bad11ae +size 14960 diff --git a/checkpoint-65000/rng_state_1.pth b/checkpoint-65000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..95fd3ba6aa8dca62a75a322194237d81db34f3b0 --- /dev/null +++ b/checkpoint-65000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b193f708cbd330ff0c9344e6b41e847f8fc3944ac5894b34157b1867d2df2b72 +size 14960 diff --git a/checkpoint-65000/rng_state_2.pth b/checkpoint-65000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..71c1af02db2d685c296335c62b810d5491384940 --- /dev/null +++ b/checkpoint-65000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38febdeb5d9aabe5b12704ae9ba82c3a576d9abd99c1ad82f0d4f1b455f9ac53 +size 14960 diff --git a/checkpoint-65000/rng_state_3.pth b/checkpoint-65000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ab03cf8eb068a0f694ffa38755d3e37bd19e4f1b --- /dev/null +++ b/checkpoint-65000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:777f88144775fe8de99b4817ca0fdff9b09cd701f79f6d84d35f15d064018e27 +size 14960 diff --git a/checkpoint-65000/scaler.pt b/checkpoint-65000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6faef85be36059e5312afd9e26e5e44cd6007ed5 --- /dev/null +++ b/checkpoint-65000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fecf213c35ff38444ea6916370882432e71bfe60a14e22fef3af0107af356e7 +size 988 diff --git a/checkpoint-65000/scheduler.pt b/checkpoint-65000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5bcd7cd981fe5973577aac902f4f3670f5ca39a6 --- /dev/null +++ b/checkpoint-65000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37b25fb6861357a59b677028c1ca6445fb652f33d7914a60e59280a0be350c36 +size 1064 diff --git a/checkpoint-65000/special_tokens_map.json b/checkpoint-65000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..773bd68cf0900427f8d69dd974724e3abb9a08a9 --- /dev/null +++ b/checkpoint-65000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-65000/tokenizer.json b/checkpoint-65000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..ac40039af791f0fd130b3d36c3677a156b2de089 --- /dev/null +++ b/checkpoint-65000/tokenizer.json @@ -0,0 +1,53 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": true + }, + "post_processor": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": false, + "use_regex": true + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": "", + "end_of_word_suffix": "", + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "<|endoftext|>": 0, + "A": 1, + "C": 2, + "G": 3, + "T": 4 + }, + "merges": [] + } +} \ No newline at end of file diff --git a/checkpoint-65000/tokenizer_config.json b/checkpoint-65000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7c4e19588fa8b4faceab450a1d7e8dae1ce87f7c --- /dev/null +++ b/checkpoint-65000/tokenizer_config.json @@ -0,0 +1,21 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|endoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "<|endoftext|>" +} diff --git a/checkpoint-65000/trainer_state.json b/checkpoint-65000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d3873bb777a01e031116b0d26c74892b71636e54 --- /dev/null +++ b/checkpoint-65000/trainer_state.json @@ -0,0 +1,4708 @@ +{ + "best_global_step": 65000, + "best_metric": 1.0305662155151367, + "best_model_checkpoint": "./dna_model/checkpoint-65000", + "epoch": 8.980381320806853, + "eval_steps": 5000, + "global_step": 65000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00013815971262779773, + "grad_norm": 42.580928802490234, + "learning_rate": 0.0, + "loss": 1.6625, + "step": 1 + }, + { + "epoch": 0.013815971262779773, + "grad_norm": 2.4457767009735107, + "learning_rate": 2.97e-05, + "loss": 1.36, + "step": 100 + }, + { + "epoch": 0.027631942525559547, + "grad_norm": 0.5432274341583252, + "learning_rate": 5.97e-05, + "loss": 1.3309, + "step": 200 + }, + { + "epoch": 0.04144791378833932, + "grad_norm": 0.825528621673584, + "learning_rate": 8.969999999999998e-05, + "loss": 1.3234, + "step": 300 + }, + { + "epoch": 0.055263885051119094, + "grad_norm": 0.4912604093551636, + "learning_rate": 0.0001197, + "loss": 1.3249, + "step": 400 + }, + { + "epoch": 0.06907985631389886, + "grad_norm": 0.9077563881874084, + "learning_rate": 0.00014969999999999998, + "loss": 1.3153, + "step": 500 + }, + { + "epoch": 0.08289582757667864, + "grad_norm": 0.8954246640205383, + "learning_rate": 0.00017969999999999998, + "loss": 1.3123, + "step": 600 + }, + { + "epoch": 0.09671179883945841, + "grad_norm": 0.5876831412315369, + "learning_rate": 0.00020969999999999997, + "loss": 1.3098, + "step": 700 + }, + { + "epoch": 0.11052777010223819, + "grad_norm": 0.426789253950119, + "learning_rate": 0.0002397, + "loss": 1.3072, + "step": 800 + }, + { + "epoch": 0.12434374136501795, + "grad_norm": 0.3324718177318573, + "learning_rate": 0.0002697, + "loss": 1.3037, + "step": 900 + }, + { + "epoch": 0.13815971262779772, + "grad_norm": 0.23672613501548767, + "learning_rate": 0.00029969999999999997, + "loss": 1.2991, + "step": 1000 + }, + { + "epoch": 0.1519756838905775, + "grad_norm": 0.4699796438217163, + "learning_rate": 0.00029958391706360325, + "loss": 1.2923, + "step": 1100 + }, + { + "epoch": 0.16579165515335728, + "grad_norm": 0.684186577796936, + "learning_rate": 0.00029916363126926307, + "loss": 1.2825, + "step": 1200 + }, + { + "epoch": 0.17960762641613706, + "grad_norm": 0.3944641649723053, + "learning_rate": 0.00029874334547492294, + "loss": 1.2678, + "step": 1300 + }, + { + "epoch": 0.19342359767891681, + "grad_norm": 1.1556001901626587, + "learning_rate": 0.00029832305968058276, + "loss": 1.2541, + "step": 1400 + }, + { + "epoch": 0.2072395689416966, + "grad_norm": 0.39745599031448364, + "learning_rate": 0.0002979027738862426, + "loss": 1.2439, + "step": 1500 + }, + { + "epoch": 0.22105554020447638, + "grad_norm": 0.5201444029808044, + "learning_rate": 0.00029748248809190246, + "loss": 1.2329, + "step": 1600 + }, + { + "epoch": 0.23487151146725616, + "grad_norm": 0.2168777734041214, + "learning_rate": 0.00029706220229756234, + "loss": 1.2268, + "step": 1700 + }, + { + "epoch": 0.2486874827300359, + "grad_norm": 0.30599427223205566, + "learning_rate": 0.00029664191650322216, + "loss": 1.2199, + "step": 1800 + }, + { + "epoch": 0.2625034539928157, + "grad_norm": 0.32062044739723206, + "learning_rate": 0.00029622163070888203, + "loss": 1.2131, + "step": 1900 + }, + { + "epoch": 0.27631942525559544, + "grad_norm": 0.13411013782024384, + "learning_rate": 0.00029580134491454186, + "loss": 1.2074, + "step": 2000 + }, + { + "epoch": 0.2901353965183752, + "grad_norm": 0.3672633767127991, + "learning_rate": 0.00029538105912020173, + "loss": 1.2022, + "step": 2100 + }, + { + "epoch": 0.303951367781155, + "grad_norm": 0.41515815258026123, + "learning_rate": 0.00029496077332586155, + "loss": 1.1949, + "step": 2200 + }, + { + "epoch": 0.3177673390439348, + "grad_norm": 0.18381068110466003, + "learning_rate": 0.0002945404875315214, + "loss": 1.1887, + "step": 2300 + }, + { + "epoch": 0.33158331030671456, + "grad_norm": 0.3080751895904541, + "learning_rate": 0.00029412020173718125, + "loss": 1.1844, + "step": 2400 + }, + { + "epoch": 0.34539928156949434, + "grad_norm": 0.38037416338920593, + "learning_rate": 0.0002936999159428411, + "loss": 1.1804, + "step": 2500 + }, + { + "epoch": 0.3592152528322741, + "grad_norm": 0.23272989690303802, + "learning_rate": 0.00029327963014850095, + "loss": 1.1753, + "step": 2600 + }, + { + "epoch": 0.3730312240950539, + "grad_norm": 0.1149936243891716, + "learning_rate": 0.0002928593443541608, + "loss": 1.1739, + "step": 2700 + }, + { + "epoch": 0.38684719535783363, + "grad_norm": 0.28469276428222656, + "learning_rate": 0.00029243905855982064, + "loss": 1.1671, + "step": 2800 + }, + { + "epoch": 0.4006631666206134, + "grad_norm": 0.25204166769981384, + "learning_rate": 0.0002920187727654805, + "loss": 1.1633, + "step": 2900 + }, + { + "epoch": 0.4144791378833932, + "grad_norm": 0.3945861756801605, + "learning_rate": 0.00029159848697114034, + "loss": 1.1608, + "step": 3000 + }, + { + "epoch": 0.42829510914617297, + "grad_norm": 0.2578865587711334, + "learning_rate": 0.00029117820117680016, + "loss": 1.1622, + "step": 3100 + }, + { + "epoch": 0.44211108040895275, + "grad_norm": 0.16060177981853485, + "learning_rate": 0.00029075791538246004, + "loss": 1.1577, + "step": 3200 + }, + { + "epoch": 0.45592705167173253, + "grad_norm": 0.1980718970298767, + "learning_rate": 0.0002903376295881199, + "loss": 1.155, + "step": 3300 + }, + { + "epoch": 0.4697430229345123, + "grad_norm": 0.12515653669834137, + "learning_rate": 0.00028991734379377974, + "loss": 1.1519, + "step": 3400 + }, + { + "epoch": 0.4835589941972921, + "grad_norm": 0.26255738735198975, + "learning_rate": 0.0002894970579994396, + "loss": 1.1523, + "step": 3500 + }, + { + "epoch": 0.4973749654600718, + "grad_norm": 0.281464546918869, + "learning_rate": 0.00028907677220509943, + "loss": 1.1511, + "step": 3600 + }, + { + "epoch": 0.5111909367228517, + "grad_norm": 0.11816036701202393, + "learning_rate": 0.0002886564864107593, + "loss": 1.1469, + "step": 3700 + }, + { + "epoch": 0.5250069079856314, + "grad_norm": 0.25923675298690796, + "learning_rate": 0.00028823620061641913, + "loss": 1.1456, + "step": 3800 + }, + { + "epoch": 0.5388228792484112, + "grad_norm": 0.2766472399234772, + "learning_rate": 0.00028781591482207895, + "loss": 1.1442, + "step": 3900 + }, + { + "epoch": 0.5526388505111909, + "grad_norm": 0.1701624095439911, + "learning_rate": 0.00028739562902773883, + "loss": 1.1445, + "step": 4000 + }, + { + "epoch": 0.5664548217739707, + "grad_norm": 0.3141656219959259, + "learning_rate": 0.0002869753432333987, + "loss": 1.1392, + "step": 4100 + }, + { + "epoch": 0.5802707930367504, + "grad_norm": 0.11816743016242981, + "learning_rate": 0.0002865550574390585, + "loss": 1.1406, + "step": 4200 + }, + { + "epoch": 0.5940867642995302, + "grad_norm": 0.12762723863124847, + "learning_rate": 0.0002861347716447184, + "loss": 1.1361, + "step": 4300 + }, + { + "epoch": 0.60790273556231, + "grad_norm": 0.09322622418403625, + "learning_rate": 0.0002857144858503782, + "loss": 1.134, + "step": 4400 + }, + { + "epoch": 0.6217187068250898, + "grad_norm": 0.1586735099554062, + "learning_rate": 0.0002852942000560381, + "loss": 1.1336, + "step": 4500 + }, + { + "epoch": 0.6355346780878696, + "grad_norm": 0.13594642281532288, + "learning_rate": 0.0002848739142616979, + "loss": 1.1328, + "step": 4600 + }, + { + "epoch": 0.6493506493506493, + "grad_norm": 0.21865279972553253, + "learning_rate": 0.00028445362846735774, + "loss": 1.1311, + "step": 4700 + }, + { + "epoch": 0.6631666206134291, + "grad_norm": 0.22787001729011536, + "learning_rate": 0.0002840333426730176, + "loss": 1.1271, + "step": 4800 + }, + { + "epoch": 0.6769825918762089, + "grad_norm": 0.2334531843662262, + "learning_rate": 0.0002836130568786775, + "loss": 1.1291, + "step": 4900 + }, + { + "epoch": 0.6907985631389887, + "grad_norm": 0.11103236675262451, + "learning_rate": 0.0002831927710843373, + "loss": 1.1252, + "step": 5000 + }, + { + "epoch": 0.6907985631389887, + "eval_accuracy": 0.4745045939970608, + "eval_loss": 1.1205766201019287, + "eval_runtime": 1027.9902, + "eval_samples_per_second": 200.256, + "eval_steps_per_second": 6.259, + "step": 5000 + }, + { + "epoch": 0.7046145344017685, + "grad_norm": 0.21742330491542816, + "learning_rate": 0.0002827724852899972, + "loss": 1.1235, + "step": 5100 + }, + { + "epoch": 0.7184305056645482, + "grad_norm": 0.23728515207767487, + "learning_rate": 0.000282352199495657, + "loss": 1.1233, + "step": 5200 + }, + { + "epoch": 0.732246476927328, + "grad_norm": 0.21022765338420868, + "learning_rate": 0.0002819319137013169, + "loss": 1.1236, + "step": 5300 + }, + { + "epoch": 0.7460624481901078, + "grad_norm": 0.0924484059214592, + "learning_rate": 0.0002815116279069767, + "loss": 1.1215, + "step": 5400 + }, + { + "epoch": 0.7598784194528876, + "grad_norm": 0.1716778427362442, + "learning_rate": 0.00028109134211263653, + "loss": 1.1238, + "step": 5500 + }, + { + "epoch": 0.7736943907156673, + "grad_norm": 0.13049638271331787, + "learning_rate": 0.0002806710563182964, + "loss": 1.1185, + "step": 5600 + }, + { + "epoch": 0.787510361978447, + "grad_norm": 0.16255174577236176, + "learning_rate": 0.0002802507705239563, + "loss": 1.1169, + "step": 5700 + }, + { + "epoch": 0.8013263332412268, + "grad_norm": 0.10065080225467682, + "learning_rate": 0.0002798304847296161, + "loss": 1.1184, + "step": 5800 + }, + { + "epoch": 0.8151423045040066, + "grad_norm": 0.1182553768157959, + "learning_rate": 0.000279410198935276, + "loss": 1.1141, + "step": 5900 + }, + { + "epoch": 0.8289582757667864, + "grad_norm": 0.14556263387203217, + "learning_rate": 0.0002789899131409358, + "loss": 1.1154, + "step": 6000 + }, + { + "epoch": 0.8427742470295662, + "grad_norm": 0.1383764147758484, + "learning_rate": 0.00027857383020453907, + "loss": 1.1118, + "step": 6100 + }, + { + "epoch": 0.8565902182923459, + "grad_norm": 0.2821154296398163, + "learning_rate": 0.00027815354441019895, + "loss": 1.1104, + "step": 6200 + }, + { + "epoch": 0.8704061895551257, + "grad_norm": 0.22286450862884521, + "learning_rate": 0.00027773325861585877, + "loss": 1.1109, + "step": 6300 + }, + { + "epoch": 0.8842221608179055, + "grad_norm": 0.2058987319469452, + "learning_rate": 0.0002773129728215186, + "loss": 1.1093, + "step": 6400 + }, + { + "epoch": 0.8980381320806853, + "grad_norm": 0.21338045597076416, + "learning_rate": 0.00027689268702717847, + "loss": 1.1091, + "step": 6500 + }, + { + "epoch": 0.9118541033434651, + "grad_norm": 0.0900028795003891, + "learning_rate": 0.0002764724012328383, + "loss": 1.1067, + "step": 6600 + }, + { + "epoch": 0.9256700746062448, + "grad_norm": 0.10679551959037781, + "learning_rate": 0.00027605211543849816, + "loss": 1.108, + "step": 6700 + }, + { + "epoch": 0.9394860458690246, + "grad_norm": 0.07972779124975204, + "learning_rate": 0.000275631829644158, + "loss": 1.1057, + "step": 6800 + }, + { + "epoch": 0.9533020171318044, + "grad_norm": 0.24500218033790588, + "learning_rate": 0.00027521154384981786, + "loss": 1.105, + "step": 6900 + }, + { + "epoch": 0.9671179883945842, + "grad_norm": 0.11576998978853226, + "learning_rate": 0.00027479125805547774, + "loss": 1.1029, + "step": 7000 + }, + { + "epoch": 0.980933959657364, + "grad_norm": 0.10553757101297379, + "learning_rate": 0.00027437097226113756, + "loss": 1.1041, + "step": 7100 + }, + { + "epoch": 0.9947499309201436, + "grad_norm": 0.15332186222076416, + "learning_rate": 0.0002739506864667974, + "loss": 1.0982, + "step": 7200 + }, + { + "epoch": 1.0085659021829234, + "grad_norm": 0.11897014081478119, + "learning_rate": 0.00027353040067245725, + "loss": 1.0996, + "step": 7300 + }, + { + "epoch": 1.0223818734457033, + "grad_norm": 0.1156444102525711, + "learning_rate": 0.0002731101148781171, + "loss": 1.1032, + "step": 7400 + }, + { + "epoch": 1.036197844708483, + "grad_norm": 0.06223931908607483, + "learning_rate": 0.00027268982908377695, + "loss": 1.0982, + "step": 7500 + }, + { + "epoch": 1.0500138159712629, + "grad_norm": 0.14377152919769287, + "learning_rate": 0.00027226954328943677, + "loss": 1.1003, + "step": 7600 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 0.12667153775691986, + "learning_rate": 0.00027184925749509665, + "loss": 1.0989, + "step": 7700 + }, + { + "epoch": 1.0776457584968224, + "grad_norm": 0.16101804375648499, + "learning_rate": 0.0002714289717007565, + "loss": 1.0968, + "step": 7800 + }, + { + "epoch": 1.091461729759602, + "grad_norm": 0.06424383819103241, + "learning_rate": 0.00027100868590641635, + "loss": 1.0955, + "step": 7900 + }, + { + "epoch": 1.105277701022382, + "grad_norm": 0.09638939052820206, + "learning_rate": 0.00027058840011207617, + "loss": 1.095, + "step": 8000 + }, + { + "epoch": 1.1190936722851617, + "grad_norm": 0.08098015189170837, + "learning_rate": 0.00027016811431773604, + "loss": 1.0969, + "step": 8100 + }, + { + "epoch": 1.1329096435479413, + "grad_norm": 0.10837887227535248, + "learning_rate": 0.00026974782852339586, + "loss": 1.096, + "step": 8200 + }, + { + "epoch": 1.1467256148107212, + "grad_norm": 0.05644046515226364, + "learning_rate": 0.00026932754272905574, + "loss": 1.0944, + "step": 8300 + }, + { + "epoch": 1.1605415860735009, + "grad_norm": 0.12965446710586548, + "learning_rate": 0.00026890725693471556, + "loss": 1.0953, + "step": 8400 + }, + { + "epoch": 1.1743575573362808, + "grad_norm": 0.12333771586418152, + "learning_rate": 0.00026848697114037544, + "loss": 1.095, + "step": 8500 + }, + { + "epoch": 1.1881735285990604, + "grad_norm": 0.1270703673362732, + "learning_rate": 0.0002680666853460353, + "loss": 1.0929, + "step": 8600 + }, + { + "epoch": 1.2019894998618403, + "grad_norm": 0.16918766498565674, + "learning_rate": 0.00026764639955169513, + "loss": 1.0918, + "step": 8700 + }, + { + "epoch": 1.21580547112462, + "grad_norm": 0.08776108920574188, + "learning_rate": 0.00026722611375735496, + "loss": 1.0952, + "step": 8800 + }, + { + "epoch": 1.2296214423874, + "grad_norm": 0.08252176642417908, + "learning_rate": 0.00026680582796301483, + "loss": 1.09, + "step": 8900 + }, + { + "epoch": 1.2434374136501796, + "grad_norm": 0.16331979632377625, + "learning_rate": 0.00026638554216867465, + "loss": 1.0898, + "step": 9000 + }, + { + "epoch": 1.2572533849129595, + "grad_norm": 0.17065368592739105, + "learning_rate": 0.00026596525637433453, + "loss": 1.0907, + "step": 9100 + }, + { + "epoch": 1.2710693561757391, + "grad_norm": 0.12038784474134445, + "learning_rate": 0.00026554497057999435, + "loss": 1.0856, + "step": 9200 + }, + { + "epoch": 1.284885327438519, + "grad_norm": 0.11924347281455994, + "learning_rate": 0.0002651246847856542, + "loss": 1.0895, + "step": 9300 + }, + { + "epoch": 1.2987012987012987, + "grad_norm": 0.1443828046321869, + "learning_rate": 0.0002647043989913141, + "loss": 1.0874, + "step": 9400 + }, + { + "epoch": 1.3125172699640784, + "grad_norm": 0.14472317695617676, + "learning_rate": 0.0002642841131969739, + "loss": 1.0879, + "step": 9500 + }, + { + "epoch": 1.3263332412268583, + "grad_norm": 0.15847088396549225, + "learning_rate": 0.00026386382740263374, + "loss": 1.0873, + "step": 9600 + }, + { + "epoch": 1.3401492124896381, + "grad_norm": 0.17960332334041595, + "learning_rate": 0.0002634435416082936, + "loss": 1.0887, + "step": 9700 + }, + { + "epoch": 1.3539651837524178, + "grad_norm": 0.1566227227449417, + "learning_rate": 0.00026302325581395344, + "loss": 1.0884, + "step": 9800 + }, + { + "epoch": 1.3677811550151975, + "grad_norm": 0.1431213617324829, + "learning_rate": 0.0002626029700196133, + "loss": 1.0864, + "step": 9900 + }, + { + "epoch": 1.3815971262779774, + "grad_norm": 0.10321222990751266, + "learning_rate": 0.0002621826842252732, + "loss": 1.0835, + "step": 10000 + }, + { + "epoch": 1.3815971262779774, + "eval_accuracy": 0.49913821881815945, + "eval_loss": 1.081355094909668, + "eval_runtime": 748.8314, + "eval_samples_per_second": 274.91, + "eval_steps_per_second": 8.592, + "step": 10000 + }, + { + "epoch": 1.395413097540757, + "grad_norm": 0.10260605067014694, + "learning_rate": 0.0002617666012888764, + "loss": 1.0843, + "step": 10100 + }, + { + "epoch": 1.409229068803537, + "grad_norm": 0.1076885387301445, + "learning_rate": 0.0002613463154945363, + "loss": 1.0845, + "step": 10200 + }, + { + "epoch": 1.4230450400663166, + "grad_norm": 0.0723571702837944, + "learning_rate": 0.0002609260297001961, + "loss": 1.0814, + "step": 10300 + }, + { + "epoch": 1.4368610113290965, + "grad_norm": 0.10695687681436539, + "learning_rate": 0.00026050574390585593, + "loss": 1.0842, + "step": 10400 + }, + { + "epoch": 1.4506769825918762, + "grad_norm": 0.11008185893297195, + "learning_rate": 0.0002600854581115158, + "loss": 1.0832, + "step": 10500 + }, + { + "epoch": 1.464492953854656, + "grad_norm": 0.12239653617143631, + "learning_rate": 0.0002596651723171756, + "loss": 1.0813, + "step": 10600 + }, + { + "epoch": 1.4783089251174357, + "grad_norm": 0.11045056581497192, + "learning_rate": 0.0002592448865228355, + "loss": 1.0848, + "step": 10700 + }, + { + "epoch": 1.4921248963802154, + "grad_norm": 0.07234488427639008, + "learning_rate": 0.0002588246007284954, + "loss": 1.0826, + "step": 10800 + }, + { + "epoch": 1.5059408676429953, + "grad_norm": 0.11086778342723846, + "learning_rate": 0.0002584043149341552, + "loss": 1.0804, + "step": 10900 + }, + { + "epoch": 1.5197568389057752, + "grad_norm": 0.10693442821502686, + "learning_rate": 0.0002579840291398151, + "loss": 1.0784, + "step": 11000 + }, + { + "epoch": 1.5335728101685548, + "grad_norm": 0.11604110896587372, + "learning_rate": 0.0002575637433454749, + "loss": 1.0792, + "step": 11100 + }, + { + "epoch": 1.5473887814313345, + "grad_norm": 0.0809662714600563, + "learning_rate": 0.0002571434575511347, + "loss": 1.083, + "step": 11200 + }, + { + "epoch": 1.5612047526941144, + "grad_norm": 0.1850002408027649, + "learning_rate": 0.0002567231717567946, + "loss": 1.0802, + "step": 11300 + }, + { + "epoch": 1.5750207239568943, + "grad_norm": 0.0779227465391159, + "learning_rate": 0.0002563028859624544, + "loss": 1.0811, + "step": 11400 + }, + { + "epoch": 1.588836695219674, + "grad_norm": 0.16764625906944275, + "learning_rate": 0.0002558826001681143, + "loss": 1.0763, + "step": 11500 + }, + { + "epoch": 1.6026526664824536, + "grad_norm": 0.11104313284158707, + "learning_rate": 0.00025546231437377417, + "loss": 1.0782, + "step": 11600 + }, + { + "epoch": 1.6164686377452335, + "grad_norm": 0.16667212545871735, + "learning_rate": 0.000255042028579434, + "loss": 1.0781, + "step": 11700 + }, + { + "epoch": 1.6302846090080134, + "grad_norm": 0.2246047705411911, + "learning_rate": 0.00025462174278509386, + "loss": 1.08, + "step": 11800 + }, + { + "epoch": 1.644100580270793, + "grad_norm": 0.2305343896150589, + "learning_rate": 0.0002542014569907537, + "loss": 1.0756, + "step": 11900 + }, + { + "epoch": 1.6579165515335728, + "grad_norm": 0.13618823885917664, + "learning_rate": 0.0002537811711964135, + "loss": 1.076, + "step": 12000 + }, + { + "epoch": 1.6717325227963524, + "grad_norm": 0.15795475244522095, + "learning_rate": 0.0002533608854020734, + "loss": 1.0749, + "step": 12100 + }, + { + "epoch": 1.6855484940591323, + "grad_norm": 0.20267115533351898, + "learning_rate": 0.00025294480246567665, + "loss": 1.077, + "step": 12200 + }, + { + "epoch": 1.6993644653219122, + "grad_norm": 0.08052489906549454, + "learning_rate": 0.0002525245166713365, + "loss": 1.073, + "step": 12300 + }, + { + "epoch": 1.7131804365846919, + "grad_norm": 0.11914093047380447, + "learning_rate": 0.00025210423087699635, + "loss": 1.0755, + "step": 12400 + }, + { + "epoch": 1.7269964078474715, + "grad_norm": 0.12703542411327362, + "learning_rate": 0.00025168394508265617, + "loss": 1.0765, + "step": 12500 + }, + { + "epoch": 1.7408123791102514, + "grad_norm": 0.12948518991470337, + "learning_rate": 0.00025126365928831605, + "loss": 1.0748, + "step": 12600 + }, + { + "epoch": 1.7546283503730313, + "grad_norm": 0.1027710810303688, + "learning_rate": 0.00025084337349397587, + "loss": 1.0745, + "step": 12700 + }, + { + "epoch": 1.768444321635811, + "grad_norm": 0.20131652057170868, + "learning_rate": 0.0002504230876996357, + "loss": 1.0731, + "step": 12800 + }, + { + "epoch": 1.7822602928985907, + "grad_norm": 0.0673370212316513, + "learning_rate": 0.00025000280190529557, + "loss": 1.0721, + "step": 12900 + }, + { + "epoch": 1.7960762641613706, + "grad_norm": 0.10322799533605576, + "learning_rate": 0.00024958251611095544, + "loss": 1.0731, + "step": 13000 + }, + { + "epoch": 1.8098922354241505, + "grad_norm": 0.08498311042785645, + "learning_rate": 0.00024916223031661526, + "loss": 1.0722, + "step": 13100 + }, + { + "epoch": 1.8237082066869301, + "grad_norm": 0.07025079429149628, + "learning_rate": 0.00024874194452227514, + "loss": 1.0725, + "step": 13200 + }, + { + "epoch": 1.8375241779497098, + "grad_norm": 0.13933932781219482, + "learning_rate": 0.00024832165872793496, + "loss": 1.0714, + "step": 13300 + }, + { + "epoch": 1.8513401492124897, + "grad_norm": 0.10513993352651596, + "learning_rate": 0.00024790137293359484, + "loss": 1.0725, + "step": 13400 + }, + { + "epoch": 1.8651561204752696, + "grad_norm": 0.1704607903957367, + "learning_rate": 0.0002474810871392547, + "loss": 1.0712, + "step": 13500 + }, + { + "epoch": 1.8789720917380492, + "grad_norm": 0.08315689861774445, + "learning_rate": 0.0002470608013449145, + "loss": 1.0697, + "step": 13600 + }, + { + "epoch": 1.892788063000829, + "grad_norm": 0.09900273382663727, + "learning_rate": 0.00024664051555057436, + "loss": 1.0735, + "step": 13700 + }, + { + "epoch": 1.9066040342636086, + "grad_norm": 0.05560864508152008, + "learning_rate": 0.00024622022975623423, + "loss": 1.0711, + "step": 13800 + }, + { + "epoch": 1.9204200055263885, + "grad_norm": 0.13863462209701538, + "learning_rate": 0.00024579994396189405, + "loss": 1.0681, + "step": 13900 + }, + { + "epoch": 1.9342359767891684, + "grad_norm": 0.07841744273900986, + "learning_rate": 0.00024537965816755393, + "loss": 1.0711, + "step": 14000 + }, + { + "epoch": 1.948051948051948, + "grad_norm": 0.058312736451625824, + "learning_rate": 0.00024495937237321375, + "loss": 1.0709, + "step": 14100 + }, + { + "epoch": 1.9618679193147277, + "grad_norm": 0.11208023875951767, + "learning_rate": 0.000244543289436817, + "loss": 1.0686, + "step": 14200 + }, + { + "epoch": 1.9756838905775076, + "grad_norm": 0.10133163630962372, + "learning_rate": 0.00024412300364247687, + "loss": 1.0683, + "step": 14300 + }, + { + "epoch": 1.9894998618402875, + "grad_norm": 0.08370282500982285, + "learning_rate": 0.0002437027178481367, + "loss": 1.0709, + "step": 14400 + }, + { + "epoch": 2.003315833103067, + "grad_norm": 0.09476770460605621, + "learning_rate": 0.00024328243205379654, + "loss": 1.0697, + "step": 14500 + }, + { + "epoch": 2.017131804365847, + "grad_norm": 0.0733637660741806, + "learning_rate": 0.0002428621462594564, + "loss": 1.0681, + "step": 14600 + }, + { + "epoch": 2.0309477756286265, + "grad_norm": 0.09925834089517593, + "learning_rate": 0.00024244186046511627, + "loss": 1.0702, + "step": 14700 + }, + { + "epoch": 2.0447637468914066, + "grad_norm": 0.15911750495433807, + "learning_rate": 0.00024202157467077611, + "loss": 1.0665, + "step": 14800 + }, + { + "epoch": 2.0585797181541863, + "grad_norm": 0.13638247549533844, + "learning_rate": 0.00024160128887643596, + "loss": 1.0696, + "step": 14900 + }, + { + "epoch": 2.072395689416966, + "grad_norm": 0.16883982717990875, + "learning_rate": 0.0002411810030820958, + "loss": 1.0641, + "step": 15000 + }, + { + "epoch": 2.072395689416966, + "eval_accuracy": 0.5102966510685876, + "eval_loss": 1.0638896226882935, + "eval_runtime": 924.2494, + "eval_samples_per_second": 222.733, + "eval_steps_per_second": 6.961, + "step": 15000 + }, + { + "epoch": 2.0862116606797456, + "grad_norm": 0.09925784170627594, + "learning_rate": 0.00024076071728775566, + "loss": 1.0683, + "step": 15100 + }, + { + "epoch": 2.1000276319425257, + "grad_norm": 0.06180203706026077, + "learning_rate": 0.00024034043149341548, + "loss": 1.066, + "step": 15200 + }, + { + "epoch": 2.1138436032053054, + "grad_norm": 0.10063247382640839, + "learning_rate": 0.00023992014569907533, + "loss": 1.0668, + "step": 15300 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 0.11476041376590729, + "learning_rate": 0.0002394998599047352, + "loss": 1.0644, + "step": 15400 + }, + { + "epoch": 2.1414755457308647, + "grad_norm": 0.11798429489135742, + "learning_rate": 0.00023907957411039505, + "loss": 1.0626, + "step": 15500 + }, + { + "epoch": 2.155291516993645, + "grad_norm": 0.13165287673473358, + "learning_rate": 0.0002386592883160549, + "loss": 1.0648, + "step": 15600 + }, + { + "epoch": 2.1691074882564245, + "grad_norm": 0.1705123484134674, + "learning_rate": 0.00023823900252171475, + "loss": 1.0639, + "step": 15700 + }, + { + "epoch": 2.182923459519204, + "grad_norm": 0.13375049829483032, + "learning_rate": 0.0002378187167273746, + "loss": 1.062, + "step": 15800 + }, + { + "epoch": 2.196739430781984, + "grad_norm": 0.09405038505792618, + "learning_rate": 0.00023739843093303445, + "loss": 1.0634, + "step": 15900 + }, + { + "epoch": 2.210555402044764, + "grad_norm": 0.11285752803087234, + "learning_rate": 0.00023697814513869427, + "loss": 1.0667, + "step": 16000 + }, + { + "epoch": 2.2243713733075436, + "grad_norm": 0.12377699464559555, + "learning_rate": 0.00023655785934435412, + "loss": 1.064, + "step": 16100 + }, + { + "epoch": 2.2381873445703233, + "grad_norm": 0.0979316234588623, + "learning_rate": 0.000236137573550014, + "loss": 1.0621, + "step": 16200 + }, + { + "epoch": 2.252003315833103, + "grad_norm": 0.11494515091180801, + "learning_rate": 0.00023572149061361724, + "loss": 1.0645, + "step": 16300 + }, + { + "epoch": 2.2658192870958827, + "grad_norm": 0.07066236436367035, + "learning_rate": 0.0002353012048192771, + "loss": 1.063, + "step": 16400 + }, + { + "epoch": 2.2796352583586628, + "grad_norm": 0.08686563372612, + "learning_rate": 0.00023488091902493694, + "loss": 1.066, + "step": 16500 + }, + { + "epoch": 2.2934512296214424, + "grad_norm": 0.058148209005594254, + "learning_rate": 0.00023446063323059678, + "loss": 1.0643, + "step": 16600 + }, + { + "epoch": 2.307267200884222, + "grad_norm": 0.14033359289169312, + "learning_rate": 0.00023404034743625666, + "loss": 1.0634, + "step": 16700 + }, + { + "epoch": 2.3210831721470018, + "grad_norm": 0.09940097481012344, + "learning_rate": 0.00023362006164191645, + "loss": 1.0629, + "step": 16800 + }, + { + "epoch": 2.334899143409782, + "grad_norm": 0.08228994905948639, + "learning_rate": 0.00023319977584757633, + "loss": 1.0626, + "step": 16900 + }, + { + "epoch": 2.3487151146725616, + "grad_norm": 0.05418753623962402, + "learning_rate": 0.00023277949005323618, + "loss": 1.0611, + "step": 17000 + }, + { + "epoch": 2.3625310859353412, + "grad_norm": 0.09691222757101059, + "learning_rate": 0.00023235920425889603, + "loss": 1.0626, + "step": 17100 + }, + { + "epoch": 2.376347057198121, + "grad_norm": 0.1607312560081482, + "learning_rate": 0.00023193891846455588, + "loss": 1.0623, + "step": 17200 + }, + { + "epoch": 2.3901630284609006, + "grad_norm": 0.1193649098277092, + "learning_rate": 0.00023151863267021572, + "loss": 1.0627, + "step": 17300 + }, + { + "epoch": 2.4039789997236807, + "grad_norm": 0.05427398905158043, + "learning_rate": 0.00023109834687587557, + "loss": 1.0609, + "step": 17400 + }, + { + "epoch": 2.4177949709864603, + "grad_norm": 0.10591702163219452, + "learning_rate": 0.00023067806108153545, + "loss": 1.0637, + "step": 17500 + }, + { + "epoch": 2.43161094224924, + "grad_norm": 0.057032886892557144, + "learning_rate": 0.00023025777528719524, + "loss": 1.0612, + "step": 17600 + }, + { + "epoch": 2.44542691351202, + "grad_norm": 0.08455175161361694, + "learning_rate": 0.00022983748949285512, + "loss": 1.0606, + "step": 17700 + }, + { + "epoch": 2.4592428847748, + "grad_norm": 0.13975144922733307, + "learning_rate": 0.00022941720369851497, + "loss": 1.0624, + "step": 17800 + }, + { + "epoch": 2.4730588560375795, + "grad_norm": 0.11535393446683884, + "learning_rate": 0.00022899691790417482, + "loss": 1.0603, + "step": 17900 + }, + { + "epoch": 2.486874827300359, + "grad_norm": 0.10047648102045059, + "learning_rate": 0.00022857663210983466, + "loss": 1.0607, + "step": 18000 + }, + { + "epoch": 2.500690798563139, + "grad_norm": 0.08474704623222351, + "learning_rate": 0.0002281563463154945, + "loss": 1.062, + "step": 18100 + }, + { + "epoch": 2.514506769825919, + "grad_norm": 0.15308576822280884, + "learning_rate": 0.00022773606052115436, + "loss": 1.0603, + "step": 18200 + }, + { + "epoch": 2.5283227410886986, + "grad_norm": 0.05684039369225502, + "learning_rate": 0.00022731577472681424, + "loss": 1.0589, + "step": 18300 + }, + { + "epoch": 2.5421387123514783, + "grad_norm": 0.10712555050849915, + "learning_rate": 0.00022689548893247409, + "loss": 1.0592, + "step": 18400 + }, + { + "epoch": 2.555954683614258, + "grad_norm": 0.0800655260682106, + "learning_rate": 0.0002264794059960773, + "loss": 1.0603, + "step": 18500 + }, + { + "epoch": 2.569770654877038, + "grad_norm": 0.05980188027024269, + "learning_rate": 0.00022605912020173715, + "loss": 1.0608, + "step": 18600 + }, + { + "epoch": 2.5835866261398177, + "grad_norm": 0.052051473408937454, + "learning_rate": 0.000225638834407397, + "loss": 1.0603, + "step": 18700 + }, + { + "epoch": 2.5974025974025974, + "grad_norm": 0.11966883391141891, + "learning_rate": 0.00022521854861305685, + "loss": 1.057, + "step": 18800 + }, + { + "epoch": 2.611218568665377, + "grad_norm": 0.08861220628023148, + "learning_rate": 0.00022479826281871673, + "loss": 1.0603, + "step": 18900 + }, + { + "epoch": 2.6250345399281567, + "grad_norm": 0.12264814227819443, + "learning_rate": 0.00022437797702437657, + "loss": 1.0602, + "step": 19000 + }, + { + "epoch": 2.638850511190937, + "grad_norm": 0.08384163677692413, + "learning_rate": 0.00022395769123003642, + "loss": 1.057, + "step": 19100 + }, + { + "epoch": 2.6526664824537165, + "grad_norm": 0.11168386787176132, + "learning_rate": 0.00022353740543569624, + "loss": 1.0572, + "step": 19200 + }, + { + "epoch": 2.666482453716496, + "grad_norm": 0.12558519840240479, + "learning_rate": 0.0002231171196413561, + "loss": 1.0592, + "step": 19300 + }, + { + "epoch": 2.6802984249792763, + "grad_norm": 0.06810207664966583, + "learning_rate": 0.00022269683384701594, + "loss": 1.055, + "step": 19400 + }, + { + "epoch": 2.694114396242056, + "grad_norm": 0.16571113467216492, + "learning_rate": 0.0002222765480526758, + "loss": 1.0599, + "step": 19500 + }, + { + "epoch": 2.7079303675048356, + "grad_norm": 0.07613151520490646, + "learning_rate": 0.00022185626225833564, + "loss": 1.0564, + "step": 19600 + }, + { + "epoch": 2.7217463387676153, + "grad_norm": 0.08713393658399582, + "learning_rate": 0.00022143597646399551, + "loss": 1.0582, + "step": 19700 + }, + { + "epoch": 2.735562310030395, + "grad_norm": 0.11707925796508789, + "learning_rate": 0.00022101569066965536, + "loss": 1.056, + "step": 19800 + }, + { + "epoch": 2.749378281293175, + "grad_norm": 0.1053171455860138, + "learning_rate": 0.0002205954048753152, + "loss": 1.0608, + "step": 19900 + }, + { + "epoch": 2.7631942525559547, + "grad_norm": 0.056531500071287155, + "learning_rate": 0.00022017511908097506, + "loss": 1.0563, + "step": 20000 + }, + { + "epoch": 2.7631942525559547, + "eval_accuracy": 0.516310033016185, + "eval_loss": 1.054749608039856, + "eval_runtime": 731.5154, + "eval_samples_per_second": 281.417, + "eval_steps_per_second": 8.795, + "step": 20000 + }, + { + "epoch": 2.7770102238187344, + "grad_norm": 0.10811367630958557, + "learning_rate": 0.00021975483328663488, + "loss": 1.0556, + "step": 20100 + }, + { + "epoch": 2.790826195081514, + "grad_norm": 0.06601472198963165, + "learning_rate": 0.00021933454749229473, + "loss": 1.0578, + "step": 20200 + }, + { + "epoch": 2.804642166344294, + "grad_norm": 0.06906837224960327, + "learning_rate": 0.00021891426169795458, + "loss": 1.06, + "step": 20300 + }, + { + "epoch": 2.818458137607074, + "grad_norm": 0.08911406248807907, + "learning_rate": 0.00021849397590361443, + "loss": 1.0583, + "step": 20400 + }, + { + "epoch": 2.8322741088698535, + "grad_norm": 0.06497912108898163, + "learning_rate": 0.0002180778929672177, + "loss": 1.0575, + "step": 20500 + }, + { + "epoch": 2.846090080132633, + "grad_norm": 0.0886107012629509, + "learning_rate": 0.00021765760717287755, + "loss": 1.0552, + "step": 20600 + }, + { + "epoch": 2.859906051395413, + "grad_norm": 0.05942055955529213, + "learning_rate": 0.0002172373213785374, + "loss": 1.0533, + "step": 20700 + }, + { + "epoch": 2.873722022658193, + "grad_norm": 0.13015809655189514, + "learning_rate": 0.00021681703558419725, + "loss": 1.0549, + "step": 20800 + }, + { + "epoch": 2.8875379939209727, + "grad_norm": 0.06085093691945076, + "learning_rate": 0.00021639674978985707, + "loss": 1.057, + "step": 20900 + }, + { + "epoch": 2.9013539651837523, + "grad_norm": 0.17039401829242706, + "learning_rate": 0.00021597646399551692, + "loss": 1.0571, + "step": 21000 + }, + { + "epoch": 2.9151699364465324, + "grad_norm": 0.07950026541948318, + "learning_rate": 0.00021555617820117676, + "loss": 1.0535, + "step": 21100 + }, + { + "epoch": 2.928985907709312, + "grad_norm": 0.1195695698261261, + "learning_rate": 0.00021513589240683664, + "loss": 1.0535, + "step": 21200 + }, + { + "epoch": 2.942801878972092, + "grad_norm": 0.0896124541759491, + "learning_rate": 0.0002147156066124965, + "loss": 1.0534, + "step": 21300 + }, + { + "epoch": 2.9566178502348714, + "grad_norm": 0.07629978656768799, + "learning_rate": 0.00021429532081815634, + "loss": 1.0564, + "step": 21400 + }, + { + "epoch": 2.970433821497651, + "grad_norm": 0.07431907206773758, + "learning_rate": 0.00021387503502381618, + "loss": 1.0559, + "step": 21500 + }, + { + "epoch": 2.984249792760431, + "grad_norm": 0.0771278440952301, + "learning_rate": 0.00021345474922947603, + "loss": 1.0562, + "step": 21600 + }, + { + "epoch": 2.998065764023211, + "grad_norm": 0.11643990874290466, + "learning_rate": 0.00021303446343513585, + "loss": 1.0525, + "step": 21700 + }, + { + "epoch": 3.0118817352859906, + "grad_norm": 0.058162059634923935, + "learning_rate": 0.0002126141776407957, + "loss": 1.0509, + "step": 21800 + }, + { + "epoch": 3.0256977065487702, + "grad_norm": 0.12037301808595657, + "learning_rate": 0.00021219389184645558, + "loss": 1.0513, + "step": 21900 + }, + { + "epoch": 3.0395136778115504, + "grad_norm": 0.052515506744384766, + "learning_rate": 0.00021177360605211543, + "loss": 1.051, + "step": 22000 + }, + { + "epoch": 3.05332964907433, + "grad_norm": 0.10646827518939972, + "learning_rate": 0.00021135332025777528, + "loss": 1.0542, + "step": 22100 + }, + { + "epoch": 3.0671456203371097, + "grad_norm": 0.1113181784749031, + "learning_rate": 0.00021093303446343512, + "loss": 1.0531, + "step": 22200 + }, + { + "epoch": 3.0809615915998894, + "grad_norm": 0.07355222851037979, + "learning_rate": 0.00021051274866909497, + "loss": 1.0524, + "step": 22300 + }, + { + "epoch": 3.094777562862669, + "grad_norm": 0.06925370544195175, + "learning_rate": 0.00021009246287475482, + "loss": 1.0535, + "step": 22400 + }, + { + "epoch": 3.108593534125449, + "grad_norm": 0.048475924879312515, + "learning_rate": 0.00020967217708041464, + "loss": 1.0564, + "step": 22500 + }, + { + "epoch": 3.122409505388229, + "grad_norm": 0.08578319102525711, + "learning_rate": 0.0002092518912860745, + "loss": 1.0519, + "step": 22600 + }, + { + "epoch": 3.1362254766510085, + "grad_norm": 0.08585724979639053, + "learning_rate": 0.00020883160549173437, + "loss": 1.0525, + "step": 22700 + }, + { + "epoch": 3.150041447913788, + "grad_norm": 0.06518802791833878, + "learning_rate": 0.00020841131969739422, + "loss": 1.0543, + "step": 22800 + }, + { + "epoch": 3.1638574191765683, + "grad_norm": 0.046030618250370026, + "learning_rate": 0.00020799103390305406, + "loss": 1.0525, + "step": 22900 + }, + { + "epoch": 3.177673390439348, + "grad_norm": 0.04972764104604721, + "learning_rate": 0.0002075707481087139, + "loss": 1.0512, + "step": 23000 + }, + { + "epoch": 3.1914893617021276, + "grad_norm": 0.11977583914995193, + "learning_rate": 0.00020715046231437376, + "loss": 1.052, + "step": 23100 + }, + { + "epoch": 3.2053053329649073, + "grad_norm": 0.08040472120046616, + "learning_rate": 0.0002067301765200336, + "loss": 1.0491, + "step": 23200 + }, + { + "epoch": 3.2191213042276874, + "grad_norm": 0.10473213344812393, + "learning_rate": 0.00020630989072569343, + "loss": 1.0525, + "step": 23300 + }, + { + "epoch": 3.232937275490467, + "grad_norm": 0.0790744498372078, + "learning_rate": 0.00020588960493135328, + "loss": 1.0508, + "step": 23400 + }, + { + "epoch": 3.2467532467532467, + "grad_norm": 0.12807689607143402, + "learning_rate": 0.00020547352199495655, + "loss": 1.0485, + "step": 23500 + }, + { + "epoch": 3.2605692180160264, + "grad_norm": 0.10298227518796921, + "learning_rate": 0.0002050532362006164, + "loss": 1.049, + "step": 23600 + }, + { + "epoch": 3.2743851892788065, + "grad_norm": 0.11504103243350983, + "learning_rate": 0.00020463295040627625, + "loss": 1.0511, + "step": 23700 + }, + { + "epoch": 3.288201160541586, + "grad_norm": 0.05548229441046715, + "learning_rate": 0.0002042126646119361, + "loss": 1.0499, + "step": 23800 + }, + { + "epoch": 3.302017131804366, + "grad_norm": 0.06242981553077698, + "learning_rate": 0.00020379237881759595, + "loss": 1.0543, + "step": 23900 + }, + { + "epoch": 3.3158331030671455, + "grad_norm": 0.12101748585700989, + "learning_rate": 0.00020337209302325582, + "loss": 1.0482, + "step": 24000 + }, + { + "epoch": 3.329649074329925, + "grad_norm": 0.09176388382911682, + "learning_rate": 0.00020295180722891562, + "loss": 1.0514, + "step": 24100 + }, + { + "epoch": 3.3434650455927053, + "grad_norm": 0.08758760988712311, + "learning_rate": 0.0002025315214345755, + "loss": 1.0505, + "step": 24200 + }, + { + "epoch": 3.357281016855485, + "grad_norm": 0.06818066537380219, + "learning_rate": 0.00020211123564023534, + "loss": 1.0511, + "step": 24300 + }, + { + "epoch": 3.3710969881182646, + "grad_norm": 0.10384306311607361, + "learning_rate": 0.0002016909498458952, + "loss": 1.0513, + "step": 24400 + }, + { + "epoch": 3.3849129593810443, + "grad_norm": 0.12452493607997894, + "learning_rate": 0.00020127066405155504, + "loss": 1.0502, + "step": 24500 + }, + { + "epoch": 3.3987289306438244, + "grad_norm": 0.07460072636604309, + "learning_rate": 0.0002008503782572149, + "loss": 1.0526, + "step": 24600 + }, + { + "epoch": 3.412544901906604, + "grad_norm": 0.1017543151974678, + "learning_rate": 0.00020043009246287474, + "loss": 1.0501, + "step": 24700 + }, + { + "epoch": 3.4263608731693838, + "grad_norm": 0.0900358185172081, + "learning_rate": 0.0002000098066685346, + "loss": 1.0512, + "step": 24800 + }, + { + "epoch": 3.4401768444321634, + "grad_norm": 0.10934050381183624, + "learning_rate": 0.00019958952087419443, + "loss": 1.0495, + "step": 24900 + }, + { + "epoch": 3.4539928156949435, + "grad_norm": 0.0656353011727333, + "learning_rate": 0.00019916923507985428, + "loss": 1.0504, + "step": 25000 + }, + { + "epoch": 3.4539928156949435, + "eval_accuracy": 0.520419659075542, + "eval_loss": 1.0485948324203491, + "eval_runtime": 728.0613, + "eval_samples_per_second": 282.752, + "eval_steps_per_second": 8.837, + "step": 25000 + }, + { + "epoch": 3.467808786957723, + "grad_norm": 0.07246037572622299, + "learning_rate": 0.00019874894928551413, + "loss": 1.0493, + "step": 25100 + }, + { + "epoch": 3.481624758220503, + "grad_norm": 0.14033739268779755, + "learning_rate": 0.00019832866349117398, + "loss": 1.05, + "step": 25200 + }, + { + "epoch": 3.4954407294832825, + "grad_norm": 0.05688853561878204, + "learning_rate": 0.00019790837769683383, + "loss": 1.0509, + "step": 25300 + }, + { + "epoch": 3.5092567007460627, + "grad_norm": 0.053916674107313156, + "learning_rate": 0.00019748809190249368, + "loss": 1.0503, + "step": 25400 + }, + { + "epoch": 3.5230726720088423, + "grad_norm": 0.12233688682317734, + "learning_rate": 0.00019706780610815352, + "loss": 1.05, + "step": 25500 + }, + { + "epoch": 3.536888643271622, + "grad_norm": 0.10314755886793137, + "learning_rate": 0.0001966475203138134, + "loss": 1.0501, + "step": 25600 + }, + { + "epoch": 3.5507046145344017, + "grad_norm": 0.05037887394428253, + "learning_rate": 0.00019623143737741662, + "loss": 1.0468, + "step": 25700 + }, + { + "epoch": 3.5645205857971813, + "grad_norm": 0.13344399631023407, + "learning_rate": 0.00019581115158307647, + "loss": 1.0477, + "step": 25800 + }, + { + "epoch": 3.5783365570599615, + "grad_norm": 0.07191654294729233, + "learning_rate": 0.00019539086578873632, + "loss": 1.0498, + "step": 25900 + }, + { + "epoch": 3.592152528322741, + "grad_norm": 0.05592725798487663, + "learning_rate": 0.00019497057999439616, + "loss": 1.0506, + "step": 26000 + }, + { + "epoch": 3.605968499585521, + "grad_norm": 0.10346696525812149, + "learning_rate": 0.000194550294200056, + "loss": 1.0499, + "step": 26100 + }, + { + "epoch": 3.619784470848301, + "grad_norm": 0.09233855456113815, + "learning_rate": 0.0001941300084057159, + "loss": 1.0456, + "step": 26200 + }, + { + "epoch": 3.6336004421110806, + "grad_norm": 0.060603220015764236, + "learning_rate": 0.00019370972261137574, + "loss": 1.0475, + "step": 26300 + }, + { + "epoch": 3.6474164133738602, + "grad_norm": 0.11710167676210403, + "learning_rate": 0.00019328943681703559, + "loss": 1.0497, + "step": 26400 + }, + { + "epoch": 3.66123238463664, + "grad_norm": 0.16325397789478302, + "learning_rate": 0.0001928691510226954, + "loss": 1.0487, + "step": 26500 + }, + { + "epoch": 3.6750483558994196, + "grad_norm": 0.08937475085258484, + "learning_rate": 0.00019244886522835526, + "loss": 1.0468, + "step": 26600 + }, + { + "epoch": 3.6888643271621993, + "grad_norm": 0.07486152648925781, + "learning_rate": 0.0001920285794340151, + "loss": 1.0479, + "step": 26700 + }, + { + "epoch": 3.7026802984249794, + "grad_norm": 0.1263752579689026, + "learning_rate": 0.00019160829363967495, + "loss": 1.0449, + "step": 26800 + }, + { + "epoch": 3.716496269687759, + "grad_norm": 0.11803583055734634, + "learning_rate": 0.0001911880078453348, + "loss": 1.0512, + "step": 26900 + }, + { + "epoch": 3.7303122409505387, + "grad_norm": 0.07918773591518402, + "learning_rate": 0.00019076772205099468, + "loss": 1.0486, + "step": 27000 + }, + { + "epoch": 3.744128212213319, + "grad_norm": 0.11923271417617798, + "learning_rate": 0.00019034743625665453, + "loss": 1.0465, + "step": 27100 + }, + { + "epoch": 3.7579441834760985, + "grad_norm": 0.12752223014831543, + "learning_rate": 0.00018992715046231437, + "loss": 1.0472, + "step": 27200 + }, + { + "epoch": 3.771760154738878, + "grad_norm": 0.07391146570444107, + "learning_rate": 0.0001895068646679742, + "loss": 1.0493, + "step": 27300 + }, + { + "epoch": 3.785576126001658, + "grad_norm": 0.06606881320476532, + "learning_rate": 0.00018908657887363404, + "loss": 1.0485, + "step": 27400 + }, + { + "epoch": 3.7993920972644375, + "grad_norm": 0.04949864745140076, + "learning_rate": 0.0001886662930792939, + "loss": 1.0481, + "step": 27500 + }, + { + "epoch": 3.8132080685272176, + "grad_norm": 0.05234380066394806, + "learning_rate": 0.00018824600728495374, + "loss": 1.0476, + "step": 27600 + }, + { + "epoch": 3.8270240397899973, + "grad_norm": 0.04995539411902428, + "learning_rate": 0.0001878257214906136, + "loss": 1.0466, + "step": 27700 + }, + { + "epoch": 3.840840011052777, + "grad_norm": 0.09871330112218857, + "learning_rate": 0.00018740543569627347, + "loss": 1.0501, + "step": 27800 + }, + { + "epoch": 3.8546559823155566, + "grad_norm": 0.06254375725984573, + "learning_rate": 0.00018698514990193331, + "loss": 1.0467, + "step": 27900 + }, + { + "epoch": 3.8684719535783367, + "grad_norm": 0.07971449941396713, + "learning_rate": 0.00018656486410759316, + "loss": 1.0502, + "step": 28000 + }, + { + "epoch": 3.8822879248411164, + "grad_norm": 0.12627951800823212, + "learning_rate": 0.000186144578313253, + "loss": 1.0446, + "step": 28100 + }, + { + "epoch": 3.896103896103896, + "grad_norm": 0.08057064563035965, + "learning_rate": 0.00018572429251891283, + "loss": 1.0468, + "step": 28200 + }, + { + "epoch": 3.9099198673666757, + "grad_norm": 0.0501413568854332, + "learning_rate": 0.00018530400672457268, + "loss": 1.0453, + "step": 28300 + }, + { + "epoch": 3.9237358386294554, + "grad_norm": 0.09999352693557739, + "learning_rate": 0.00018488372093023253, + "loss": 1.0502, + "step": 28400 + }, + { + "epoch": 3.9375518098922355, + "grad_norm": 0.12323564291000366, + "learning_rate": 0.00018446343513589238, + "loss": 1.0478, + "step": 28500 + }, + { + "epoch": 3.951367781155015, + "grad_norm": 0.0877193808555603, + "learning_rate": 0.00018404314934155225, + "loss": 1.049, + "step": 28600 + }, + { + "epoch": 3.965183752417795, + "grad_norm": 0.09397170692682266, + "learning_rate": 0.0001836228635472121, + "loss": 1.0474, + "step": 28700 + }, + { + "epoch": 3.978999723680575, + "grad_norm": 0.09532420337200165, + "learning_rate": 0.00018320257775287195, + "loss": 1.0496, + "step": 28800 + }, + { + "epoch": 3.9928156949433546, + "grad_norm": 0.0442403182387352, + "learning_rate": 0.0001827822919585318, + "loss": 1.0466, + "step": 28900 + }, + { + "epoch": 4.006631666206134, + "grad_norm": 0.06309514492750168, + "learning_rate": 0.00018236200616419162, + "loss": 1.0479, + "step": 29000 + }, + { + "epoch": 4.020447637468914, + "grad_norm": 0.06191420555114746, + "learning_rate": 0.00018194172036985147, + "loss": 1.0442, + "step": 29100 + }, + { + "epoch": 4.034263608731694, + "grad_norm": 0.06752864271402359, + "learning_rate": 0.00018152143457551132, + "loss": 1.045, + "step": 29200 + }, + { + "epoch": 4.048079579994473, + "grad_norm": 0.07383009046316147, + "learning_rate": 0.00018110114878117117, + "loss": 1.0429, + "step": 29300 + }, + { + "epoch": 4.061895551257253, + "grad_norm": 0.11942852288484573, + "learning_rate": 0.00018068086298683104, + "loss": 1.0433, + "step": 29400 + }, + { + "epoch": 4.0757115225200335, + "grad_norm": 0.0840003713965416, + "learning_rate": 0.0001802605771924909, + "loss": 1.0434, + "step": 29500 + }, + { + "epoch": 4.089527493782813, + "grad_norm": 0.07768476754426956, + "learning_rate": 0.00017984029139815074, + "loss": 1.0421, + "step": 29600 + }, + { + "epoch": 4.103343465045593, + "grad_norm": 0.07166603952646255, + "learning_rate": 0.00017942420846175398, + "loss": 1.0443, + "step": 29700 + }, + { + "epoch": 4.1171594363083726, + "grad_norm": 0.07380765676498413, + "learning_rate": 0.0001790039226674138, + "loss": 1.0448, + "step": 29800 + }, + { + "epoch": 4.130975407571152, + "grad_norm": 0.1263025552034378, + "learning_rate": 0.00017858363687307365, + "loss": 1.0437, + "step": 29900 + }, + { + "epoch": 4.144791378833932, + "grad_norm": 0.09632286429405212, + "learning_rate": 0.00017816335107873353, + "loss": 1.0439, + "step": 30000 + }, + { + "epoch": 4.144791378833932, + "eval_accuracy": 0.5233148259844476, + "eval_loss": 1.0439139604568481, + "eval_runtime": 787.8404, + "eval_samples_per_second": 261.298, + "eval_steps_per_second": 8.167, + "step": 30000 + }, + { + "epoch": 4.158607350096712, + "grad_norm": 0.09395026415586472, + "learning_rate": 0.00017774306528439338, + "loss": 1.0447, + "step": 30100 + }, + { + "epoch": 4.172423321359491, + "grad_norm": 0.07320912927389145, + "learning_rate": 0.00017732277949005323, + "loss": 1.0477, + "step": 30200 + }, + { + "epoch": 4.186239292622272, + "grad_norm": 0.05703623965382576, + "learning_rate": 0.00017690249369571308, + "loss": 1.0443, + "step": 30300 + }, + { + "epoch": 4.2000552638850515, + "grad_norm": 0.04885410889983177, + "learning_rate": 0.00017648220790137292, + "loss": 1.0467, + "step": 30400 + }, + { + "epoch": 4.213871235147831, + "grad_norm": 0.10649748146533966, + "learning_rate": 0.00017606192210703277, + "loss": 1.0448, + "step": 30500 + }, + { + "epoch": 4.227687206410611, + "grad_norm": 0.05844441428780556, + "learning_rate": 0.0001756416363126926, + "loss": 1.044, + "step": 30600 + }, + { + "epoch": 4.2415031776733905, + "grad_norm": 0.07287675887346268, + "learning_rate": 0.00017522135051835244, + "loss": 1.0428, + "step": 30700 + }, + { + "epoch": 4.25531914893617, + "grad_norm": 0.05190150439739227, + "learning_rate": 0.00017480106472401232, + "loss": 1.0413, + "step": 30800 + }, + { + "epoch": 4.26913512019895, + "grad_norm": 0.06985218822956085, + "learning_rate": 0.00017438077892967217, + "loss": 1.0455, + "step": 30900 + }, + { + "epoch": 4.2829510914617295, + "grad_norm": 0.06930764764547348, + "learning_rate": 0.00017396049313533202, + "loss": 1.0444, + "step": 31000 + }, + { + "epoch": 4.296767062724509, + "grad_norm": 0.07905230671167374, + "learning_rate": 0.00017354020734099186, + "loss": 1.0445, + "step": 31100 + }, + { + "epoch": 4.31058303398729, + "grad_norm": 0.04994554817676544, + "learning_rate": 0.0001731199215466517, + "loss": 1.0432, + "step": 31200 + }, + { + "epoch": 4.324399005250069, + "grad_norm": 0.08036911487579346, + "learning_rate": 0.00017269963575231156, + "loss": 1.0424, + "step": 31300 + }, + { + "epoch": 4.338214976512849, + "grad_norm": 0.07251475006341934, + "learning_rate": 0.00017227934995797138, + "loss": 1.0465, + "step": 31400 + }, + { + "epoch": 4.352030947775629, + "grad_norm": 0.09622683376073837, + "learning_rate": 0.00017185906416363123, + "loss": 1.0441, + "step": 31500 + }, + { + "epoch": 4.365846919038408, + "grad_norm": 0.07545050978660583, + "learning_rate": 0.0001714387783692911, + "loss": 1.0423, + "step": 31600 + }, + { + "epoch": 4.379662890301188, + "grad_norm": 0.07171428948640823, + "learning_rate": 0.00017102269543289435, + "loss": 1.0434, + "step": 31700 + }, + { + "epoch": 4.393478861563968, + "grad_norm": 0.06658755987882614, + "learning_rate": 0.0001706024096385542, + "loss": 1.0415, + "step": 31800 + }, + { + "epoch": 4.407294832826747, + "grad_norm": 0.10734014213085175, + "learning_rate": 0.00017018212384421405, + "loss": 1.0406, + "step": 31900 + }, + { + "epoch": 4.421110804089528, + "grad_norm": 0.06358776986598969, + "learning_rate": 0.0001697618380498739, + "loss": 1.0405, + "step": 32000 + }, + { + "epoch": 4.434926775352308, + "grad_norm": 0.06078578904271126, + "learning_rate": 0.00016934155225553377, + "loss": 1.0458, + "step": 32100 + }, + { + "epoch": 4.448742746615087, + "grad_norm": 0.09674441814422607, + "learning_rate": 0.000168925469319137, + "loss": 1.0433, + "step": 32200 + }, + { + "epoch": 4.462558717877867, + "grad_norm": 0.11840452253818512, + "learning_rate": 0.00016850518352479684, + "loss": 1.0448, + "step": 32300 + }, + { + "epoch": 4.476374689140647, + "grad_norm": 0.08742488920688629, + "learning_rate": 0.0001680848977304567, + "loss": 1.0409, + "step": 32400 + }, + { + "epoch": 4.490190660403426, + "grad_norm": 0.09082327783107758, + "learning_rate": 0.00016766461193611654, + "loss": 1.0432, + "step": 32500 + }, + { + "epoch": 4.504006631666206, + "grad_norm": 0.06259270012378693, + "learning_rate": 0.0001672443261417764, + "loss": 1.0406, + "step": 32600 + }, + { + "epoch": 4.517822602928986, + "grad_norm": 0.06466669589281082, + "learning_rate": 0.00016682404034743626, + "loss": 1.0404, + "step": 32700 + }, + { + "epoch": 4.531638574191765, + "grad_norm": 0.07167832553386688, + "learning_rate": 0.0001664037545530961, + "loss": 1.0457, + "step": 32800 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.055970191955566406, + "learning_rate": 0.00016598346875875596, + "loss": 1.0433, + "step": 32900 + }, + { + "epoch": 4.5592705167173255, + "grad_norm": 0.05038364604115486, + "learning_rate": 0.00016556318296441578, + "loss": 1.0414, + "step": 33000 + }, + { + "epoch": 4.573086487980105, + "grad_norm": 0.11647244542837143, + "learning_rate": 0.00016514289717007563, + "loss": 1.0408, + "step": 33100 + }, + { + "epoch": 4.586902459242885, + "grad_norm": 0.08881094306707382, + "learning_rate": 0.00016472261137573548, + "loss": 1.0468, + "step": 33200 + }, + { + "epoch": 4.6007184305056645, + "grad_norm": 0.0706004872918129, + "learning_rate": 0.00016430232558139533, + "loss": 1.0433, + "step": 33300 + }, + { + "epoch": 4.614534401768444, + "grad_norm": 0.07594550400972366, + "learning_rate": 0.00016388203978705518, + "loss": 1.0401, + "step": 33400 + }, + { + "epoch": 4.628350373031224, + "grad_norm": 0.06709697842597961, + "learning_rate": 0.00016346175399271505, + "loss": 1.0406, + "step": 33500 + }, + { + "epoch": 4.6421663442940035, + "grad_norm": 0.055218733847141266, + "learning_rate": 0.0001630414681983749, + "loss": 1.0439, + "step": 33600 + }, + { + "epoch": 4.655982315556784, + "grad_norm": 0.09484557062387466, + "learning_rate": 0.00016262118240403475, + "loss": 1.0445, + "step": 33700 + }, + { + "epoch": 4.669798286819564, + "grad_norm": 0.08181110769510269, + "learning_rate": 0.00016220089660969457, + "loss": 1.0404, + "step": 33800 + }, + { + "epoch": 4.683614258082343, + "grad_norm": 0.07101566344499588, + "learning_rate": 0.00016178061081535442, + "loss": 1.0418, + "step": 33900 + }, + { + "epoch": 4.697430229345123, + "grad_norm": 0.07521411031484604, + "learning_rate": 0.00016136032502101427, + "loss": 1.0413, + "step": 34000 + }, + { + "epoch": 4.711246200607903, + "grad_norm": 0.06438640505075455, + "learning_rate": 0.00016094003922667412, + "loss": 1.0413, + "step": 34100 + }, + { + "epoch": 4.7250621718706824, + "grad_norm": 0.0852956548333168, + "learning_rate": 0.00016051975343233396, + "loss": 1.0411, + "step": 34200 + }, + { + "epoch": 4.738878143133462, + "grad_norm": 0.041669171303510666, + "learning_rate": 0.00016009946763799384, + "loss": 1.043, + "step": 34300 + }, + { + "epoch": 4.752694114396242, + "grad_norm": 0.07866424322128296, + "learning_rate": 0.0001596791818436537, + "loss": 1.0416, + "step": 34400 + }, + { + "epoch": 4.7665100856590215, + "grad_norm": 0.06820093840360641, + "learning_rate": 0.00015925889604931354, + "loss": 1.0419, + "step": 34500 + }, + { + "epoch": 4.780326056921801, + "grad_norm": 0.08769433945417404, + "learning_rate": 0.00015883861025497336, + "loss": 1.0436, + "step": 34600 + }, + { + "epoch": 4.794142028184582, + "grad_norm": 0.11472765356302261, + "learning_rate": 0.0001584183244606332, + "loss": 1.0448, + "step": 34700 + }, + { + "epoch": 4.807957999447361, + "grad_norm": 0.10286398231983185, + "learning_rate": 0.00015799803866629305, + "loss": 1.0396, + "step": 34800 + }, + { + "epoch": 4.821773970710141, + "grad_norm": 0.08412828296422958, + "learning_rate": 0.0001575777528719529, + "loss": 1.0432, + "step": 34900 + }, + { + "epoch": 4.835589941972921, + "grad_norm": 0.06536369025707245, + "learning_rate": 0.00015715746707761275, + "loss": 1.0425, + "step": 35000 + }, + { + "epoch": 4.835589941972921, + "eval_accuracy": 0.5253784900927014, + "eval_loss": 1.0407328605651855, + "eval_runtime": 804.3369, + "eval_samples_per_second": 255.939, + "eval_steps_per_second": 7.999, + "step": 35000 + }, + { + "epoch": 4.8494059132357, + "grad_norm": 0.05366332083940506, + "learning_rate": 0.00015673718128327263, + "loss": 1.0401, + "step": 35100 + }, + { + "epoch": 4.86322188449848, + "grad_norm": 0.05627182498574257, + "learning_rate": 0.00015631689548893248, + "loss": 1.0413, + "step": 35200 + }, + { + "epoch": 4.87703785576126, + "grad_norm": 0.06880544126033783, + "learning_rate": 0.00015589660969459232, + "loss": 1.0399, + "step": 35300 + }, + { + "epoch": 4.89085382702404, + "grad_norm": 0.06326279044151306, + "learning_rate": 0.00015547632390025215, + "loss": 1.0424, + "step": 35400 + }, + { + "epoch": 4.90466979828682, + "grad_norm": 0.050615083426237106, + "learning_rate": 0.000155056038105912, + "loss": 1.0419, + "step": 35500 + }, + { + "epoch": 4.9184857695496, + "grad_norm": 0.09092865139245987, + "learning_rate": 0.00015463575231157184, + "loss": 1.0417, + "step": 35600 + }, + { + "epoch": 4.932301740812379, + "grad_norm": 0.10828616470098495, + "learning_rate": 0.0001542154665172317, + "loss": 1.0461, + "step": 35700 + }, + { + "epoch": 4.946117712075159, + "grad_norm": 0.10398013889789581, + "learning_rate": 0.00015379518072289154, + "loss": 1.0402, + "step": 35800 + }, + { + "epoch": 4.959933683337939, + "grad_norm": 0.060978490859270096, + "learning_rate": 0.00015337489492855142, + "loss": 1.0428, + "step": 35900 + }, + { + "epoch": 4.973749654600718, + "grad_norm": 0.09474412351846695, + "learning_rate": 0.00015295460913421126, + "loss": 1.0426, + "step": 36000 + }, + { + "epoch": 4.987565625863498, + "grad_norm": 0.055337630212306976, + "learning_rate": 0.0001525343233398711, + "loss": 1.0424, + "step": 36100 + }, + { + "epoch": 5.001381597126278, + "grad_norm": 0.062282662838697433, + "learning_rate": 0.00015211824040347433, + "loss": 1.0408, + "step": 36200 + }, + { + "epoch": 5.015197568389058, + "grad_norm": 0.08418793976306915, + "learning_rate": 0.00015169795460913418, + "loss": 1.0423, + "step": 36300 + }, + { + "epoch": 5.029013539651838, + "grad_norm": 0.056806761771440506, + "learning_rate": 0.00015127766881479403, + "loss": 1.0397, + "step": 36400 + }, + { + "epoch": 5.0428295109146175, + "grad_norm": 0.050782449543476105, + "learning_rate": 0.0001508573830204539, + "loss": 1.0397, + "step": 36500 + }, + { + "epoch": 5.056645482177397, + "grad_norm": 0.04436805471777916, + "learning_rate": 0.00015043709722611375, + "loss": 1.0372, + "step": 36600 + }, + { + "epoch": 5.070461453440177, + "grad_norm": 0.056697145104408264, + "learning_rate": 0.0001500168114317736, + "loss": 1.0396, + "step": 36700 + }, + { + "epoch": 5.0842774247029565, + "grad_norm": 0.0936078131198883, + "learning_rate": 0.00014959652563743342, + "loss": 1.0366, + "step": 36800 + }, + { + "epoch": 5.098093395965736, + "grad_norm": 0.058340467512607574, + "learning_rate": 0.0001491762398430933, + "loss": 1.038, + "step": 36900 + }, + { + "epoch": 5.111909367228516, + "grad_norm": 0.07920562475919724, + "learning_rate": 0.00014875595404875315, + "loss": 1.0389, + "step": 37000 + }, + { + "epoch": 5.1257253384912955, + "grad_norm": 0.054546140134334564, + "learning_rate": 0.000148335668254413, + "loss": 1.0352, + "step": 37100 + }, + { + "epoch": 5.139541309754076, + "grad_norm": 0.0779619961977005, + "learning_rate": 0.00014791538246007282, + "loss": 1.0362, + "step": 37200 + }, + { + "epoch": 5.153357281016856, + "grad_norm": 0.06077539920806885, + "learning_rate": 0.0001474950966657327, + "loss": 1.0395, + "step": 37300 + }, + { + "epoch": 5.167173252279635, + "grad_norm": 0.07015964388847351, + "learning_rate": 0.00014707481087139254, + "loss": 1.0378, + "step": 37400 + }, + { + "epoch": 5.180989223542415, + "grad_norm": 0.07821048051118851, + "learning_rate": 0.0001466545250770524, + "loss": 1.0358, + "step": 37500 + }, + { + "epoch": 5.194805194805195, + "grad_norm": 0.06446918845176697, + "learning_rate": 0.0001462342392827122, + "loss": 1.0401, + "step": 37600 + }, + { + "epoch": 5.208621166067974, + "grad_norm": 0.0754179060459137, + "learning_rate": 0.0001458139534883721, + "loss": 1.0372, + "step": 37700 + }, + { + "epoch": 5.222437137330754, + "grad_norm": 0.06225774064660072, + "learning_rate": 0.00014539366769403194, + "loss": 1.0396, + "step": 37800 + }, + { + "epoch": 5.236253108593534, + "grad_norm": 0.09567879885435104, + "learning_rate": 0.00014497338189969178, + "loss": 1.0427, + "step": 37900 + }, + { + "epoch": 5.250069079856313, + "grad_norm": 0.0810612216591835, + "learning_rate": 0.00014455309610535163, + "loss": 1.0368, + "step": 38000 + }, + { + "epoch": 5.263885051119094, + "grad_norm": 0.058250732719898224, + "learning_rate": 0.00014413281031101148, + "loss": 1.039, + "step": 38100 + }, + { + "epoch": 5.277701022381874, + "grad_norm": 0.07354842871427536, + "learning_rate": 0.00014371252451667133, + "loss": 1.0393, + "step": 38200 + }, + { + "epoch": 5.291516993644653, + "grad_norm": 0.04756517335772514, + "learning_rate": 0.00014329223872233118, + "loss": 1.0369, + "step": 38300 + }, + { + "epoch": 5.305332964907433, + "grad_norm": 0.05551883205771446, + "learning_rate": 0.00014287195292799103, + "loss": 1.038, + "step": 38400 + }, + { + "epoch": 5.319148936170213, + "grad_norm": 0.05476289987564087, + "learning_rate": 0.00014245166713365088, + "loss": 1.0391, + "step": 38500 + }, + { + "epoch": 5.332964907432992, + "grad_norm": 0.041929882019758224, + "learning_rate": 0.00014203138133931072, + "loss": 1.0377, + "step": 38600 + }, + { + "epoch": 5.346780878695772, + "grad_norm": 0.05916072428226471, + "learning_rate": 0.00014161109554497057, + "loss": 1.0417, + "step": 38700 + }, + { + "epoch": 5.360596849958552, + "grad_norm": 0.0609772689640522, + "learning_rate": 0.00014119080975063042, + "loss": 1.0386, + "step": 38800 + }, + { + "epoch": 5.374412821221332, + "grad_norm": 0.06430498510599136, + "learning_rate": 0.00014077052395629027, + "loss": 1.0397, + "step": 38900 + }, + { + "epoch": 5.388228792484112, + "grad_norm": 0.07042800635099411, + "learning_rate": 0.00014035023816195012, + "loss": 1.038, + "step": 39000 + }, + { + "epoch": 5.402044763746892, + "grad_norm": 0.05623612925410271, + "learning_rate": 0.00013992995236760997, + "loss": 1.0405, + "step": 39100 + }, + { + "epoch": 5.415860735009671, + "grad_norm": 0.04936366528272629, + "learning_rate": 0.00013950966657326982, + "loss": 1.0404, + "step": 39200 + }, + { + "epoch": 5.429676706272451, + "grad_norm": 0.05738508701324463, + "learning_rate": 0.00013908938077892966, + "loss": 1.0364, + "step": 39300 + }, + { + "epoch": 5.443492677535231, + "grad_norm": 0.09567712992429733, + "learning_rate": 0.0001386690949845895, + "loss": 1.0381, + "step": 39400 + }, + { + "epoch": 5.45730864879801, + "grad_norm": 0.07306545972824097, + "learning_rate": 0.00013824880919024936, + "loss": 1.0394, + "step": 39500 + }, + { + "epoch": 5.47112462006079, + "grad_norm": 0.060108475387096405, + "learning_rate": 0.0001378285233959092, + "loss": 1.0379, + "step": 39600 + }, + { + "epoch": 5.48494059132357, + "grad_norm": 0.08150669932365417, + "learning_rate": 0.00013740823760156906, + "loss": 1.0391, + "step": 39700 + }, + { + "epoch": 5.49875656258635, + "grad_norm": 0.06265643239021301, + "learning_rate": 0.0001369879518072289, + "loss": 1.0419, + "step": 39800 + }, + { + "epoch": 5.51257253384913, + "grad_norm": 0.09023050218820572, + "learning_rate": 0.00013656766601288876, + "loss": 1.0374, + "step": 39900 + }, + { + "epoch": 5.5263885051119095, + "grad_norm": 0.06600885838270187, + "learning_rate": 0.0001361473802185486, + "loss": 1.0365, + "step": 40000 + }, + { + "epoch": 5.5263885051119095, + "eval_accuracy": 0.52706640122358, + "eval_loss": 1.0380040407180786, + "eval_runtime": 773.4583, + "eval_samples_per_second": 266.157, + "eval_steps_per_second": 8.318, + "step": 40000 + }, + { + "epoch": 5.540204476374689, + "grad_norm": 0.07041644304990768, + "learning_rate": 0.00013572709442420845, + "loss": 1.038, + "step": 40100 + }, + { + "epoch": 5.554020447637469, + "grad_norm": 0.0819341391324997, + "learning_rate": 0.0001353110114878117, + "loss": 1.0383, + "step": 40200 + }, + { + "epoch": 5.5678364189002485, + "grad_norm": 0.04390214383602142, + "learning_rate": 0.00013489072569347155, + "loss": 1.0381, + "step": 40300 + }, + { + "epoch": 5.581652390163028, + "grad_norm": 0.0681944414973259, + "learning_rate": 0.0001344704398991314, + "loss": 1.0368, + "step": 40400 + }, + { + "epoch": 5.595468361425809, + "grad_norm": 0.0888848677277565, + "learning_rate": 0.00013405015410479124, + "loss": 1.0369, + "step": 40500 + }, + { + "epoch": 5.609284332688588, + "grad_norm": 0.07275230437517166, + "learning_rate": 0.0001336298683104511, + "loss": 1.0353, + "step": 40600 + }, + { + "epoch": 5.623100303951368, + "grad_norm": 0.10200846940279007, + "learning_rate": 0.00013320958251611094, + "loss": 1.0381, + "step": 40700 + }, + { + "epoch": 5.636916275214148, + "grad_norm": 0.056480832397937775, + "learning_rate": 0.0001327892967217708, + "loss": 1.0383, + "step": 40800 + }, + { + "epoch": 5.650732246476927, + "grad_norm": 0.0845484584569931, + "learning_rate": 0.00013236901092743064, + "loss": 1.0385, + "step": 40900 + }, + { + "epoch": 5.664548217739707, + "grad_norm": 0.05990500748157501, + "learning_rate": 0.0001319487251330905, + "loss": 1.0381, + "step": 41000 + }, + { + "epoch": 5.678364189002487, + "grad_norm": 0.04566818103194237, + "learning_rate": 0.00013152843933875034, + "loss": 1.0409, + "step": 41100 + }, + { + "epoch": 5.692180160265266, + "grad_norm": 0.05529521405696869, + "learning_rate": 0.00013110815354441018, + "loss": 1.039, + "step": 41200 + }, + { + "epoch": 5.705996131528046, + "grad_norm": 0.08812158554792404, + "learning_rate": 0.00013068786775007003, + "loss": 1.0393, + "step": 41300 + }, + { + "epoch": 5.719812102790826, + "grad_norm": 0.0714721605181694, + "learning_rate": 0.00013026758195572988, + "loss": 1.0365, + "step": 41400 + }, + { + "epoch": 5.733628074053606, + "grad_norm": 0.050889432430267334, + "learning_rate": 0.00012984729616138973, + "loss": 1.0399, + "step": 41500 + }, + { + "epoch": 5.747444045316386, + "grad_norm": 0.05863107368350029, + "learning_rate": 0.00012942701036704958, + "loss": 1.0401, + "step": 41600 + }, + { + "epoch": 5.761260016579166, + "grad_norm": 0.05279000476002693, + "learning_rate": 0.00012900672457270943, + "loss": 1.0368, + "step": 41700 + }, + { + "epoch": 5.775075987841945, + "grad_norm": 0.06430874019861221, + "learning_rate": 0.00012858643877836928, + "loss": 1.0347, + "step": 41800 + }, + { + "epoch": 5.788891959104725, + "grad_norm": 0.1187288910150528, + "learning_rate": 0.00012816615298402912, + "loss": 1.0372, + "step": 41900 + }, + { + "epoch": 5.802707930367505, + "grad_norm": 0.05984746664762497, + "learning_rate": 0.00012774586718968897, + "loss": 1.036, + "step": 42000 + }, + { + "epoch": 5.816523901630284, + "grad_norm": 0.047202371060848236, + "learning_rate": 0.00012732558139534882, + "loss": 1.0341, + "step": 42100 + }, + { + "epoch": 5.830339872893065, + "grad_norm": 0.0888022631406784, + "learning_rate": 0.00012690949845895207, + "loss": 1.0358, + "step": 42200 + }, + { + "epoch": 5.8441558441558445, + "grad_norm": 0.071753591299057, + "learning_rate": 0.00012648921266461191, + "loss": 1.0356, + "step": 42300 + }, + { + "epoch": 5.857971815418624, + "grad_norm": 0.06311481446027756, + "learning_rate": 0.0001260689268702718, + "loss": 1.0381, + "step": 42400 + }, + { + "epoch": 5.871787786681404, + "grad_norm": 0.05733519420027733, + "learning_rate": 0.0001256486410759316, + "loss": 1.0366, + "step": 42500 + }, + { + "epoch": 5.885603757944184, + "grad_norm": 0.05296749621629715, + "learning_rate": 0.00012522835528159146, + "loss": 1.0391, + "step": 42600 + }, + { + "epoch": 5.899419729206963, + "grad_norm": 0.05728083476424217, + "learning_rate": 0.0001248080694872513, + "loss": 1.0393, + "step": 42700 + }, + { + "epoch": 5.913235700469743, + "grad_norm": 0.10918726772069931, + "learning_rate": 0.00012438778369291118, + "loss": 1.0375, + "step": 42800 + }, + { + "epoch": 5.927051671732523, + "grad_norm": 0.043641045689582825, + "learning_rate": 0.000123967497898571, + "loss": 1.0342, + "step": 42900 + }, + { + "epoch": 5.940867642995302, + "grad_norm": 0.07793564349412918, + "learning_rate": 0.00012354721210423085, + "loss": 1.037, + "step": 43000 + }, + { + "epoch": 5.954683614258082, + "grad_norm": 0.10596407949924469, + "learning_rate": 0.0001231269263098907, + "loss": 1.0361, + "step": 43100 + }, + { + "epoch": 5.9684995855208625, + "grad_norm": 0.05018968880176544, + "learning_rate": 0.00012270664051555058, + "loss": 1.0352, + "step": 43200 + }, + { + "epoch": 5.982315556783642, + "grad_norm": 0.06663347035646439, + "learning_rate": 0.0001222863547212104, + "loss": 1.0379, + "step": 43300 + }, + { + "epoch": 5.996131528046422, + "grad_norm": 0.05061174929141998, + "learning_rate": 0.00012186606892687026, + "loss": 1.0378, + "step": 43400 + }, + { + "epoch": 6.0099474993092015, + "grad_norm": 0.07496211677789688, + "learning_rate": 0.00012144578313253011, + "loss": 1.0357, + "step": 43500 + }, + { + "epoch": 6.023763470571981, + "grad_norm": 0.058973684906959534, + "learning_rate": 0.00012102549733818996, + "loss": 1.0336, + "step": 43600 + }, + { + "epoch": 6.037579441834761, + "grad_norm": 0.07304850965738297, + "learning_rate": 0.0001206052115438498, + "loss": 1.0366, + "step": 43700 + }, + { + "epoch": 6.0513954130975405, + "grad_norm": 0.05964922904968262, + "learning_rate": 0.00012018492574950966, + "loss": 1.0358, + "step": 43800 + }, + { + "epoch": 6.06521138436032, + "grad_norm": 0.10107408463954926, + "learning_rate": 0.0001197646399551695, + "loss": 1.0363, + "step": 43900 + }, + { + "epoch": 6.079027355623101, + "grad_norm": 0.05830320343375206, + "learning_rate": 0.00011934435416082935, + "loss": 1.0374, + "step": 44000 + }, + { + "epoch": 6.09284332688588, + "grad_norm": 0.06493101269006729, + "learning_rate": 0.00011892406836648919, + "loss": 1.0358, + "step": 44100 + }, + { + "epoch": 6.10665929814866, + "grad_norm": 0.06381756067276001, + "learning_rate": 0.00011850798543009245, + "loss": 1.0345, + "step": 44200 + }, + { + "epoch": 6.12047526941144, + "grad_norm": 0.057328786700963974, + "learning_rate": 0.0001180876996357523, + "loss": 1.0347, + "step": 44300 + }, + { + "epoch": 6.134291240674219, + "grad_norm": 0.09036822617053986, + "learning_rate": 0.00011766741384141216, + "loss": 1.0352, + "step": 44400 + }, + { + "epoch": 6.148107211936999, + "grad_norm": 0.05485937371850014, + "learning_rate": 0.000117247128047072, + "loss": 1.0371, + "step": 44500 + }, + { + "epoch": 6.161923183199779, + "grad_norm": 0.06304465979337692, + "learning_rate": 0.00011682684225273184, + "loss": 1.0302, + "step": 44600 + }, + { + "epoch": 6.175739154462558, + "grad_norm": 0.045126065611839294, + "learning_rate": 0.0001164065564583917, + "loss": 1.0338, + "step": 44700 + }, + { + "epoch": 6.189555125725338, + "grad_norm": 0.06636038422584534, + "learning_rate": 0.00011598627066405155, + "loss": 1.0353, + "step": 44800 + }, + { + "epoch": 6.203371096988119, + "grad_norm": 0.05977385491132736, + "learning_rate": 0.00011556598486971139, + "loss": 1.0346, + "step": 44900 + }, + { + "epoch": 6.217187068250898, + "grad_norm": 0.07459376752376556, + "learning_rate": 0.00011514569907537124, + "loss": 1.0325, + "step": 45000 + }, + { + "epoch": 6.217187068250898, + "eval_accuracy": 0.5284276106869993, + "eval_loss": 1.0360603332519531, + "eval_runtime": 770.702, + "eval_samples_per_second": 267.108, + "eval_steps_per_second": 8.348, + "step": 45000 + }, + { + "epoch": 6.231003039513678, + "grad_norm": 0.050757069140672684, + "learning_rate": 0.0001147254132810311, + "loss": 1.0337, + "step": 45100 + }, + { + "epoch": 6.244819010776458, + "grad_norm": 0.065644271671772, + "learning_rate": 0.00011430512748669095, + "loss": 1.035, + "step": 45200 + }, + { + "epoch": 6.258634982039237, + "grad_norm": 0.06008651480078697, + "learning_rate": 0.00011388484169235078, + "loss": 1.0323, + "step": 45300 + }, + { + "epoch": 6.272450953302017, + "grad_norm": 0.050868868827819824, + "learning_rate": 0.00011346455589801063, + "loss": 1.0341, + "step": 45400 + }, + { + "epoch": 6.286266924564797, + "grad_norm": 0.0535401850938797, + "learning_rate": 0.00011304427010367049, + "loss": 1.0349, + "step": 45500 + }, + { + "epoch": 6.300082895827576, + "grad_norm": 0.07083383947610855, + "learning_rate": 0.00011262398430933034, + "loss": 1.0327, + "step": 45600 + }, + { + "epoch": 6.313898867090357, + "grad_norm": 0.06998474150896072, + "learning_rate": 0.00011220369851499018, + "loss": 1.035, + "step": 45700 + }, + { + "epoch": 6.3277148383531365, + "grad_norm": 0.06696050614118576, + "learning_rate": 0.00011178341272065002, + "loss": 1.0342, + "step": 45800 + }, + { + "epoch": 6.341530809615916, + "grad_norm": 0.050143785774707794, + "learning_rate": 0.00011136312692630989, + "loss": 1.0342, + "step": 45900 + }, + { + "epoch": 6.355346780878696, + "grad_norm": 0.066258005797863, + "learning_rate": 0.00011094284113196974, + "loss": 1.0368, + "step": 46000 + }, + { + "epoch": 6.3691627521414755, + "grad_norm": 0.057613175362348557, + "learning_rate": 0.00011052255533762957, + "loss": 1.0357, + "step": 46100 + }, + { + "epoch": 6.382978723404255, + "grad_norm": 0.07405593246221542, + "learning_rate": 0.00011010647240123283, + "loss": 1.033, + "step": 46200 + }, + { + "epoch": 6.396794694667035, + "grad_norm": 0.07005150616168976, + "learning_rate": 0.00010968618660689268, + "loss": 1.0329, + "step": 46300 + }, + { + "epoch": 6.4106106659298145, + "grad_norm": 0.057546067982912064, + "learning_rate": 0.00010926590081255253, + "loss": 1.033, + "step": 46400 + }, + { + "epoch": 6.424426637192594, + "grad_norm": 0.08016248792409897, + "learning_rate": 0.00010884561501821236, + "loss": 1.0389, + "step": 46500 + }, + { + "epoch": 6.438242608455375, + "grad_norm": 0.08346617966890335, + "learning_rate": 0.00010842532922387222, + "loss": 1.0332, + "step": 46600 + }, + { + "epoch": 6.452058579718154, + "grad_norm": 0.048157453536987305, + "learning_rate": 0.00010800504342953207, + "loss": 1.0342, + "step": 46700 + }, + { + "epoch": 6.465874550980934, + "grad_norm": 0.06816009432077408, + "learning_rate": 0.00010758475763519192, + "loss": 1.0357, + "step": 46800 + }, + { + "epoch": 6.479690522243714, + "grad_norm": 0.05210613086819649, + "learning_rate": 0.00010716447184085176, + "loss": 1.0345, + "step": 46900 + }, + { + "epoch": 6.4935064935064934, + "grad_norm": 0.08138227462768555, + "learning_rate": 0.00010674418604651162, + "loss": 1.035, + "step": 47000 + }, + { + "epoch": 6.507322464769273, + "grad_norm": 0.07494477927684784, + "learning_rate": 0.00010632390025217147, + "loss": 1.0361, + "step": 47100 + }, + { + "epoch": 6.521138436032053, + "grad_norm": 0.07473413646221161, + "learning_rate": 0.00010590361445783132, + "loss": 1.0339, + "step": 47200 + }, + { + "epoch": 6.5349544072948325, + "grad_norm": 0.07200802862644196, + "learning_rate": 0.00010548332866349115, + "loss": 1.0333, + "step": 47300 + }, + { + "epoch": 6.548770378557613, + "grad_norm": 0.06346756964921951, + "learning_rate": 0.00010506304286915101, + "loss": 1.0345, + "step": 47400 + }, + { + "epoch": 6.562586349820393, + "grad_norm": 0.06382066756486893, + "learning_rate": 0.00010464275707481086, + "loss": 1.0352, + "step": 47500 + }, + { + "epoch": 6.576402321083172, + "grad_norm": 0.1000475063920021, + "learning_rate": 0.00010422247128047071, + "loss": 1.0344, + "step": 47600 + }, + { + "epoch": 6.590218292345952, + "grad_norm": 0.06456384807825089, + "learning_rate": 0.00010380218548613057, + "loss": 1.0356, + "step": 47700 + }, + { + "epoch": 6.604034263608732, + "grad_norm": 0.052929963916540146, + "learning_rate": 0.0001033818996917904, + "loss": 1.0343, + "step": 47800 + }, + { + "epoch": 6.617850234871511, + "grad_norm": 0.07275223731994629, + "learning_rate": 0.00010296161389745025, + "loss": 1.033, + "step": 47900 + }, + { + "epoch": 6.631666206134291, + "grad_norm": 0.060610584914684296, + "learning_rate": 0.0001025413281031101, + "loss": 1.0334, + "step": 48000 + }, + { + "epoch": 6.645482177397071, + "grad_norm": 0.0514766089618206, + "learning_rate": 0.00010212104230876997, + "loss": 1.0351, + "step": 48100 + }, + { + "epoch": 6.65929814865985, + "grad_norm": 0.08950326591730118, + "learning_rate": 0.0001017049593723732, + "loss": 1.0341, + "step": 48200 + }, + { + "epoch": 6.673114119922631, + "grad_norm": 0.052268847823143005, + "learning_rate": 0.00010128467357803306, + "loss": 1.0342, + "step": 48300 + }, + { + "epoch": 6.686930091185411, + "grad_norm": 0.059182267636060715, + "learning_rate": 0.00010086438778369291, + "loss": 1.0303, + "step": 48400 + }, + { + "epoch": 6.70074606244819, + "grad_norm": 0.06220945715904236, + "learning_rate": 0.00010044410198935274, + "loss": 1.032, + "step": 48500 + }, + { + "epoch": 6.71456203371097, + "grad_norm": 0.0486241914331913, + "learning_rate": 0.00010002381619501259, + "loss": 1.0338, + "step": 48600 + }, + { + "epoch": 6.72837800497375, + "grad_norm": 0.04813262075185776, + "learning_rate": 9.960353040067245e-05, + "loss": 1.0344, + "step": 48700 + }, + { + "epoch": 6.742193976236529, + "grad_norm": 0.04981222748756409, + "learning_rate": 9.91832446063323e-05, + "loss": 1.0347, + "step": 48800 + }, + { + "epoch": 6.756009947499309, + "grad_norm": 0.050560541450977325, + "learning_rate": 9.876295881199214e-05, + "loss": 1.0338, + "step": 48900 + }, + { + "epoch": 6.769825918762089, + "grad_norm": 0.05338674411177635, + "learning_rate": 9.834267301765199e-05, + "loss": 1.0369, + "step": 49000 + }, + { + "epoch": 6.783641890024869, + "grad_norm": 0.042156435549259186, + "learning_rate": 9.792238722331185e-05, + "loss": 1.0345, + "step": 49100 + }, + { + "epoch": 6.797457861287649, + "grad_norm": 0.0622396394610405, + "learning_rate": 9.75021014289717e-05, + "loss": 1.0321, + "step": 49200 + }, + { + "epoch": 6.8112738325504285, + "grad_norm": 0.08523661643266678, + "learning_rate": 9.708181563463155e-05, + "loss": 1.0317, + "step": 49300 + }, + { + "epoch": 6.825089803813208, + "grad_norm": 0.055176641792058945, + "learning_rate": 9.666152984029138e-05, + "loss": 1.0368, + "step": 49400 + }, + { + "epoch": 6.838905775075988, + "grad_norm": 0.07358380407094955, + "learning_rate": 9.624124404595124e-05, + "loss": 1.0318, + "step": 49500 + }, + { + "epoch": 6.8527217463387675, + "grad_norm": 0.055568769574165344, + "learning_rate": 9.582095825161109e-05, + "loss": 1.0343, + "step": 49600 + }, + { + "epoch": 6.866537717601547, + "grad_norm": 0.04249552637338638, + "learning_rate": 9.540067245727094e-05, + "loss": 1.0331, + "step": 49700 + }, + { + "epoch": 6.880353688864327, + "grad_norm": 0.05274058133363724, + "learning_rate": 9.498038666293077e-05, + "loss": 1.0351, + "step": 49800 + }, + { + "epoch": 6.8941696601271065, + "grad_norm": 0.04792112484574318, + "learning_rate": 9.456010086859064e-05, + "loss": 1.0333, + "step": 49900 + }, + { + "epoch": 6.907985631389887, + "grad_norm": 0.05513302981853485, + "learning_rate": 9.413981507425049e-05, + "loss": 1.0322, + "step": 50000 + }, + { + "epoch": 6.907985631389887, + "eval_accuracy": 0.5296076152096916, + "eval_loss": 1.0341060161590576, + "eval_runtime": 725.8939, + "eval_samples_per_second": 283.597, + "eval_steps_per_second": 8.864, + "step": 50000 + }, + { + "epoch": 6.921801602652667, + "grad_norm": 0.05296773836016655, + "learning_rate": 9.371952927991033e-05, + "loss": 1.031, + "step": 50100 + }, + { + "epoch": 6.935617573915446, + "grad_norm": 0.062248583883047104, + "learning_rate": 9.330344634351358e-05, + "loss": 1.0341, + "step": 50200 + }, + { + "epoch": 6.949433545178226, + "grad_norm": 0.07751675695180893, + "learning_rate": 9.288316054917343e-05, + "loss": 1.0352, + "step": 50300 + }, + { + "epoch": 6.963249516441006, + "grad_norm": 0.04984898492693901, + "learning_rate": 9.246287475483328e-05, + "loss": 1.0302, + "step": 50400 + }, + { + "epoch": 6.977065487703785, + "grad_norm": 0.04315504804253578, + "learning_rate": 9.204258896049314e-05, + "loss": 1.0327, + "step": 50500 + }, + { + "epoch": 6.990881458966565, + "grad_norm": 0.053620435297489166, + "learning_rate": 9.162230316615297e-05, + "loss": 1.0328, + "step": 50600 + }, + { + "epoch": 7.004697430229345, + "grad_norm": 0.04611975699663162, + "learning_rate": 9.120201737181282e-05, + "loss": 1.0336, + "step": 50700 + }, + { + "epoch": 7.018513401492125, + "grad_norm": 0.04269848018884659, + "learning_rate": 9.078173157747267e-05, + "loss": 1.0282, + "step": 50800 + }, + { + "epoch": 7.032329372754905, + "grad_norm": 0.055365532636642456, + "learning_rate": 9.036144578313253e-05, + "loss": 1.0339, + "step": 50900 + }, + { + "epoch": 7.046145344017685, + "grad_norm": 0.06129321828484535, + "learning_rate": 8.994115998879237e-05, + "loss": 1.0304, + "step": 51000 + }, + { + "epoch": 7.059961315280464, + "grad_norm": 0.06094348803162575, + "learning_rate": 8.952507705239563e-05, + "loss": 1.0288, + "step": 51100 + }, + { + "epoch": 7.073777286543244, + "grad_norm": 0.048849135637283325, + "learning_rate": 8.910479125805548e-05, + "loss": 1.0322, + "step": 51200 + }, + { + "epoch": 7.087593257806024, + "grad_norm": 0.05081125721335411, + "learning_rate": 8.868450546371531e-05, + "loss": 1.0303, + "step": 51300 + }, + { + "epoch": 7.101409229068803, + "grad_norm": 0.07727497071027756, + "learning_rate": 8.826421966937516e-05, + "loss": 1.03, + "step": 51400 + }, + { + "epoch": 7.115225200331583, + "grad_norm": 0.06357153505086899, + "learning_rate": 8.784393387503502e-05, + "loss": 1.0342, + "step": 51500 + }, + { + "epoch": 7.129041171594363, + "grad_norm": 0.05598052963614464, + "learning_rate": 8.742364808069487e-05, + "loss": 1.0312, + "step": 51600 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.06753697246313095, + "learning_rate": 8.70033622863547e-05, + "loss": 1.0306, + "step": 51700 + }, + { + "epoch": 7.156673114119923, + "grad_norm": 0.06586912274360657, + "learning_rate": 8.658307649201455e-05, + "loss": 1.0311, + "step": 51800 + }, + { + "epoch": 7.170489085382703, + "grad_norm": 0.10361455380916595, + "learning_rate": 8.616279069767442e-05, + "loss": 1.0326, + "step": 51900 + }, + { + "epoch": 7.184305056645482, + "grad_norm": 0.09442713856697083, + "learning_rate": 8.574250490333426e-05, + "loss": 1.0339, + "step": 52000 + }, + { + "epoch": 7.198121027908262, + "grad_norm": 0.08114325255155563, + "learning_rate": 8.532221910899411e-05, + "loss": 1.0335, + "step": 52100 + }, + { + "epoch": 7.211936999171042, + "grad_norm": 0.054252710193395615, + "learning_rate": 8.490193331465395e-05, + "loss": 1.0316, + "step": 52200 + }, + { + "epoch": 7.225752970433821, + "grad_norm": 0.059643086045980453, + "learning_rate": 8.448164752031381e-05, + "loss": 1.027, + "step": 52300 + }, + { + "epoch": 7.239568941696601, + "grad_norm": 0.045472096651792526, + "learning_rate": 8.406136172597366e-05, + "loss": 1.0311, + "step": 52400 + }, + { + "epoch": 7.2533849129593815, + "grad_norm": 0.0669686570763588, + "learning_rate": 8.36410759316335e-05, + "loss": 1.0309, + "step": 52500 + }, + { + "epoch": 7.267200884222161, + "grad_norm": 0.0454520583152771, + "learning_rate": 8.322079013729334e-05, + "loss": 1.0327, + "step": 52600 + }, + { + "epoch": 7.281016855484941, + "grad_norm": 0.05776028707623482, + "learning_rate": 8.28005043429532e-05, + "loss": 1.0318, + "step": 52700 + }, + { + "epoch": 7.2948328267477205, + "grad_norm": 0.051905229687690735, + "learning_rate": 8.238021854861305e-05, + "loss": 1.0313, + "step": 52800 + }, + { + "epoch": 7.3086487980105, + "grad_norm": 0.056912437081336975, + "learning_rate": 8.19599327542729e-05, + "loss": 1.0325, + "step": 52900 + }, + { + "epoch": 7.32246476927328, + "grad_norm": 0.04940250515937805, + "learning_rate": 8.153964695993274e-05, + "loss": 1.0323, + "step": 53000 + }, + { + "epoch": 7.3362807405360595, + "grad_norm": 0.04186444729566574, + "learning_rate": 8.11193611655926e-05, + "loss": 1.0285, + "step": 53100 + }, + { + "epoch": 7.350096711798839, + "grad_norm": 0.041809357702732086, + "learning_rate": 8.069907537125245e-05, + "loss": 1.0289, + "step": 53200 + }, + { + "epoch": 7.363912683061619, + "grad_norm": 0.05794375389814377, + "learning_rate": 8.02787895769123e-05, + "loss": 1.031, + "step": 53300 + }, + { + "epoch": 7.377728654324399, + "grad_norm": 0.08333911001682281, + "learning_rate": 7.985850378257213e-05, + "loss": 1.0316, + "step": 53400 + }, + { + "epoch": 7.391544625587179, + "grad_norm": 0.06473658233880997, + "learning_rate": 7.943821798823199e-05, + "loss": 1.0317, + "step": 53500 + }, + { + "epoch": 7.405360596849959, + "grad_norm": 0.05173886939883232, + "learning_rate": 7.901793219389184e-05, + "loss": 1.0308, + "step": 53600 + }, + { + "epoch": 7.419176568112738, + "grad_norm": 0.06362345069646835, + "learning_rate": 7.859764639955169e-05, + "loss": 1.0324, + "step": 53700 + }, + { + "epoch": 7.432992539375518, + "grad_norm": 0.054053716361522675, + "learning_rate": 7.817736060521152e-05, + "loss": 1.0303, + "step": 53800 + }, + { + "epoch": 7.446808510638298, + "grad_norm": 0.048420459032058716, + "learning_rate": 7.775707481087139e-05, + "loss": 1.0299, + "step": 53900 + }, + { + "epoch": 7.460624481901077, + "grad_norm": 0.0606950968503952, + "learning_rate": 7.733678901653123e-05, + "loss": 1.0317, + "step": 54000 + }, + { + "epoch": 7.474440453163857, + "grad_norm": 0.06072583049535751, + "learning_rate": 7.691650322219108e-05, + "loss": 1.033, + "step": 54100 + }, + { + "epoch": 7.488256424426638, + "grad_norm": 0.05064817890524864, + "learning_rate": 7.649621742785093e-05, + "loss": 1.0287, + "step": 54200 + }, + { + "epoch": 7.502072395689417, + "grad_norm": 0.09318757057189941, + "learning_rate": 7.607593163351078e-05, + "loss": 1.0296, + "step": 54300 + }, + { + "epoch": 7.515888366952197, + "grad_norm": 0.0935215950012207, + "learning_rate": 7.565564583917063e-05, + "loss": 1.0322, + "step": 54400 + }, + { + "epoch": 7.529704338214977, + "grad_norm": 0.07255256175994873, + "learning_rate": 7.523536004483048e-05, + "loss": 1.0333, + "step": 54500 + }, + { + "epoch": 7.543520309477756, + "grad_norm": 0.05486008897423744, + "learning_rate": 7.481507425049033e-05, + "loss": 1.032, + "step": 54600 + }, + { + "epoch": 7.557336280740536, + "grad_norm": 0.0525212287902832, + "learning_rate": 7.439478845615017e-05, + "loss": 1.0293, + "step": 54700 + }, + { + "epoch": 7.571152252003316, + "grad_norm": 0.047569695860147476, + "learning_rate": 7.397450266181002e-05, + "loss": 1.0282, + "step": 54800 + }, + { + "epoch": 7.584968223266095, + "grad_norm": 0.06165711581707001, + "learning_rate": 7.355421686746987e-05, + "loss": 1.0312, + "step": 54900 + }, + { + "epoch": 7.598784194528875, + "grad_norm": 0.0578945092856884, + "learning_rate": 7.313393107312972e-05, + "loss": 1.0307, + "step": 55000 + }, + { + "epoch": 7.598784194528875, + "eval_accuracy": 0.5305025000901846, + "eval_loss": 1.0327985286712646, + "eval_runtime": 731.5754, + "eval_samples_per_second": 281.394, + "eval_steps_per_second": 8.795, + "step": 55000 + }, + { + "epoch": 7.612600165791655, + "grad_norm": 0.0795338973402977, + "learning_rate": 7.271784813673297e-05, + "loss": 1.0294, + "step": 55100 + }, + { + "epoch": 7.626416137054435, + "grad_norm": 0.06103779003024101, + "learning_rate": 7.229756234239283e-05, + "loss": 1.033, + "step": 55200 + }, + { + "epoch": 7.640232108317215, + "grad_norm": 0.0635315552353859, + "learning_rate": 7.187727654805266e-05, + "loss": 1.0296, + "step": 55300 + }, + { + "epoch": 7.654048079579995, + "grad_norm": 0.05289231240749359, + "learning_rate": 7.145699075371253e-05, + "loss": 1.034, + "step": 55400 + }, + { + "epoch": 7.667864050842774, + "grad_norm": 0.07801427692174911, + "learning_rate": 7.103670495937236e-05, + "loss": 1.0332, + "step": 55500 + }, + { + "epoch": 7.681680022105554, + "grad_norm": 0.07564268261194229, + "learning_rate": 7.061641916503222e-05, + "loss": 1.0299, + "step": 55600 + }, + { + "epoch": 7.695495993368334, + "grad_norm": 0.04168133810162544, + "learning_rate": 7.019613337069206e-05, + "loss": 1.03, + "step": 55700 + }, + { + "epoch": 7.709311964631113, + "grad_norm": 0.11210035532712936, + "learning_rate": 6.977584757635192e-05, + "loss": 1.0301, + "step": 55800 + }, + { + "epoch": 7.723127935893894, + "grad_norm": 0.09023060649633408, + "learning_rate": 6.935556178201175e-05, + "loss": 1.0285, + "step": 55900 + }, + { + "epoch": 7.7369439071566735, + "grad_norm": 0.05271260067820549, + "learning_rate": 6.893527598767162e-05, + "loss": 1.0315, + "step": 56000 + }, + { + "epoch": 7.750759878419453, + "grad_norm": 0.06293012201786041, + "learning_rate": 6.851499019333145e-05, + "loss": 1.0286, + "step": 56100 + }, + { + "epoch": 7.764575849682233, + "grad_norm": 0.04555558040738106, + "learning_rate": 6.809470439899131e-05, + "loss": 1.0308, + "step": 56200 + }, + { + "epoch": 7.7783918209450125, + "grad_norm": 0.042364273220300674, + "learning_rate": 6.767441860465115e-05, + "loss": 1.0311, + "step": 56300 + }, + { + "epoch": 7.792207792207792, + "grad_norm": 0.05084213241934776, + "learning_rate": 6.725413281031101e-05, + "loss": 1.0298, + "step": 56400 + }, + { + "epoch": 7.806023763470572, + "grad_norm": 0.059168051928281784, + "learning_rate": 6.683384701597085e-05, + "loss": 1.0303, + "step": 56500 + }, + { + "epoch": 7.8198397347333515, + "grad_norm": 0.05535740405321121, + "learning_rate": 6.641356122163071e-05, + "loss": 1.0306, + "step": 56600 + }, + { + "epoch": 7.833655705996131, + "grad_norm": 0.06625715643167496, + "learning_rate": 6.599327542729054e-05, + "loss": 1.0283, + "step": 56700 + }, + { + "epoch": 7.847471677258911, + "grad_norm": 0.04644458368420601, + "learning_rate": 6.55729896329504e-05, + "loss": 1.0289, + "step": 56800 + }, + { + "epoch": 7.861287648521691, + "grad_norm": 0.05319574847817421, + "learning_rate": 6.515270383861024e-05, + "loss": 1.0303, + "step": 56900 + }, + { + "epoch": 7.875103619784471, + "grad_norm": 0.06394356489181519, + "learning_rate": 6.47324180442701e-05, + "loss": 1.0315, + "step": 57000 + }, + { + "epoch": 7.888919591047251, + "grad_norm": 0.0535539835691452, + "learning_rate": 6.431633510787335e-05, + "loss": 1.0323, + "step": 57100 + }, + { + "epoch": 7.90273556231003, + "grad_norm": 0.05220150947570801, + "learning_rate": 6.38960493135332e-05, + "loss": 1.032, + "step": 57200 + }, + { + "epoch": 7.91655153357281, + "grad_norm": 0.04795517399907112, + "learning_rate": 6.347576351919304e-05, + "loss": 1.03, + "step": 57300 + }, + { + "epoch": 7.93036750483559, + "grad_norm": 0.0748489499092102, + "learning_rate": 6.30554777248529e-05, + "loss": 1.0338, + "step": 57400 + }, + { + "epoch": 7.944183476098369, + "grad_norm": 0.08164035528898239, + "learning_rate": 6.263519193051274e-05, + "loss": 1.0318, + "step": 57500 + }, + { + "epoch": 7.95799944736115, + "grad_norm": 0.0764247477054596, + "learning_rate": 6.221490613617259e-05, + "loss": 1.0278, + "step": 57600 + }, + { + "epoch": 7.97181541862393, + "grad_norm": 0.05609816685318947, + "learning_rate": 6.179462034183244e-05, + "loss": 1.0307, + "step": 57700 + }, + { + "epoch": 7.985631389886709, + "grad_norm": 0.05001819133758545, + "learning_rate": 6.137433454749229e-05, + "loss": 1.0297, + "step": 57800 + }, + { + "epoch": 7.999447361149489, + "grad_norm": 0.10084258019924164, + "learning_rate": 6.0954048753152136e-05, + "loss": 1.0339, + "step": 57900 + }, + { + "epoch": 8.013263332412269, + "grad_norm": 0.07571733742952347, + "learning_rate": 6.0533762958811985e-05, + "loss": 1.0305, + "step": 58000 + }, + { + "epoch": 8.027079303675048, + "grad_norm": 0.059294216334819794, + "learning_rate": 6.011347716447183e-05, + "loss": 1.026, + "step": 58100 + }, + { + "epoch": 8.040895274937828, + "grad_norm": 0.04530787095427513, + "learning_rate": 5.969319137013168e-05, + "loss": 1.0282, + "step": 58200 + }, + { + "epoch": 8.054711246200608, + "grad_norm": 0.05052864924073219, + "learning_rate": 5.927290557579153e-05, + "loss": 1.0271, + "step": 58300 + }, + { + "epoch": 8.068527217463387, + "grad_norm": 0.04923342168331146, + "learning_rate": 5.885261978145138e-05, + "loss": 1.029, + "step": 58400 + }, + { + "epoch": 8.082343188726167, + "grad_norm": 0.04905908182263374, + "learning_rate": 5.843233398711123e-05, + "loss": 1.0277, + "step": 58500 + }, + { + "epoch": 8.096159159988947, + "grad_norm": 0.046151451766490936, + "learning_rate": 5.801204819277108e-05, + "loss": 1.0289, + "step": 58600 + }, + { + "epoch": 8.109975131251726, + "grad_norm": 0.06011873856186867, + "learning_rate": 5.7591762398430925e-05, + "loss": 1.0245, + "step": 58700 + }, + { + "epoch": 8.123791102514506, + "grad_norm": 0.06879663467407227, + "learning_rate": 5.717147660409078e-05, + "loss": 1.0271, + "step": 58800 + }, + { + "epoch": 8.137607073777286, + "grad_norm": 0.04675479233264923, + "learning_rate": 5.675119080975063e-05, + "loss": 1.0263, + "step": 58900 + }, + { + "epoch": 8.151423045040067, + "grad_norm": 0.08497285097837448, + "learning_rate": 5.633090501541048e-05, + "loss": 1.0287, + "step": 59000 + }, + { + "epoch": 8.165239016302847, + "grad_norm": 0.07600156217813492, + "learning_rate": 5.5910619221070326e-05, + "loss": 1.0262, + "step": 59100 + }, + { + "epoch": 8.179054987565626, + "grad_norm": 0.04951677843928337, + "learning_rate": 5.549453628467357e-05, + "loss": 1.0283, + "step": 59200 + }, + { + "epoch": 8.192870958828406, + "grad_norm": 0.05662324279546738, + "learning_rate": 5.507425049033342e-05, + "loss": 1.0295, + "step": 59300 + }, + { + "epoch": 8.206686930091186, + "grad_norm": 0.05791959911584854, + "learning_rate": 5.465396469599327e-05, + "loss": 1.0285, + "step": 59400 + }, + { + "epoch": 8.220502901353965, + "grad_norm": 0.058768805116415024, + "learning_rate": 5.423367890165312e-05, + "loss": 1.0272, + "step": 59500 + }, + { + "epoch": 8.234318872616745, + "grad_norm": 0.05399869754910469, + "learning_rate": 5.381339310731297e-05, + "loss": 1.0301, + "step": 59600 + }, + { + "epoch": 8.248134843879525, + "grad_norm": 0.06434085965156555, + "learning_rate": 5.3393107312972814e-05, + "loss": 1.0277, + "step": 59700 + }, + { + "epoch": 8.261950815142304, + "grad_norm": 0.054656483232975006, + "learning_rate": 5.297282151863267e-05, + "loss": 1.0295, + "step": 59800 + }, + { + "epoch": 8.275766786405084, + "grad_norm": 0.04396641626954079, + "learning_rate": 5.255253572429251e-05, + "loss": 1.0276, + "step": 59900 + }, + { + "epoch": 8.289582757667864, + "grad_norm": 0.058395449072122574, + "learning_rate": 5.2132249929952366e-05, + "loss": 1.0267, + "step": 60000 + }, + { + "epoch": 8.289582757667864, + "eval_accuracy": 0.5312832658873073, + "eval_loss": 1.0315501689910889, + "eval_runtime": 729.415, + "eval_samples_per_second": 282.228, + "eval_steps_per_second": 8.821, + "step": 60000 + }, + { + "epoch": 8.303398728930643, + "grad_norm": 0.06770013272762299, + "learning_rate": 5.171196413561221e-05, + "loss": 1.029, + "step": 60100 + }, + { + "epoch": 8.317214700193423, + "grad_norm": 0.06161688268184662, + "learning_rate": 5.1291678341272063e-05, + "loss": 1.0242, + "step": 60200 + }, + { + "epoch": 8.331030671456203, + "grad_norm": 0.04140911623835564, + "learning_rate": 5.087139254693191e-05, + "loss": 1.029, + "step": 60300 + }, + { + "epoch": 8.344846642718982, + "grad_norm": 0.07091998308897018, + "learning_rate": 5.045110675259176e-05, + "loss": 1.0268, + "step": 60400 + }, + { + "epoch": 8.358662613981762, + "grad_norm": 0.05135732889175415, + "learning_rate": 5.003082095825161e-05, + "loss": 1.0264, + "step": 60500 + }, + { + "epoch": 8.372478585244544, + "grad_norm": 0.05828474089503288, + "learning_rate": 4.961053516391146e-05, + "loss": 1.0271, + "step": 60600 + }, + { + "epoch": 8.386294556507323, + "grad_norm": 0.05920015275478363, + "learning_rate": 4.9190249369571306e-05, + "loss": 1.0263, + "step": 60700 + }, + { + "epoch": 8.400110527770103, + "grad_norm": 0.048502273857593536, + "learning_rate": 4.8769963575231155e-05, + "loss": 1.029, + "step": 60800 + }, + { + "epoch": 8.413926499032883, + "grad_norm": 0.049063604325056076, + "learning_rate": 4.8349677780891e-05, + "loss": 1.0294, + "step": 60900 + }, + { + "epoch": 8.427742470295662, + "grad_norm": 0.05672093480825424, + "learning_rate": 4.792939198655085e-05, + "loss": 1.0297, + "step": 61000 + }, + { + "epoch": 8.441558441558442, + "grad_norm": 0.06934633105993271, + "learning_rate": 4.75091061922107e-05, + "loss": 1.0261, + "step": 61100 + }, + { + "epoch": 8.455374412821222, + "grad_norm": 0.04098910838365555, + "learning_rate": 4.709302325581395e-05, + "loss": 1.0292, + "step": 61200 + }, + { + "epoch": 8.469190384084001, + "grad_norm": 0.06421385705471039, + "learning_rate": 4.6672737461473794e-05, + "loss": 1.0315, + "step": 61300 + }, + { + "epoch": 8.483006355346781, + "grad_norm": 0.05238828435540199, + "learning_rate": 4.625245166713365e-05, + "loss": 1.0309, + "step": 61400 + }, + { + "epoch": 8.49682232660956, + "grad_norm": 0.049910806119441986, + "learning_rate": 4.583216587279349e-05, + "loss": 1.0257, + "step": 61500 + }, + { + "epoch": 8.51063829787234, + "grad_norm": 0.06672196090221405, + "learning_rate": 4.541188007845335e-05, + "loss": 1.0328, + "step": 61600 + }, + { + "epoch": 8.52445426913512, + "grad_norm": 0.05466538295149803, + "learning_rate": 4.4991594284113195e-05, + "loss": 1.0284, + "step": 61700 + }, + { + "epoch": 8.5382702403979, + "grad_norm": 0.05218784883618355, + "learning_rate": 4.4571308489773044e-05, + "loss": 1.0285, + "step": 61800 + }, + { + "epoch": 8.55208621166068, + "grad_norm": 0.04263923689723015, + "learning_rate": 4.415102269543289e-05, + "loss": 1.0307, + "step": 61900 + }, + { + "epoch": 8.565902182923459, + "grad_norm": 0.054478637874126434, + "learning_rate": 4.373073690109274e-05, + "loss": 1.0291, + "step": 62000 + }, + { + "epoch": 8.579718154186239, + "grad_norm": 0.05667020007967949, + "learning_rate": 4.331045110675259e-05, + "loss": 1.0296, + "step": 62100 + }, + { + "epoch": 8.593534125449018, + "grad_norm": 0.0490160770714283, + "learning_rate": 4.289016531241244e-05, + "loss": 1.029, + "step": 62200 + }, + { + "epoch": 8.607350096711798, + "grad_norm": 0.049655403941869736, + "learning_rate": 4.246987951807229e-05, + "loss": 1.0298, + "step": 62300 + }, + { + "epoch": 8.62116606797458, + "grad_norm": 0.047429408878088, + "learning_rate": 4.2049593723732135e-05, + "loss": 1.0277, + "step": 62400 + }, + { + "epoch": 8.634982039237359, + "grad_norm": 0.05222218483686447, + "learning_rate": 4.1629307929391984e-05, + "loss": 1.0292, + "step": 62500 + }, + { + "epoch": 8.648798010500139, + "grad_norm": 0.05841238424181938, + "learning_rate": 4.120902213505183e-05, + "loss": 1.029, + "step": 62600 + }, + { + "epoch": 8.662613981762918, + "grad_norm": 0.0452195480465889, + "learning_rate": 4.078873634071168e-05, + "loss": 1.0265, + "step": 62700 + }, + { + "epoch": 8.676429953025698, + "grad_norm": 0.049306340515613556, + "learning_rate": 4.036845054637153e-05, + "loss": 1.0308, + "step": 62800 + }, + { + "epoch": 8.690245924288478, + "grad_norm": 0.050401389598846436, + "learning_rate": 3.994816475203138e-05, + "loss": 1.0294, + "step": 62900 + }, + { + "epoch": 8.704061895551257, + "grad_norm": 0.04503024369478226, + "learning_rate": 3.952787895769123e-05, + "loss": 1.0291, + "step": 63000 + }, + { + "epoch": 8.717877866814037, + "grad_norm": 0.0738733783364296, + "learning_rate": 3.9107593163351075e-05, + "loss": 1.0279, + "step": 63100 + }, + { + "epoch": 8.731693838076817, + "grad_norm": 0.04586975276470184, + "learning_rate": 3.869151022695433e-05, + "loss": 1.026, + "step": 63200 + }, + { + "epoch": 8.745509809339596, + "grad_norm": 0.04988343268632889, + "learning_rate": 3.8271224432614176e-05, + "loss": 1.0257, + "step": 63300 + }, + { + "epoch": 8.759325780602376, + "grad_norm": 0.07822008430957794, + "learning_rate": 3.7850938638274025e-05, + "loss": 1.0254, + "step": 63400 + }, + { + "epoch": 8.773141751865156, + "grad_norm": 0.058496229350566864, + "learning_rate": 3.743065284393387e-05, + "loss": 1.0263, + "step": 63500 + }, + { + "epoch": 8.786957723127935, + "grad_norm": 0.04458677023649216, + "learning_rate": 3.701036704959372e-05, + "loss": 1.0292, + "step": 63600 + }, + { + "epoch": 8.800773694390715, + "grad_norm": 0.06616061180830002, + "learning_rate": 3.659008125525357e-05, + "loss": 1.0309, + "step": 63700 + }, + { + "epoch": 8.814589665653495, + "grad_norm": 0.06473194807767868, + "learning_rate": 3.616979546091342e-05, + "loss": 1.0265, + "step": 63800 + }, + { + "epoch": 8.828405636916274, + "grad_norm": 0.047700874507427216, + "learning_rate": 3.574950966657327e-05, + "loss": 1.0303, + "step": 63900 + }, + { + "epoch": 8.842221608179056, + "grad_norm": 0.055733323097229004, + "learning_rate": 3.5329223872233116e-05, + "loss": 1.0279, + "step": 64000 + }, + { + "epoch": 8.856037579441836, + "grad_norm": 0.04398791491985321, + "learning_rate": 3.4908938077892965e-05, + "loss": 1.0284, + "step": 64100 + }, + { + "epoch": 8.869853550704615, + "grad_norm": 0.08901511132717133, + "learning_rate": 3.448865228355281e-05, + "loss": 1.0283, + "step": 64200 + }, + { + "epoch": 8.883669521967395, + "grad_norm": 0.05853118374943733, + "learning_rate": 3.406836648921266e-05, + "loss": 1.0291, + "step": 64300 + }, + { + "epoch": 8.897485493230175, + "grad_norm": 0.043922308832407, + "learning_rate": 3.364808069487251e-05, + "loss": 1.0294, + "step": 64400 + }, + { + "epoch": 8.911301464492954, + "grad_norm": 0.04332153871655464, + "learning_rate": 3.322779490053236e-05, + "loss": 1.0277, + "step": 64500 + }, + { + "epoch": 8.925117435755734, + "grad_norm": 0.09197825193405151, + "learning_rate": 3.280750910619221e-05, + "loss": 1.0295, + "step": 64600 + }, + { + "epoch": 8.938933407018514, + "grad_norm": 0.05589272826910019, + "learning_rate": 3.2387223311852056e-05, + "loss": 1.0274, + "step": 64700 + }, + { + "epoch": 8.952749378281293, + "grad_norm": 0.06028933823108673, + "learning_rate": 3.1966937517511904e-05, + "loss": 1.0285, + "step": 64800 + }, + { + "epoch": 8.966565349544073, + "grad_norm": 0.05357721447944641, + "learning_rate": 3.154665172317175e-05, + "loss": 1.027, + "step": 64900 + }, + { + "epoch": 8.980381320806853, + "grad_norm": 0.07362578809261322, + "learning_rate": 3.11263659288316e-05, + "loss": 1.0273, + "step": 65000 + }, + { + "epoch": 8.980381320806853, + "eval_accuracy": 0.5319501927585898, + "eval_loss": 1.0305662155151367, + "eval_runtime": 722.9505, + "eval_samples_per_second": 284.751, + "eval_steps_per_second": 8.9, + "step": 65000 + } + ], + "logging_steps": 100, + "max_steps": 72380, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.695798824691565e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-65000/training_args.bin b/checkpoint-65000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..45f2f12b913e85908e1565ce4b13c8763ea7a1ca --- /dev/null +++ b/checkpoint-65000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e8fb7657adc13bdcaf635b1c6fb616dd082a6870cdd6aecd3b669d8cac0873 +size 5304 diff --git a/checkpoint-65000/vocab.json b/checkpoint-65000/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..d0809a2e3e28811023f05ed415122e24681bc9d1 --- /dev/null +++ b/checkpoint-65000/vocab.json @@ -0,0 +1 @@ +{"<|endoftext|>":0,"A":1,"C":2,"G":3,"T":4} \ No newline at end of file diff --git a/checkpoint-70000/config.json b/checkpoint-70000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..192ba05a8714569e728cced45eaebf4106596353 --- /dev/null +++ b/checkpoint-70000/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 50256, + "embd_pdrop": 0.1, + "eos_token_id": 50256, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 768, + "n_head": 12, + "n_inner": null, + "n_layer": 12, + "n_positions": 1024, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.1, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.52.0.dev0", + "use_cache": true, + "vocab_size": 5 +} diff --git a/checkpoint-70000/generation_config.json b/checkpoint-70000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c829fa47bd90bfe00fdb37ed6d41324f6fb81f63 --- /dev/null +++ b/checkpoint-70000/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 50256, + "eos_token_id": 50256, + "transformers_version": "4.52.0.dev0" +} diff --git a/checkpoint-70000/merges.txt b/checkpoint-70000/merges.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e7f1fd94996c8e2b65adea828af1b398eace61f --- /dev/null +++ b/checkpoint-70000/merges.txt @@ -0,0 +1 @@ +#version: 0.2 diff --git a/checkpoint-70000/model.safetensors b/checkpoint-70000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c09e2b5644e78b2f38d42b31f00f85b094fe2fe0 --- /dev/null +++ b/checkpoint-70000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7694d2d4deca5552713678b7ba3a98015adc907157087d06374e8759397a9704 +size 343400064 diff --git a/checkpoint-70000/optimizer.pt b/checkpoint-70000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea726eca95ae8fdc39261a9f13a5772f8004b6dd --- /dev/null +++ b/checkpoint-70000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:044069d2fe8a5554e5399ab1cbb2bf808bff35ff185a0141569bfcb3bd777b95 +size 686894010 diff --git a/checkpoint-70000/rng_state_0.pth b/checkpoint-70000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ad5c4273a673027492b04abaddad6c7bc8a9bea9 --- /dev/null +++ b/checkpoint-70000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dec82c95e8817f8b4a9d63e77c0f6b5b52a0e039a9397117bc95a18374acfb7 +size 14960 diff --git a/checkpoint-70000/rng_state_1.pth b/checkpoint-70000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7c2d1db19f0fea5f367fded8e1f213d79a09188d --- /dev/null +++ b/checkpoint-70000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdd996c2bdddb8f92975bb79265833d12d6478a586ba1b97e4281358d5dca6f3 +size 14960 diff --git a/checkpoint-70000/rng_state_2.pth b/checkpoint-70000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd02202424dd3e5fe69deae095fadc3c8cd415ce --- /dev/null +++ b/checkpoint-70000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f89b14efe50937d062b0eb38eb7917537ab35011b36cf6050f8736af97eda57c +size 14960 diff --git a/checkpoint-70000/rng_state_3.pth b/checkpoint-70000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..607902da0b54d0631bc2d82d2f6cab03dfbc145e --- /dev/null +++ b/checkpoint-70000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acbb453cfd80542114994513f8203a5b3b87f5757bf87f3caf54bee3ab7609eb +size 14960 diff --git a/checkpoint-70000/scaler.pt b/checkpoint-70000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7390ee9dc612c03ee1029a2abbaed8ec232173e --- /dev/null +++ b/checkpoint-70000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73c747ac36afcda69f34e348853d519b2859b2dbf2765547178b534ad53d9215 +size 988 diff --git a/checkpoint-70000/scheduler.pt b/checkpoint-70000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5bfac160ca086f3d06d6f3b42cdeb3a15650bdc --- /dev/null +++ b/checkpoint-70000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac1e915b201c2bf73c557406f586b44f202512a9f35a7541fc5a357a0622ce3f +size 1064 diff --git a/checkpoint-70000/special_tokens_map.json b/checkpoint-70000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..773bd68cf0900427f8d69dd974724e3abb9a08a9 --- /dev/null +++ b/checkpoint-70000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-70000/tokenizer.json b/checkpoint-70000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..ac40039af791f0fd130b3d36c3677a156b2de089 --- /dev/null +++ b/checkpoint-70000/tokenizer.json @@ -0,0 +1,53 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": true + }, + "post_processor": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": false, + "use_regex": true + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": "", + "end_of_word_suffix": "", + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "<|endoftext|>": 0, + "A": 1, + "C": 2, + "G": 3, + "T": 4 + }, + "merges": [] + } +} \ No newline at end of file diff --git a/checkpoint-70000/tokenizer_config.json b/checkpoint-70000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7c4e19588fa8b4faceab450a1d7e8dae1ce87f7c --- /dev/null +++ b/checkpoint-70000/tokenizer_config.json @@ -0,0 +1,21 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|endoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "<|endoftext|>" +} diff --git a/checkpoint-70000/trainer_state.json b/checkpoint-70000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c5b26380b1809cf9d1690fdf2db57c9fc31c6af0 --- /dev/null +++ b/checkpoint-70000/trainer_state.json @@ -0,0 +1,5067 @@ +{ + "best_global_step": 70000, + "best_metric": 1.029943823814392, + "best_model_checkpoint": "./dna_model/checkpoint-70000", + "epoch": 9.671179883945841, + "eval_steps": 5000, + "global_step": 70000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00013815971262779773, + "grad_norm": 42.580928802490234, + "learning_rate": 0.0, + "loss": 1.6625, + "step": 1 + }, + { + "epoch": 0.013815971262779773, + "grad_norm": 2.4457767009735107, + "learning_rate": 2.97e-05, + "loss": 1.36, + "step": 100 + }, + { + "epoch": 0.027631942525559547, + "grad_norm": 0.5432274341583252, + "learning_rate": 5.97e-05, + "loss": 1.3309, + "step": 200 + }, + { + "epoch": 0.04144791378833932, + "grad_norm": 0.825528621673584, + "learning_rate": 8.969999999999998e-05, + "loss": 1.3234, + "step": 300 + }, + { + "epoch": 0.055263885051119094, + "grad_norm": 0.4912604093551636, + "learning_rate": 0.0001197, + "loss": 1.3249, + "step": 400 + }, + { + "epoch": 0.06907985631389886, + "grad_norm": 0.9077563881874084, + "learning_rate": 0.00014969999999999998, + "loss": 1.3153, + "step": 500 + }, + { + "epoch": 0.08289582757667864, + "grad_norm": 0.8954246640205383, + "learning_rate": 0.00017969999999999998, + "loss": 1.3123, + "step": 600 + }, + { + "epoch": 0.09671179883945841, + "grad_norm": 0.5876831412315369, + "learning_rate": 0.00020969999999999997, + "loss": 1.3098, + "step": 700 + }, + { + "epoch": 0.11052777010223819, + "grad_norm": 0.426789253950119, + "learning_rate": 0.0002397, + "loss": 1.3072, + "step": 800 + }, + { + "epoch": 0.12434374136501795, + "grad_norm": 0.3324718177318573, + "learning_rate": 0.0002697, + "loss": 1.3037, + "step": 900 + }, + { + "epoch": 0.13815971262779772, + "grad_norm": 0.23672613501548767, + "learning_rate": 0.00029969999999999997, + "loss": 1.2991, + "step": 1000 + }, + { + "epoch": 0.1519756838905775, + "grad_norm": 0.4699796438217163, + "learning_rate": 0.00029958391706360325, + "loss": 1.2923, + "step": 1100 + }, + { + "epoch": 0.16579165515335728, + "grad_norm": 0.684186577796936, + "learning_rate": 0.00029916363126926307, + "loss": 1.2825, + "step": 1200 + }, + { + "epoch": 0.17960762641613706, + "grad_norm": 0.3944641649723053, + "learning_rate": 0.00029874334547492294, + "loss": 1.2678, + "step": 1300 + }, + { + "epoch": 0.19342359767891681, + "grad_norm": 1.1556001901626587, + "learning_rate": 0.00029832305968058276, + "loss": 1.2541, + "step": 1400 + }, + { + "epoch": 0.2072395689416966, + "grad_norm": 0.39745599031448364, + "learning_rate": 0.0002979027738862426, + "loss": 1.2439, + "step": 1500 + }, + { + "epoch": 0.22105554020447638, + "grad_norm": 0.5201444029808044, + "learning_rate": 0.00029748248809190246, + "loss": 1.2329, + "step": 1600 + }, + { + "epoch": 0.23487151146725616, + "grad_norm": 0.2168777734041214, + "learning_rate": 0.00029706220229756234, + "loss": 1.2268, + "step": 1700 + }, + { + "epoch": 0.2486874827300359, + "grad_norm": 0.30599427223205566, + "learning_rate": 0.00029664191650322216, + "loss": 1.2199, + "step": 1800 + }, + { + "epoch": 0.2625034539928157, + "grad_norm": 0.32062044739723206, + "learning_rate": 0.00029622163070888203, + "loss": 1.2131, + "step": 1900 + }, + { + "epoch": 0.27631942525559544, + "grad_norm": 0.13411013782024384, + "learning_rate": 0.00029580134491454186, + "loss": 1.2074, + "step": 2000 + }, + { + "epoch": 0.2901353965183752, + "grad_norm": 0.3672633767127991, + "learning_rate": 0.00029538105912020173, + "loss": 1.2022, + "step": 2100 + }, + { + "epoch": 0.303951367781155, + "grad_norm": 0.41515815258026123, + "learning_rate": 0.00029496077332586155, + "loss": 1.1949, + "step": 2200 + }, + { + "epoch": 0.3177673390439348, + "grad_norm": 0.18381068110466003, + "learning_rate": 0.0002945404875315214, + "loss": 1.1887, + "step": 2300 + }, + { + "epoch": 0.33158331030671456, + "grad_norm": 0.3080751895904541, + "learning_rate": 0.00029412020173718125, + "loss": 1.1844, + "step": 2400 + }, + { + "epoch": 0.34539928156949434, + "grad_norm": 0.38037416338920593, + "learning_rate": 0.0002936999159428411, + "loss": 1.1804, + "step": 2500 + }, + { + "epoch": 0.3592152528322741, + "grad_norm": 0.23272989690303802, + "learning_rate": 0.00029327963014850095, + "loss": 1.1753, + "step": 2600 + }, + { + "epoch": 0.3730312240950539, + "grad_norm": 0.1149936243891716, + "learning_rate": 0.0002928593443541608, + "loss": 1.1739, + "step": 2700 + }, + { + "epoch": 0.38684719535783363, + "grad_norm": 0.28469276428222656, + "learning_rate": 0.00029243905855982064, + "loss": 1.1671, + "step": 2800 + }, + { + "epoch": 0.4006631666206134, + "grad_norm": 0.25204166769981384, + "learning_rate": 0.0002920187727654805, + "loss": 1.1633, + "step": 2900 + }, + { + "epoch": 0.4144791378833932, + "grad_norm": 0.3945861756801605, + "learning_rate": 0.00029159848697114034, + "loss": 1.1608, + "step": 3000 + }, + { + "epoch": 0.42829510914617297, + "grad_norm": 0.2578865587711334, + "learning_rate": 0.00029117820117680016, + "loss": 1.1622, + "step": 3100 + }, + { + "epoch": 0.44211108040895275, + "grad_norm": 0.16060177981853485, + "learning_rate": 0.00029075791538246004, + "loss": 1.1577, + "step": 3200 + }, + { + "epoch": 0.45592705167173253, + "grad_norm": 0.1980718970298767, + "learning_rate": 0.0002903376295881199, + "loss": 1.155, + "step": 3300 + }, + { + "epoch": 0.4697430229345123, + "grad_norm": 0.12515653669834137, + "learning_rate": 0.00028991734379377974, + "loss": 1.1519, + "step": 3400 + }, + { + "epoch": 0.4835589941972921, + "grad_norm": 0.26255738735198975, + "learning_rate": 0.0002894970579994396, + "loss": 1.1523, + "step": 3500 + }, + { + "epoch": 0.4973749654600718, + "grad_norm": 0.281464546918869, + "learning_rate": 0.00028907677220509943, + "loss": 1.1511, + "step": 3600 + }, + { + "epoch": 0.5111909367228517, + "grad_norm": 0.11816036701202393, + "learning_rate": 0.0002886564864107593, + "loss": 1.1469, + "step": 3700 + }, + { + "epoch": 0.5250069079856314, + "grad_norm": 0.25923675298690796, + "learning_rate": 0.00028823620061641913, + "loss": 1.1456, + "step": 3800 + }, + { + "epoch": 0.5388228792484112, + "grad_norm": 0.2766472399234772, + "learning_rate": 0.00028781591482207895, + "loss": 1.1442, + "step": 3900 + }, + { + "epoch": 0.5526388505111909, + "grad_norm": 0.1701624095439911, + "learning_rate": 0.00028739562902773883, + "loss": 1.1445, + "step": 4000 + }, + { + "epoch": 0.5664548217739707, + "grad_norm": 0.3141656219959259, + "learning_rate": 0.0002869753432333987, + "loss": 1.1392, + "step": 4100 + }, + { + "epoch": 0.5802707930367504, + "grad_norm": 0.11816743016242981, + "learning_rate": 0.0002865550574390585, + "loss": 1.1406, + "step": 4200 + }, + { + "epoch": 0.5940867642995302, + "grad_norm": 0.12762723863124847, + "learning_rate": 0.0002861347716447184, + "loss": 1.1361, + "step": 4300 + }, + { + "epoch": 0.60790273556231, + "grad_norm": 0.09322622418403625, + "learning_rate": 0.0002857144858503782, + "loss": 1.134, + "step": 4400 + }, + { + "epoch": 0.6217187068250898, + "grad_norm": 0.1586735099554062, + "learning_rate": 0.0002852942000560381, + "loss": 1.1336, + "step": 4500 + }, + { + "epoch": 0.6355346780878696, + "grad_norm": 0.13594642281532288, + "learning_rate": 0.0002848739142616979, + "loss": 1.1328, + "step": 4600 + }, + { + "epoch": 0.6493506493506493, + "grad_norm": 0.21865279972553253, + "learning_rate": 0.00028445362846735774, + "loss": 1.1311, + "step": 4700 + }, + { + "epoch": 0.6631666206134291, + "grad_norm": 0.22787001729011536, + "learning_rate": 0.0002840333426730176, + "loss": 1.1271, + "step": 4800 + }, + { + "epoch": 0.6769825918762089, + "grad_norm": 0.2334531843662262, + "learning_rate": 0.0002836130568786775, + "loss": 1.1291, + "step": 4900 + }, + { + "epoch": 0.6907985631389887, + "grad_norm": 0.11103236675262451, + "learning_rate": 0.0002831927710843373, + "loss": 1.1252, + "step": 5000 + }, + { + "epoch": 0.6907985631389887, + "eval_accuracy": 0.4745045939970608, + "eval_loss": 1.1205766201019287, + "eval_runtime": 1027.9902, + "eval_samples_per_second": 200.256, + "eval_steps_per_second": 6.259, + "step": 5000 + }, + { + "epoch": 0.7046145344017685, + "grad_norm": 0.21742330491542816, + "learning_rate": 0.0002827724852899972, + "loss": 1.1235, + "step": 5100 + }, + { + "epoch": 0.7184305056645482, + "grad_norm": 0.23728515207767487, + "learning_rate": 0.000282352199495657, + "loss": 1.1233, + "step": 5200 + }, + { + "epoch": 0.732246476927328, + "grad_norm": 0.21022765338420868, + "learning_rate": 0.0002819319137013169, + "loss": 1.1236, + "step": 5300 + }, + { + "epoch": 0.7460624481901078, + "grad_norm": 0.0924484059214592, + "learning_rate": 0.0002815116279069767, + "loss": 1.1215, + "step": 5400 + }, + { + "epoch": 0.7598784194528876, + "grad_norm": 0.1716778427362442, + "learning_rate": 0.00028109134211263653, + "loss": 1.1238, + "step": 5500 + }, + { + "epoch": 0.7736943907156673, + "grad_norm": 0.13049638271331787, + "learning_rate": 0.0002806710563182964, + "loss": 1.1185, + "step": 5600 + }, + { + "epoch": 0.787510361978447, + "grad_norm": 0.16255174577236176, + "learning_rate": 0.0002802507705239563, + "loss": 1.1169, + "step": 5700 + }, + { + "epoch": 0.8013263332412268, + "grad_norm": 0.10065080225467682, + "learning_rate": 0.0002798304847296161, + "loss": 1.1184, + "step": 5800 + }, + { + "epoch": 0.8151423045040066, + "grad_norm": 0.1182553768157959, + "learning_rate": 0.000279410198935276, + "loss": 1.1141, + "step": 5900 + }, + { + "epoch": 0.8289582757667864, + "grad_norm": 0.14556263387203217, + "learning_rate": 0.0002789899131409358, + "loss": 1.1154, + "step": 6000 + }, + { + "epoch": 0.8427742470295662, + "grad_norm": 0.1383764147758484, + "learning_rate": 0.00027857383020453907, + "loss": 1.1118, + "step": 6100 + }, + { + "epoch": 0.8565902182923459, + "grad_norm": 0.2821154296398163, + "learning_rate": 0.00027815354441019895, + "loss": 1.1104, + "step": 6200 + }, + { + "epoch": 0.8704061895551257, + "grad_norm": 0.22286450862884521, + "learning_rate": 0.00027773325861585877, + "loss": 1.1109, + "step": 6300 + }, + { + "epoch": 0.8842221608179055, + "grad_norm": 0.2058987319469452, + "learning_rate": 0.0002773129728215186, + "loss": 1.1093, + "step": 6400 + }, + { + "epoch": 0.8980381320806853, + "grad_norm": 0.21338045597076416, + "learning_rate": 0.00027689268702717847, + "loss": 1.1091, + "step": 6500 + }, + { + "epoch": 0.9118541033434651, + "grad_norm": 0.0900028795003891, + "learning_rate": 0.0002764724012328383, + "loss": 1.1067, + "step": 6600 + }, + { + "epoch": 0.9256700746062448, + "grad_norm": 0.10679551959037781, + "learning_rate": 0.00027605211543849816, + "loss": 1.108, + "step": 6700 + }, + { + "epoch": 0.9394860458690246, + "grad_norm": 0.07972779124975204, + "learning_rate": 0.000275631829644158, + "loss": 1.1057, + "step": 6800 + }, + { + "epoch": 0.9533020171318044, + "grad_norm": 0.24500218033790588, + "learning_rate": 0.00027521154384981786, + "loss": 1.105, + "step": 6900 + }, + { + "epoch": 0.9671179883945842, + "grad_norm": 0.11576998978853226, + "learning_rate": 0.00027479125805547774, + "loss": 1.1029, + "step": 7000 + }, + { + "epoch": 0.980933959657364, + "grad_norm": 0.10553757101297379, + "learning_rate": 0.00027437097226113756, + "loss": 1.1041, + "step": 7100 + }, + { + "epoch": 0.9947499309201436, + "grad_norm": 0.15332186222076416, + "learning_rate": 0.0002739506864667974, + "loss": 1.0982, + "step": 7200 + }, + { + "epoch": 1.0085659021829234, + "grad_norm": 0.11897014081478119, + "learning_rate": 0.00027353040067245725, + "loss": 1.0996, + "step": 7300 + }, + { + "epoch": 1.0223818734457033, + "grad_norm": 0.1156444102525711, + "learning_rate": 0.0002731101148781171, + "loss": 1.1032, + "step": 7400 + }, + { + "epoch": 1.036197844708483, + "grad_norm": 0.06223931908607483, + "learning_rate": 0.00027268982908377695, + "loss": 1.0982, + "step": 7500 + }, + { + "epoch": 1.0500138159712629, + "grad_norm": 0.14377152919769287, + "learning_rate": 0.00027226954328943677, + "loss": 1.1003, + "step": 7600 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 0.12667153775691986, + "learning_rate": 0.00027184925749509665, + "loss": 1.0989, + "step": 7700 + }, + { + "epoch": 1.0776457584968224, + "grad_norm": 0.16101804375648499, + "learning_rate": 0.0002714289717007565, + "loss": 1.0968, + "step": 7800 + }, + { + "epoch": 1.091461729759602, + "grad_norm": 0.06424383819103241, + "learning_rate": 0.00027100868590641635, + "loss": 1.0955, + "step": 7900 + }, + { + "epoch": 1.105277701022382, + "grad_norm": 0.09638939052820206, + "learning_rate": 0.00027058840011207617, + "loss": 1.095, + "step": 8000 + }, + { + "epoch": 1.1190936722851617, + "grad_norm": 0.08098015189170837, + "learning_rate": 0.00027016811431773604, + "loss": 1.0969, + "step": 8100 + }, + { + "epoch": 1.1329096435479413, + "grad_norm": 0.10837887227535248, + "learning_rate": 0.00026974782852339586, + "loss": 1.096, + "step": 8200 + }, + { + "epoch": 1.1467256148107212, + "grad_norm": 0.05644046515226364, + "learning_rate": 0.00026932754272905574, + "loss": 1.0944, + "step": 8300 + }, + { + "epoch": 1.1605415860735009, + "grad_norm": 0.12965446710586548, + "learning_rate": 0.00026890725693471556, + "loss": 1.0953, + "step": 8400 + }, + { + "epoch": 1.1743575573362808, + "grad_norm": 0.12333771586418152, + "learning_rate": 0.00026848697114037544, + "loss": 1.095, + "step": 8500 + }, + { + "epoch": 1.1881735285990604, + "grad_norm": 0.1270703673362732, + "learning_rate": 0.0002680666853460353, + "loss": 1.0929, + "step": 8600 + }, + { + "epoch": 1.2019894998618403, + "grad_norm": 0.16918766498565674, + "learning_rate": 0.00026764639955169513, + "loss": 1.0918, + "step": 8700 + }, + { + "epoch": 1.21580547112462, + "grad_norm": 0.08776108920574188, + "learning_rate": 0.00026722611375735496, + "loss": 1.0952, + "step": 8800 + }, + { + "epoch": 1.2296214423874, + "grad_norm": 0.08252176642417908, + "learning_rate": 0.00026680582796301483, + "loss": 1.09, + "step": 8900 + }, + { + "epoch": 1.2434374136501796, + "grad_norm": 0.16331979632377625, + "learning_rate": 0.00026638554216867465, + "loss": 1.0898, + "step": 9000 + }, + { + "epoch": 1.2572533849129595, + "grad_norm": 0.17065368592739105, + "learning_rate": 0.00026596525637433453, + "loss": 1.0907, + "step": 9100 + }, + { + "epoch": 1.2710693561757391, + "grad_norm": 0.12038784474134445, + "learning_rate": 0.00026554497057999435, + "loss": 1.0856, + "step": 9200 + }, + { + "epoch": 1.284885327438519, + "grad_norm": 0.11924347281455994, + "learning_rate": 0.0002651246847856542, + "loss": 1.0895, + "step": 9300 + }, + { + "epoch": 1.2987012987012987, + "grad_norm": 0.1443828046321869, + "learning_rate": 0.0002647043989913141, + "loss": 1.0874, + "step": 9400 + }, + { + "epoch": 1.3125172699640784, + "grad_norm": 0.14472317695617676, + "learning_rate": 0.0002642841131969739, + "loss": 1.0879, + "step": 9500 + }, + { + "epoch": 1.3263332412268583, + "grad_norm": 0.15847088396549225, + "learning_rate": 0.00026386382740263374, + "loss": 1.0873, + "step": 9600 + }, + { + "epoch": 1.3401492124896381, + "grad_norm": 0.17960332334041595, + "learning_rate": 0.0002634435416082936, + "loss": 1.0887, + "step": 9700 + }, + { + "epoch": 1.3539651837524178, + "grad_norm": 0.1566227227449417, + "learning_rate": 0.00026302325581395344, + "loss": 1.0884, + "step": 9800 + }, + { + "epoch": 1.3677811550151975, + "grad_norm": 0.1431213617324829, + "learning_rate": 0.0002626029700196133, + "loss": 1.0864, + "step": 9900 + }, + { + "epoch": 1.3815971262779774, + "grad_norm": 0.10321222990751266, + "learning_rate": 0.0002621826842252732, + "loss": 1.0835, + "step": 10000 + }, + { + "epoch": 1.3815971262779774, + "eval_accuracy": 0.49913821881815945, + "eval_loss": 1.081355094909668, + "eval_runtime": 748.8314, + "eval_samples_per_second": 274.91, + "eval_steps_per_second": 8.592, + "step": 10000 + }, + { + "epoch": 1.395413097540757, + "grad_norm": 0.10260605067014694, + "learning_rate": 0.0002617666012888764, + "loss": 1.0843, + "step": 10100 + }, + { + "epoch": 1.409229068803537, + "grad_norm": 0.1076885387301445, + "learning_rate": 0.0002613463154945363, + "loss": 1.0845, + "step": 10200 + }, + { + "epoch": 1.4230450400663166, + "grad_norm": 0.0723571702837944, + "learning_rate": 0.0002609260297001961, + "loss": 1.0814, + "step": 10300 + }, + { + "epoch": 1.4368610113290965, + "grad_norm": 0.10695687681436539, + "learning_rate": 0.00026050574390585593, + "loss": 1.0842, + "step": 10400 + }, + { + "epoch": 1.4506769825918762, + "grad_norm": 0.11008185893297195, + "learning_rate": 0.0002600854581115158, + "loss": 1.0832, + "step": 10500 + }, + { + "epoch": 1.464492953854656, + "grad_norm": 0.12239653617143631, + "learning_rate": 0.0002596651723171756, + "loss": 1.0813, + "step": 10600 + }, + { + "epoch": 1.4783089251174357, + "grad_norm": 0.11045056581497192, + "learning_rate": 0.0002592448865228355, + "loss": 1.0848, + "step": 10700 + }, + { + "epoch": 1.4921248963802154, + "grad_norm": 0.07234488427639008, + "learning_rate": 0.0002588246007284954, + "loss": 1.0826, + "step": 10800 + }, + { + "epoch": 1.5059408676429953, + "grad_norm": 0.11086778342723846, + "learning_rate": 0.0002584043149341552, + "loss": 1.0804, + "step": 10900 + }, + { + "epoch": 1.5197568389057752, + "grad_norm": 0.10693442821502686, + "learning_rate": 0.0002579840291398151, + "loss": 1.0784, + "step": 11000 + }, + { + "epoch": 1.5335728101685548, + "grad_norm": 0.11604110896587372, + "learning_rate": 0.0002575637433454749, + "loss": 1.0792, + "step": 11100 + }, + { + "epoch": 1.5473887814313345, + "grad_norm": 0.0809662714600563, + "learning_rate": 0.0002571434575511347, + "loss": 1.083, + "step": 11200 + }, + { + "epoch": 1.5612047526941144, + "grad_norm": 0.1850002408027649, + "learning_rate": 0.0002567231717567946, + "loss": 1.0802, + "step": 11300 + }, + { + "epoch": 1.5750207239568943, + "grad_norm": 0.0779227465391159, + "learning_rate": 0.0002563028859624544, + "loss": 1.0811, + "step": 11400 + }, + { + "epoch": 1.588836695219674, + "grad_norm": 0.16764625906944275, + "learning_rate": 0.0002558826001681143, + "loss": 1.0763, + "step": 11500 + }, + { + "epoch": 1.6026526664824536, + "grad_norm": 0.11104313284158707, + "learning_rate": 0.00025546231437377417, + "loss": 1.0782, + "step": 11600 + }, + { + "epoch": 1.6164686377452335, + "grad_norm": 0.16667212545871735, + "learning_rate": 0.000255042028579434, + "loss": 1.0781, + "step": 11700 + }, + { + "epoch": 1.6302846090080134, + "grad_norm": 0.2246047705411911, + "learning_rate": 0.00025462174278509386, + "loss": 1.08, + "step": 11800 + }, + { + "epoch": 1.644100580270793, + "grad_norm": 0.2305343896150589, + "learning_rate": 0.0002542014569907537, + "loss": 1.0756, + "step": 11900 + }, + { + "epoch": 1.6579165515335728, + "grad_norm": 0.13618823885917664, + "learning_rate": 0.0002537811711964135, + "loss": 1.076, + "step": 12000 + }, + { + "epoch": 1.6717325227963524, + "grad_norm": 0.15795475244522095, + "learning_rate": 0.0002533608854020734, + "loss": 1.0749, + "step": 12100 + }, + { + "epoch": 1.6855484940591323, + "grad_norm": 0.20267115533351898, + "learning_rate": 0.00025294480246567665, + "loss": 1.077, + "step": 12200 + }, + { + "epoch": 1.6993644653219122, + "grad_norm": 0.08052489906549454, + "learning_rate": 0.0002525245166713365, + "loss": 1.073, + "step": 12300 + }, + { + "epoch": 1.7131804365846919, + "grad_norm": 0.11914093047380447, + "learning_rate": 0.00025210423087699635, + "loss": 1.0755, + "step": 12400 + }, + { + "epoch": 1.7269964078474715, + "grad_norm": 0.12703542411327362, + "learning_rate": 0.00025168394508265617, + "loss": 1.0765, + "step": 12500 + }, + { + "epoch": 1.7408123791102514, + "grad_norm": 0.12948518991470337, + "learning_rate": 0.00025126365928831605, + "loss": 1.0748, + "step": 12600 + }, + { + "epoch": 1.7546283503730313, + "grad_norm": 0.1027710810303688, + "learning_rate": 0.00025084337349397587, + "loss": 1.0745, + "step": 12700 + }, + { + "epoch": 1.768444321635811, + "grad_norm": 0.20131652057170868, + "learning_rate": 0.0002504230876996357, + "loss": 1.0731, + "step": 12800 + }, + { + "epoch": 1.7822602928985907, + "grad_norm": 0.0673370212316513, + "learning_rate": 0.00025000280190529557, + "loss": 1.0721, + "step": 12900 + }, + { + "epoch": 1.7960762641613706, + "grad_norm": 0.10322799533605576, + "learning_rate": 0.00024958251611095544, + "loss": 1.0731, + "step": 13000 + }, + { + "epoch": 1.8098922354241505, + "grad_norm": 0.08498311042785645, + "learning_rate": 0.00024916223031661526, + "loss": 1.0722, + "step": 13100 + }, + { + "epoch": 1.8237082066869301, + "grad_norm": 0.07025079429149628, + "learning_rate": 0.00024874194452227514, + "loss": 1.0725, + "step": 13200 + }, + { + "epoch": 1.8375241779497098, + "grad_norm": 0.13933932781219482, + "learning_rate": 0.00024832165872793496, + "loss": 1.0714, + "step": 13300 + }, + { + "epoch": 1.8513401492124897, + "grad_norm": 0.10513993352651596, + "learning_rate": 0.00024790137293359484, + "loss": 1.0725, + "step": 13400 + }, + { + "epoch": 1.8651561204752696, + "grad_norm": 0.1704607903957367, + "learning_rate": 0.0002474810871392547, + "loss": 1.0712, + "step": 13500 + }, + { + "epoch": 1.8789720917380492, + "grad_norm": 0.08315689861774445, + "learning_rate": 0.0002470608013449145, + "loss": 1.0697, + "step": 13600 + }, + { + "epoch": 1.892788063000829, + "grad_norm": 0.09900273382663727, + "learning_rate": 0.00024664051555057436, + "loss": 1.0735, + "step": 13700 + }, + { + "epoch": 1.9066040342636086, + "grad_norm": 0.05560864508152008, + "learning_rate": 0.00024622022975623423, + "loss": 1.0711, + "step": 13800 + }, + { + "epoch": 1.9204200055263885, + "grad_norm": 0.13863462209701538, + "learning_rate": 0.00024579994396189405, + "loss": 1.0681, + "step": 13900 + }, + { + "epoch": 1.9342359767891684, + "grad_norm": 0.07841744273900986, + "learning_rate": 0.00024537965816755393, + "loss": 1.0711, + "step": 14000 + }, + { + "epoch": 1.948051948051948, + "grad_norm": 0.058312736451625824, + "learning_rate": 0.00024495937237321375, + "loss": 1.0709, + "step": 14100 + }, + { + "epoch": 1.9618679193147277, + "grad_norm": 0.11208023875951767, + "learning_rate": 0.000244543289436817, + "loss": 1.0686, + "step": 14200 + }, + { + "epoch": 1.9756838905775076, + "grad_norm": 0.10133163630962372, + "learning_rate": 0.00024412300364247687, + "loss": 1.0683, + "step": 14300 + }, + { + "epoch": 1.9894998618402875, + "grad_norm": 0.08370282500982285, + "learning_rate": 0.0002437027178481367, + "loss": 1.0709, + "step": 14400 + }, + { + "epoch": 2.003315833103067, + "grad_norm": 0.09476770460605621, + "learning_rate": 0.00024328243205379654, + "loss": 1.0697, + "step": 14500 + }, + { + "epoch": 2.017131804365847, + "grad_norm": 0.0733637660741806, + "learning_rate": 0.0002428621462594564, + "loss": 1.0681, + "step": 14600 + }, + { + "epoch": 2.0309477756286265, + "grad_norm": 0.09925834089517593, + "learning_rate": 0.00024244186046511627, + "loss": 1.0702, + "step": 14700 + }, + { + "epoch": 2.0447637468914066, + "grad_norm": 0.15911750495433807, + "learning_rate": 0.00024202157467077611, + "loss": 1.0665, + "step": 14800 + }, + { + "epoch": 2.0585797181541863, + "grad_norm": 0.13638247549533844, + "learning_rate": 0.00024160128887643596, + "loss": 1.0696, + "step": 14900 + }, + { + "epoch": 2.072395689416966, + "grad_norm": 0.16883982717990875, + "learning_rate": 0.0002411810030820958, + "loss": 1.0641, + "step": 15000 + }, + { + "epoch": 2.072395689416966, + "eval_accuracy": 0.5102966510685876, + "eval_loss": 1.0638896226882935, + "eval_runtime": 924.2494, + "eval_samples_per_second": 222.733, + "eval_steps_per_second": 6.961, + "step": 15000 + }, + { + "epoch": 2.0862116606797456, + "grad_norm": 0.09925784170627594, + "learning_rate": 0.00024076071728775566, + "loss": 1.0683, + "step": 15100 + }, + { + "epoch": 2.1000276319425257, + "grad_norm": 0.06180203706026077, + "learning_rate": 0.00024034043149341548, + "loss": 1.066, + "step": 15200 + }, + { + "epoch": 2.1138436032053054, + "grad_norm": 0.10063247382640839, + "learning_rate": 0.00023992014569907533, + "loss": 1.0668, + "step": 15300 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 0.11476041376590729, + "learning_rate": 0.0002394998599047352, + "loss": 1.0644, + "step": 15400 + }, + { + "epoch": 2.1414755457308647, + "grad_norm": 0.11798429489135742, + "learning_rate": 0.00023907957411039505, + "loss": 1.0626, + "step": 15500 + }, + { + "epoch": 2.155291516993645, + "grad_norm": 0.13165287673473358, + "learning_rate": 0.0002386592883160549, + "loss": 1.0648, + "step": 15600 + }, + { + "epoch": 2.1691074882564245, + "grad_norm": 0.1705123484134674, + "learning_rate": 0.00023823900252171475, + "loss": 1.0639, + "step": 15700 + }, + { + "epoch": 2.182923459519204, + "grad_norm": 0.13375049829483032, + "learning_rate": 0.0002378187167273746, + "loss": 1.062, + "step": 15800 + }, + { + "epoch": 2.196739430781984, + "grad_norm": 0.09405038505792618, + "learning_rate": 0.00023739843093303445, + "loss": 1.0634, + "step": 15900 + }, + { + "epoch": 2.210555402044764, + "grad_norm": 0.11285752803087234, + "learning_rate": 0.00023697814513869427, + "loss": 1.0667, + "step": 16000 + }, + { + "epoch": 2.2243713733075436, + "grad_norm": 0.12377699464559555, + "learning_rate": 0.00023655785934435412, + "loss": 1.064, + "step": 16100 + }, + { + "epoch": 2.2381873445703233, + "grad_norm": 0.0979316234588623, + "learning_rate": 0.000236137573550014, + "loss": 1.0621, + "step": 16200 + }, + { + "epoch": 2.252003315833103, + "grad_norm": 0.11494515091180801, + "learning_rate": 0.00023572149061361724, + "loss": 1.0645, + "step": 16300 + }, + { + "epoch": 2.2658192870958827, + "grad_norm": 0.07066236436367035, + "learning_rate": 0.0002353012048192771, + "loss": 1.063, + "step": 16400 + }, + { + "epoch": 2.2796352583586628, + "grad_norm": 0.08686563372612, + "learning_rate": 0.00023488091902493694, + "loss": 1.066, + "step": 16500 + }, + { + "epoch": 2.2934512296214424, + "grad_norm": 0.058148209005594254, + "learning_rate": 0.00023446063323059678, + "loss": 1.0643, + "step": 16600 + }, + { + "epoch": 2.307267200884222, + "grad_norm": 0.14033359289169312, + "learning_rate": 0.00023404034743625666, + "loss": 1.0634, + "step": 16700 + }, + { + "epoch": 2.3210831721470018, + "grad_norm": 0.09940097481012344, + "learning_rate": 0.00023362006164191645, + "loss": 1.0629, + "step": 16800 + }, + { + "epoch": 2.334899143409782, + "grad_norm": 0.08228994905948639, + "learning_rate": 0.00023319977584757633, + "loss": 1.0626, + "step": 16900 + }, + { + "epoch": 2.3487151146725616, + "grad_norm": 0.05418753623962402, + "learning_rate": 0.00023277949005323618, + "loss": 1.0611, + "step": 17000 + }, + { + "epoch": 2.3625310859353412, + "grad_norm": 0.09691222757101059, + "learning_rate": 0.00023235920425889603, + "loss": 1.0626, + "step": 17100 + }, + { + "epoch": 2.376347057198121, + "grad_norm": 0.1607312560081482, + "learning_rate": 0.00023193891846455588, + "loss": 1.0623, + "step": 17200 + }, + { + "epoch": 2.3901630284609006, + "grad_norm": 0.1193649098277092, + "learning_rate": 0.00023151863267021572, + "loss": 1.0627, + "step": 17300 + }, + { + "epoch": 2.4039789997236807, + "grad_norm": 0.05427398905158043, + "learning_rate": 0.00023109834687587557, + "loss": 1.0609, + "step": 17400 + }, + { + "epoch": 2.4177949709864603, + "grad_norm": 0.10591702163219452, + "learning_rate": 0.00023067806108153545, + "loss": 1.0637, + "step": 17500 + }, + { + "epoch": 2.43161094224924, + "grad_norm": 0.057032886892557144, + "learning_rate": 0.00023025777528719524, + "loss": 1.0612, + "step": 17600 + }, + { + "epoch": 2.44542691351202, + "grad_norm": 0.08455175161361694, + "learning_rate": 0.00022983748949285512, + "loss": 1.0606, + "step": 17700 + }, + { + "epoch": 2.4592428847748, + "grad_norm": 0.13975144922733307, + "learning_rate": 0.00022941720369851497, + "loss": 1.0624, + "step": 17800 + }, + { + "epoch": 2.4730588560375795, + "grad_norm": 0.11535393446683884, + "learning_rate": 0.00022899691790417482, + "loss": 1.0603, + "step": 17900 + }, + { + "epoch": 2.486874827300359, + "grad_norm": 0.10047648102045059, + "learning_rate": 0.00022857663210983466, + "loss": 1.0607, + "step": 18000 + }, + { + "epoch": 2.500690798563139, + "grad_norm": 0.08474704623222351, + "learning_rate": 0.0002281563463154945, + "loss": 1.062, + "step": 18100 + }, + { + "epoch": 2.514506769825919, + "grad_norm": 0.15308576822280884, + "learning_rate": 0.00022773606052115436, + "loss": 1.0603, + "step": 18200 + }, + { + "epoch": 2.5283227410886986, + "grad_norm": 0.05684039369225502, + "learning_rate": 0.00022731577472681424, + "loss": 1.0589, + "step": 18300 + }, + { + "epoch": 2.5421387123514783, + "grad_norm": 0.10712555050849915, + "learning_rate": 0.00022689548893247409, + "loss": 1.0592, + "step": 18400 + }, + { + "epoch": 2.555954683614258, + "grad_norm": 0.0800655260682106, + "learning_rate": 0.0002264794059960773, + "loss": 1.0603, + "step": 18500 + }, + { + "epoch": 2.569770654877038, + "grad_norm": 0.05980188027024269, + "learning_rate": 0.00022605912020173715, + "loss": 1.0608, + "step": 18600 + }, + { + "epoch": 2.5835866261398177, + "grad_norm": 0.052051473408937454, + "learning_rate": 0.000225638834407397, + "loss": 1.0603, + "step": 18700 + }, + { + "epoch": 2.5974025974025974, + "grad_norm": 0.11966883391141891, + "learning_rate": 0.00022521854861305685, + "loss": 1.057, + "step": 18800 + }, + { + "epoch": 2.611218568665377, + "grad_norm": 0.08861220628023148, + "learning_rate": 0.00022479826281871673, + "loss": 1.0603, + "step": 18900 + }, + { + "epoch": 2.6250345399281567, + "grad_norm": 0.12264814227819443, + "learning_rate": 0.00022437797702437657, + "loss": 1.0602, + "step": 19000 + }, + { + "epoch": 2.638850511190937, + "grad_norm": 0.08384163677692413, + "learning_rate": 0.00022395769123003642, + "loss": 1.057, + "step": 19100 + }, + { + "epoch": 2.6526664824537165, + "grad_norm": 0.11168386787176132, + "learning_rate": 0.00022353740543569624, + "loss": 1.0572, + "step": 19200 + }, + { + "epoch": 2.666482453716496, + "grad_norm": 0.12558519840240479, + "learning_rate": 0.0002231171196413561, + "loss": 1.0592, + "step": 19300 + }, + { + "epoch": 2.6802984249792763, + "grad_norm": 0.06810207664966583, + "learning_rate": 0.00022269683384701594, + "loss": 1.055, + "step": 19400 + }, + { + "epoch": 2.694114396242056, + "grad_norm": 0.16571113467216492, + "learning_rate": 0.0002222765480526758, + "loss": 1.0599, + "step": 19500 + }, + { + "epoch": 2.7079303675048356, + "grad_norm": 0.07613151520490646, + "learning_rate": 0.00022185626225833564, + "loss": 1.0564, + "step": 19600 + }, + { + "epoch": 2.7217463387676153, + "grad_norm": 0.08713393658399582, + "learning_rate": 0.00022143597646399551, + "loss": 1.0582, + "step": 19700 + }, + { + "epoch": 2.735562310030395, + "grad_norm": 0.11707925796508789, + "learning_rate": 0.00022101569066965536, + "loss": 1.056, + "step": 19800 + }, + { + "epoch": 2.749378281293175, + "grad_norm": 0.1053171455860138, + "learning_rate": 0.0002205954048753152, + "loss": 1.0608, + "step": 19900 + }, + { + "epoch": 2.7631942525559547, + "grad_norm": 0.056531500071287155, + "learning_rate": 0.00022017511908097506, + "loss": 1.0563, + "step": 20000 + }, + { + "epoch": 2.7631942525559547, + "eval_accuracy": 0.516310033016185, + "eval_loss": 1.054749608039856, + "eval_runtime": 731.5154, + "eval_samples_per_second": 281.417, + "eval_steps_per_second": 8.795, + "step": 20000 + }, + { + "epoch": 2.7770102238187344, + "grad_norm": 0.10811367630958557, + "learning_rate": 0.00021975483328663488, + "loss": 1.0556, + "step": 20100 + }, + { + "epoch": 2.790826195081514, + "grad_norm": 0.06601472198963165, + "learning_rate": 0.00021933454749229473, + "loss": 1.0578, + "step": 20200 + }, + { + "epoch": 2.804642166344294, + "grad_norm": 0.06906837224960327, + "learning_rate": 0.00021891426169795458, + "loss": 1.06, + "step": 20300 + }, + { + "epoch": 2.818458137607074, + "grad_norm": 0.08911406248807907, + "learning_rate": 0.00021849397590361443, + "loss": 1.0583, + "step": 20400 + }, + { + "epoch": 2.8322741088698535, + "grad_norm": 0.06497912108898163, + "learning_rate": 0.0002180778929672177, + "loss": 1.0575, + "step": 20500 + }, + { + "epoch": 2.846090080132633, + "grad_norm": 0.0886107012629509, + "learning_rate": 0.00021765760717287755, + "loss": 1.0552, + "step": 20600 + }, + { + "epoch": 2.859906051395413, + "grad_norm": 0.05942055955529213, + "learning_rate": 0.0002172373213785374, + "loss": 1.0533, + "step": 20700 + }, + { + "epoch": 2.873722022658193, + "grad_norm": 0.13015809655189514, + "learning_rate": 0.00021681703558419725, + "loss": 1.0549, + "step": 20800 + }, + { + "epoch": 2.8875379939209727, + "grad_norm": 0.06085093691945076, + "learning_rate": 0.00021639674978985707, + "loss": 1.057, + "step": 20900 + }, + { + "epoch": 2.9013539651837523, + "grad_norm": 0.17039401829242706, + "learning_rate": 0.00021597646399551692, + "loss": 1.0571, + "step": 21000 + }, + { + "epoch": 2.9151699364465324, + "grad_norm": 0.07950026541948318, + "learning_rate": 0.00021555617820117676, + "loss": 1.0535, + "step": 21100 + }, + { + "epoch": 2.928985907709312, + "grad_norm": 0.1195695698261261, + "learning_rate": 0.00021513589240683664, + "loss": 1.0535, + "step": 21200 + }, + { + "epoch": 2.942801878972092, + "grad_norm": 0.0896124541759491, + "learning_rate": 0.0002147156066124965, + "loss": 1.0534, + "step": 21300 + }, + { + "epoch": 2.9566178502348714, + "grad_norm": 0.07629978656768799, + "learning_rate": 0.00021429532081815634, + "loss": 1.0564, + "step": 21400 + }, + { + "epoch": 2.970433821497651, + "grad_norm": 0.07431907206773758, + "learning_rate": 0.00021387503502381618, + "loss": 1.0559, + "step": 21500 + }, + { + "epoch": 2.984249792760431, + "grad_norm": 0.0771278440952301, + "learning_rate": 0.00021345474922947603, + "loss": 1.0562, + "step": 21600 + }, + { + "epoch": 2.998065764023211, + "grad_norm": 0.11643990874290466, + "learning_rate": 0.00021303446343513585, + "loss": 1.0525, + "step": 21700 + }, + { + "epoch": 3.0118817352859906, + "grad_norm": 0.058162059634923935, + "learning_rate": 0.0002126141776407957, + "loss": 1.0509, + "step": 21800 + }, + { + "epoch": 3.0256977065487702, + "grad_norm": 0.12037301808595657, + "learning_rate": 0.00021219389184645558, + "loss": 1.0513, + "step": 21900 + }, + { + "epoch": 3.0395136778115504, + "grad_norm": 0.052515506744384766, + "learning_rate": 0.00021177360605211543, + "loss": 1.051, + "step": 22000 + }, + { + "epoch": 3.05332964907433, + "grad_norm": 0.10646827518939972, + "learning_rate": 0.00021135332025777528, + "loss": 1.0542, + "step": 22100 + }, + { + "epoch": 3.0671456203371097, + "grad_norm": 0.1113181784749031, + "learning_rate": 0.00021093303446343512, + "loss": 1.0531, + "step": 22200 + }, + { + "epoch": 3.0809615915998894, + "grad_norm": 0.07355222851037979, + "learning_rate": 0.00021051274866909497, + "loss": 1.0524, + "step": 22300 + }, + { + "epoch": 3.094777562862669, + "grad_norm": 0.06925370544195175, + "learning_rate": 0.00021009246287475482, + "loss": 1.0535, + "step": 22400 + }, + { + "epoch": 3.108593534125449, + "grad_norm": 0.048475924879312515, + "learning_rate": 0.00020967217708041464, + "loss": 1.0564, + "step": 22500 + }, + { + "epoch": 3.122409505388229, + "grad_norm": 0.08578319102525711, + "learning_rate": 0.0002092518912860745, + "loss": 1.0519, + "step": 22600 + }, + { + "epoch": 3.1362254766510085, + "grad_norm": 0.08585724979639053, + "learning_rate": 0.00020883160549173437, + "loss": 1.0525, + "step": 22700 + }, + { + "epoch": 3.150041447913788, + "grad_norm": 0.06518802791833878, + "learning_rate": 0.00020841131969739422, + "loss": 1.0543, + "step": 22800 + }, + { + "epoch": 3.1638574191765683, + "grad_norm": 0.046030618250370026, + "learning_rate": 0.00020799103390305406, + "loss": 1.0525, + "step": 22900 + }, + { + "epoch": 3.177673390439348, + "grad_norm": 0.04972764104604721, + "learning_rate": 0.0002075707481087139, + "loss": 1.0512, + "step": 23000 + }, + { + "epoch": 3.1914893617021276, + "grad_norm": 0.11977583914995193, + "learning_rate": 0.00020715046231437376, + "loss": 1.052, + "step": 23100 + }, + { + "epoch": 3.2053053329649073, + "grad_norm": 0.08040472120046616, + "learning_rate": 0.0002067301765200336, + "loss": 1.0491, + "step": 23200 + }, + { + "epoch": 3.2191213042276874, + "grad_norm": 0.10473213344812393, + "learning_rate": 0.00020630989072569343, + "loss": 1.0525, + "step": 23300 + }, + { + "epoch": 3.232937275490467, + "grad_norm": 0.0790744498372078, + "learning_rate": 0.00020588960493135328, + "loss": 1.0508, + "step": 23400 + }, + { + "epoch": 3.2467532467532467, + "grad_norm": 0.12807689607143402, + "learning_rate": 0.00020547352199495655, + "loss": 1.0485, + "step": 23500 + }, + { + "epoch": 3.2605692180160264, + "grad_norm": 0.10298227518796921, + "learning_rate": 0.0002050532362006164, + "loss": 1.049, + "step": 23600 + }, + { + "epoch": 3.2743851892788065, + "grad_norm": 0.11504103243350983, + "learning_rate": 0.00020463295040627625, + "loss": 1.0511, + "step": 23700 + }, + { + "epoch": 3.288201160541586, + "grad_norm": 0.05548229441046715, + "learning_rate": 0.0002042126646119361, + "loss": 1.0499, + "step": 23800 + }, + { + "epoch": 3.302017131804366, + "grad_norm": 0.06242981553077698, + "learning_rate": 0.00020379237881759595, + "loss": 1.0543, + "step": 23900 + }, + { + "epoch": 3.3158331030671455, + "grad_norm": 0.12101748585700989, + "learning_rate": 0.00020337209302325582, + "loss": 1.0482, + "step": 24000 + }, + { + "epoch": 3.329649074329925, + "grad_norm": 0.09176388382911682, + "learning_rate": 0.00020295180722891562, + "loss": 1.0514, + "step": 24100 + }, + { + "epoch": 3.3434650455927053, + "grad_norm": 0.08758760988712311, + "learning_rate": 0.0002025315214345755, + "loss": 1.0505, + "step": 24200 + }, + { + "epoch": 3.357281016855485, + "grad_norm": 0.06818066537380219, + "learning_rate": 0.00020211123564023534, + "loss": 1.0511, + "step": 24300 + }, + { + "epoch": 3.3710969881182646, + "grad_norm": 0.10384306311607361, + "learning_rate": 0.0002016909498458952, + "loss": 1.0513, + "step": 24400 + }, + { + "epoch": 3.3849129593810443, + "grad_norm": 0.12452493607997894, + "learning_rate": 0.00020127066405155504, + "loss": 1.0502, + "step": 24500 + }, + { + "epoch": 3.3987289306438244, + "grad_norm": 0.07460072636604309, + "learning_rate": 0.0002008503782572149, + "loss": 1.0526, + "step": 24600 + }, + { + "epoch": 3.412544901906604, + "grad_norm": 0.1017543151974678, + "learning_rate": 0.00020043009246287474, + "loss": 1.0501, + "step": 24700 + }, + { + "epoch": 3.4263608731693838, + "grad_norm": 0.0900358185172081, + "learning_rate": 0.0002000098066685346, + "loss": 1.0512, + "step": 24800 + }, + { + "epoch": 3.4401768444321634, + "grad_norm": 0.10934050381183624, + "learning_rate": 0.00019958952087419443, + "loss": 1.0495, + "step": 24900 + }, + { + "epoch": 3.4539928156949435, + "grad_norm": 0.0656353011727333, + "learning_rate": 0.00019916923507985428, + "loss": 1.0504, + "step": 25000 + }, + { + "epoch": 3.4539928156949435, + "eval_accuracy": 0.520419659075542, + "eval_loss": 1.0485948324203491, + "eval_runtime": 728.0613, + "eval_samples_per_second": 282.752, + "eval_steps_per_second": 8.837, + "step": 25000 + }, + { + "epoch": 3.467808786957723, + "grad_norm": 0.07246037572622299, + "learning_rate": 0.00019874894928551413, + "loss": 1.0493, + "step": 25100 + }, + { + "epoch": 3.481624758220503, + "grad_norm": 0.14033739268779755, + "learning_rate": 0.00019832866349117398, + "loss": 1.05, + "step": 25200 + }, + { + "epoch": 3.4954407294832825, + "grad_norm": 0.05688853561878204, + "learning_rate": 0.00019790837769683383, + "loss": 1.0509, + "step": 25300 + }, + { + "epoch": 3.5092567007460627, + "grad_norm": 0.053916674107313156, + "learning_rate": 0.00019748809190249368, + "loss": 1.0503, + "step": 25400 + }, + { + "epoch": 3.5230726720088423, + "grad_norm": 0.12233688682317734, + "learning_rate": 0.00019706780610815352, + "loss": 1.05, + "step": 25500 + }, + { + "epoch": 3.536888643271622, + "grad_norm": 0.10314755886793137, + "learning_rate": 0.0001966475203138134, + "loss": 1.0501, + "step": 25600 + }, + { + "epoch": 3.5507046145344017, + "grad_norm": 0.05037887394428253, + "learning_rate": 0.00019623143737741662, + "loss": 1.0468, + "step": 25700 + }, + { + "epoch": 3.5645205857971813, + "grad_norm": 0.13344399631023407, + "learning_rate": 0.00019581115158307647, + "loss": 1.0477, + "step": 25800 + }, + { + "epoch": 3.5783365570599615, + "grad_norm": 0.07191654294729233, + "learning_rate": 0.00019539086578873632, + "loss": 1.0498, + "step": 25900 + }, + { + "epoch": 3.592152528322741, + "grad_norm": 0.05592725798487663, + "learning_rate": 0.00019497057999439616, + "loss": 1.0506, + "step": 26000 + }, + { + "epoch": 3.605968499585521, + "grad_norm": 0.10346696525812149, + "learning_rate": 0.000194550294200056, + "loss": 1.0499, + "step": 26100 + }, + { + "epoch": 3.619784470848301, + "grad_norm": 0.09233855456113815, + "learning_rate": 0.0001941300084057159, + "loss": 1.0456, + "step": 26200 + }, + { + "epoch": 3.6336004421110806, + "grad_norm": 0.060603220015764236, + "learning_rate": 0.00019370972261137574, + "loss": 1.0475, + "step": 26300 + }, + { + "epoch": 3.6474164133738602, + "grad_norm": 0.11710167676210403, + "learning_rate": 0.00019328943681703559, + "loss": 1.0497, + "step": 26400 + }, + { + "epoch": 3.66123238463664, + "grad_norm": 0.16325397789478302, + "learning_rate": 0.0001928691510226954, + "loss": 1.0487, + "step": 26500 + }, + { + "epoch": 3.6750483558994196, + "grad_norm": 0.08937475085258484, + "learning_rate": 0.00019244886522835526, + "loss": 1.0468, + "step": 26600 + }, + { + "epoch": 3.6888643271621993, + "grad_norm": 0.07486152648925781, + "learning_rate": 0.0001920285794340151, + "loss": 1.0479, + "step": 26700 + }, + { + "epoch": 3.7026802984249794, + "grad_norm": 0.1263752579689026, + "learning_rate": 0.00019160829363967495, + "loss": 1.0449, + "step": 26800 + }, + { + "epoch": 3.716496269687759, + "grad_norm": 0.11803583055734634, + "learning_rate": 0.0001911880078453348, + "loss": 1.0512, + "step": 26900 + }, + { + "epoch": 3.7303122409505387, + "grad_norm": 0.07918773591518402, + "learning_rate": 0.00019076772205099468, + "loss": 1.0486, + "step": 27000 + }, + { + "epoch": 3.744128212213319, + "grad_norm": 0.11923271417617798, + "learning_rate": 0.00019034743625665453, + "loss": 1.0465, + "step": 27100 + }, + { + "epoch": 3.7579441834760985, + "grad_norm": 0.12752223014831543, + "learning_rate": 0.00018992715046231437, + "loss": 1.0472, + "step": 27200 + }, + { + "epoch": 3.771760154738878, + "grad_norm": 0.07391146570444107, + "learning_rate": 0.0001895068646679742, + "loss": 1.0493, + "step": 27300 + }, + { + "epoch": 3.785576126001658, + "grad_norm": 0.06606881320476532, + "learning_rate": 0.00018908657887363404, + "loss": 1.0485, + "step": 27400 + }, + { + "epoch": 3.7993920972644375, + "grad_norm": 0.04949864745140076, + "learning_rate": 0.0001886662930792939, + "loss": 1.0481, + "step": 27500 + }, + { + "epoch": 3.8132080685272176, + "grad_norm": 0.05234380066394806, + "learning_rate": 0.00018824600728495374, + "loss": 1.0476, + "step": 27600 + }, + { + "epoch": 3.8270240397899973, + "grad_norm": 0.04995539411902428, + "learning_rate": 0.0001878257214906136, + "loss": 1.0466, + "step": 27700 + }, + { + "epoch": 3.840840011052777, + "grad_norm": 0.09871330112218857, + "learning_rate": 0.00018740543569627347, + "loss": 1.0501, + "step": 27800 + }, + { + "epoch": 3.8546559823155566, + "grad_norm": 0.06254375725984573, + "learning_rate": 0.00018698514990193331, + "loss": 1.0467, + "step": 27900 + }, + { + "epoch": 3.8684719535783367, + "grad_norm": 0.07971449941396713, + "learning_rate": 0.00018656486410759316, + "loss": 1.0502, + "step": 28000 + }, + { + "epoch": 3.8822879248411164, + "grad_norm": 0.12627951800823212, + "learning_rate": 0.000186144578313253, + "loss": 1.0446, + "step": 28100 + }, + { + "epoch": 3.896103896103896, + "grad_norm": 0.08057064563035965, + "learning_rate": 0.00018572429251891283, + "loss": 1.0468, + "step": 28200 + }, + { + "epoch": 3.9099198673666757, + "grad_norm": 0.0501413568854332, + "learning_rate": 0.00018530400672457268, + "loss": 1.0453, + "step": 28300 + }, + { + "epoch": 3.9237358386294554, + "grad_norm": 0.09999352693557739, + "learning_rate": 0.00018488372093023253, + "loss": 1.0502, + "step": 28400 + }, + { + "epoch": 3.9375518098922355, + "grad_norm": 0.12323564291000366, + "learning_rate": 0.00018446343513589238, + "loss": 1.0478, + "step": 28500 + }, + { + "epoch": 3.951367781155015, + "grad_norm": 0.0877193808555603, + "learning_rate": 0.00018404314934155225, + "loss": 1.049, + "step": 28600 + }, + { + "epoch": 3.965183752417795, + "grad_norm": 0.09397170692682266, + "learning_rate": 0.0001836228635472121, + "loss": 1.0474, + "step": 28700 + }, + { + "epoch": 3.978999723680575, + "grad_norm": 0.09532420337200165, + "learning_rate": 0.00018320257775287195, + "loss": 1.0496, + "step": 28800 + }, + { + "epoch": 3.9928156949433546, + "grad_norm": 0.0442403182387352, + "learning_rate": 0.0001827822919585318, + "loss": 1.0466, + "step": 28900 + }, + { + "epoch": 4.006631666206134, + "grad_norm": 0.06309514492750168, + "learning_rate": 0.00018236200616419162, + "loss": 1.0479, + "step": 29000 + }, + { + "epoch": 4.020447637468914, + "grad_norm": 0.06191420555114746, + "learning_rate": 0.00018194172036985147, + "loss": 1.0442, + "step": 29100 + }, + { + "epoch": 4.034263608731694, + "grad_norm": 0.06752864271402359, + "learning_rate": 0.00018152143457551132, + "loss": 1.045, + "step": 29200 + }, + { + "epoch": 4.048079579994473, + "grad_norm": 0.07383009046316147, + "learning_rate": 0.00018110114878117117, + "loss": 1.0429, + "step": 29300 + }, + { + "epoch": 4.061895551257253, + "grad_norm": 0.11942852288484573, + "learning_rate": 0.00018068086298683104, + "loss": 1.0433, + "step": 29400 + }, + { + "epoch": 4.0757115225200335, + "grad_norm": 0.0840003713965416, + "learning_rate": 0.0001802605771924909, + "loss": 1.0434, + "step": 29500 + }, + { + "epoch": 4.089527493782813, + "grad_norm": 0.07768476754426956, + "learning_rate": 0.00017984029139815074, + "loss": 1.0421, + "step": 29600 + }, + { + "epoch": 4.103343465045593, + "grad_norm": 0.07166603952646255, + "learning_rate": 0.00017942420846175398, + "loss": 1.0443, + "step": 29700 + }, + { + "epoch": 4.1171594363083726, + "grad_norm": 0.07380765676498413, + "learning_rate": 0.0001790039226674138, + "loss": 1.0448, + "step": 29800 + }, + { + "epoch": 4.130975407571152, + "grad_norm": 0.1263025552034378, + "learning_rate": 0.00017858363687307365, + "loss": 1.0437, + "step": 29900 + }, + { + "epoch": 4.144791378833932, + "grad_norm": 0.09632286429405212, + "learning_rate": 0.00017816335107873353, + "loss": 1.0439, + "step": 30000 + }, + { + "epoch": 4.144791378833932, + "eval_accuracy": 0.5233148259844476, + "eval_loss": 1.0439139604568481, + "eval_runtime": 787.8404, + "eval_samples_per_second": 261.298, + "eval_steps_per_second": 8.167, + "step": 30000 + }, + { + "epoch": 4.158607350096712, + "grad_norm": 0.09395026415586472, + "learning_rate": 0.00017774306528439338, + "loss": 1.0447, + "step": 30100 + }, + { + "epoch": 4.172423321359491, + "grad_norm": 0.07320912927389145, + "learning_rate": 0.00017732277949005323, + "loss": 1.0477, + "step": 30200 + }, + { + "epoch": 4.186239292622272, + "grad_norm": 0.05703623965382576, + "learning_rate": 0.00017690249369571308, + "loss": 1.0443, + "step": 30300 + }, + { + "epoch": 4.2000552638850515, + "grad_norm": 0.04885410889983177, + "learning_rate": 0.00017648220790137292, + "loss": 1.0467, + "step": 30400 + }, + { + "epoch": 4.213871235147831, + "grad_norm": 0.10649748146533966, + "learning_rate": 0.00017606192210703277, + "loss": 1.0448, + "step": 30500 + }, + { + "epoch": 4.227687206410611, + "grad_norm": 0.05844441428780556, + "learning_rate": 0.0001756416363126926, + "loss": 1.044, + "step": 30600 + }, + { + "epoch": 4.2415031776733905, + "grad_norm": 0.07287675887346268, + "learning_rate": 0.00017522135051835244, + "loss": 1.0428, + "step": 30700 + }, + { + "epoch": 4.25531914893617, + "grad_norm": 0.05190150439739227, + "learning_rate": 0.00017480106472401232, + "loss": 1.0413, + "step": 30800 + }, + { + "epoch": 4.26913512019895, + "grad_norm": 0.06985218822956085, + "learning_rate": 0.00017438077892967217, + "loss": 1.0455, + "step": 30900 + }, + { + "epoch": 4.2829510914617295, + "grad_norm": 0.06930764764547348, + "learning_rate": 0.00017396049313533202, + "loss": 1.0444, + "step": 31000 + }, + { + "epoch": 4.296767062724509, + "grad_norm": 0.07905230671167374, + "learning_rate": 0.00017354020734099186, + "loss": 1.0445, + "step": 31100 + }, + { + "epoch": 4.31058303398729, + "grad_norm": 0.04994554817676544, + "learning_rate": 0.0001731199215466517, + "loss": 1.0432, + "step": 31200 + }, + { + "epoch": 4.324399005250069, + "grad_norm": 0.08036911487579346, + "learning_rate": 0.00017269963575231156, + "loss": 1.0424, + "step": 31300 + }, + { + "epoch": 4.338214976512849, + "grad_norm": 0.07251475006341934, + "learning_rate": 0.00017227934995797138, + "loss": 1.0465, + "step": 31400 + }, + { + "epoch": 4.352030947775629, + "grad_norm": 0.09622683376073837, + "learning_rate": 0.00017185906416363123, + "loss": 1.0441, + "step": 31500 + }, + { + "epoch": 4.365846919038408, + "grad_norm": 0.07545050978660583, + "learning_rate": 0.0001714387783692911, + "loss": 1.0423, + "step": 31600 + }, + { + "epoch": 4.379662890301188, + "grad_norm": 0.07171428948640823, + "learning_rate": 0.00017102269543289435, + "loss": 1.0434, + "step": 31700 + }, + { + "epoch": 4.393478861563968, + "grad_norm": 0.06658755987882614, + "learning_rate": 0.0001706024096385542, + "loss": 1.0415, + "step": 31800 + }, + { + "epoch": 4.407294832826747, + "grad_norm": 0.10734014213085175, + "learning_rate": 0.00017018212384421405, + "loss": 1.0406, + "step": 31900 + }, + { + "epoch": 4.421110804089528, + "grad_norm": 0.06358776986598969, + "learning_rate": 0.0001697618380498739, + "loss": 1.0405, + "step": 32000 + }, + { + "epoch": 4.434926775352308, + "grad_norm": 0.06078578904271126, + "learning_rate": 0.00016934155225553377, + "loss": 1.0458, + "step": 32100 + }, + { + "epoch": 4.448742746615087, + "grad_norm": 0.09674441814422607, + "learning_rate": 0.000168925469319137, + "loss": 1.0433, + "step": 32200 + }, + { + "epoch": 4.462558717877867, + "grad_norm": 0.11840452253818512, + "learning_rate": 0.00016850518352479684, + "loss": 1.0448, + "step": 32300 + }, + { + "epoch": 4.476374689140647, + "grad_norm": 0.08742488920688629, + "learning_rate": 0.0001680848977304567, + "loss": 1.0409, + "step": 32400 + }, + { + "epoch": 4.490190660403426, + "grad_norm": 0.09082327783107758, + "learning_rate": 0.00016766461193611654, + "loss": 1.0432, + "step": 32500 + }, + { + "epoch": 4.504006631666206, + "grad_norm": 0.06259270012378693, + "learning_rate": 0.0001672443261417764, + "loss": 1.0406, + "step": 32600 + }, + { + "epoch": 4.517822602928986, + "grad_norm": 0.06466669589281082, + "learning_rate": 0.00016682404034743626, + "loss": 1.0404, + "step": 32700 + }, + { + "epoch": 4.531638574191765, + "grad_norm": 0.07167832553386688, + "learning_rate": 0.0001664037545530961, + "loss": 1.0457, + "step": 32800 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.055970191955566406, + "learning_rate": 0.00016598346875875596, + "loss": 1.0433, + "step": 32900 + }, + { + "epoch": 4.5592705167173255, + "grad_norm": 0.05038364604115486, + "learning_rate": 0.00016556318296441578, + "loss": 1.0414, + "step": 33000 + }, + { + "epoch": 4.573086487980105, + "grad_norm": 0.11647244542837143, + "learning_rate": 0.00016514289717007563, + "loss": 1.0408, + "step": 33100 + }, + { + "epoch": 4.586902459242885, + "grad_norm": 0.08881094306707382, + "learning_rate": 0.00016472261137573548, + "loss": 1.0468, + "step": 33200 + }, + { + "epoch": 4.6007184305056645, + "grad_norm": 0.0706004872918129, + "learning_rate": 0.00016430232558139533, + "loss": 1.0433, + "step": 33300 + }, + { + "epoch": 4.614534401768444, + "grad_norm": 0.07594550400972366, + "learning_rate": 0.00016388203978705518, + "loss": 1.0401, + "step": 33400 + }, + { + "epoch": 4.628350373031224, + "grad_norm": 0.06709697842597961, + "learning_rate": 0.00016346175399271505, + "loss": 1.0406, + "step": 33500 + }, + { + "epoch": 4.6421663442940035, + "grad_norm": 0.055218733847141266, + "learning_rate": 0.0001630414681983749, + "loss": 1.0439, + "step": 33600 + }, + { + "epoch": 4.655982315556784, + "grad_norm": 0.09484557062387466, + "learning_rate": 0.00016262118240403475, + "loss": 1.0445, + "step": 33700 + }, + { + "epoch": 4.669798286819564, + "grad_norm": 0.08181110769510269, + "learning_rate": 0.00016220089660969457, + "loss": 1.0404, + "step": 33800 + }, + { + "epoch": 4.683614258082343, + "grad_norm": 0.07101566344499588, + "learning_rate": 0.00016178061081535442, + "loss": 1.0418, + "step": 33900 + }, + { + "epoch": 4.697430229345123, + "grad_norm": 0.07521411031484604, + "learning_rate": 0.00016136032502101427, + "loss": 1.0413, + "step": 34000 + }, + { + "epoch": 4.711246200607903, + "grad_norm": 0.06438640505075455, + "learning_rate": 0.00016094003922667412, + "loss": 1.0413, + "step": 34100 + }, + { + "epoch": 4.7250621718706824, + "grad_norm": 0.0852956548333168, + "learning_rate": 0.00016051975343233396, + "loss": 1.0411, + "step": 34200 + }, + { + "epoch": 4.738878143133462, + "grad_norm": 0.041669171303510666, + "learning_rate": 0.00016009946763799384, + "loss": 1.043, + "step": 34300 + }, + { + "epoch": 4.752694114396242, + "grad_norm": 0.07866424322128296, + "learning_rate": 0.0001596791818436537, + "loss": 1.0416, + "step": 34400 + }, + { + "epoch": 4.7665100856590215, + "grad_norm": 0.06820093840360641, + "learning_rate": 0.00015925889604931354, + "loss": 1.0419, + "step": 34500 + }, + { + "epoch": 4.780326056921801, + "grad_norm": 0.08769433945417404, + "learning_rate": 0.00015883861025497336, + "loss": 1.0436, + "step": 34600 + }, + { + "epoch": 4.794142028184582, + "grad_norm": 0.11472765356302261, + "learning_rate": 0.0001584183244606332, + "loss": 1.0448, + "step": 34700 + }, + { + "epoch": 4.807957999447361, + "grad_norm": 0.10286398231983185, + "learning_rate": 0.00015799803866629305, + "loss": 1.0396, + "step": 34800 + }, + { + "epoch": 4.821773970710141, + "grad_norm": 0.08412828296422958, + "learning_rate": 0.0001575777528719529, + "loss": 1.0432, + "step": 34900 + }, + { + "epoch": 4.835589941972921, + "grad_norm": 0.06536369025707245, + "learning_rate": 0.00015715746707761275, + "loss": 1.0425, + "step": 35000 + }, + { + "epoch": 4.835589941972921, + "eval_accuracy": 0.5253784900927014, + "eval_loss": 1.0407328605651855, + "eval_runtime": 804.3369, + "eval_samples_per_second": 255.939, + "eval_steps_per_second": 7.999, + "step": 35000 + }, + { + "epoch": 4.8494059132357, + "grad_norm": 0.05366332083940506, + "learning_rate": 0.00015673718128327263, + "loss": 1.0401, + "step": 35100 + }, + { + "epoch": 4.86322188449848, + "grad_norm": 0.05627182498574257, + "learning_rate": 0.00015631689548893248, + "loss": 1.0413, + "step": 35200 + }, + { + "epoch": 4.87703785576126, + "grad_norm": 0.06880544126033783, + "learning_rate": 0.00015589660969459232, + "loss": 1.0399, + "step": 35300 + }, + { + "epoch": 4.89085382702404, + "grad_norm": 0.06326279044151306, + "learning_rate": 0.00015547632390025215, + "loss": 1.0424, + "step": 35400 + }, + { + "epoch": 4.90466979828682, + "grad_norm": 0.050615083426237106, + "learning_rate": 0.000155056038105912, + "loss": 1.0419, + "step": 35500 + }, + { + "epoch": 4.9184857695496, + "grad_norm": 0.09092865139245987, + "learning_rate": 0.00015463575231157184, + "loss": 1.0417, + "step": 35600 + }, + { + "epoch": 4.932301740812379, + "grad_norm": 0.10828616470098495, + "learning_rate": 0.0001542154665172317, + "loss": 1.0461, + "step": 35700 + }, + { + "epoch": 4.946117712075159, + "grad_norm": 0.10398013889789581, + "learning_rate": 0.00015379518072289154, + "loss": 1.0402, + "step": 35800 + }, + { + "epoch": 4.959933683337939, + "grad_norm": 0.060978490859270096, + "learning_rate": 0.00015337489492855142, + "loss": 1.0428, + "step": 35900 + }, + { + "epoch": 4.973749654600718, + "grad_norm": 0.09474412351846695, + "learning_rate": 0.00015295460913421126, + "loss": 1.0426, + "step": 36000 + }, + { + "epoch": 4.987565625863498, + "grad_norm": 0.055337630212306976, + "learning_rate": 0.0001525343233398711, + "loss": 1.0424, + "step": 36100 + }, + { + "epoch": 5.001381597126278, + "grad_norm": 0.062282662838697433, + "learning_rate": 0.00015211824040347433, + "loss": 1.0408, + "step": 36200 + }, + { + "epoch": 5.015197568389058, + "grad_norm": 0.08418793976306915, + "learning_rate": 0.00015169795460913418, + "loss": 1.0423, + "step": 36300 + }, + { + "epoch": 5.029013539651838, + "grad_norm": 0.056806761771440506, + "learning_rate": 0.00015127766881479403, + "loss": 1.0397, + "step": 36400 + }, + { + "epoch": 5.0428295109146175, + "grad_norm": 0.050782449543476105, + "learning_rate": 0.0001508573830204539, + "loss": 1.0397, + "step": 36500 + }, + { + "epoch": 5.056645482177397, + "grad_norm": 0.04436805471777916, + "learning_rate": 0.00015043709722611375, + "loss": 1.0372, + "step": 36600 + }, + { + "epoch": 5.070461453440177, + "grad_norm": 0.056697145104408264, + "learning_rate": 0.0001500168114317736, + "loss": 1.0396, + "step": 36700 + }, + { + "epoch": 5.0842774247029565, + "grad_norm": 0.0936078131198883, + "learning_rate": 0.00014959652563743342, + "loss": 1.0366, + "step": 36800 + }, + { + "epoch": 5.098093395965736, + "grad_norm": 0.058340467512607574, + "learning_rate": 0.0001491762398430933, + "loss": 1.038, + "step": 36900 + }, + { + "epoch": 5.111909367228516, + "grad_norm": 0.07920562475919724, + "learning_rate": 0.00014875595404875315, + "loss": 1.0389, + "step": 37000 + }, + { + "epoch": 5.1257253384912955, + "grad_norm": 0.054546140134334564, + "learning_rate": 0.000148335668254413, + "loss": 1.0352, + "step": 37100 + }, + { + "epoch": 5.139541309754076, + "grad_norm": 0.0779619961977005, + "learning_rate": 0.00014791538246007282, + "loss": 1.0362, + "step": 37200 + }, + { + "epoch": 5.153357281016856, + "grad_norm": 0.06077539920806885, + "learning_rate": 0.0001474950966657327, + "loss": 1.0395, + "step": 37300 + }, + { + "epoch": 5.167173252279635, + "grad_norm": 0.07015964388847351, + "learning_rate": 0.00014707481087139254, + "loss": 1.0378, + "step": 37400 + }, + { + "epoch": 5.180989223542415, + "grad_norm": 0.07821048051118851, + "learning_rate": 0.0001466545250770524, + "loss": 1.0358, + "step": 37500 + }, + { + "epoch": 5.194805194805195, + "grad_norm": 0.06446918845176697, + "learning_rate": 0.0001462342392827122, + "loss": 1.0401, + "step": 37600 + }, + { + "epoch": 5.208621166067974, + "grad_norm": 0.0754179060459137, + "learning_rate": 0.0001458139534883721, + "loss": 1.0372, + "step": 37700 + }, + { + "epoch": 5.222437137330754, + "grad_norm": 0.06225774064660072, + "learning_rate": 0.00014539366769403194, + "loss": 1.0396, + "step": 37800 + }, + { + "epoch": 5.236253108593534, + "grad_norm": 0.09567879885435104, + "learning_rate": 0.00014497338189969178, + "loss": 1.0427, + "step": 37900 + }, + { + "epoch": 5.250069079856313, + "grad_norm": 0.0810612216591835, + "learning_rate": 0.00014455309610535163, + "loss": 1.0368, + "step": 38000 + }, + { + "epoch": 5.263885051119094, + "grad_norm": 0.058250732719898224, + "learning_rate": 0.00014413281031101148, + "loss": 1.039, + "step": 38100 + }, + { + "epoch": 5.277701022381874, + "grad_norm": 0.07354842871427536, + "learning_rate": 0.00014371252451667133, + "loss": 1.0393, + "step": 38200 + }, + { + "epoch": 5.291516993644653, + "grad_norm": 0.04756517335772514, + "learning_rate": 0.00014329223872233118, + "loss": 1.0369, + "step": 38300 + }, + { + "epoch": 5.305332964907433, + "grad_norm": 0.05551883205771446, + "learning_rate": 0.00014287195292799103, + "loss": 1.038, + "step": 38400 + }, + { + "epoch": 5.319148936170213, + "grad_norm": 0.05476289987564087, + "learning_rate": 0.00014245166713365088, + "loss": 1.0391, + "step": 38500 + }, + { + "epoch": 5.332964907432992, + "grad_norm": 0.041929882019758224, + "learning_rate": 0.00014203138133931072, + "loss": 1.0377, + "step": 38600 + }, + { + "epoch": 5.346780878695772, + "grad_norm": 0.05916072428226471, + "learning_rate": 0.00014161109554497057, + "loss": 1.0417, + "step": 38700 + }, + { + "epoch": 5.360596849958552, + "grad_norm": 0.0609772689640522, + "learning_rate": 0.00014119080975063042, + "loss": 1.0386, + "step": 38800 + }, + { + "epoch": 5.374412821221332, + "grad_norm": 0.06430498510599136, + "learning_rate": 0.00014077052395629027, + "loss": 1.0397, + "step": 38900 + }, + { + "epoch": 5.388228792484112, + "grad_norm": 0.07042800635099411, + "learning_rate": 0.00014035023816195012, + "loss": 1.038, + "step": 39000 + }, + { + "epoch": 5.402044763746892, + "grad_norm": 0.05623612925410271, + "learning_rate": 0.00013992995236760997, + "loss": 1.0405, + "step": 39100 + }, + { + "epoch": 5.415860735009671, + "grad_norm": 0.04936366528272629, + "learning_rate": 0.00013950966657326982, + "loss": 1.0404, + "step": 39200 + }, + { + "epoch": 5.429676706272451, + "grad_norm": 0.05738508701324463, + "learning_rate": 0.00013908938077892966, + "loss": 1.0364, + "step": 39300 + }, + { + "epoch": 5.443492677535231, + "grad_norm": 0.09567712992429733, + "learning_rate": 0.0001386690949845895, + "loss": 1.0381, + "step": 39400 + }, + { + "epoch": 5.45730864879801, + "grad_norm": 0.07306545972824097, + "learning_rate": 0.00013824880919024936, + "loss": 1.0394, + "step": 39500 + }, + { + "epoch": 5.47112462006079, + "grad_norm": 0.060108475387096405, + "learning_rate": 0.0001378285233959092, + "loss": 1.0379, + "step": 39600 + }, + { + "epoch": 5.48494059132357, + "grad_norm": 0.08150669932365417, + "learning_rate": 0.00013740823760156906, + "loss": 1.0391, + "step": 39700 + }, + { + "epoch": 5.49875656258635, + "grad_norm": 0.06265643239021301, + "learning_rate": 0.0001369879518072289, + "loss": 1.0419, + "step": 39800 + }, + { + "epoch": 5.51257253384913, + "grad_norm": 0.09023050218820572, + "learning_rate": 0.00013656766601288876, + "loss": 1.0374, + "step": 39900 + }, + { + "epoch": 5.5263885051119095, + "grad_norm": 0.06600885838270187, + "learning_rate": 0.0001361473802185486, + "loss": 1.0365, + "step": 40000 + }, + { + "epoch": 5.5263885051119095, + "eval_accuracy": 0.52706640122358, + "eval_loss": 1.0380040407180786, + "eval_runtime": 773.4583, + "eval_samples_per_second": 266.157, + "eval_steps_per_second": 8.318, + "step": 40000 + }, + { + "epoch": 5.540204476374689, + "grad_norm": 0.07041644304990768, + "learning_rate": 0.00013572709442420845, + "loss": 1.038, + "step": 40100 + }, + { + "epoch": 5.554020447637469, + "grad_norm": 0.0819341391324997, + "learning_rate": 0.0001353110114878117, + "loss": 1.0383, + "step": 40200 + }, + { + "epoch": 5.5678364189002485, + "grad_norm": 0.04390214383602142, + "learning_rate": 0.00013489072569347155, + "loss": 1.0381, + "step": 40300 + }, + { + "epoch": 5.581652390163028, + "grad_norm": 0.0681944414973259, + "learning_rate": 0.0001344704398991314, + "loss": 1.0368, + "step": 40400 + }, + { + "epoch": 5.595468361425809, + "grad_norm": 0.0888848677277565, + "learning_rate": 0.00013405015410479124, + "loss": 1.0369, + "step": 40500 + }, + { + "epoch": 5.609284332688588, + "grad_norm": 0.07275230437517166, + "learning_rate": 0.0001336298683104511, + "loss": 1.0353, + "step": 40600 + }, + { + "epoch": 5.623100303951368, + "grad_norm": 0.10200846940279007, + "learning_rate": 0.00013320958251611094, + "loss": 1.0381, + "step": 40700 + }, + { + "epoch": 5.636916275214148, + "grad_norm": 0.056480832397937775, + "learning_rate": 0.0001327892967217708, + "loss": 1.0383, + "step": 40800 + }, + { + "epoch": 5.650732246476927, + "grad_norm": 0.0845484584569931, + "learning_rate": 0.00013236901092743064, + "loss": 1.0385, + "step": 40900 + }, + { + "epoch": 5.664548217739707, + "grad_norm": 0.05990500748157501, + "learning_rate": 0.0001319487251330905, + "loss": 1.0381, + "step": 41000 + }, + { + "epoch": 5.678364189002487, + "grad_norm": 0.04566818103194237, + "learning_rate": 0.00013152843933875034, + "loss": 1.0409, + "step": 41100 + }, + { + "epoch": 5.692180160265266, + "grad_norm": 0.05529521405696869, + "learning_rate": 0.00013110815354441018, + "loss": 1.039, + "step": 41200 + }, + { + "epoch": 5.705996131528046, + "grad_norm": 0.08812158554792404, + "learning_rate": 0.00013068786775007003, + "loss": 1.0393, + "step": 41300 + }, + { + "epoch": 5.719812102790826, + "grad_norm": 0.0714721605181694, + "learning_rate": 0.00013026758195572988, + "loss": 1.0365, + "step": 41400 + }, + { + "epoch": 5.733628074053606, + "grad_norm": 0.050889432430267334, + "learning_rate": 0.00012984729616138973, + "loss": 1.0399, + "step": 41500 + }, + { + "epoch": 5.747444045316386, + "grad_norm": 0.05863107368350029, + "learning_rate": 0.00012942701036704958, + "loss": 1.0401, + "step": 41600 + }, + { + "epoch": 5.761260016579166, + "grad_norm": 0.05279000476002693, + "learning_rate": 0.00012900672457270943, + "loss": 1.0368, + "step": 41700 + }, + { + "epoch": 5.775075987841945, + "grad_norm": 0.06430874019861221, + "learning_rate": 0.00012858643877836928, + "loss": 1.0347, + "step": 41800 + }, + { + "epoch": 5.788891959104725, + "grad_norm": 0.1187288910150528, + "learning_rate": 0.00012816615298402912, + "loss": 1.0372, + "step": 41900 + }, + { + "epoch": 5.802707930367505, + "grad_norm": 0.05984746664762497, + "learning_rate": 0.00012774586718968897, + "loss": 1.036, + "step": 42000 + }, + { + "epoch": 5.816523901630284, + "grad_norm": 0.047202371060848236, + "learning_rate": 0.00012732558139534882, + "loss": 1.0341, + "step": 42100 + }, + { + "epoch": 5.830339872893065, + "grad_norm": 0.0888022631406784, + "learning_rate": 0.00012690949845895207, + "loss": 1.0358, + "step": 42200 + }, + { + "epoch": 5.8441558441558445, + "grad_norm": 0.071753591299057, + "learning_rate": 0.00012648921266461191, + "loss": 1.0356, + "step": 42300 + }, + { + "epoch": 5.857971815418624, + "grad_norm": 0.06311481446027756, + "learning_rate": 0.0001260689268702718, + "loss": 1.0381, + "step": 42400 + }, + { + "epoch": 5.871787786681404, + "grad_norm": 0.05733519420027733, + "learning_rate": 0.0001256486410759316, + "loss": 1.0366, + "step": 42500 + }, + { + "epoch": 5.885603757944184, + "grad_norm": 0.05296749621629715, + "learning_rate": 0.00012522835528159146, + "loss": 1.0391, + "step": 42600 + }, + { + "epoch": 5.899419729206963, + "grad_norm": 0.05728083476424217, + "learning_rate": 0.0001248080694872513, + "loss": 1.0393, + "step": 42700 + }, + { + "epoch": 5.913235700469743, + "grad_norm": 0.10918726772069931, + "learning_rate": 0.00012438778369291118, + "loss": 1.0375, + "step": 42800 + }, + { + "epoch": 5.927051671732523, + "grad_norm": 0.043641045689582825, + "learning_rate": 0.000123967497898571, + "loss": 1.0342, + "step": 42900 + }, + { + "epoch": 5.940867642995302, + "grad_norm": 0.07793564349412918, + "learning_rate": 0.00012354721210423085, + "loss": 1.037, + "step": 43000 + }, + { + "epoch": 5.954683614258082, + "grad_norm": 0.10596407949924469, + "learning_rate": 0.0001231269263098907, + "loss": 1.0361, + "step": 43100 + }, + { + "epoch": 5.9684995855208625, + "grad_norm": 0.05018968880176544, + "learning_rate": 0.00012270664051555058, + "loss": 1.0352, + "step": 43200 + }, + { + "epoch": 5.982315556783642, + "grad_norm": 0.06663347035646439, + "learning_rate": 0.0001222863547212104, + "loss": 1.0379, + "step": 43300 + }, + { + "epoch": 5.996131528046422, + "grad_norm": 0.05061174929141998, + "learning_rate": 0.00012186606892687026, + "loss": 1.0378, + "step": 43400 + }, + { + "epoch": 6.0099474993092015, + "grad_norm": 0.07496211677789688, + "learning_rate": 0.00012144578313253011, + "loss": 1.0357, + "step": 43500 + }, + { + "epoch": 6.023763470571981, + "grad_norm": 0.058973684906959534, + "learning_rate": 0.00012102549733818996, + "loss": 1.0336, + "step": 43600 + }, + { + "epoch": 6.037579441834761, + "grad_norm": 0.07304850965738297, + "learning_rate": 0.0001206052115438498, + "loss": 1.0366, + "step": 43700 + }, + { + "epoch": 6.0513954130975405, + "grad_norm": 0.05964922904968262, + "learning_rate": 0.00012018492574950966, + "loss": 1.0358, + "step": 43800 + }, + { + "epoch": 6.06521138436032, + "grad_norm": 0.10107408463954926, + "learning_rate": 0.0001197646399551695, + "loss": 1.0363, + "step": 43900 + }, + { + "epoch": 6.079027355623101, + "grad_norm": 0.05830320343375206, + "learning_rate": 0.00011934435416082935, + "loss": 1.0374, + "step": 44000 + }, + { + "epoch": 6.09284332688588, + "grad_norm": 0.06493101269006729, + "learning_rate": 0.00011892406836648919, + "loss": 1.0358, + "step": 44100 + }, + { + "epoch": 6.10665929814866, + "grad_norm": 0.06381756067276001, + "learning_rate": 0.00011850798543009245, + "loss": 1.0345, + "step": 44200 + }, + { + "epoch": 6.12047526941144, + "grad_norm": 0.057328786700963974, + "learning_rate": 0.0001180876996357523, + "loss": 1.0347, + "step": 44300 + }, + { + "epoch": 6.134291240674219, + "grad_norm": 0.09036822617053986, + "learning_rate": 0.00011766741384141216, + "loss": 1.0352, + "step": 44400 + }, + { + "epoch": 6.148107211936999, + "grad_norm": 0.05485937371850014, + "learning_rate": 0.000117247128047072, + "loss": 1.0371, + "step": 44500 + }, + { + "epoch": 6.161923183199779, + "grad_norm": 0.06304465979337692, + "learning_rate": 0.00011682684225273184, + "loss": 1.0302, + "step": 44600 + }, + { + "epoch": 6.175739154462558, + "grad_norm": 0.045126065611839294, + "learning_rate": 0.0001164065564583917, + "loss": 1.0338, + "step": 44700 + }, + { + "epoch": 6.189555125725338, + "grad_norm": 0.06636038422584534, + "learning_rate": 0.00011598627066405155, + "loss": 1.0353, + "step": 44800 + }, + { + "epoch": 6.203371096988119, + "grad_norm": 0.05977385491132736, + "learning_rate": 0.00011556598486971139, + "loss": 1.0346, + "step": 44900 + }, + { + "epoch": 6.217187068250898, + "grad_norm": 0.07459376752376556, + "learning_rate": 0.00011514569907537124, + "loss": 1.0325, + "step": 45000 + }, + { + "epoch": 6.217187068250898, + "eval_accuracy": 0.5284276106869993, + "eval_loss": 1.0360603332519531, + "eval_runtime": 770.702, + "eval_samples_per_second": 267.108, + "eval_steps_per_second": 8.348, + "step": 45000 + }, + { + "epoch": 6.231003039513678, + "grad_norm": 0.050757069140672684, + "learning_rate": 0.0001147254132810311, + "loss": 1.0337, + "step": 45100 + }, + { + "epoch": 6.244819010776458, + "grad_norm": 0.065644271671772, + "learning_rate": 0.00011430512748669095, + "loss": 1.035, + "step": 45200 + }, + { + "epoch": 6.258634982039237, + "grad_norm": 0.06008651480078697, + "learning_rate": 0.00011388484169235078, + "loss": 1.0323, + "step": 45300 + }, + { + "epoch": 6.272450953302017, + "grad_norm": 0.050868868827819824, + "learning_rate": 0.00011346455589801063, + "loss": 1.0341, + "step": 45400 + }, + { + "epoch": 6.286266924564797, + "grad_norm": 0.0535401850938797, + "learning_rate": 0.00011304427010367049, + "loss": 1.0349, + "step": 45500 + }, + { + "epoch": 6.300082895827576, + "grad_norm": 0.07083383947610855, + "learning_rate": 0.00011262398430933034, + "loss": 1.0327, + "step": 45600 + }, + { + "epoch": 6.313898867090357, + "grad_norm": 0.06998474150896072, + "learning_rate": 0.00011220369851499018, + "loss": 1.035, + "step": 45700 + }, + { + "epoch": 6.3277148383531365, + "grad_norm": 0.06696050614118576, + "learning_rate": 0.00011178341272065002, + "loss": 1.0342, + "step": 45800 + }, + { + "epoch": 6.341530809615916, + "grad_norm": 0.050143785774707794, + "learning_rate": 0.00011136312692630989, + "loss": 1.0342, + "step": 45900 + }, + { + "epoch": 6.355346780878696, + "grad_norm": 0.066258005797863, + "learning_rate": 0.00011094284113196974, + "loss": 1.0368, + "step": 46000 + }, + { + "epoch": 6.3691627521414755, + "grad_norm": 0.057613175362348557, + "learning_rate": 0.00011052255533762957, + "loss": 1.0357, + "step": 46100 + }, + { + "epoch": 6.382978723404255, + "grad_norm": 0.07405593246221542, + "learning_rate": 0.00011010647240123283, + "loss": 1.033, + "step": 46200 + }, + { + "epoch": 6.396794694667035, + "grad_norm": 0.07005150616168976, + "learning_rate": 0.00010968618660689268, + "loss": 1.0329, + "step": 46300 + }, + { + "epoch": 6.4106106659298145, + "grad_norm": 0.057546067982912064, + "learning_rate": 0.00010926590081255253, + "loss": 1.033, + "step": 46400 + }, + { + "epoch": 6.424426637192594, + "grad_norm": 0.08016248792409897, + "learning_rate": 0.00010884561501821236, + "loss": 1.0389, + "step": 46500 + }, + { + "epoch": 6.438242608455375, + "grad_norm": 0.08346617966890335, + "learning_rate": 0.00010842532922387222, + "loss": 1.0332, + "step": 46600 + }, + { + "epoch": 6.452058579718154, + "grad_norm": 0.048157453536987305, + "learning_rate": 0.00010800504342953207, + "loss": 1.0342, + "step": 46700 + }, + { + "epoch": 6.465874550980934, + "grad_norm": 0.06816009432077408, + "learning_rate": 0.00010758475763519192, + "loss": 1.0357, + "step": 46800 + }, + { + "epoch": 6.479690522243714, + "grad_norm": 0.05210613086819649, + "learning_rate": 0.00010716447184085176, + "loss": 1.0345, + "step": 46900 + }, + { + "epoch": 6.4935064935064934, + "grad_norm": 0.08138227462768555, + "learning_rate": 0.00010674418604651162, + "loss": 1.035, + "step": 47000 + }, + { + "epoch": 6.507322464769273, + "grad_norm": 0.07494477927684784, + "learning_rate": 0.00010632390025217147, + "loss": 1.0361, + "step": 47100 + }, + { + "epoch": 6.521138436032053, + "grad_norm": 0.07473413646221161, + "learning_rate": 0.00010590361445783132, + "loss": 1.0339, + "step": 47200 + }, + { + "epoch": 6.5349544072948325, + "grad_norm": 0.07200802862644196, + "learning_rate": 0.00010548332866349115, + "loss": 1.0333, + "step": 47300 + }, + { + "epoch": 6.548770378557613, + "grad_norm": 0.06346756964921951, + "learning_rate": 0.00010506304286915101, + "loss": 1.0345, + "step": 47400 + }, + { + "epoch": 6.562586349820393, + "grad_norm": 0.06382066756486893, + "learning_rate": 0.00010464275707481086, + "loss": 1.0352, + "step": 47500 + }, + { + "epoch": 6.576402321083172, + "grad_norm": 0.1000475063920021, + "learning_rate": 0.00010422247128047071, + "loss": 1.0344, + "step": 47600 + }, + { + "epoch": 6.590218292345952, + "grad_norm": 0.06456384807825089, + "learning_rate": 0.00010380218548613057, + "loss": 1.0356, + "step": 47700 + }, + { + "epoch": 6.604034263608732, + "grad_norm": 0.052929963916540146, + "learning_rate": 0.0001033818996917904, + "loss": 1.0343, + "step": 47800 + }, + { + "epoch": 6.617850234871511, + "grad_norm": 0.07275223731994629, + "learning_rate": 0.00010296161389745025, + "loss": 1.033, + "step": 47900 + }, + { + "epoch": 6.631666206134291, + "grad_norm": 0.060610584914684296, + "learning_rate": 0.0001025413281031101, + "loss": 1.0334, + "step": 48000 + }, + { + "epoch": 6.645482177397071, + "grad_norm": 0.0514766089618206, + "learning_rate": 0.00010212104230876997, + "loss": 1.0351, + "step": 48100 + }, + { + "epoch": 6.65929814865985, + "grad_norm": 0.08950326591730118, + "learning_rate": 0.0001017049593723732, + "loss": 1.0341, + "step": 48200 + }, + { + "epoch": 6.673114119922631, + "grad_norm": 0.052268847823143005, + "learning_rate": 0.00010128467357803306, + "loss": 1.0342, + "step": 48300 + }, + { + "epoch": 6.686930091185411, + "grad_norm": 0.059182267636060715, + "learning_rate": 0.00010086438778369291, + "loss": 1.0303, + "step": 48400 + }, + { + "epoch": 6.70074606244819, + "grad_norm": 0.06220945715904236, + "learning_rate": 0.00010044410198935274, + "loss": 1.032, + "step": 48500 + }, + { + "epoch": 6.71456203371097, + "grad_norm": 0.0486241914331913, + "learning_rate": 0.00010002381619501259, + "loss": 1.0338, + "step": 48600 + }, + { + "epoch": 6.72837800497375, + "grad_norm": 0.04813262075185776, + "learning_rate": 9.960353040067245e-05, + "loss": 1.0344, + "step": 48700 + }, + { + "epoch": 6.742193976236529, + "grad_norm": 0.04981222748756409, + "learning_rate": 9.91832446063323e-05, + "loss": 1.0347, + "step": 48800 + }, + { + "epoch": 6.756009947499309, + "grad_norm": 0.050560541450977325, + "learning_rate": 9.876295881199214e-05, + "loss": 1.0338, + "step": 48900 + }, + { + "epoch": 6.769825918762089, + "grad_norm": 0.05338674411177635, + "learning_rate": 9.834267301765199e-05, + "loss": 1.0369, + "step": 49000 + }, + { + "epoch": 6.783641890024869, + "grad_norm": 0.042156435549259186, + "learning_rate": 9.792238722331185e-05, + "loss": 1.0345, + "step": 49100 + }, + { + "epoch": 6.797457861287649, + "grad_norm": 0.0622396394610405, + "learning_rate": 9.75021014289717e-05, + "loss": 1.0321, + "step": 49200 + }, + { + "epoch": 6.8112738325504285, + "grad_norm": 0.08523661643266678, + "learning_rate": 9.708181563463155e-05, + "loss": 1.0317, + "step": 49300 + }, + { + "epoch": 6.825089803813208, + "grad_norm": 0.055176641792058945, + "learning_rate": 9.666152984029138e-05, + "loss": 1.0368, + "step": 49400 + }, + { + "epoch": 6.838905775075988, + "grad_norm": 0.07358380407094955, + "learning_rate": 9.624124404595124e-05, + "loss": 1.0318, + "step": 49500 + }, + { + "epoch": 6.8527217463387675, + "grad_norm": 0.055568769574165344, + "learning_rate": 9.582095825161109e-05, + "loss": 1.0343, + "step": 49600 + }, + { + "epoch": 6.866537717601547, + "grad_norm": 0.04249552637338638, + "learning_rate": 9.540067245727094e-05, + "loss": 1.0331, + "step": 49700 + }, + { + "epoch": 6.880353688864327, + "grad_norm": 0.05274058133363724, + "learning_rate": 9.498038666293077e-05, + "loss": 1.0351, + "step": 49800 + }, + { + "epoch": 6.8941696601271065, + "grad_norm": 0.04792112484574318, + "learning_rate": 9.456010086859064e-05, + "loss": 1.0333, + "step": 49900 + }, + { + "epoch": 6.907985631389887, + "grad_norm": 0.05513302981853485, + "learning_rate": 9.413981507425049e-05, + "loss": 1.0322, + "step": 50000 + }, + { + "epoch": 6.907985631389887, + "eval_accuracy": 0.5296076152096916, + "eval_loss": 1.0341060161590576, + "eval_runtime": 725.8939, + "eval_samples_per_second": 283.597, + "eval_steps_per_second": 8.864, + "step": 50000 + }, + { + "epoch": 6.921801602652667, + "grad_norm": 0.05296773836016655, + "learning_rate": 9.371952927991033e-05, + "loss": 1.031, + "step": 50100 + }, + { + "epoch": 6.935617573915446, + "grad_norm": 0.062248583883047104, + "learning_rate": 9.330344634351358e-05, + "loss": 1.0341, + "step": 50200 + }, + { + "epoch": 6.949433545178226, + "grad_norm": 0.07751675695180893, + "learning_rate": 9.288316054917343e-05, + "loss": 1.0352, + "step": 50300 + }, + { + "epoch": 6.963249516441006, + "grad_norm": 0.04984898492693901, + "learning_rate": 9.246287475483328e-05, + "loss": 1.0302, + "step": 50400 + }, + { + "epoch": 6.977065487703785, + "grad_norm": 0.04315504804253578, + "learning_rate": 9.204258896049314e-05, + "loss": 1.0327, + "step": 50500 + }, + { + "epoch": 6.990881458966565, + "grad_norm": 0.053620435297489166, + "learning_rate": 9.162230316615297e-05, + "loss": 1.0328, + "step": 50600 + }, + { + "epoch": 7.004697430229345, + "grad_norm": 0.04611975699663162, + "learning_rate": 9.120201737181282e-05, + "loss": 1.0336, + "step": 50700 + }, + { + "epoch": 7.018513401492125, + "grad_norm": 0.04269848018884659, + "learning_rate": 9.078173157747267e-05, + "loss": 1.0282, + "step": 50800 + }, + { + "epoch": 7.032329372754905, + "grad_norm": 0.055365532636642456, + "learning_rate": 9.036144578313253e-05, + "loss": 1.0339, + "step": 50900 + }, + { + "epoch": 7.046145344017685, + "grad_norm": 0.06129321828484535, + "learning_rate": 8.994115998879237e-05, + "loss": 1.0304, + "step": 51000 + }, + { + "epoch": 7.059961315280464, + "grad_norm": 0.06094348803162575, + "learning_rate": 8.952507705239563e-05, + "loss": 1.0288, + "step": 51100 + }, + { + "epoch": 7.073777286543244, + "grad_norm": 0.048849135637283325, + "learning_rate": 8.910479125805548e-05, + "loss": 1.0322, + "step": 51200 + }, + { + "epoch": 7.087593257806024, + "grad_norm": 0.05081125721335411, + "learning_rate": 8.868450546371531e-05, + "loss": 1.0303, + "step": 51300 + }, + { + "epoch": 7.101409229068803, + "grad_norm": 0.07727497071027756, + "learning_rate": 8.826421966937516e-05, + "loss": 1.03, + "step": 51400 + }, + { + "epoch": 7.115225200331583, + "grad_norm": 0.06357153505086899, + "learning_rate": 8.784393387503502e-05, + "loss": 1.0342, + "step": 51500 + }, + { + "epoch": 7.129041171594363, + "grad_norm": 0.05598052963614464, + "learning_rate": 8.742364808069487e-05, + "loss": 1.0312, + "step": 51600 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.06753697246313095, + "learning_rate": 8.70033622863547e-05, + "loss": 1.0306, + "step": 51700 + }, + { + "epoch": 7.156673114119923, + "grad_norm": 0.06586912274360657, + "learning_rate": 8.658307649201455e-05, + "loss": 1.0311, + "step": 51800 + }, + { + "epoch": 7.170489085382703, + "grad_norm": 0.10361455380916595, + "learning_rate": 8.616279069767442e-05, + "loss": 1.0326, + "step": 51900 + }, + { + "epoch": 7.184305056645482, + "grad_norm": 0.09442713856697083, + "learning_rate": 8.574250490333426e-05, + "loss": 1.0339, + "step": 52000 + }, + { + "epoch": 7.198121027908262, + "grad_norm": 0.08114325255155563, + "learning_rate": 8.532221910899411e-05, + "loss": 1.0335, + "step": 52100 + }, + { + "epoch": 7.211936999171042, + "grad_norm": 0.054252710193395615, + "learning_rate": 8.490193331465395e-05, + "loss": 1.0316, + "step": 52200 + }, + { + "epoch": 7.225752970433821, + "grad_norm": 0.059643086045980453, + "learning_rate": 8.448164752031381e-05, + "loss": 1.027, + "step": 52300 + }, + { + "epoch": 7.239568941696601, + "grad_norm": 0.045472096651792526, + "learning_rate": 8.406136172597366e-05, + "loss": 1.0311, + "step": 52400 + }, + { + "epoch": 7.2533849129593815, + "grad_norm": 0.0669686570763588, + "learning_rate": 8.36410759316335e-05, + "loss": 1.0309, + "step": 52500 + }, + { + "epoch": 7.267200884222161, + "grad_norm": 0.0454520583152771, + "learning_rate": 8.322079013729334e-05, + "loss": 1.0327, + "step": 52600 + }, + { + "epoch": 7.281016855484941, + "grad_norm": 0.05776028707623482, + "learning_rate": 8.28005043429532e-05, + "loss": 1.0318, + "step": 52700 + }, + { + "epoch": 7.2948328267477205, + "grad_norm": 0.051905229687690735, + "learning_rate": 8.238021854861305e-05, + "loss": 1.0313, + "step": 52800 + }, + { + "epoch": 7.3086487980105, + "grad_norm": 0.056912437081336975, + "learning_rate": 8.19599327542729e-05, + "loss": 1.0325, + "step": 52900 + }, + { + "epoch": 7.32246476927328, + "grad_norm": 0.04940250515937805, + "learning_rate": 8.153964695993274e-05, + "loss": 1.0323, + "step": 53000 + }, + { + "epoch": 7.3362807405360595, + "grad_norm": 0.04186444729566574, + "learning_rate": 8.11193611655926e-05, + "loss": 1.0285, + "step": 53100 + }, + { + "epoch": 7.350096711798839, + "grad_norm": 0.041809357702732086, + "learning_rate": 8.069907537125245e-05, + "loss": 1.0289, + "step": 53200 + }, + { + "epoch": 7.363912683061619, + "grad_norm": 0.05794375389814377, + "learning_rate": 8.02787895769123e-05, + "loss": 1.031, + "step": 53300 + }, + { + "epoch": 7.377728654324399, + "grad_norm": 0.08333911001682281, + "learning_rate": 7.985850378257213e-05, + "loss": 1.0316, + "step": 53400 + }, + { + "epoch": 7.391544625587179, + "grad_norm": 0.06473658233880997, + "learning_rate": 7.943821798823199e-05, + "loss": 1.0317, + "step": 53500 + }, + { + "epoch": 7.405360596849959, + "grad_norm": 0.05173886939883232, + "learning_rate": 7.901793219389184e-05, + "loss": 1.0308, + "step": 53600 + }, + { + "epoch": 7.419176568112738, + "grad_norm": 0.06362345069646835, + "learning_rate": 7.859764639955169e-05, + "loss": 1.0324, + "step": 53700 + }, + { + "epoch": 7.432992539375518, + "grad_norm": 0.054053716361522675, + "learning_rate": 7.817736060521152e-05, + "loss": 1.0303, + "step": 53800 + }, + { + "epoch": 7.446808510638298, + "grad_norm": 0.048420459032058716, + "learning_rate": 7.775707481087139e-05, + "loss": 1.0299, + "step": 53900 + }, + { + "epoch": 7.460624481901077, + "grad_norm": 0.0606950968503952, + "learning_rate": 7.733678901653123e-05, + "loss": 1.0317, + "step": 54000 + }, + { + "epoch": 7.474440453163857, + "grad_norm": 0.06072583049535751, + "learning_rate": 7.691650322219108e-05, + "loss": 1.033, + "step": 54100 + }, + { + "epoch": 7.488256424426638, + "grad_norm": 0.05064817890524864, + "learning_rate": 7.649621742785093e-05, + "loss": 1.0287, + "step": 54200 + }, + { + "epoch": 7.502072395689417, + "grad_norm": 0.09318757057189941, + "learning_rate": 7.607593163351078e-05, + "loss": 1.0296, + "step": 54300 + }, + { + "epoch": 7.515888366952197, + "grad_norm": 0.0935215950012207, + "learning_rate": 7.565564583917063e-05, + "loss": 1.0322, + "step": 54400 + }, + { + "epoch": 7.529704338214977, + "grad_norm": 0.07255256175994873, + "learning_rate": 7.523536004483048e-05, + "loss": 1.0333, + "step": 54500 + }, + { + "epoch": 7.543520309477756, + "grad_norm": 0.05486008897423744, + "learning_rate": 7.481507425049033e-05, + "loss": 1.032, + "step": 54600 + }, + { + "epoch": 7.557336280740536, + "grad_norm": 0.0525212287902832, + "learning_rate": 7.439478845615017e-05, + "loss": 1.0293, + "step": 54700 + }, + { + "epoch": 7.571152252003316, + "grad_norm": 0.047569695860147476, + "learning_rate": 7.397450266181002e-05, + "loss": 1.0282, + "step": 54800 + }, + { + "epoch": 7.584968223266095, + "grad_norm": 0.06165711581707001, + "learning_rate": 7.355421686746987e-05, + "loss": 1.0312, + "step": 54900 + }, + { + "epoch": 7.598784194528875, + "grad_norm": 0.0578945092856884, + "learning_rate": 7.313393107312972e-05, + "loss": 1.0307, + "step": 55000 + }, + { + "epoch": 7.598784194528875, + "eval_accuracy": 0.5305025000901846, + "eval_loss": 1.0327985286712646, + "eval_runtime": 731.5754, + "eval_samples_per_second": 281.394, + "eval_steps_per_second": 8.795, + "step": 55000 + }, + { + "epoch": 7.612600165791655, + "grad_norm": 0.0795338973402977, + "learning_rate": 7.271784813673297e-05, + "loss": 1.0294, + "step": 55100 + }, + { + "epoch": 7.626416137054435, + "grad_norm": 0.06103779003024101, + "learning_rate": 7.229756234239283e-05, + "loss": 1.033, + "step": 55200 + }, + { + "epoch": 7.640232108317215, + "grad_norm": 0.0635315552353859, + "learning_rate": 7.187727654805266e-05, + "loss": 1.0296, + "step": 55300 + }, + { + "epoch": 7.654048079579995, + "grad_norm": 0.05289231240749359, + "learning_rate": 7.145699075371253e-05, + "loss": 1.034, + "step": 55400 + }, + { + "epoch": 7.667864050842774, + "grad_norm": 0.07801427692174911, + "learning_rate": 7.103670495937236e-05, + "loss": 1.0332, + "step": 55500 + }, + { + "epoch": 7.681680022105554, + "grad_norm": 0.07564268261194229, + "learning_rate": 7.061641916503222e-05, + "loss": 1.0299, + "step": 55600 + }, + { + "epoch": 7.695495993368334, + "grad_norm": 0.04168133810162544, + "learning_rate": 7.019613337069206e-05, + "loss": 1.03, + "step": 55700 + }, + { + "epoch": 7.709311964631113, + "grad_norm": 0.11210035532712936, + "learning_rate": 6.977584757635192e-05, + "loss": 1.0301, + "step": 55800 + }, + { + "epoch": 7.723127935893894, + "grad_norm": 0.09023060649633408, + "learning_rate": 6.935556178201175e-05, + "loss": 1.0285, + "step": 55900 + }, + { + "epoch": 7.7369439071566735, + "grad_norm": 0.05271260067820549, + "learning_rate": 6.893527598767162e-05, + "loss": 1.0315, + "step": 56000 + }, + { + "epoch": 7.750759878419453, + "grad_norm": 0.06293012201786041, + "learning_rate": 6.851499019333145e-05, + "loss": 1.0286, + "step": 56100 + }, + { + "epoch": 7.764575849682233, + "grad_norm": 0.04555558040738106, + "learning_rate": 6.809470439899131e-05, + "loss": 1.0308, + "step": 56200 + }, + { + "epoch": 7.7783918209450125, + "grad_norm": 0.042364273220300674, + "learning_rate": 6.767441860465115e-05, + "loss": 1.0311, + "step": 56300 + }, + { + "epoch": 7.792207792207792, + "grad_norm": 0.05084213241934776, + "learning_rate": 6.725413281031101e-05, + "loss": 1.0298, + "step": 56400 + }, + { + "epoch": 7.806023763470572, + "grad_norm": 0.059168051928281784, + "learning_rate": 6.683384701597085e-05, + "loss": 1.0303, + "step": 56500 + }, + { + "epoch": 7.8198397347333515, + "grad_norm": 0.05535740405321121, + "learning_rate": 6.641356122163071e-05, + "loss": 1.0306, + "step": 56600 + }, + { + "epoch": 7.833655705996131, + "grad_norm": 0.06625715643167496, + "learning_rate": 6.599327542729054e-05, + "loss": 1.0283, + "step": 56700 + }, + { + "epoch": 7.847471677258911, + "grad_norm": 0.04644458368420601, + "learning_rate": 6.55729896329504e-05, + "loss": 1.0289, + "step": 56800 + }, + { + "epoch": 7.861287648521691, + "grad_norm": 0.05319574847817421, + "learning_rate": 6.515270383861024e-05, + "loss": 1.0303, + "step": 56900 + }, + { + "epoch": 7.875103619784471, + "grad_norm": 0.06394356489181519, + "learning_rate": 6.47324180442701e-05, + "loss": 1.0315, + "step": 57000 + }, + { + "epoch": 7.888919591047251, + "grad_norm": 0.0535539835691452, + "learning_rate": 6.431633510787335e-05, + "loss": 1.0323, + "step": 57100 + }, + { + "epoch": 7.90273556231003, + "grad_norm": 0.05220150947570801, + "learning_rate": 6.38960493135332e-05, + "loss": 1.032, + "step": 57200 + }, + { + "epoch": 7.91655153357281, + "grad_norm": 0.04795517399907112, + "learning_rate": 6.347576351919304e-05, + "loss": 1.03, + "step": 57300 + }, + { + "epoch": 7.93036750483559, + "grad_norm": 0.0748489499092102, + "learning_rate": 6.30554777248529e-05, + "loss": 1.0338, + "step": 57400 + }, + { + "epoch": 7.944183476098369, + "grad_norm": 0.08164035528898239, + "learning_rate": 6.263519193051274e-05, + "loss": 1.0318, + "step": 57500 + }, + { + "epoch": 7.95799944736115, + "grad_norm": 0.0764247477054596, + "learning_rate": 6.221490613617259e-05, + "loss": 1.0278, + "step": 57600 + }, + { + "epoch": 7.97181541862393, + "grad_norm": 0.05609816685318947, + "learning_rate": 6.179462034183244e-05, + "loss": 1.0307, + "step": 57700 + }, + { + "epoch": 7.985631389886709, + "grad_norm": 0.05001819133758545, + "learning_rate": 6.137433454749229e-05, + "loss": 1.0297, + "step": 57800 + }, + { + "epoch": 7.999447361149489, + "grad_norm": 0.10084258019924164, + "learning_rate": 6.0954048753152136e-05, + "loss": 1.0339, + "step": 57900 + }, + { + "epoch": 8.013263332412269, + "grad_norm": 0.07571733742952347, + "learning_rate": 6.0533762958811985e-05, + "loss": 1.0305, + "step": 58000 + }, + { + "epoch": 8.027079303675048, + "grad_norm": 0.059294216334819794, + "learning_rate": 6.011347716447183e-05, + "loss": 1.026, + "step": 58100 + }, + { + "epoch": 8.040895274937828, + "grad_norm": 0.04530787095427513, + "learning_rate": 5.969319137013168e-05, + "loss": 1.0282, + "step": 58200 + }, + { + "epoch": 8.054711246200608, + "grad_norm": 0.05052864924073219, + "learning_rate": 5.927290557579153e-05, + "loss": 1.0271, + "step": 58300 + }, + { + "epoch": 8.068527217463387, + "grad_norm": 0.04923342168331146, + "learning_rate": 5.885261978145138e-05, + "loss": 1.029, + "step": 58400 + }, + { + "epoch": 8.082343188726167, + "grad_norm": 0.04905908182263374, + "learning_rate": 5.843233398711123e-05, + "loss": 1.0277, + "step": 58500 + }, + { + "epoch": 8.096159159988947, + "grad_norm": 0.046151451766490936, + "learning_rate": 5.801204819277108e-05, + "loss": 1.0289, + "step": 58600 + }, + { + "epoch": 8.109975131251726, + "grad_norm": 0.06011873856186867, + "learning_rate": 5.7591762398430925e-05, + "loss": 1.0245, + "step": 58700 + }, + { + "epoch": 8.123791102514506, + "grad_norm": 0.06879663467407227, + "learning_rate": 5.717147660409078e-05, + "loss": 1.0271, + "step": 58800 + }, + { + "epoch": 8.137607073777286, + "grad_norm": 0.04675479233264923, + "learning_rate": 5.675119080975063e-05, + "loss": 1.0263, + "step": 58900 + }, + { + "epoch": 8.151423045040067, + "grad_norm": 0.08497285097837448, + "learning_rate": 5.633090501541048e-05, + "loss": 1.0287, + "step": 59000 + }, + { + "epoch": 8.165239016302847, + "grad_norm": 0.07600156217813492, + "learning_rate": 5.5910619221070326e-05, + "loss": 1.0262, + "step": 59100 + }, + { + "epoch": 8.179054987565626, + "grad_norm": 0.04951677843928337, + "learning_rate": 5.549453628467357e-05, + "loss": 1.0283, + "step": 59200 + }, + { + "epoch": 8.192870958828406, + "grad_norm": 0.05662324279546738, + "learning_rate": 5.507425049033342e-05, + "loss": 1.0295, + "step": 59300 + }, + { + "epoch": 8.206686930091186, + "grad_norm": 0.05791959911584854, + "learning_rate": 5.465396469599327e-05, + "loss": 1.0285, + "step": 59400 + }, + { + "epoch": 8.220502901353965, + "grad_norm": 0.058768805116415024, + "learning_rate": 5.423367890165312e-05, + "loss": 1.0272, + "step": 59500 + }, + { + "epoch": 8.234318872616745, + "grad_norm": 0.05399869754910469, + "learning_rate": 5.381339310731297e-05, + "loss": 1.0301, + "step": 59600 + }, + { + "epoch": 8.248134843879525, + "grad_norm": 0.06434085965156555, + "learning_rate": 5.3393107312972814e-05, + "loss": 1.0277, + "step": 59700 + }, + { + "epoch": 8.261950815142304, + "grad_norm": 0.054656483232975006, + "learning_rate": 5.297282151863267e-05, + "loss": 1.0295, + "step": 59800 + }, + { + "epoch": 8.275766786405084, + "grad_norm": 0.04396641626954079, + "learning_rate": 5.255253572429251e-05, + "loss": 1.0276, + "step": 59900 + }, + { + "epoch": 8.289582757667864, + "grad_norm": 0.058395449072122574, + "learning_rate": 5.2132249929952366e-05, + "loss": 1.0267, + "step": 60000 + }, + { + "epoch": 8.289582757667864, + "eval_accuracy": 0.5312832658873073, + "eval_loss": 1.0315501689910889, + "eval_runtime": 729.415, + "eval_samples_per_second": 282.228, + "eval_steps_per_second": 8.821, + "step": 60000 + }, + { + "epoch": 8.303398728930643, + "grad_norm": 0.06770013272762299, + "learning_rate": 5.171196413561221e-05, + "loss": 1.029, + "step": 60100 + }, + { + "epoch": 8.317214700193423, + "grad_norm": 0.06161688268184662, + "learning_rate": 5.1291678341272063e-05, + "loss": 1.0242, + "step": 60200 + }, + { + "epoch": 8.331030671456203, + "grad_norm": 0.04140911623835564, + "learning_rate": 5.087139254693191e-05, + "loss": 1.029, + "step": 60300 + }, + { + "epoch": 8.344846642718982, + "grad_norm": 0.07091998308897018, + "learning_rate": 5.045110675259176e-05, + "loss": 1.0268, + "step": 60400 + }, + { + "epoch": 8.358662613981762, + "grad_norm": 0.05135732889175415, + "learning_rate": 5.003082095825161e-05, + "loss": 1.0264, + "step": 60500 + }, + { + "epoch": 8.372478585244544, + "grad_norm": 0.05828474089503288, + "learning_rate": 4.961053516391146e-05, + "loss": 1.0271, + "step": 60600 + }, + { + "epoch": 8.386294556507323, + "grad_norm": 0.05920015275478363, + "learning_rate": 4.9190249369571306e-05, + "loss": 1.0263, + "step": 60700 + }, + { + "epoch": 8.400110527770103, + "grad_norm": 0.048502273857593536, + "learning_rate": 4.8769963575231155e-05, + "loss": 1.029, + "step": 60800 + }, + { + "epoch": 8.413926499032883, + "grad_norm": 0.049063604325056076, + "learning_rate": 4.8349677780891e-05, + "loss": 1.0294, + "step": 60900 + }, + { + "epoch": 8.427742470295662, + "grad_norm": 0.05672093480825424, + "learning_rate": 4.792939198655085e-05, + "loss": 1.0297, + "step": 61000 + }, + { + "epoch": 8.441558441558442, + "grad_norm": 0.06934633105993271, + "learning_rate": 4.75091061922107e-05, + "loss": 1.0261, + "step": 61100 + }, + { + "epoch": 8.455374412821222, + "grad_norm": 0.04098910838365555, + "learning_rate": 4.709302325581395e-05, + "loss": 1.0292, + "step": 61200 + }, + { + "epoch": 8.469190384084001, + "grad_norm": 0.06421385705471039, + "learning_rate": 4.6672737461473794e-05, + "loss": 1.0315, + "step": 61300 + }, + { + "epoch": 8.483006355346781, + "grad_norm": 0.05238828435540199, + "learning_rate": 4.625245166713365e-05, + "loss": 1.0309, + "step": 61400 + }, + { + "epoch": 8.49682232660956, + "grad_norm": 0.049910806119441986, + "learning_rate": 4.583216587279349e-05, + "loss": 1.0257, + "step": 61500 + }, + { + "epoch": 8.51063829787234, + "grad_norm": 0.06672196090221405, + "learning_rate": 4.541188007845335e-05, + "loss": 1.0328, + "step": 61600 + }, + { + "epoch": 8.52445426913512, + "grad_norm": 0.05466538295149803, + "learning_rate": 4.4991594284113195e-05, + "loss": 1.0284, + "step": 61700 + }, + { + "epoch": 8.5382702403979, + "grad_norm": 0.05218784883618355, + "learning_rate": 4.4571308489773044e-05, + "loss": 1.0285, + "step": 61800 + }, + { + "epoch": 8.55208621166068, + "grad_norm": 0.04263923689723015, + "learning_rate": 4.415102269543289e-05, + "loss": 1.0307, + "step": 61900 + }, + { + "epoch": 8.565902182923459, + "grad_norm": 0.054478637874126434, + "learning_rate": 4.373073690109274e-05, + "loss": 1.0291, + "step": 62000 + }, + { + "epoch": 8.579718154186239, + "grad_norm": 0.05667020007967949, + "learning_rate": 4.331045110675259e-05, + "loss": 1.0296, + "step": 62100 + }, + { + "epoch": 8.593534125449018, + "grad_norm": 0.0490160770714283, + "learning_rate": 4.289016531241244e-05, + "loss": 1.029, + "step": 62200 + }, + { + "epoch": 8.607350096711798, + "grad_norm": 0.049655403941869736, + "learning_rate": 4.246987951807229e-05, + "loss": 1.0298, + "step": 62300 + }, + { + "epoch": 8.62116606797458, + "grad_norm": 0.047429408878088, + "learning_rate": 4.2049593723732135e-05, + "loss": 1.0277, + "step": 62400 + }, + { + "epoch": 8.634982039237359, + "grad_norm": 0.05222218483686447, + "learning_rate": 4.1629307929391984e-05, + "loss": 1.0292, + "step": 62500 + }, + { + "epoch": 8.648798010500139, + "grad_norm": 0.05841238424181938, + "learning_rate": 4.120902213505183e-05, + "loss": 1.029, + "step": 62600 + }, + { + "epoch": 8.662613981762918, + "grad_norm": 0.0452195480465889, + "learning_rate": 4.078873634071168e-05, + "loss": 1.0265, + "step": 62700 + }, + { + "epoch": 8.676429953025698, + "grad_norm": 0.049306340515613556, + "learning_rate": 4.036845054637153e-05, + "loss": 1.0308, + "step": 62800 + }, + { + "epoch": 8.690245924288478, + "grad_norm": 0.050401389598846436, + "learning_rate": 3.994816475203138e-05, + "loss": 1.0294, + "step": 62900 + }, + { + "epoch": 8.704061895551257, + "grad_norm": 0.04503024369478226, + "learning_rate": 3.952787895769123e-05, + "loss": 1.0291, + "step": 63000 + }, + { + "epoch": 8.717877866814037, + "grad_norm": 0.0738733783364296, + "learning_rate": 3.9107593163351075e-05, + "loss": 1.0279, + "step": 63100 + }, + { + "epoch": 8.731693838076817, + "grad_norm": 0.04586975276470184, + "learning_rate": 3.869151022695433e-05, + "loss": 1.026, + "step": 63200 + }, + { + "epoch": 8.745509809339596, + "grad_norm": 0.04988343268632889, + "learning_rate": 3.8271224432614176e-05, + "loss": 1.0257, + "step": 63300 + }, + { + "epoch": 8.759325780602376, + "grad_norm": 0.07822008430957794, + "learning_rate": 3.7850938638274025e-05, + "loss": 1.0254, + "step": 63400 + }, + { + "epoch": 8.773141751865156, + "grad_norm": 0.058496229350566864, + "learning_rate": 3.743065284393387e-05, + "loss": 1.0263, + "step": 63500 + }, + { + "epoch": 8.786957723127935, + "grad_norm": 0.04458677023649216, + "learning_rate": 3.701036704959372e-05, + "loss": 1.0292, + "step": 63600 + }, + { + "epoch": 8.800773694390715, + "grad_norm": 0.06616061180830002, + "learning_rate": 3.659008125525357e-05, + "loss": 1.0309, + "step": 63700 + }, + { + "epoch": 8.814589665653495, + "grad_norm": 0.06473194807767868, + "learning_rate": 3.616979546091342e-05, + "loss": 1.0265, + "step": 63800 + }, + { + "epoch": 8.828405636916274, + "grad_norm": 0.047700874507427216, + "learning_rate": 3.574950966657327e-05, + "loss": 1.0303, + "step": 63900 + }, + { + "epoch": 8.842221608179056, + "grad_norm": 0.055733323097229004, + "learning_rate": 3.5329223872233116e-05, + "loss": 1.0279, + "step": 64000 + }, + { + "epoch": 8.856037579441836, + "grad_norm": 0.04398791491985321, + "learning_rate": 3.4908938077892965e-05, + "loss": 1.0284, + "step": 64100 + }, + { + "epoch": 8.869853550704615, + "grad_norm": 0.08901511132717133, + "learning_rate": 3.448865228355281e-05, + "loss": 1.0283, + "step": 64200 + }, + { + "epoch": 8.883669521967395, + "grad_norm": 0.05853118374943733, + "learning_rate": 3.406836648921266e-05, + "loss": 1.0291, + "step": 64300 + }, + { + "epoch": 8.897485493230175, + "grad_norm": 0.043922308832407, + "learning_rate": 3.364808069487251e-05, + "loss": 1.0294, + "step": 64400 + }, + { + "epoch": 8.911301464492954, + "grad_norm": 0.04332153871655464, + "learning_rate": 3.322779490053236e-05, + "loss": 1.0277, + "step": 64500 + }, + { + "epoch": 8.925117435755734, + "grad_norm": 0.09197825193405151, + "learning_rate": 3.280750910619221e-05, + "loss": 1.0295, + "step": 64600 + }, + { + "epoch": 8.938933407018514, + "grad_norm": 0.05589272826910019, + "learning_rate": 3.2387223311852056e-05, + "loss": 1.0274, + "step": 64700 + }, + { + "epoch": 8.952749378281293, + "grad_norm": 0.06028933823108673, + "learning_rate": 3.1966937517511904e-05, + "loss": 1.0285, + "step": 64800 + }, + { + "epoch": 8.966565349544073, + "grad_norm": 0.05357721447944641, + "learning_rate": 3.154665172317175e-05, + "loss": 1.027, + "step": 64900 + }, + { + "epoch": 8.980381320806853, + "grad_norm": 0.07362578809261322, + "learning_rate": 3.11263659288316e-05, + "loss": 1.0273, + "step": 65000 + }, + { + "epoch": 8.980381320806853, + "eval_accuracy": 0.5319501927585898, + "eval_loss": 1.0305662155151367, + "eval_runtime": 722.9505, + "eval_samples_per_second": 284.751, + "eval_steps_per_second": 8.9, + "step": 65000 + }, + { + "epoch": 8.994197292069632, + "grad_norm": 0.04831722378730774, + "learning_rate": 3.070608013449145e-05, + "loss": 1.0294, + "step": 65100 + }, + { + "epoch": 9.008013263332412, + "grad_norm": 0.06001870334148407, + "learning_rate": 3.0289997198094702e-05, + "loss": 1.0306, + "step": 65200 + }, + { + "epoch": 9.021829234595192, + "grad_norm": 0.04466562718153, + "learning_rate": 2.986971140375455e-05, + "loss": 1.0267, + "step": 65300 + }, + { + "epoch": 9.035645205857971, + "grad_norm": 0.059990085661411285, + "learning_rate": 2.94494256094144e-05, + "loss": 1.0248, + "step": 65400 + }, + { + "epoch": 9.049461177120751, + "grad_norm": 0.05244195833802223, + "learning_rate": 2.9029139815074248e-05, + "loss": 1.0282, + "step": 65500 + }, + { + "epoch": 9.06327714838353, + "grad_norm": 0.060148317366838455, + "learning_rate": 2.8608854020734097e-05, + "loss": 1.0266, + "step": 65600 + }, + { + "epoch": 9.07709311964631, + "grad_norm": 0.051530059427022934, + "learning_rate": 2.8188568226393945e-05, + "loss": 1.0257, + "step": 65700 + }, + { + "epoch": 9.090909090909092, + "grad_norm": 0.06650034338235855, + "learning_rate": 2.7768282432053794e-05, + "loss": 1.0276, + "step": 65800 + }, + { + "epoch": 9.104725062171871, + "grad_norm": 0.04850700497627258, + "learning_rate": 2.7347996637713642e-05, + "loss": 1.0249, + "step": 65900 + }, + { + "epoch": 9.118541033434651, + "grad_norm": 0.057128727436065674, + "learning_rate": 2.692771084337349e-05, + "loss": 1.0264, + "step": 66000 + }, + { + "epoch": 9.13235700469743, + "grad_norm": 0.056875213980674744, + "learning_rate": 2.650742504903334e-05, + "loss": 1.0285, + "step": 66100 + }, + { + "epoch": 9.14617297596021, + "grad_norm": 0.05632421374320984, + "learning_rate": 2.6087139254693188e-05, + "loss": 1.0286, + "step": 66200 + }, + { + "epoch": 9.15998894722299, + "grad_norm": 0.04903789609670639, + "learning_rate": 2.5666853460353037e-05, + "loss": 1.0233, + "step": 66300 + }, + { + "epoch": 9.17380491848577, + "grad_norm": 0.04932420328259468, + "learning_rate": 2.5246567666012885e-05, + "loss": 1.0273, + "step": 66400 + }, + { + "epoch": 9.18762088974855, + "grad_norm": 0.0668862909078598, + "learning_rate": 2.4826281871672734e-05, + "loss": 1.0264, + "step": 66500 + }, + { + "epoch": 9.201436861011329, + "grad_norm": 0.05283021926879883, + "learning_rate": 2.4405996077332586e-05, + "loss": 1.0278, + "step": 66600 + }, + { + "epoch": 9.215252832274109, + "grad_norm": 0.04914732649922371, + "learning_rate": 2.3985710282992434e-05, + "loss": 1.0276, + "step": 66700 + }, + { + "epoch": 9.229068803536888, + "grad_norm": 0.06511181592941284, + "learning_rate": 2.3565424488652283e-05, + "loss": 1.0268, + "step": 66800 + }, + { + "epoch": 9.242884774799668, + "grad_norm": 0.06101306900382042, + "learning_rate": 2.314513869431213e-05, + "loss": 1.0267, + "step": 66900 + }, + { + "epoch": 9.256700746062448, + "grad_norm": 0.05272289365530014, + "learning_rate": 2.272485289997198e-05, + "loss": 1.0242, + "step": 67000 + }, + { + "epoch": 9.270516717325227, + "grad_norm": 0.04828105494379997, + "learning_rate": 2.230456710563183e-05, + "loss": 1.0258, + "step": 67100 + }, + { + "epoch": 9.284332688588007, + "grad_norm": 0.054294098168611526, + "learning_rate": 2.1888484169235077e-05, + "loss": 1.0262, + "step": 67200 + }, + { + "epoch": 9.298148659850787, + "grad_norm": 0.04951765388250351, + "learning_rate": 2.1468198374894926e-05, + "loss": 1.0254, + "step": 67300 + }, + { + "epoch": 9.311964631113566, + "grad_norm": 0.047647446393966675, + "learning_rate": 2.1047912580554774e-05, + "loss": 1.0262, + "step": 67400 + }, + { + "epoch": 9.325780602376348, + "grad_norm": 0.062047079205513, + "learning_rate": 2.0627626786214623e-05, + "loss": 1.0287, + "step": 67500 + }, + { + "epoch": 9.339596573639128, + "grad_norm": 0.05751033127307892, + "learning_rate": 2.020734099187447e-05, + "loss": 1.027, + "step": 67600 + }, + { + "epoch": 9.353412544901907, + "grad_norm": 0.058642346411943436, + "learning_rate": 1.978705519753432e-05, + "loss": 1.0276, + "step": 67700 + }, + { + "epoch": 9.367228516164687, + "grad_norm": 0.050882838666439056, + "learning_rate": 1.936676940319417e-05, + "loss": 1.0223, + "step": 67800 + }, + { + "epoch": 9.381044487427467, + "grad_norm": 0.053814638406038284, + "learning_rate": 1.8946483608854017e-05, + "loss": 1.0271, + "step": 67900 + }, + { + "epoch": 9.394860458690246, + "grad_norm": 0.05407179519534111, + "learning_rate": 1.852619781451387e-05, + "loss": 1.0242, + "step": 68000 + }, + { + "epoch": 9.408676429953026, + "grad_norm": 0.05431421846151352, + "learning_rate": 1.8105912020173718e-05, + "loss": 1.0246, + "step": 68100 + }, + { + "epoch": 9.422492401215806, + "grad_norm": 0.05826635658740997, + "learning_rate": 1.7685626225833566e-05, + "loss": 1.024, + "step": 68200 + }, + { + "epoch": 9.436308372478585, + "grad_norm": 0.043603766709566116, + "learning_rate": 1.7265340431493415e-05, + "loss": 1.025, + "step": 68300 + }, + { + "epoch": 9.450124343741365, + "grad_norm": 0.0555894561111927, + "learning_rate": 1.6845054637153263e-05, + "loss": 1.0267, + "step": 68400 + }, + { + "epoch": 9.463940315004145, + "grad_norm": 0.046029891818761826, + "learning_rate": 1.6424768842813112e-05, + "loss": 1.0247, + "step": 68500 + }, + { + "epoch": 9.477756286266924, + "grad_norm": 0.04906938225030899, + "learning_rate": 1.600448304847296e-05, + "loss": 1.0233, + "step": 68600 + }, + { + "epoch": 9.491572257529704, + "grad_norm": 0.07827210426330566, + "learning_rate": 1.558419725413281e-05, + "loss": 1.0262, + "step": 68700 + }, + { + "epoch": 9.505388228792484, + "grad_norm": 0.04391390085220337, + "learning_rate": 1.5163911459792658e-05, + "loss": 1.0255, + "step": 68800 + }, + { + "epoch": 9.519204200055263, + "grad_norm": 0.05310402810573578, + "learning_rate": 1.4743625665452506e-05, + "loss": 1.0268, + "step": 68900 + }, + { + "epoch": 9.533020171318043, + "grad_norm": 0.060242168605327606, + "learning_rate": 1.4323339871112355e-05, + "loss": 1.0257, + "step": 69000 + }, + { + "epoch": 9.546836142580823, + "grad_norm": 0.04949665814638138, + "learning_rate": 1.3903054076772205e-05, + "loss": 1.0294, + "step": 69100 + }, + { + "epoch": 9.560652113843604, + "grad_norm": 0.05413687229156494, + "learning_rate": 1.3482768282432054e-05, + "loss": 1.0272, + "step": 69200 + }, + { + "epoch": 9.574468085106384, + "grad_norm": 0.05380227789282799, + "learning_rate": 1.3062482488091902e-05, + "loss": 1.025, + "step": 69300 + }, + { + "epoch": 9.588284056369163, + "grad_norm": 0.04961249604821205, + "learning_rate": 1.2646399551695151e-05, + "loss": 1.0289, + "step": 69400 + }, + { + "epoch": 9.602100027631943, + "grad_norm": 0.045629873871803284, + "learning_rate": 1.2226113757355e-05, + "loss": 1.0269, + "step": 69500 + }, + { + "epoch": 9.615915998894723, + "grad_norm": 0.04661751165986061, + "learning_rate": 1.1805827963014848e-05, + "loss": 1.0277, + "step": 69600 + }, + { + "epoch": 9.629731970157502, + "grad_norm": 0.06289409101009369, + "learning_rate": 1.1385542168674697e-05, + "loss": 1.0246, + "step": 69700 + }, + { + "epoch": 9.643547941420282, + "grad_norm": 0.061526406556367874, + "learning_rate": 1.0965256374334547e-05, + "loss": 1.0252, + "step": 69800 + }, + { + "epoch": 9.657363912683062, + "grad_norm": 0.05611636862158775, + "learning_rate": 1.0544970579994395e-05, + "loss": 1.0281, + "step": 69900 + }, + { + "epoch": 9.671179883945841, + "grad_norm": 0.05305150896310806, + "learning_rate": 1.0124684785654244e-05, + "loss": 1.027, + "step": 70000 + }, + { + "epoch": 9.671179883945841, + "eval_accuracy": 0.5323623139821072, + "eval_loss": 1.029943823814392, + "eval_runtime": 726.2479, + "eval_samples_per_second": 283.458, + "eval_steps_per_second": 8.859, + "step": 70000 + } + ], + "logging_steps": 100, + "max_steps": 72380, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.364706426611565e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-70000/training_args.bin b/checkpoint-70000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..45f2f12b913e85908e1565ce4b13c8763ea7a1ca --- /dev/null +++ b/checkpoint-70000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e8fb7657adc13bdcaf635b1c6fb616dd082a6870cdd6aecd3b669d8cac0873 +size 5304 diff --git a/checkpoint-70000/vocab.json b/checkpoint-70000/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..d0809a2e3e28811023f05ed415122e24681bc9d1 --- /dev/null +++ b/checkpoint-70000/vocab.json @@ -0,0 +1 @@ +{"<|endoftext|>":0,"A":1,"C":2,"G":3,"T":4} \ No newline at end of file diff --git a/checkpoint-72380/config.json b/checkpoint-72380/config.json new file mode 100644 index 0000000000000000000000000000000000000000..192ba05a8714569e728cced45eaebf4106596353 --- /dev/null +++ b/checkpoint-72380/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 50256, + "embd_pdrop": 0.1, + "eos_token_id": 50256, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 768, + "n_head": 12, + "n_inner": null, + "n_layer": 12, + "n_positions": 1024, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.1, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.52.0.dev0", + "use_cache": true, + "vocab_size": 5 +} diff --git a/checkpoint-72380/generation_config.json b/checkpoint-72380/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c829fa47bd90bfe00fdb37ed6d41324f6fb81f63 --- /dev/null +++ b/checkpoint-72380/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 50256, + "eos_token_id": 50256, + "transformers_version": "4.52.0.dev0" +} diff --git a/checkpoint-72380/merges.txt b/checkpoint-72380/merges.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e7f1fd94996c8e2b65adea828af1b398eace61f --- /dev/null +++ b/checkpoint-72380/merges.txt @@ -0,0 +1 @@ +#version: 0.2 diff --git a/checkpoint-72380/model.safetensors b/checkpoint-72380/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c10ea73f1c094db8c30e99d38dc77b1cc091209f --- /dev/null +++ b/checkpoint-72380/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f92899f4db195c8aeb912724ff986896242eb0429b37aee778d302e21091c9de +size 343400064 diff --git a/checkpoint-72380/optimizer.pt b/checkpoint-72380/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..68737bca32fae63c25f2d8df0ba6fbcb9abdf1ef --- /dev/null +++ b/checkpoint-72380/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18b7cdf65c9d9b0e5f7cc5a0eefff555fa183b157836316032bf3cb608879251 +size 686894010 diff --git a/checkpoint-72380/rng_state_0.pth b/checkpoint-72380/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..745d1f064d0bc97e31e79c2b22fcd0c7d433b219 --- /dev/null +++ b/checkpoint-72380/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c25515ce626b4abe5a87054259fda00aa4607c6b55105ce503dbf9685168014 +size 14960 diff --git a/checkpoint-72380/rng_state_1.pth b/checkpoint-72380/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef02af8ac80765c3a5cde179cbb6233995383a4a --- /dev/null +++ b/checkpoint-72380/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a36cd014a4704101a9bf0bc9cd1ea746333e0872d4713feebc6f28b65046aaf6 +size 14960 diff --git a/checkpoint-72380/rng_state_2.pth b/checkpoint-72380/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..a55c9dfab9434a0050b4d45eff0269bbc8e9ef74 --- /dev/null +++ b/checkpoint-72380/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d066faeaef510c6df230bdde787d1a47cc41fa100a5cc9b52c590ddb31b737b0 +size 14960 diff --git a/checkpoint-72380/rng_state_3.pth b/checkpoint-72380/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7320a08c846ae6a4562dc8fd86482b01d987c44a --- /dev/null +++ b/checkpoint-72380/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12b7e7097bc6610ab93b5b8495ffb37a797cf3694d6cacb86f7a320518ca5b80 +size 14960 diff --git a/checkpoint-72380/scaler.pt b/checkpoint-72380/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f112d4a399b01f95e32c6c17d75f4234b4e1051c --- /dev/null +++ b/checkpoint-72380/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aa1e803a71b8195096e7e8434c59a48b7e733ca69258660b62b3f2d5128a2e0 +size 988 diff --git a/checkpoint-72380/scheduler.pt b/checkpoint-72380/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5efa4ec9ee5443a8f255969078cbf8d2c0aa4916 --- /dev/null +++ b/checkpoint-72380/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:134bc570b6bcd834acbb92457d0f89b92c86b5647b77c08c7ea37da87f1eeeb1 +size 1064 diff --git a/checkpoint-72380/special_tokens_map.json b/checkpoint-72380/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..773bd68cf0900427f8d69dd974724e3abb9a08a9 --- /dev/null +++ b/checkpoint-72380/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-72380/tokenizer.json b/checkpoint-72380/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..ac40039af791f0fd130b3d36c3677a156b2de089 --- /dev/null +++ b/checkpoint-72380/tokenizer.json @@ -0,0 +1,53 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": true + }, + "post_processor": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": false, + "use_regex": true + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": "", + "end_of_word_suffix": "", + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "<|endoftext|>": 0, + "A": 1, + "C": 2, + "G": 3, + "T": 4 + }, + "merges": [] + } +} \ No newline at end of file diff --git a/checkpoint-72380/tokenizer_config.json b/checkpoint-72380/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7c4e19588fa8b4faceab450a1d7e8dae1ce87f7c --- /dev/null +++ b/checkpoint-72380/tokenizer_config.json @@ -0,0 +1,21 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|endoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "<|endoftext|>" +} diff --git a/checkpoint-72380/trainer_state.json b/checkpoint-72380/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bcd33d0b41ea2d7b6a76892e58f2812341c08420 --- /dev/null +++ b/checkpoint-72380/trainer_state.json @@ -0,0 +1,5228 @@ +{ + "best_global_step": 70000, + "best_metric": 1.029943823814392, + "best_model_checkpoint": "./dna_model/checkpoint-70000", + "epoch": 10.0, + "eval_steps": 5000, + "global_step": 72380, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00013815971262779773, + "grad_norm": 42.580928802490234, + "learning_rate": 0.0, + "loss": 1.6625, + "step": 1 + }, + { + "epoch": 0.013815971262779773, + "grad_norm": 2.4457767009735107, + "learning_rate": 2.97e-05, + "loss": 1.36, + "step": 100 + }, + { + "epoch": 0.027631942525559547, + "grad_norm": 0.5432274341583252, + "learning_rate": 5.97e-05, + "loss": 1.3309, + "step": 200 + }, + { + "epoch": 0.04144791378833932, + "grad_norm": 0.825528621673584, + "learning_rate": 8.969999999999998e-05, + "loss": 1.3234, + "step": 300 + }, + { + "epoch": 0.055263885051119094, + "grad_norm": 0.4912604093551636, + "learning_rate": 0.0001197, + "loss": 1.3249, + "step": 400 + }, + { + "epoch": 0.06907985631389886, + "grad_norm": 0.9077563881874084, + "learning_rate": 0.00014969999999999998, + "loss": 1.3153, + "step": 500 + }, + { + "epoch": 0.08289582757667864, + "grad_norm": 0.8954246640205383, + "learning_rate": 0.00017969999999999998, + "loss": 1.3123, + "step": 600 + }, + { + "epoch": 0.09671179883945841, + "grad_norm": 0.5876831412315369, + "learning_rate": 0.00020969999999999997, + "loss": 1.3098, + "step": 700 + }, + { + "epoch": 0.11052777010223819, + "grad_norm": 0.426789253950119, + "learning_rate": 0.0002397, + "loss": 1.3072, + "step": 800 + }, + { + "epoch": 0.12434374136501795, + "grad_norm": 0.3324718177318573, + "learning_rate": 0.0002697, + "loss": 1.3037, + "step": 900 + }, + { + "epoch": 0.13815971262779772, + "grad_norm": 0.23672613501548767, + "learning_rate": 0.00029969999999999997, + "loss": 1.2991, + "step": 1000 + }, + { + "epoch": 0.1519756838905775, + "grad_norm": 0.4699796438217163, + "learning_rate": 0.00029958391706360325, + "loss": 1.2923, + "step": 1100 + }, + { + "epoch": 0.16579165515335728, + "grad_norm": 0.684186577796936, + "learning_rate": 0.00029916363126926307, + "loss": 1.2825, + "step": 1200 + }, + { + "epoch": 0.17960762641613706, + "grad_norm": 0.3944641649723053, + "learning_rate": 0.00029874334547492294, + "loss": 1.2678, + "step": 1300 + }, + { + "epoch": 0.19342359767891681, + "grad_norm": 1.1556001901626587, + "learning_rate": 0.00029832305968058276, + "loss": 1.2541, + "step": 1400 + }, + { + "epoch": 0.2072395689416966, + "grad_norm": 0.39745599031448364, + "learning_rate": 0.0002979027738862426, + "loss": 1.2439, + "step": 1500 + }, + { + "epoch": 0.22105554020447638, + "grad_norm": 0.5201444029808044, + "learning_rate": 0.00029748248809190246, + "loss": 1.2329, + "step": 1600 + }, + { + "epoch": 0.23487151146725616, + "grad_norm": 0.2168777734041214, + "learning_rate": 0.00029706220229756234, + "loss": 1.2268, + "step": 1700 + }, + { + "epoch": 0.2486874827300359, + "grad_norm": 0.30599427223205566, + "learning_rate": 0.00029664191650322216, + "loss": 1.2199, + "step": 1800 + }, + { + "epoch": 0.2625034539928157, + "grad_norm": 0.32062044739723206, + "learning_rate": 0.00029622163070888203, + "loss": 1.2131, + "step": 1900 + }, + { + "epoch": 0.27631942525559544, + "grad_norm": 0.13411013782024384, + "learning_rate": 0.00029580134491454186, + "loss": 1.2074, + "step": 2000 + }, + { + "epoch": 0.2901353965183752, + "grad_norm": 0.3672633767127991, + "learning_rate": 0.00029538105912020173, + "loss": 1.2022, + "step": 2100 + }, + { + "epoch": 0.303951367781155, + "grad_norm": 0.41515815258026123, + "learning_rate": 0.00029496077332586155, + "loss": 1.1949, + "step": 2200 + }, + { + "epoch": 0.3177673390439348, + "grad_norm": 0.18381068110466003, + "learning_rate": 0.0002945404875315214, + "loss": 1.1887, + "step": 2300 + }, + { + "epoch": 0.33158331030671456, + "grad_norm": 0.3080751895904541, + "learning_rate": 0.00029412020173718125, + "loss": 1.1844, + "step": 2400 + }, + { + "epoch": 0.34539928156949434, + "grad_norm": 0.38037416338920593, + "learning_rate": 0.0002936999159428411, + "loss": 1.1804, + "step": 2500 + }, + { + "epoch": 0.3592152528322741, + "grad_norm": 0.23272989690303802, + "learning_rate": 0.00029327963014850095, + "loss": 1.1753, + "step": 2600 + }, + { + "epoch": 0.3730312240950539, + "grad_norm": 0.1149936243891716, + "learning_rate": 0.0002928593443541608, + "loss": 1.1739, + "step": 2700 + }, + { + "epoch": 0.38684719535783363, + "grad_norm": 0.28469276428222656, + "learning_rate": 0.00029243905855982064, + "loss": 1.1671, + "step": 2800 + }, + { + "epoch": 0.4006631666206134, + "grad_norm": 0.25204166769981384, + "learning_rate": 0.0002920187727654805, + "loss": 1.1633, + "step": 2900 + }, + { + "epoch": 0.4144791378833932, + "grad_norm": 0.3945861756801605, + "learning_rate": 0.00029159848697114034, + "loss": 1.1608, + "step": 3000 + }, + { + "epoch": 0.42829510914617297, + "grad_norm": 0.2578865587711334, + "learning_rate": 0.00029117820117680016, + "loss": 1.1622, + "step": 3100 + }, + { + "epoch": 0.44211108040895275, + "grad_norm": 0.16060177981853485, + "learning_rate": 0.00029075791538246004, + "loss": 1.1577, + "step": 3200 + }, + { + "epoch": 0.45592705167173253, + "grad_norm": 0.1980718970298767, + "learning_rate": 0.0002903376295881199, + "loss": 1.155, + "step": 3300 + }, + { + "epoch": 0.4697430229345123, + "grad_norm": 0.12515653669834137, + "learning_rate": 0.00028991734379377974, + "loss": 1.1519, + "step": 3400 + }, + { + "epoch": 0.4835589941972921, + "grad_norm": 0.26255738735198975, + "learning_rate": 0.0002894970579994396, + "loss": 1.1523, + "step": 3500 + }, + { + "epoch": 0.4973749654600718, + "grad_norm": 0.281464546918869, + "learning_rate": 0.00028907677220509943, + "loss": 1.1511, + "step": 3600 + }, + { + "epoch": 0.5111909367228517, + "grad_norm": 0.11816036701202393, + "learning_rate": 0.0002886564864107593, + "loss": 1.1469, + "step": 3700 + }, + { + "epoch": 0.5250069079856314, + "grad_norm": 0.25923675298690796, + "learning_rate": 0.00028823620061641913, + "loss": 1.1456, + "step": 3800 + }, + { + "epoch": 0.5388228792484112, + "grad_norm": 0.2766472399234772, + "learning_rate": 0.00028781591482207895, + "loss": 1.1442, + "step": 3900 + }, + { + "epoch": 0.5526388505111909, + "grad_norm": 0.1701624095439911, + "learning_rate": 0.00028739562902773883, + "loss": 1.1445, + "step": 4000 + }, + { + "epoch": 0.5664548217739707, + "grad_norm": 0.3141656219959259, + "learning_rate": 0.0002869753432333987, + "loss": 1.1392, + "step": 4100 + }, + { + "epoch": 0.5802707930367504, + "grad_norm": 0.11816743016242981, + "learning_rate": 0.0002865550574390585, + "loss": 1.1406, + "step": 4200 + }, + { + "epoch": 0.5940867642995302, + "grad_norm": 0.12762723863124847, + "learning_rate": 0.0002861347716447184, + "loss": 1.1361, + "step": 4300 + }, + { + "epoch": 0.60790273556231, + "grad_norm": 0.09322622418403625, + "learning_rate": 0.0002857144858503782, + "loss": 1.134, + "step": 4400 + }, + { + "epoch": 0.6217187068250898, + "grad_norm": 0.1586735099554062, + "learning_rate": 0.0002852942000560381, + "loss": 1.1336, + "step": 4500 + }, + { + "epoch": 0.6355346780878696, + "grad_norm": 0.13594642281532288, + "learning_rate": 0.0002848739142616979, + "loss": 1.1328, + "step": 4600 + }, + { + "epoch": 0.6493506493506493, + "grad_norm": 0.21865279972553253, + "learning_rate": 0.00028445362846735774, + "loss": 1.1311, + "step": 4700 + }, + { + "epoch": 0.6631666206134291, + "grad_norm": 0.22787001729011536, + "learning_rate": 0.0002840333426730176, + "loss": 1.1271, + "step": 4800 + }, + { + "epoch": 0.6769825918762089, + "grad_norm": 0.2334531843662262, + "learning_rate": 0.0002836130568786775, + "loss": 1.1291, + "step": 4900 + }, + { + "epoch": 0.6907985631389887, + "grad_norm": 0.11103236675262451, + "learning_rate": 0.0002831927710843373, + "loss": 1.1252, + "step": 5000 + }, + { + "epoch": 0.6907985631389887, + "eval_accuracy": 0.4745045939970608, + "eval_loss": 1.1205766201019287, + "eval_runtime": 1027.9902, + "eval_samples_per_second": 200.256, + "eval_steps_per_second": 6.259, + "step": 5000 + }, + { + "epoch": 0.7046145344017685, + "grad_norm": 0.21742330491542816, + "learning_rate": 0.0002827724852899972, + "loss": 1.1235, + "step": 5100 + }, + { + "epoch": 0.7184305056645482, + "grad_norm": 0.23728515207767487, + "learning_rate": 0.000282352199495657, + "loss": 1.1233, + "step": 5200 + }, + { + "epoch": 0.732246476927328, + "grad_norm": 0.21022765338420868, + "learning_rate": 0.0002819319137013169, + "loss": 1.1236, + "step": 5300 + }, + { + "epoch": 0.7460624481901078, + "grad_norm": 0.0924484059214592, + "learning_rate": 0.0002815116279069767, + "loss": 1.1215, + "step": 5400 + }, + { + "epoch": 0.7598784194528876, + "grad_norm": 0.1716778427362442, + "learning_rate": 0.00028109134211263653, + "loss": 1.1238, + "step": 5500 + }, + { + "epoch": 0.7736943907156673, + "grad_norm": 0.13049638271331787, + "learning_rate": 0.0002806710563182964, + "loss": 1.1185, + "step": 5600 + }, + { + "epoch": 0.787510361978447, + "grad_norm": 0.16255174577236176, + "learning_rate": 0.0002802507705239563, + "loss": 1.1169, + "step": 5700 + }, + { + "epoch": 0.8013263332412268, + "grad_norm": 0.10065080225467682, + "learning_rate": 0.0002798304847296161, + "loss": 1.1184, + "step": 5800 + }, + { + "epoch": 0.8151423045040066, + "grad_norm": 0.1182553768157959, + "learning_rate": 0.000279410198935276, + "loss": 1.1141, + "step": 5900 + }, + { + "epoch": 0.8289582757667864, + "grad_norm": 0.14556263387203217, + "learning_rate": 0.0002789899131409358, + "loss": 1.1154, + "step": 6000 + }, + { + "epoch": 0.8427742470295662, + "grad_norm": 0.1383764147758484, + "learning_rate": 0.00027857383020453907, + "loss": 1.1118, + "step": 6100 + }, + { + "epoch": 0.8565902182923459, + "grad_norm": 0.2821154296398163, + "learning_rate": 0.00027815354441019895, + "loss": 1.1104, + "step": 6200 + }, + { + "epoch": 0.8704061895551257, + "grad_norm": 0.22286450862884521, + "learning_rate": 0.00027773325861585877, + "loss": 1.1109, + "step": 6300 + }, + { + "epoch": 0.8842221608179055, + "grad_norm": 0.2058987319469452, + "learning_rate": 0.0002773129728215186, + "loss": 1.1093, + "step": 6400 + }, + { + "epoch": 0.8980381320806853, + "grad_norm": 0.21338045597076416, + "learning_rate": 0.00027689268702717847, + "loss": 1.1091, + "step": 6500 + }, + { + "epoch": 0.9118541033434651, + "grad_norm": 0.0900028795003891, + "learning_rate": 0.0002764724012328383, + "loss": 1.1067, + "step": 6600 + }, + { + "epoch": 0.9256700746062448, + "grad_norm": 0.10679551959037781, + "learning_rate": 0.00027605211543849816, + "loss": 1.108, + "step": 6700 + }, + { + "epoch": 0.9394860458690246, + "grad_norm": 0.07972779124975204, + "learning_rate": 0.000275631829644158, + "loss": 1.1057, + "step": 6800 + }, + { + "epoch": 0.9533020171318044, + "grad_norm": 0.24500218033790588, + "learning_rate": 0.00027521154384981786, + "loss": 1.105, + "step": 6900 + }, + { + "epoch": 0.9671179883945842, + "grad_norm": 0.11576998978853226, + "learning_rate": 0.00027479125805547774, + "loss": 1.1029, + "step": 7000 + }, + { + "epoch": 0.980933959657364, + "grad_norm": 0.10553757101297379, + "learning_rate": 0.00027437097226113756, + "loss": 1.1041, + "step": 7100 + }, + { + "epoch": 0.9947499309201436, + "grad_norm": 0.15332186222076416, + "learning_rate": 0.0002739506864667974, + "loss": 1.0982, + "step": 7200 + }, + { + "epoch": 1.0085659021829234, + "grad_norm": 0.11897014081478119, + "learning_rate": 0.00027353040067245725, + "loss": 1.0996, + "step": 7300 + }, + { + "epoch": 1.0223818734457033, + "grad_norm": 0.1156444102525711, + "learning_rate": 0.0002731101148781171, + "loss": 1.1032, + "step": 7400 + }, + { + "epoch": 1.036197844708483, + "grad_norm": 0.06223931908607483, + "learning_rate": 0.00027268982908377695, + "loss": 1.0982, + "step": 7500 + }, + { + "epoch": 1.0500138159712629, + "grad_norm": 0.14377152919769287, + "learning_rate": 0.00027226954328943677, + "loss": 1.1003, + "step": 7600 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 0.12667153775691986, + "learning_rate": 0.00027184925749509665, + "loss": 1.0989, + "step": 7700 + }, + { + "epoch": 1.0776457584968224, + "grad_norm": 0.16101804375648499, + "learning_rate": 0.0002714289717007565, + "loss": 1.0968, + "step": 7800 + }, + { + "epoch": 1.091461729759602, + "grad_norm": 0.06424383819103241, + "learning_rate": 0.00027100868590641635, + "loss": 1.0955, + "step": 7900 + }, + { + "epoch": 1.105277701022382, + "grad_norm": 0.09638939052820206, + "learning_rate": 0.00027058840011207617, + "loss": 1.095, + "step": 8000 + }, + { + "epoch": 1.1190936722851617, + "grad_norm": 0.08098015189170837, + "learning_rate": 0.00027016811431773604, + "loss": 1.0969, + "step": 8100 + }, + { + "epoch": 1.1329096435479413, + "grad_norm": 0.10837887227535248, + "learning_rate": 0.00026974782852339586, + "loss": 1.096, + "step": 8200 + }, + { + "epoch": 1.1467256148107212, + "grad_norm": 0.05644046515226364, + "learning_rate": 0.00026932754272905574, + "loss": 1.0944, + "step": 8300 + }, + { + "epoch": 1.1605415860735009, + "grad_norm": 0.12965446710586548, + "learning_rate": 0.00026890725693471556, + "loss": 1.0953, + "step": 8400 + }, + { + "epoch": 1.1743575573362808, + "grad_norm": 0.12333771586418152, + "learning_rate": 0.00026848697114037544, + "loss": 1.095, + "step": 8500 + }, + { + "epoch": 1.1881735285990604, + "grad_norm": 0.1270703673362732, + "learning_rate": 0.0002680666853460353, + "loss": 1.0929, + "step": 8600 + }, + { + "epoch": 1.2019894998618403, + "grad_norm": 0.16918766498565674, + "learning_rate": 0.00026764639955169513, + "loss": 1.0918, + "step": 8700 + }, + { + "epoch": 1.21580547112462, + "grad_norm": 0.08776108920574188, + "learning_rate": 0.00026722611375735496, + "loss": 1.0952, + "step": 8800 + }, + { + "epoch": 1.2296214423874, + "grad_norm": 0.08252176642417908, + "learning_rate": 0.00026680582796301483, + "loss": 1.09, + "step": 8900 + }, + { + "epoch": 1.2434374136501796, + "grad_norm": 0.16331979632377625, + "learning_rate": 0.00026638554216867465, + "loss": 1.0898, + "step": 9000 + }, + { + "epoch": 1.2572533849129595, + "grad_norm": 0.17065368592739105, + "learning_rate": 0.00026596525637433453, + "loss": 1.0907, + "step": 9100 + }, + { + "epoch": 1.2710693561757391, + "grad_norm": 0.12038784474134445, + "learning_rate": 0.00026554497057999435, + "loss": 1.0856, + "step": 9200 + }, + { + "epoch": 1.284885327438519, + "grad_norm": 0.11924347281455994, + "learning_rate": 0.0002651246847856542, + "loss": 1.0895, + "step": 9300 + }, + { + "epoch": 1.2987012987012987, + "grad_norm": 0.1443828046321869, + "learning_rate": 0.0002647043989913141, + "loss": 1.0874, + "step": 9400 + }, + { + "epoch": 1.3125172699640784, + "grad_norm": 0.14472317695617676, + "learning_rate": 0.0002642841131969739, + "loss": 1.0879, + "step": 9500 + }, + { + "epoch": 1.3263332412268583, + "grad_norm": 0.15847088396549225, + "learning_rate": 0.00026386382740263374, + "loss": 1.0873, + "step": 9600 + }, + { + "epoch": 1.3401492124896381, + "grad_norm": 0.17960332334041595, + "learning_rate": 0.0002634435416082936, + "loss": 1.0887, + "step": 9700 + }, + { + "epoch": 1.3539651837524178, + "grad_norm": 0.1566227227449417, + "learning_rate": 0.00026302325581395344, + "loss": 1.0884, + "step": 9800 + }, + { + "epoch": 1.3677811550151975, + "grad_norm": 0.1431213617324829, + "learning_rate": 0.0002626029700196133, + "loss": 1.0864, + "step": 9900 + }, + { + "epoch": 1.3815971262779774, + "grad_norm": 0.10321222990751266, + "learning_rate": 0.0002621826842252732, + "loss": 1.0835, + "step": 10000 + }, + { + "epoch": 1.3815971262779774, + "eval_accuracy": 0.49913821881815945, + "eval_loss": 1.081355094909668, + "eval_runtime": 748.8314, + "eval_samples_per_second": 274.91, + "eval_steps_per_second": 8.592, + "step": 10000 + }, + { + "epoch": 1.395413097540757, + "grad_norm": 0.10260605067014694, + "learning_rate": 0.0002617666012888764, + "loss": 1.0843, + "step": 10100 + }, + { + "epoch": 1.409229068803537, + "grad_norm": 0.1076885387301445, + "learning_rate": 0.0002613463154945363, + "loss": 1.0845, + "step": 10200 + }, + { + "epoch": 1.4230450400663166, + "grad_norm": 0.0723571702837944, + "learning_rate": 0.0002609260297001961, + "loss": 1.0814, + "step": 10300 + }, + { + "epoch": 1.4368610113290965, + "grad_norm": 0.10695687681436539, + "learning_rate": 0.00026050574390585593, + "loss": 1.0842, + "step": 10400 + }, + { + "epoch": 1.4506769825918762, + "grad_norm": 0.11008185893297195, + "learning_rate": 0.0002600854581115158, + "loss": 1.0832, + "step": 10500 + }, + { + "epoch": 1.464492953854656, + "grad_norm": 0.12239653617143631, + "learning_rate": 0.0002596651723171756, + "loss": 1.0813, + "step": 10600 + }, + { + "epoch": 1.4783089251174357, + "grad_norm": 0.11045056581497192, + "learning_rate": 0.0002592448865228355, + "loss": 1.0848, + "step": 10700 + }, + { + "epoch": 1.4921248963802154, + "grad_norm": 0.07234488427639008, + "learning_rate": 0.0002588246007284954, + "loss": 1.0826, + "step": 10800 + }, + { + "epoch": 1.5059408676429953, + "grad_norm": 0.11086778342723846, + "learning_rate": 0.0002584043149341552, + "loss": 1.0804, + "step": 10900 + }, + { + "epoch": 1.5197568389057752, + "grad_norm": 0.10693442821502686, + "learning_rate": 0.0002579840291398151, + "loss": 1.0784, + "step": 11000 + }, + { + "epoch": 1.5335728101685548, + "grad_norm": 0.11604110896587372, + "learning_rate": 0.0002575637433454749, + "loss": 1.0792, + "step": 11100 + }, + { + "epoch": 1.5473887814313345, + "grad_norm": 0.0809662714600563, + "learning_rate": 0.0002571434575511347, + "loss": 1.083, + "step": 11200 + }, + { + "epoch": 1.5612047526941144, + "grad_norm": 0.1850002408027649, + "learning_rate": 0.0002567231717567946, + "loss": 1.0802, + "step": 11300 + }, + { + "epoch": 1.5750207239568943, + "grad_norm": 0.0779227465391159, + "learning_rate": 0.0002563028859624544, + "loss": 1.0811, + "step": 11400 + }, + { + "epoch": 1.588836695219674, + "grad_norm": 0.16764625906944275, + "learning_rate": 0.0002558826001681143, + "loss": 1.0763, + "step": 11500 + }, + { + "epoch": 1.6026526664824536, + "grad_norm": 0.11104313284158707, + "learning_rate": 0.00025546231437377417, + "loss": 1.0782, + "step": 11600 + }, + { + "epoch": 1.6164686377452335, + "grad_norm": 0.16667212545871735, + "learning_rate": 0.000255042028579434, + "loss": 1.0781, + "step": 11700 + }, + { + "epoch": 1.6302846090080134, + "grad_norm": 0.2246047705411911, + "learning_rate": 0.00025462174278509386, + "loss": 1.08, + "step": 11800 + }, + { + "epoch": 1.644100580270793, + "grad_norm": 0.2305343896150589, + "learning_rate": 0.0002542014569907537, + "loss": 1.0756, + "step": 11900 + }, + { + "epoch": 1.6579165515335728, + "grad_norm": 0.13618823885917664, + "learning_rate": 0.0002537811711964135, + "loss": 1.076, + "step": 12000 + }, + { + "epoch": 1.6717325227963524, + "grad_norm": 0.15795475244522095, + "learning_rate": 0.0002533608854020734, + "loss": 1.0749, + "step": 12100 + }, + { + "epoch": 1.6855484940591323, + "grad_norm": 0.20267115533351898, + "learning_rate": 0.00025294480246567665, + "loss": 1.077, + "step": 12200 + }, + { + "epoch": 1.6993644653219122, + "grad_norm": 0.08052489906549454, + "learning_rate": 0.0002525245166713365, + "loss": 1.073, + "step": 12300 + }, + { + "epoch": 1.7131804365846919, + "grad_norm": 0.11914093047380447, + "learning_rate": 0.00025210423087699635, + "loss": 1.0755, + "step": 12400 + }, + { + "epoch": 1.7269964078474715, + "grad_norm": 0.12703542411327362, + "learning_rate": 0.00025168394508265617, + "loss": 1.0765, + "step": 12500 + }, + { + "epoch": 1.7408123791102514, + "grad_norm": 0.12948518991470337, + "learning_rate": 0.00025126365928831605, + "loss": 1.0748, + "step": 12600 + }, + { + "epoch": 1.7546283503730313, + "grad_norm": 0.1027710810303688, + "learning_rate": 0.00025084337349397587, + "loss": 1.0745, + "step": 12700 + }, + { + "epoch": 1.768444321635811, + "grad_norm": 0.20131652057170868, + "learning_rate": 0.0002504230876996357, + "loss": 1.0731, + "step": 12800 + }, + { + "epoch": 1.7822602928985907, + "grad_norm": 0.0673370212316513, + "learning_rate": 0.00025000280190529557, + "loss": 1.0721, + "step": 12900 + }, + { + "epoch": 1.7960762641613706, + "grad_norm": 0.10322799533605576, + "learning_rate": 0.00024958251611095544, + "loss": 1.0731, + "step": 13000 + }, + { + "epoch": 1.8098922354241505, + "grad_norm": 0.08498311042785645, + "learning_rate": 0.00024916223031661526, + "loss": 1.0722, + "step": 13100 + }, + { + "epoch": 1.8237082066869301, + "grad_norm": 0.07025079429149628, + "learning_rate": 0.00024874194452227514, + "loss": 1.0725, + "step": 13200 + }, + { + "epoch": 1.8375241779497098, + "grad_norm": 0.13933932781219482, + "learning_rate": 0.00024832165872793496, + "loss": 1.0714, + "step": 13300 + }, + { + "epoch": 1.8513401492124897, + "grad_norm": 0.10513993352651596, + "learning_rate": 0.00024790137293359484, + "loss": 1.0725, + "step": 13400 + }, + { + "epoch": 1.8651561204752696, + "grad_norm": 0.1704607903957367, + "learning_rate": 0.0002474810871392547, + "loss": 1.0712, + "step": 13500 + }, + { + "epoch": 1.8789720917380492, + "grad_norm": 0.08315689861774445, + "learning_rate": 0.0002470608013449145, + "loss": 1.0697, + "step": 13600 + }, + { + "epoch": 1.892788063000829, + "grad_norm": 0.09900273382663727, + "learning_rate": 0.00024664051555057436, + "loss": 1.0735, + "step": 13700 + }, + { + "epoch": 1.9066040342636086, + "grad_norm": 0.05560864508152008, + "learning_rate": 0.00024622022975623423, + "loss": 1.0711, + "step": 13800 + }, + { + "epoch": 1.9204200055263885, + "grad_norm": 0.13863462209701538, + "learning_rate": 0.00024579994396189405, + "loss": 1.0681, + "step": 13900 + }, + { + "epoch": 1.9342359767891684, + "grad_norm": 0.07841744273900986, + "learning_rate": 0.00024537965816755393, + "loss": 1.0711, + "step": 14000 + }, + { + "epoch": 1.948051948051948, + "grad_norm": 0.058312736451625824, + "learning_rate": 0.00024495937237321375, + "loss": 1.0709, + "step": 14100 + }, + { + "epoch": 1.9618679193147277, + "grad_norm": 0.11208023875951767, + "learning_rate": 0.000244543289436817, + "loss": 1.0686, + "step": 14200 + }, + { + "epoch": 1.9756838905775076, + "grad_norm": 0.10133163630962372, + "learning_rate": 0.00024412300364247687, + "loss": 1.0683, + "step": 14300 + }, + { + "epoch": 1.9894998618402875, + "grad_norm": 0.08370282500982285, + "learning_rate": 0.0002437027178481367, + "loss": 1.0709, + "step": 14400 + }, + { + "epoch": 2.003315833103067, + "grad_norm": 0.09476770460605621, + "learning_rate": 0.00024328243205379654, + "loss": 1.0697, + "step": 14500 + }, + { + "epoch": 2.017131804365847, + "grad_norm": 0.0733637660741806, + "learning_rate": 0.0002428621462594564, + "loss": 1.0681, + "step": 14600 + }, + { + "epoch": 2.0309477756286265, + "grad_norm": 0.09925834089517593, + "learning_rate": 0.00024244186046511627, + "loss": 1.0702, + "step": 14700 + }, + { + "epoch": 2.0447637468914066, + "grad_norm": 0.15911750495433807, + "learning_rate": 0.00024202157467077611, + "loss": 1.0665, + "step": 14800 + }, + { + "epoch": 2.0585797181541863, + "grad_norm": 0.13638247549533844, + "learning_rate": 0.00024160128887643596, + "loss": 1.0696, + "step": 14900 + }, + { + "epoch": 2.072395689416966, + "grad_norm": 0.16883982717990875, + "learning_rate": 0.0002411810030820958, + "loss": 1.0641, + "step": 15000 + }, + { + "epoch": 2.072395689416966, + "eval_accuracy": 0.5102966510685876, + "eval_loss": 1.0638896226882935, + "eval_runtime": 924.2494, + "eval_samples_per_second": 222.733, + "eval_steps_per_second": 6.961, + "step": 15000 + }, + { + "epoch": 2.0862116606797456, + "grad_norm": 0.09925784170627594, + "learning_rate": 0.00024076071728775566, + "loss": 1.0683, + "step": 15100 + }, + { + "epoch": 2.1000276319425257, + "grad_norm": 0.06180203706026077, + "learning_rate": 0.00024034043149341548, + "loss": 1.066, + "step": 15200 + }, + { + "epoch": 2.1138436032053054, + "grad_norm": 0.10063247382640839, + "learning_rate": 0.00023992014569907533, + "loss": 1.0668, + "step": 15300 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 0.11476041376590729, + "learning_rate": 0.0002394998599047352, + "loss": 1.0644, + "step": 15400 + }, + { + "epoch": 2.1414755457308647, + "grad_norm": 0.11798429489135742, + "learning_rate": 0.00023907957411039505, + "loss": 1.0626, + "step": 15500 + }, + { + "epoch": 2.155291516993645, + "grad_norm": 0.13165287673473358, + "learning_rate": 0.0002386592883160549, + "loss": 1.0648, + "step": 15600 + }, + { + "epoch": 2.1691074882564245, + "grad_norm": 0.1705123484134674, + "learning_rate": 0.00023823900252171475, + "loss": 1.0639, + "step": 15700 + }, + { + "epoch": 2.182923459519204, + "grad_norm": 0.13375049829483032, + "learning_rate": 0.0002378187167273746, + "loss": 1.062, + "step": 15800 + }, + { + "epoch": 2.196739430781984, + "grad_norm": 0.09405038505792618, + "learning_rate": 0.00023739843093303445, + "loss": 1.0634, + "step": 15900 + }, + { + "epoch": 2.210555402044764, + "grad_norm": 0.11285752803087234, + "learning_rate": 0.00023697814513869427, + "loss": 1.0667, + "step": 16000 + }, + { + "epoch": 2.2243713733075436, + "grad_norm": 0.12377699464559555, + "learning_rate": 0.00023655785934435412, + "loss": 1.064, + "step": 16100 + }, + { + "epoch": 2.2381873445703233, + "grad_norm": 0.0979316234588623, + "learning_rate": 0.000236137573550014, + "loss": 1.0621, + "step": 16200 + }, + { + "epoch": 2.252003315833103, + "grad_norm": 0.11494515091180801, + "learning_rate": 0.00023572149061361724, + "loss": 1.0645, + "step": 16300 + }, + { + "epoch": 2.2658192870958827, + "grad_norm": 0.07066236436367035, + "learning_rate": 0.0002353012048192771, + "loss": 1.063, + "step": 16400 + }, + { + "epoch": 2.2796352583586628, + "grad_norm": 0.08686563372612, + "learning_rate": 0.00023488091902493694, + "loss": 1.066, + "step": 16500 + }, + { + "epoch": 2.2934512296214424, + "grad_norm": 0.058148209005594254, + "learning_rate": 0.00023446063323059678, + "loss": 1.0643, + "step": 16600 + }, + { + "epoch": 2.307267200884222, + "grad_norm": 0.14033359289169312, + "learning_rate": 0.00023404034743625666, + "loss": 1.0634, + "step": 16700 + }, + { + "epoch": 2.3210831721470018, + "grad_norm": 0.09940097481012344, + "learning_rate": 0.00023362006164191645, + "loss": 1.0629, + "step": 16800 + }, + { + "epoch": 2.334899143409782, + "grad_norm": 0.08228994905948639, + "learning_rate": 0.00023319977584757633, + "loss": 1.0626, + "step": 16900 + }, + { + "epoch": 2.3487151146725616, + "grad_norm": 0.05418753623962402, + "learning_rate": 0.00023277949005323618, + "loss": 1.0611, + "step": 17000 + }, + { + "epoch": 2.3625310859353412, + "grad_norm": 0.09691222757101059, + "learning_rate": 0.00023235920425889603, + "loss": 1.0626, + "step": 17100 + }, + { + "epoch": 2.376347057198121, + "grad_norm": 0.1607312560081482, + "learning_rate": 0.00023193891846455588, + "loss": 1.0623, + "step": 17200 + }, + { + "epoch": 2.3901630284609006, + "grad_norm": 0.1193649098277092, + "learning_rate": 0.00023151863267021572, + "loss": 1.0627, + "step": 17300 + }, + { + "epoch": 2.4039789997236807, + "grad_norm": 0.05427398905158043, + "learning_rate": 0.00023109834687587557, + "loss": 1.0609, + "step": 17400 + }, + { + "epoch": 2.4177949709864603, + "grad_norm": 0.10591702163219452, + "learning_rate": 0.00023067806108153545, + "loss": 1.0637, + "step": 17500 + }, + { + "epoch": 2.43161094224924, + "grad_norm": 0.057032886892557144, + "learning_rate": 0.00023025777528719524, + "loss": 1.0612, + "step": 17600 + }, + { + "epoch": 2.44542691351202, + "grad_norm": 0.08455175161361694, + "learning_rate": 0.00022983748949285512, + "loss": 1.0606, + "step": 17700 + }, + { + "epoch": 2.4592428847748, + "grad_norm": 0.13975144922733307, + "learning_rate": 0.00022941720369851497, + "loss": 1.0624, + "step": 17800 + }, + { + "epoch": 2.4730588560375795, + "grad_norm": 0.11535393446683884, + "learning_rate": 0.00022899691790417482, + "loss": 1.0603, + "step": 17900 + }, + { + "epoch": 2.486874827300359, + "grad_norm": 0.10047648102045059, + "learning_rate": 0.00022857663210983466, + "loss": 1.0607, + "step": 18000 + }, + { + "epoch": 2.500690798563139, + "grad_norm": 0.08474704623222351, + "learning_rate": 0.0002281563463154945, + "loss": 1.062, + "step": 18100 + }, + { + "epoch": 2.514506769825919, + "grad_norm": 0.15308576822280884, + "learning_rate": 0.00022773606052115436, + "loss": 1.0603, + "step": 18200 + }, + { + "epoch": 2.5283227410886986, + "grad_norm": 0.05684039369225502, + "learning_rate": 0.00022731577472681424, + "loss": 1.0589, + "step": 18300 + }, + { + "epoch": 2.5421387123514783, + "grad_norm": 0.10712555050849915, + "learning_rate": 0.00022689548893247409, + "loss": 1.0592, + "step": 18400 + }, + { + "epoch": 2.555954683614258, + "grad_norm": 0.0800655260682106, + "learning_rate": 0.0002264794059960773, + "loss": 1.0603, + "step": 18500 + }, + { + "epoch": 2.569770654877038, + "grad_norm": 0.05980188027024269, + "learning_rate": 0.00022605912020173715, + "loss": 1.0608, + "step": 18600 + }, + { + "epoch": 2.5835866261398177, + "grad_norm": 0.052051473408937454, + "learning_rate": 0.000225638834407397, + "loss": 1.0603, + "step": 18700 + }, + { + "epoch": 2.5974025974025974, + "grad_norm": 0.11966883391141891, + "learning_rate": 0.00022521854861305685, + "loss": 1.057, + "step": 18800 + }, + { + "epoch": 2.611218568665377, + "grad_norm": 0.08861220628023148, + "learning_rate": 0.00022479826281871673, + "loss": 1.0603, + "step": 18900 + }, + { + "epoch": 2.6250345399281567, + "grad_norm": 0.12264814227819443, + "learning_rate": 0.00022437797702437657, + "loss": 1.0602, + "step": 19000 + }, + { + "epoch": 2.638850511190937, + "grad_norm": 0.08384163677692413, + "learning_rate": 0.00022395769123003642, + "loss": 1.057, + "step": 19100 + }, + { + "epoch": 2.6526664824537165, + "grad_norm": 0.11168386787176132, + "learning_rate": 0.00022353740543569624, + "loss": 1.0572, + "step": 19200 + }, + { + "epoch": 2.666482453716496, + "grad_norm": 0.12558519840240479, + "learning_rate": 0.0002231171196413561, + "loss": 1.0592, + "step": 19300 + }, + { + "epoch": 2.6802984249792763, + "grad_norm": 0.06810207664966583, + "learning_rate": 0.00022269683384701594, + "loss": 1.055, + "step": 19400 + }, + { + "epoch": 2.694114396242056, + "grad_norm": 0.16571113467216492, + "learning_rate": 0.0002222765480526758, + "loss": 1.0599, + "step": 19500 + }, + { + "epoch": 2.7079303675048356, + "grad_norm": 0.07613151520490646, + "learning_rate": 0.00022185626225833564, + "loss": 1.0564, + "step": 19600 + }, + { + "epoch": 2.7217463387676153, + "grad_norm": 0.08713393658399582, + "learning_rate": 0.00022143597646399551, + "loss": 1.0582, + "step": 19700 + }, + { + "epoch": 2.735562310030395, + "grad_norm": 0.11707925796508789, + "learning_rate": 0.00022101569066965536, + "loss": 1.056, + "step": 19800 + }, + { + "epoch": 2.749378281293175, + "grad_norm": 0.1053171455860138, + "learning_rate": 0.0002205954048753152, + "loss": 1.0608, + "step": 19900 + }, + { + "epoch": 2.7631942525559547, + "grad_norm": 0.056531500071287155, + "learning_rate": 0.00022017511908097506, + "loss": 1.0563, + "step": 20000 + }, + { + "epoch": 2.7631942525559547, + "eval_accuracy": 0.516310033016185, + "eval_loss": 1.054749608039856, + "eval_runtime": 731.5154, + "eval_samples_per_second": 281.417, + "eval_steps_per_second": 8.795, + "step": 20000 + }, + { + "epoch": 2.7770102238187344, + "grad_norm": 0.10811367630958557, + "learning_rate": 0.00021975483328663488, + "loss": 1.0556, + "step": 20100 + }, + { + "epoch": 2.790826195081514, + "grad_norm": 0.06601472198963165, + "learning_rate": 0.00021933454749229473, + "loss": 1.0578, + "step": 20200 + }, + { + "epoch": 2.804642166344294, + "grad_norm": 0.06906837224960327, + "learning_rate": 0.00021891426169795458, + "loss": 1.06, + "step": 20300 + }, + { + "epoch": 2.818458137607074, + "grad_norm": 0.08911406248807907, + "learning_rate": 0.00021849397590361443, + "loss": 1.0583, + "step": 20400 + }, + { + "epoch": 2.8322741088698535, + "grad_norm": 0.06497912108898163, + "learning_rate": 0.0002180778929672177, + "loss": 1.0575, + "step": 20500 + }, + { + "epoch": 2.846090080132633, + "grad_norm": 0.0886107012629509, + "learning_rate": 0.00021765760717287755, + "loss": 1.0552, + "step": 20600 + }, + { + "epoch": 2.859906051395413, + "grad_norm": 0.05942055955529213, + "learning_rate": 0.0002172373213785374, + "loss": 1.0533, + "step": 20700 + }, + { + "epoch": 2.873722022658193, + "grad_norm": 0.13015809655189514, + "learning_rate": 0.00021681703558419725, + "loss": 1.0549, + "step": 20800 + }, + { + "epoch": 2.8875379939209727, + "grad_norm": 0.06085093691945076, + "learning_rate": 0.00021639674978985707, + "loss": 1.057, + "step": 20900 + }, + { + "epoch": 2.9013539651837523, + "grad_norm": 0.17039401829242706, + "learning_rate": 0.00021597646399551692, + "loss": 1.0571, + "step": 21000 + }, + { + "epoch": 2.9151699364465324, + "grad_norm": 0.07950026541948318, + "learning_rate": 0.00021555617820117676, + "loss": 1.0535, + "step": 21100 + }, + { + "epoch": 2.928985907709312, + "grad_norm": 0.1195695698261261, + "learning_rate": 0.00021513589240683664, + "loss": 1.0535, + "step": 21200 + }, + { + "epoch": 2.942801878972092, + "grad_norm": 0.0896124541759491, + "learning_rate": 0.0002147156066124965, + "loss": 1.0534, + "step": 21300 + }, + { + "epoch": 2.9566178502348714, + "grad_norm": 0.07629978656768799, + "learning_rate": 0.00021429532081815634, + "loss": 1.0564, + "step": 21400 + }, + { + "epoch": 2.970433821497651, + "grad_norm": 0.07431907206773758, + "learning_rate": 0.00021387503502381618, + "loss": 1.0559, + "step": 21500 + }, + { + "epoch": 2.984249792760431, + "grad_norm": 0.0771278440952301, + "learning_rate": 0.00021345474922947603, + "loss": 1.0562, + "step": 21600 + }, + { + "epoch": 2.998065764023211, + "grad_norm": 0.11643990874290466, + "learning_rate": 0.00021303446343513585, + "loss": 1.0525, + "step": 21700 + }, + { + "epoch": 3.0118817352859906, + "grad_norm": 0.058162059634923935, + "learning_rate": 0.0002126141776407957, + "loss": 1.0509, + "step": 21800 + }, + { + "epoch": 3.0256977065487702, + "grad_norm": 0.12037301808595657, + "learning_rate": 0.00021219389184645558, + "loss": 1.0513, + "step": 21900 + }, + { + "epoch": 3.0395136778115504, + "grad_norm": 0.052515506744384766, + "learning_rate": 0.00021177360605211543, + "loss": 1.051, + "step": 22000 + }, + { + "epoch": 3.05332964907433, + "grad_norm": 0.10646827518939972, + "learning_rate": 0.00021135332025777528, + "loss": 1.0542, + "step": 22100 + }, + { + "epoch": 3.0671456203371097, + "grad_norm": 0.1113181784749031, + "learning_rate": 0.00021093303446343512, + "loss": 1.0531, + "step": 22200 + }, + { + "epoch": 3.0809615915998894, + "grad_norm": 0.07355222851037979, + "learning_rate": 0.00021051274866909497, + "loss": 1.0524, + "step": 22300 + }, + { + "epoch": 3.094777562862669, + "grad_norm": 0.06925370544195175, + "learning_rate": 0.00021009246287475482, + "loss": 1.0535, + "step": 22400 + }, + { + "epoch": 3.108593534125449, + "grad_norm": 0.048475924879312515, + "learning_rate": 0.00020967217708041464, + "loss": 1.0564, + "step": 22500 + }, + { + "epoch": 3.122409505388229, + "grad_norm": 0.08578319102525711, + "learning_rate": 0.0002092518912860745, + "loss": 1.0519, + "step": 22600 + }, + { + "epoch": 3.1362254766510085, + "grad_norm": 0.08585724979639053, + "learning_rate": 0.00020883160549173437, + "loss": 1.0525, + "step": 22700 + }, + { + "epoch": 3.150041447913788, + "grad_norm": 0.06518802791833878, + "learning_rate": 0.00020841131969739422, + "loss": 1.0543, + "step": 22800 + }, + { + "epoch": 3.1638574191765683, + "grad_norm": 0.046030618250370026, + "learning_rate": 0.00020799103390305406, + "loss": 1.0525, + "step": 22900 + }, + { + "epoch": 3.177673390439348, + "grad_norm": 0.04972764104604721, + "learning_rate": 0.0002075707481087139, + "loss": 1.0512, + "step": 23000 + }, + { + "epoch": 3.1914893617021276, + "grad_norm": 0.11977583914995193, + "learning_rate": 0.00020715046231437376, + "loss": 1.052, + "step": 23100 + }, + { + "epoch": 3.2053053329649073, + "grad_norm": 0.08040472120046616, + "learning_rate": 0.0002067301765200336, + "loss": 1.0491, + "step": 23200 + }, + { + "epoch": 3.2191213042276874, + "grad_norm": 0.10473213344812393, + "learning_rate": 0.00020630989072569343, + "loss": 1.0525, + "step": 23300 + }, + { + "epoch": 3.232937275490467, + "grad_norm": 0.0790744498372078, + "learning_rate": 0.00020588960493135328, + "loss": 1.0508, + "step": 23400 + }, + { + "epoch": 3.2467532467532467, + "grad_norm": 0.12807689607143402, + "learning_rate": 0.00020547352199495655, + "loss": 1.0485, + "step": 23500 + }, + { + "epoch": 3.2605692180160264, + "grad_norm": 0.10298227518796921, + "learning_rate": 0.0002050532362006164, + "loss": 1.049, + "step": 23600 + }, + { + "epoch": 3.2743851892788065, + "grad_norm": 0.11504103243350983, + "learning_rate": 0.00020463295040627625, + "loss": 1.0511, + "step": 23700 + }, + { + "epoch": 3.288201160541586, + "grad_norm": 0.05548229441046715, + "learning_rate": 0.0002042126646119361, + "loss": 1.0499, + "step": 23800 + }, + { + "epoch": 3.302017131804366, + "grad_norm": 0.06242981553077698, + "learning_rate": 0.00020379237881759595, + "loss": 1.0543, + "step": 23900 + }, + { + "epoch": 3.3158331030671455, + "grad_norm": 0.12101748585700989, + "learning_rate": 0.00020337209302325582, + "loss": 1.0482, + "step": 24000 + }, + { + "epoch": 3.329649074329925, + "grad_norm": 0.09176388382911682, + "learning_rate": 0.00020295180722891562, + "loss": 1.0514, + "step": 24100 + }, + { + "epoch": 3.3434650455927053, + "grad_norm": 0.08758760988712311, + "learning_rate": 0.0002025315214345755, + "loss": 1.0505, + "step": 24200 + }, + { + "epoch": 3.357281016855485, + "grad_norm": 0.06818066537380219, + "learning_rate": 0.00020211123564023534, + "loss": 1.0511, + "step": 24300 + }, + { + "epoch": 3.3710969881182646, + "grad_norm": 0.10384306311607361, + "learning_rate": 0.0002016909498458952, + "loss": 1.0513, + "step": 24400 + }, + { + "epoch": 3.3849129593810443, + "grad_norm": 0.12452493607997894, + "learning_rate": 0.00020127066405155504, + "loss": 1.0502, + "step": 24500 + }, + { + "epoch": 3.3987289306438244, + "grad_norm": 0.07460072636604309, + "learning_rate": 0.0002008503782572149, + "loss": 1.0526, + "step": 24600 + }, + { + "epoch": 3.412544901906604, + "grad_norm": 0.1017543151974678, + "learning_rate": 0.00020043009246287474, + "loss": 1.0501, + "step": 24700 + }, + { + "epoch": 3.4263608731693838, + "grad_norm": 0.0900358185172081, + "learning_rate": 0.0002000098066685346, + "loss": 1.0512, + "step": 24800 + }, + { + "epoch": 3.4401768444321634, + "grad_norm": 0.10934050381183624, + "learning_rate": 0.00019958952087419443, + "loss": 1.0495, + "step": 24900 + }, + { + "epoch": 3.4539928156949435, + "grad_norm": 0.0656353011727333, + "learning_rate": 0.00019916923507985428, + "loss": 1.0504, + "step": 25000 + }, + { + "epoch": 3.4539928156949435, + "eval_accuracy": 0.520419659075542, + "eval_loss": 1.0485948324203491, + "eval_runtime": 728.0613, + "eval_samples_per_second": 282.752, + "eval_steps_per_second": 8.837, + "step": 25000 + }, + { + "epoch": 3.467808786957723, + "grad_norm": 0.07246037572622299, + "learning_rate": 0.00019874894928551413, + "loss": 1.0493, + "step": 25100 + }, + { + "epoch": 3.481624758220503, + "grad_norm": 0.14033739268779755, + "learning_rate": 0.00019832866349117398, + "loss": 1.05, + "step": 25200 + }, + { + "epoch": 3.4954407294832825, + "grad_norm": 0.05688853561878204, + "learning_rate": 0.00019790837769683383, + "loss": 1.0509, + "step": 25300 + }, + { + "epoch": 3.5092567007460627, + "grad_norm": 0.053916674107313156, + "learning_rate": 0.00019748809190249368, + "loss": 1.0503, + "step": 25400 + }, + { + "epoch": 3.5230726720088423, + "grad_norm": 0.12233688682317734, + "learning_rate": 0.00019706780610815352, + "loss": 1.05, + "step": 25500 + }, + { + "epoch": 3.536888643271622, + "grad_norm": 0.10314755886793137, + "learning_rate": 0.0001966475203138134, + "loss": 1.0501, + "step": 25600 + }, + { + "epoch": 3.5507046145344017, + "grad_norm": 0.05037887394428253, + "learning_rate": 0.00019623143737741662, + "loss": 1.0468, + "step": 25700 + }, + { + "epoch": 3.5645205857971813, + "grad_norm": 0.13344399631023407, + "learning_rate": 0.00019581115158307647, + "loss": 1.0477, + "step": 25800 + }, + { + "epoch": 3.5783365570599615, + "grad_norm": 0.07191654294729233, + "learning_rate": 0.00019539086578873632, + "loss": 1.0498, + "step": 25900 + }, + { + "epoch": 3.592152528322741, + "grad_norm": 0.05592725798487663, + "learning_rate": 0.00019497057999439616, + "loss": 1.0506, + "step": 26000 + }, + { + "epoch": 3.605968499585521, + "grad_norm": 0.10346696525812149, + "learning_rate": 0.000194550294200056, + "loss": 1.0499, + "step": 26100 + }, + { + "epoch": 3.619784470848301, + "grad_norm": 0.09233855456113815, + "learning_rate": 0.0001941300084057159, + "loss": 1.0456, + "step": 26200 + }, + { + "epoch": 3.6336004421110806, + "grad_norm": 0.060603220015764236, + "learning_rate": 0.00019370972261137574, + "loss": 1.0475, + "step": 26300 + }, + { + "epoch": 3.6474164133738602, + "grad_norm": 0.11710167676210403, + "learning_rate": 0.00019328943681703559, + "loss": 1.0497, + "step": 26400 + }, + { + "epoch": 3.66123238463664, + "grad_norm": 0.16325397789478302, + "learning_rate": 0.0001928691510226954, + "loss": 1.0487, + "step": 26500 + }, + { + "epoch": 3.6750483558994196, + "grad_norm": 0.08937475085258484, + "learning_rate": 0.00019244886522835526, + "loss": 1.0468, + "step": 26600 + }, + { + "epoch": 3.6888643271621993, + "grad_norm": 0.07486152648925781, + "learning_rate": 0.0001920285794340151, + "loss": 1.0479, + "step": 26700 + }, + { + "epoch": 3.7026802984249794, + "grad_norm": 0.1263752579689026, + "learning_rate": 0.00019160829363967495, + "loss": 1.0449, + "step": 26800 + }, + { + "epoch": 3.716496269687759, + "grad_norm": 0.11803583055734634, + "learning_rate": 0.0001911880078453348, + "loss": 1.0512, + "step": 26900 + }, + { + "epoch": 3.7303122409505387, + "grad_norm": 0.07918773591518402, + "learning_rate": 0.00019076772205099468, + "loss": 1.0486, + "step": 27000 + }, + { + "epoch": 3.744128212213319, + "grad_norm": 0.11923271417617798, + "learning_rate": 0.00019034743625665453, + "loss": 1.0465, + "step": 27100 + }, + { + "epoch": 3.7579441834760985, + "grad_norm": 0.12752223014831543, + "learning_rate": 0.00018992715046231437, + "loss": 1.0472, + "step": 27200 + }, + { + "epoch": 3.771760154738878, + "grad_norm": 0.07391146570444107, + "learning_rate": 0.0001895068646679742, + "loss": 1.0493, + "step": 27300 + }, + { + "epoch": 3.785576126001658, + "grad_norm": 0.06606881320476532, + "learning_rate": 0.00018908657887363404, + "loss": 1.0485, + "step": 27400 + }, + { + "epoch": 3.7993920972644375, + "grad_norm": 0.04949864745140076, + "learning_rate": 0.0001886662930792939, + "loss": 1.0481, + "step": 27500 + }, + { + "epoch": 3.8132080685272176, + "grad_norm": 0.05234380066394806, + "learning_rate": 0.00018824600728495374, + "loss": 1.0476, + "step": 27600 + }, + { + "epoch": 3.8270240397899973, + "grad_norm": 0.04995539411902428, + "learning_rate": 0.0001878257214906136, + "loss": 1.0466, + "step": 27700 + }, + { + "epoch": 3.840840011052777, + "grad_norm": 0.09871330112218857, + "learning_rate": 0.00018740543569627347, + "loss": 1.0501, + "step": 27800 + }, + { + "epoch": 3.8546559823155566, + "grad_norm": 0.06254375725984573, + "learning_rate": 0.00018698514990193331, + "loss": 1.0467, + "step": 27900 + }, + { + "epoch": 3.8684719535783367, + "grad_norm": 0.07971449941396713, + "learning_rate": 0.00018656486410759316, + "loss": 1.0502, + "step": 28000 + }, + { + "epoch": 3.8822879248411164, + "grad_norm": 0.12627951800823212, + "learning_rate": 0.000186144578313253, + "loss": 1.0446, + "step": 28100 + }, + { + "epoch": 3.896103896103896, + "grad_norm": 0.08057064563035965, + "learning_rate": 0.00018572429251891283, + "loss": 1.0468, + "step": 28200 + }, + { + "epoch": 3.9099198673666757, + "grad_norm": 0.0501413568854332, + "learning_rate": 0.00018530400672457268, + "loss": 1.0453, + "step": 28300 + }, + { + "epoch": 3.9237358386294554, + "grad_norm": 0.09999352693557739, + "learning_rate": 0.00018488372093023253, + "loss": 1.0502, + "step": 28400 + }, + { + "epoch": 3.9375518098922355, + "grad_norm": 0.12323564291000366, + "learning_rate": 0.00018446343513589238, + "loss": 1.0478, + "step": 28500 + }, + { + "epoch": 3.951367781155015, + "grad_norm": 0.0877193808555603, + "learning_rate": 0.00018404314934155225, + "loss": 1.049, + "step": 28600 + }, + { + "epoch": 3.965183752417795, + "grad_norm": 0.09397170692682266, + "learning_rate": 0.0001836228635472121, + "loss": 1.0474, + "step": 28700 + }, + { + "epoch": 3.978999723680575, + "grad_norm": 0.09532420337200165, + "learning_rate": 0.00018320257775287195, + "loss": 1.0496, + "step": 28800 + }, + { + "epoch": 3.9928156949433546, + "grad_norm": 0.0442403182387352, + "learning_rate": 0.0001827822919585318, + "loss": 1.0466, + "step": 28900 + }, + { + "epoch": 4.006631666206134, + "grad_norm": 0.06309514492750168, + "learning_rate": 0.00018236200616419162, + "loss": 1.0479, + "step": 29000 + }, + { + "epoch": 4.020447637468914, + "grad_norm": 0.06191420555114746, + "learning_rate": 0.00018194172036985147, + "loss": 1.0442, + "step": 29100 + }, + { + "epoch": 4.034263608731694, + "grad_norm": 0.06752864271402359, + "learning_rate": 0.00018152143457551132, + "loss": 1.045, + "step": 29200 + }, + { + "epoch": 4.048079579994473, + "grad_norm": 0.07383009046316147, + "learning_rate": 0.00018110114878117117, + "loss": 1.0429, + "step": 29300 + }, + { + "epoch": 4.061895551257253, + "grad_norm": 0.11942852288484573, + "learning_rate": 0.00018068086298683104, + "loss": 1.0433, + "step": 29400 + }, + { + "epoch": 4.0757115225200335, + "grad_norm": 0.0840003713965416, + "learning_rate": 0.0001802605771924909, + "loss": 1.0434, + "step": 29500 + }, + { + "epoch": 4.089527493782813, + "grad_norm": 0.07768476754426956, + "learning_rate": 0.00017984029139815074, + "loss": 1.0421, + "step": 29600 + }, + { + "epoch": 4.103343465045593, + "grad_norm": 0.07166603952646255, + "learning_rate": 0.00017942420846175398, + "loss": 1.0443, + "step": 29700 + }, + { + "epoch": 4.1171594363083726, + "grad_norm": 0.07380765676498413, + "learning_rate": 0.0001790039226674138, + "loss": 1.0448, + "step": 29800 + }, + { + "epoch": 4.130975407571152, + "grad_norm": 0.1263025552034378, + "learning_rate": 0.00017858363687307365, + "loss": 1.0437, + "step": 29900 + }, + { + "epoch": 4.144791378833932, + "grad_norm": 0.09632286429405212, + "learning_rate": 0.00017816335107873353, + "loss": 1.0439, + "step": 30000 + }, + { + "epoch": 4.144791378833932, + "eval_accuracy": 0.5233148259844476, + "eval_loss": 1.0439139604568481, + "eval_runtime": 787.8404, + "eval_samples_per_second": 261.298, + "eval_steps_per_second": 8.167, + "step": 30000 + }, + { + "epoch": 4.158607350096712, + "grad_norm": 0.09395026415586472, + "learning_rate": 0.00017774306528439338, + "loss": 1.0447, + "step": 30100 + }, + { + "epoch": 4.172423321359491, + "grad_norm": 0.07320912927389145, + "learning_rate": 0.00017732277949005323, + "loss": 1.0477, + "step": 30200 + }, + { + "epoch": 4.186239292622272, + "grad_norm": 0.05703623965382576, + "learning_rate": 0.00017690249369571308, + "loss": 1.0443, + "step": 30300 + }, + { + "epoch": 4.2000552638850515, + "grad_norm": 0.04885410889983177, + "learning_rate": 0.00017648220790137292, + "loss": 1.0467, + "step": 30400 + }, + { + "epoch": 4.213871235147831, + "grad_norm": 0.10649748146533966, + "learning_rate": 0.00017606192210703277, + "loss": 1.0448, + "step": 30500 + }, + { + "epoch": 4.227687206410611, + "grad_norm": 0.05844441428780556, + "learning_rate": 0.0001756416363126926, + "loss": 1.044, + "step": 30600 + }, + { + "epoch": 4.2415031776733905, + "grad_norm": 0.07287675887346268, + "learning_rate": 0.00017522135051835244, + "loss": 1.0428, + "step": 30700 + }, + { + "epoch": 4.25531914893617, + "grad_norm": 0.05190150439739227, + "learning_rate": 0.00017480106472401232, + "loss": 1.0413, + "step": 30800 + }, + { + "epoch": 4.26913512019895, + "grad_norm": 0.06985218822956085, + "learning_rate": 0.00017438077892967217, + "loss": 1.0455, + "step": 30900 + }, + { + "epoch": 4.2829510914617295, + "grad_norm": 0.06930764764547348, + "learning_rate": 0.00017396049313533202, + "loss": 1.0444, + "step": 31000 + }, + { + "epoch": 4.296767062724509, + "grad_norm": 0.07905230671167374, + "learning_rate": 0.00017354020734099186, + "loss": 1.0445, + "step": 31100 + }, + { + "epoch": 4.31058303398729, + "grad_norm": 0.04994554817676544, + "learning_rate": 0.0001731199215466517, + "loss": 1.0432, + "step": 31200 + }, + { + "epoch": 4.324399005250069, + "grad_norm": 0.08036911487579346, + "learning_rate": 0.00017269963575231156, + "loss": 1.0424, + "step": 31300 + }, + { + "epoch": 4.338214976512849, + "grad_norm": 0.07251475006341934, + "learning_rate": 0.00017227934995797138, + "loss": 1.0465, + "step": 31400 + }, + { + "epoch": 4.352030947775629, + "grad_norm": 0.09622683376073837, + "learning_rate": 0.00017185906416363123, + "loss": 1.0441, + "step": 31500 + }, + { + "epoch": 4.365846919038408, + "grad_norm": 0.07545050978660583, + "learning_rate": 0.0001714387783692911, + "loss": 1.0423, + "step": 31600 + }, + { + "epoch": 4.379662890301188, + "grad_norm": 0.07171428948640823, + "learning_rate": 0.00017102269543289435, + "loss": 1.0434, + "step": 31700 + }, + { + "epoch": 4.393478861563968, + "grad_norm": 0.06658755987882614, + "learning_rate": 0.0001706024096385542, + "loss": 1.0415, + "step": 31800 + }, + { + "epoch": 4.407294832826747, + "grad_norm": 0.10734014213085175, + "learning_rate": 0.00017018212384421405, + "loss": 1.0406, + "step": 31900 + }, + { + "epoch": 4.421110804089528, + "grad_norm": 0.06358776986598969, + "learning_rate": 0.0001697618380498739, + "loss": 1.0405, + "step": 32000 + }, + { + "epoch": 4.434926775352308, + "grad_norm": 0.06078578904271126, + "learning_rate": 0.00016934155225553377, + "loss": 1.0458, + "step": 32100 + }, + { + "epoch": 4.448742746615087, + "grad_norm": 0.09674441814422607, + "learning_rate": 0.000168925469319137, + "loss": 1.0433, + "step": 32200 + }, + { + "epoch": 4.462558717877867, + "grad_norm": 0.11840452253818512, + "learning_rate": 0.00016850518352479684, + "loss": 1.0448, + "step": 32300 + }, + { + "epoch": 4.476374689140647, + "grad_norm": 0.08742488920688629, + "learning_rate": 0.0001680848977304567, + "loss": 1.0409, + "step": 32400 + }, + { + "epoch": 4.490190660403426, + "grad_norm": 0.09082327783107758, + "learning_rate": 0.00016766461193611654, + "loss": 1.0432, + "step": 32500 + }, + { + "epoch": 4.504006631666206, + "grad_norm": 0.06259270012378693, + "learning_rate": 0.0001672443261417764, + "loss": 1.0406, + "step": 32600 + }, + { + "epoch": 4.517822602928986, + "grad_norm": 0.06466669589281082, + "learning_rate": 0.00016682404034743626, + "loss": 1.0404, + "step": 32700 + }, + { + "epoch": 4.531638574191765, + "grad_norm": 0.07167832553386688, + "learning_rate": 0.0001664037545530961, + "loss": 1.0457, + "step": 32800 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.055970191955566406, + "learning_rate": 0.00016598346875875596, + "loss": 1.0433, + "step": 32900 + }, + { + "epoch": 4.5592705167173255, + "grad_norm": 0.05038364604115486, + "learning_rate": 0.00016556318296441578, + "loss": 1.0414, + "step": 33000 + }, + { + "epoch": 4.573086487980105, + "grad_norm": 0.11647244542837143, + "learning_rate": 0.00016514289717007563, + "loss": 1.0408, + "step": 33100 + }, + { + "epoch": 4.586902459242885, + "grad_norm": 0.08881094306707382, + "learning_rate": 0.00016472261137573548, + "loss": 1.0468, + "step": 33200 + }, + { + "epoch": 4.6007184305056645, + "grad_norm": 0.0706004872918129, + "learning_rate": 0.00016430232558139533, + "loss": 1.0433, + "step": 33300 + }, + { + "epoch": 4.614534401768444, + "grad_norm": 0.07594550400972366, + "learning_rate": 0.00016388203978705518, + "loss": 1.0401, + "step": 33400 + }, + { + "epoch": 4.628350373031224, + "grad_norm": 0.06709697842597961, + "learning_rate": 0.00016346175399271505, + "loss": 1.0406, + "step": 33500 + }, + { + "epoch": 4.6421663442940035, + "grad_norm": 0.055218733847141266, + "learning_rate": 0.0001630414681983749, + "loss": 1.0439, + "step": 33600 + }, + { + "epoch": 4.655982315556784, + "grad_norm": 0.09484557062387466, + "learning_rate": 0.00016262118240403475, + "loss": 1.0445, + "step": 33700 + }, + { + "epoch": 4.669798286819564, + "grad_norm": 0.08181110769510269, + "learning_rate": 0.00016220089660969457, + "loss": 1.0404, + "step": 33800 + }, + { + "epoch": 4.683614258082343, + "grad_norm": 0.07101566344499588, + "learning_rate": 0.00016178061081535442, + "loss": 1.0418, + "step": 33900 + }, + { + "epoch": 4.697430229345123, + "grad_norm": 0.07521411031484604, + "learning_rate": 0.00016136032502101427, + "loss": 1.0413, + "step": 34000 + }, + { + "epoch": 4.711246200607903, + "grad_norm": 0.06438640505075455, + "learning_rate": 0.00016094003922667412, + "loss": 1.0413, + "step": 34100 + }, + { + "epoch": 4.7250621718706824, + "grad_norm": 0.0852956548333168, + "learning_rate": 0.00016051975343233396, + "loss": 1.0411, + "step": 34200 + }, + { + "epoch": 4.738878143133462, + "grad_norm": 0.041669171303510666, + "learning_rate": 0.00016009946763799384, + "loss": 1.043, + "step": 34300 + }, + { + "epoch": 4.752694114396242, + "grad_norm": 0.07866424322128296, + "learning_rate": 0.0001596791818436537, + "loss": 1.0416, + "step": 34400 + }, + { + "epoch": 4.7665100856590215, + "grad_norm": 0.06820093840360641, + "learning_rate": 0.00015925889604931354, + "loss": 1.0419, + "step": 34500 + }, + { + "epoch": 4.780326056921801, + "grad_norm": 0.08769433945417404, + "learning_rate": 0.00015883861025497336, + "loss": 1.0436, + "step": 34600 + }, + { + "epoch": 4.794142028184582, + "grad_norm": 0.11472765356302261, + "learning_rate": 0.0001584183244606332, + "loss": 1.0448, + "step": 34700 + }, + { + "epoch": 4.807957999447361, + "grad_norm": 0.10286398231983185, + "learning_rate": 0.00015799803866629305, + "loss": 1.0396, + "step": 34800 + }, + { + "epoch": 4.821773970710141, + "grad_norm": 0.08412828296422958, + "learning_rate": 0.0001575777528719529, + "loss": 1.0432, + "step": 34900 + }, + { + "epoch": 4.835589941972921, + "grad_norm": 0.06536369025707245, + "learning_rate": 0.00015715746707761275, + "loss": 1.0425, + "step": 35000 + }, + { + "epoch": 4.835589941972921, + "eval_accuracy": 0.5253784900927014, + "eval_loss": 1.0407328605651855, + "eval_runtime": 804.3369, + "eval_samples_per_second": 255.939, + "eval_steps_per_second": 7.999, + "step": 35000 + }, + { + "epoch": 4.8494059132357, + "grad_norm": 0.05366332083940506, + "learning_rate": 0.00015673718128327263, + "loss": 1.0401, + "step": 35100 + }, + { + "epoch": 4.86322188449848, + "grad_norm": 0.05627182498574257, + "learning_rate": 0.00015631689548893248, + "loss": 1.0413, + "step": 35200 + }, + { + "epoch": 4.87703785576126, + "grad_norm": 0.06880544126033783, + "learning_rate": 0.00015589660969459232, + "loss": 1.0399, + "step": 35300 + }, + { + "epoch": 4.89085382702404, + "grad_norm": 0.06326279044151306, + "learning_rate": 0.00015547632390025215, + "loss": 1.0424, + "step": 35400 + }, + { + "epoch": 4.90466979828682, + "grad_norm": 0.050615083426237106, + "learning_rate": 0.000155056038105912, + "loss": 1.0419, + "step": 35500 + }, + { + "epoch": 4.9184857695496, + "grad_norm": 0.09092865139245987, + "learning_rate": 0.00015463575231157184, + "loss": 1.0417, + "step": 35600 + }, + { + "epoch": 4.932301740812379, + "grad_norm": 0.10828616470098495, + "learning_rate": 0.0001542154665172317, + "loss": 1.0461, + "step": 35700 + }, + { + "epoch": 4.946117712075159, + "grad_norm": 0.10398013889789581, + "learning_rate": 0.00015379518072289154, + "loss": 1.0402, + "step": 35800 + }, + { + "epoch": 4.959933683337939, + "grad_norm": 0.060978490859270096, + "learning_rate": 0.00015337489492855142, + "loss": 1.0428, + "step": 35900 + }, + { + "epoch": 4.973749654600718, + "grad_norm": 0.09474412351846695, + "learning_rate": 0.00015295460913421126, + "loss": 1.0426, + "step": 36000 + }, + { + "epoch": 4.987565625863498, + "grad_norm": 0.055337630212306976, + "learning_rate": 0.0001525343233398711, + "loss": 1.0424, + "step": 36100 + }, + { + "epoch": 5.001381597126278, + "grad_norm": 0.062282662838697433, + "learning_rate": 0.00015211824040347433, + "loss": 1.0408, + "step": 36200 + }, + { + "epoch": 5.015197568389058, + "grad_norm": 0.08418793976306915, + "learning_rate": 0.00015169795460913418, + "loss": 1.0423, + "step": 36300 + }, + { + "epoch": 5.029013539651838, + "grad_norm": 0.056806761771440506, + "learning_rate": 0.00015127766881479403, + "loss": 1.0397, + "step": 36400 + }, + { + "epoch": 5.0428295109146175, + "grad_norm": 0.050782449543476105, + "learning_rate": 0.0001508573830204539, + "loss": 1.0397, + "step": 36500 + }, + { + "epoch": 5.056645482177397, + "grad_norm": 0.04436805471777916, + "learning_rate": 0.00015043709722611375, + "loss": 1.0372, + "step": 36600 + }, + { + "epoch": 5.070461453440177, + "grad_norm": 0.056697145104408264, + "learning_rate": 0.0001500168114317736, + "loss": 1.0396, + "step": 36700 + }, + { + "epoch": 5.0842774247029565, + "grad_norm": 0.0936078131198883, + "learning_rate": 0.00014959652563743342, + "loss": 1.0366, + "step": 36800 + }, + { + "epoch": 5.098093395965736, + "grad_norm": 0.058340467512607574, + "learning_rate": 0.0001491762398430933, + "loss": 1.038, + "step": 36900 + }, + { + "epoch": 5.111909367228516, + "grad_norm": 0.07920562475919724, + "learning_rate": 0.00014875595404875315, + "loss": 1.0389, + "step": 37000 + }, + { + "epoch": 5.1257253384912955, + "grad_norm": 0.054546140134334564, + "learning_rate": 0.000148335668254413, + "loss": 1.0352, + "step": 37100 + }, + { + "epoch": 5.139541309754076, + "grad_norm": 0.0779619961977005, + "learning_rate": 0.00014791538246007282, + "loss": 1.0362, + "step": 37200 + }, + { + "epoch": 5.153357281016856, + "grad_norm": 0.06077539920806885, + "learning_rate": 0.0001474950966657327, + "loss": 1.0395, + "step": 37300 + }, + { + "epoch": 5.167173252279635, + "grad_norm": 0.07015964388847351, + "learning_rate": 0.00014707481087139254, + "loss": 1.0378, + "step": 37400 + }, + { + "epoch": 5.180989223542415, + "grad_norm": 0.07821048051118851, + "learning_rate": 0.0001466545250770524, + "loss": 1.0358, + "step": 37500 + }, + { + "epoch": 5.194805194805195, + "grad_norm": 0.06446918845176697, + "learning_rate": 0.0001462342392827122, + "loss": 1.0401, + "step": 37600 + }, + { + "epoch": 5.208621166067974, + "grad_norm": 0.0754179060459137, + "learning_rate": 0.0001458139534883721, + "loss": 1.0372, + "step": 37700 + }, + { + "epoch": 5.222437137330754, + "grad_norm": 0.06225774064660072, + "learning_rate": 0.00014539366769403194, + "loss": 1.0396, + "step": 37800 + }, + { + "epoch": 5.236253108593534, + "grad_norm": 0.09567879885435104, + "learning_rate": 0.00014497338189969178, + "loss": 1.0427, + "step": 37900 + }, + { + "epoch": 5.250069079856313, + "grad_norm": 0.0810612216591835, + "learning_rate": 0.00014455309610535163, + "loss": 1.0368, + "step": 38000 + }, + { + "epoch": 5.263885051119094, + "grad_norm": 0.058250732719898224, + "learning_rate": 0.00014413281031101148, + "loss": 1.039, + "step": 38100 + }, + { + "epoch": 5.277701022381874, + "grad_norm": 0.07354842871427536, + "learning_rate": 0.00014371252451667133, + "loss": 1.0393, + "step": 38200 + }, + { + "epoch": 5.291516993644653, + "grad_norm": 0.04756517335772514, + "learning_rate": 0.00014329223872233118, + "loss": 1.0369, + "step": 38300 + }, + { + "epoch": 5.305332964907433, + "grad_norm": 0.05551883205771446, + "learning_rate": 0.00014287195292799103, + "loss": 1.038, + "step": 38400 + }, + { + "epoch": 5.319148936170213, + "grad_norm": 0.05476289987564087, + "learning_rate": 0.00014245166713365088, + "loss": 1.0391, + "step": 38500 + }, + { + "epoch": 5.332964907432992, + "grad_norm": 0.041929882019758224, + "learning_rate": 0.00014203138133931072, + "loss": 1.0377, + "step": 38600 + }, + { + "epoch": 5.346780878695772, + "grad_norm": 0.05916072428226471, + "learning_rate": 0.00014161109554497057, + "loss": 1.0417, + "step": 38700 + }, + { + "epoch": 5.360596849958552, + "grad_norm": 0.0609772689640522, + "learning_rate": 0.00014119080975063042, + "loss": 1.0386, + "step": 38800 + }, + { + "epoch": 5.374412821221332, + "grad_norm": 0.06430498510599136, + "learning_rate": 0.00014077052395629027, + "loss": 1.0397, + "step": 38900 + }, + { + "epoch": 5.388228792484112, + "grad_norm": 0.07042800635099411, + "learning_rate": 0.00014035023816195012, + "loss": 1.038, + "step": 39000 + }, + { + "epoch": 5.402044763746892, + "grad_norm": 0.05623612925410271, + "learning_rate": 0.00013992995236760997, + "loss": 1.0405, + "step": 39100 + }, + { + "epoch": 5.415860735009671, + "grad_norm": 0.04936366528272629, + "learning_rate": 0.00013950966657326982, + "loss": 1.0404, + "step": 39200 + }, + { + "epoch": 5.429676706272451, + "grad_norm": 0.05738508701324463, + "learning_rate": 0.00013908938077892966, + "loss": 1.0364, + "step": 39300 + }, + { + "epoch": 5.443492677535231, + "grad_norm": 0.09567712992429733, + "learning_rate": 0.0001386690949845895, + "loss": 1.0381, + "step": 39400 + }, + { + "epoch": 5.45730864879801, + "grad_norm": 0.07306545972824097, + "learning_rate": 0.00013824880919024936, + "loss": 1.0394, + "step": 39500 + }, + { + "epoch": 5.47112462006079, + "grad_norm": 0.060108475387096405, + "learning_rate": 0.0001378285233959092, + "loss": 1.0379, + "step": 39600 + }, + { + "epoch": 5.48494059132357, + "grad_norm": 0.08150669932365417, + "learning_rate": 0.00013740823760156906, + "loss": 1.0391, + "step": 39700 + }, + { + "epoch": 5.49875656258635, + "grad_norm": 0.06265643239021301, + "learning_rate": 0.0001369879518072289, + "loss": 1.0419, + "step": 39800 + }, + { + "epoch": 5.51257253384913, + "grad_norm": 0.09023050218820572, + "learning_rate": 0.00013656766601288876, + "loss": 1.0374, + "step": 39900 + }, + { + "epoch": 5.5263885051119095, + "grad_norm": 0.06600885838270187, + "learning_rate": 0.0001361473802185486, + "loss": 1.0365, + "step": 40000 + }, + { + "epoch": 5.5263885051119095, + "eval_accuracy": 0.52706640122358, + "eval_loss": 1.0380040407180786, + "eval_runtime": 773.4583, + "eval_samples_per_second": 266.157, + "eval_steps_per_second": 8.318, + "step": 40000 + }, + { + "epoch": 5.540204476374689, + "grad_norm": 0.07041644304990768, + "learning_rate": 0.00013572709442420845, + "loss": 1.038, + "step": 40100 + }, + { + "epoch": 5.554020447637469, + "grad_norm": 0.0819341391324997, + "learning_rate": 0.0001353110114878117, + "loss": 1.0383, + "step": 40200 + }, + { + "epoch": 5.5678364189002485, + "grad_norm": 0.04390214383602142, + "learning_rate": 0.00013489072569347155, + "loss": 1.0381, + "step": 40300 + }, + { + "epoch": 5.581652390163028, + "grad_norm": 0.0681944414973259, + "learning_rate": 0.0001344704398991314, + "loss": 1.0368, + "step": 40400 + }, + { + "epoch": 5.595468361425809, + "grad_norm": 0.0888848677277565, + "learning_rate": 0.00013405015410479124, + "loss": 1.0369, + "step": 40500 + }, + { + "epoch": 5.609284332688588, + "grad_norm": 0.07275230437517166, + "learning_rate": 0.0001336298683104511, + "loss": 1.0353, + "step": 40600 + }, + { + "epoch": 5.623100303951368, + "grad_norm": 0.10200846940279007, + "learning_rate": 0.00013320958251611094, + "loss": 1.0381, + "step": 40700 + }, + { + "epoch": 5.636916275214148, + "grad_norm": 0.056480832397937775, + "learning_rate": 0.0001327892967217708, + "loss": 1.0383, + "step": 40800 + }, + { + "epoch": 5.650732246476927, + "grad_norm": 0.0845484584569931, + "learning_rate": 0.00013236901092743064, + "loss": 1.0385, + "step": 40900 + }, + { + "epoch": 5.664548217739707, + "grad_norm": 0.05990500748157501, + "learning_rate": 0.0001319487251330905, + "loss": 1.0381, + "step": 41000 + }, + { + "epoch": 5.678364189002487, + "grad_norm": 0.04566818103194237, + "learning_rate": 0.00013152843933875034, + "loss": 1.0409, + "step": 41100 + }, + { + "epoch": 5.692180160265266, + "grad_norm": 0.05529521405696869, + "learning_rate": 0.00013110815354441018, + "loss": 1.039, + "step": 41200 + }, + { + "epoch": 5.705996131528046, + "grad_norm": 0.08812158554792404, + "learning_rate": 0.00013068786775007003, + "loss": 1.0393, + "step": 41300 + }, + { + "epoch": 5.719812102790826, + "grad_norm": 0.0714721605181694, + "learning_rate": 0.00013026758195572988, + "loss": 1.0365, + "step": 41400 + }, + { + "epoch": 5.733628074053606, + "grad_norm": 0.050889432430267334, + "learning_rate": 0.00012984729616138973, + "loss": 1.0399, + "step": 41500 + }, + { + "epoch": 5.747444045316386, + "grad_norm": 0.05863107368350029, + "learning_rate": 0.00012942701036704958, + "loss": 1.0401, + "step": 41600 + }, + { + "epoch": 5.761260016579166, + "grad_norm": 0.05279000476002693, + "learning_rate": 0.00012900672457270943, + "loss": 1.0368, + "step": 41700 + }, + { + "epoch": 5.775075987841945, + "grad_norm": 0.06430874019861221, + "learning_rate": 0.00012858643877836928, + "loss": 1.0347, + "step": 41800 + }, + { + "epoch": 5.788891959104725, + "grad_norm": 0.1187288910150528, + "learning_rate": 0.00012816615298402912, + "loss": 1.0372, + "step": 41900 + }, + { + "epoch": 5.802707930367505, + "grad_norm": 0.05984746664762497, + "learning_rate": 0.00012774586718968897, + "loss": 1.036, + "step": 42000 + }, + { + "epoch": 5.816523901630284, + "grad_norm": 0.047202371060848236, + "learning_rate": 0.00012732558139534882, + "loss": 1.0341, + "step": 42100 + }, + { + "epoch": 5.830339872893065, + "grad_norm": 0.0888022631406784, + "learning_rate": 0.00012690949845895207, + "loss": 1.0358, + "step": 42200 + }, + { + "epoch": 5.8441558441558445, + "grad_norm": 0.071753591299057, + "learning_rate": 0.00012648921266461191, + "loss": 1.0356, + "step": 42300 + }, + { + "epoch": 5.857971815418624, + "grad_norm": 0.06311481446027756, + "learning_rate": 0.0001260689268702718, + "loss": 1.0381, + "step": 42400 + }, + { + "epoch": 5.871787786681404, + "grad_norm": 0.05733519420027733, + "learning_rate": 0.0001256486410759316, + "loss": 1.0366, + "step": 42500 + }, + { + "epoch": 5.885603757944184, + "grad_norm": 0.05296749621629715, + "learning_rate": 0.00012522835528159146, + "loss": 1.0391, + "step": 42600 + }, + { + "epoch": 5.899419729206963, + "grad_norm": 0.05728083476424217, + "learning_rate": 0.0001248080694872513, + "loss": 1.0393, + "step": 42700 + }, + { + "epoch": 5.913235700469743, + "grad_norm": 0.10918726772069931, + "learning_rate": 0.00012438778369291118, + "loss": 1.0375, + "step": 42800 + }, + { + "epoch": 5.927051671732523, + "grad_norm": 0.043641045689582825, + "learning_rate": 0.000123967497898571, + "loss": 1.0342, + "step": 42900 + }, + { + "epoch": 5.940867642995302, + "grad_norm": 0.07793564349412918, + "learning_rate": 0.00012354721210423085, + "loss": 1.037, + "step": 43000 + }, + { + "epoch": 5.954683614258082, + "grad_norm": 0.10596407949924469, + "learning_rate": 0.0001231269263098907, + "loss": 1.0361, + "step": 43100 + }, + { + "epoch": 5.9684995855208625, + "grad_norm": 0.05018968880176544, + "learning_rate": 0.00012270664051555058, + "loss": 1.0352, + "step": 43200 + }, + { + "epoch": 5.982315556783642, + "grad_norm": 0.06663347035646439, + "learning_rate": 0.0001222863547212104, + "loss": 1.0379, + "step": 43300 + }, + { + "epoch": 5.996131528046422, + "grad_norm": 0.05061174929141998, + "learning_rate": 0.00012186606892687026, + "loss": 1.0378, + "step": 43400 + }, + { + "epoch": 6.0099474993092015, + "grad_norm": 0.07496211677789688, + "learning_rate": 0.00012144578313253011, + "loss": 1.0357, + "step": 43500 + }, + { + "epoch": 6.023763470571981, + "grad_norm": 0.058973684906959534, + "learning_rate": 0.00012102549733818996, + "loss": 1.0336, + "step": 43600 + }, + { + "epoch": 6.037579441834761, + "grad_norm": 0.07304850965738297, + "learning_rate": 0.0001206052115438498, + "loss": 1.0366, + "step": 43700 + }, + { + "epoch": 6.0513954130975405, + "grad_norm": 0.05964922904968262, + "learning_rate": 0.00012018492574950966, + "loss": 1.0358, + "step": 43800 + }, + { + "epoch": 6.06521138436032, + "grad_norm": 0.10107408463954926, + "learning_rate": 0.0001197646399551695, + "loss": 1.0363, + "step": 43900 + }, + { + "epoch": 6.079027355623101, + "grad_norm": 0.05830320343375206, + "learning_rate": 0.00011934435416082935, + "loss": 1.0374, + "step": 44000 + }, + { + "epoch": 6.09284332688588, + "grad_norm": 0.06493101269006729, + "learning_rate": 0.00011892406836648919, + "loss": 1.0358, + "step": 44100 + }, + { + "epoch": 6.10665929814866, + "grad_norm": 0.06381756067276001, + "learning_rate": 0.00011850798543009245, + "loss": 1.0345, + "step": 44200 + }, + { + "epoch": 6.12047526941144, + "grad_norm": 0.057328786700963974, + "learning_rate": 0.0001180876996357523, + "loss": 1.0347, + "step": 44300 + }, + { + "epoch": 6.134291240674219, + "grad_norm": 0.09036822617053986, + "learning_rate": 0.00011766741384141216, + "loss": 1.0352, + "step": 44400 + }, + { + "epoch": 6.148107211936999, + "grad_norm": 0.05485937371850014, + "learning_rate": 0.000117247128047072, + "loss": 1.0371, + "step": 44500 + }, + { + "epoch": 6.161923183199779, + "grad_norm": 0.06304465979337692, + "learning_rate": 0.00011682684225273184, + "loss": 1.0302, + "step": 44600 + }, + { + "epoch": 6.175739154462558, + "grad_norm": 0.045126065611839294, + "learning_rate": 0.0001164065564583917, + "loss": 1.0338, + "step": 44700 + }, + { + "epoch": 6.189555125725338, + "grad_norm": 0.06636038422584534, + "learning_rate": 0.00011598627066405155, + "loss": 1.0353, + "step": 44800 + }, + { + "epoch": 6.203371096988119, + "grad_norm": 0.05977385491132736, + "learning_rate": 0.00011556598486971139, + "loss": 1.0346, + "step": 44900 + }, + { + "epoch": 6.217187068250898, + "grad_norm": 0.07459376752376556, + "learning_rate": 0.00011514569907537124, + "loss": 1.0325, + "step": 45000 + }, + { + "epoch": 6.217187068250898, + "eval_accuracy": 0.5284276106869993, + "eval_loss": 1.0360603332519531, + "eval_runtime": 770.702, + "eval_samples_per_second": 267.108, + "eval_steps_per_second": 8.348, + "step": 45000 + }, + { + "epoch": 6.231003039513678, + "grad_norm": 0.050757069140672684, + "learning_rate": 0.0001147254132810311, + "loss": 1.0337, + "step": 45100 + }, + { + "epoch": 6.244819010776458, + "grad_norm": 0.065644271671772, + "learning_rate": 0.00011430512748669095, + "loss": 1.035, + "step": 45200 + }, + { + "epoch": 6.258634982039237, + "grad_norm": 0.06008651480078697, + "learning_rate": 0.00011388484169235078, + "loss": 1.0323, + "step": 45300 + }, + { + "epoch": 6.272450953302017, + "grad_norm": 0.050868868827819824, + "learning_rate": 0.00011346455589801063, + "loss": 1.0341, + "step": 45400 + }, + { + "epoch": 6.286266924564797, + "grad_norm": 0.0535401850938797, + "learning_rate": 0.00011304427010367049, + "loss": 1.0349, + "step": 45500 + }, + { + "epoch": 6.300082895827576, + "grad_norm": 0.07083383947610855, + "learning_rate": 0.00011262398430933034, + "loss": 1.0327, + "step": 45600 + }, + { + "epoch": 6.313898867090357, + "grad_norm": 0.06998474150896072, + "learning_rate": 0.00011220369851499018, + "loss": 1.035, + "step": 45700 + }, + { + "epoch": 6.3277148383531365, + "grad_norm": 0.06696050614118576, + "learning_rate": 0.00011178341272065002, + "loss": 1.0342, + "step": 45800 + }, + { + "epoch": 6.341530809615916, + "grad_norm": 0.050143785774707794, + "learning_rate": 0.00011136312692630989, + "loss": 1.0342, + "step": 45900 + }, + { + "epoch": 6.355346780878696, + "grad_norm": 0.066258005797863, + "learning_rate": 0.00011094284113196974, + "loss": 1.0368, + "step": 46000 + }, + { + "epoch": 6.3691627521414755, + "grad_norm": 0.057613175362348557, + "learning_rate": 0.00011052255533762957, + "loss": 1.0357, + "step": 46100 + }, + { + "epoch": 6.382978723404255, + "grad_norm": 0.07405593246221542, + "learning_rate": 0.00011010647240123283, + "loss": 1.033, + "step": 46200 + }, + { + "epoch": 6.396794694667035, + "grad_norm": 0.07005150616168976, + "learning_rate": 0.00010968618660689268, + "loss": 1.0329, + "step": 46300 + }, + { + "epoch": 6.4106106659298145, + "grad_norm": 0.057546067982912064, + "learning_rate": 0.00010926590081255253, + "loss": 1.033, + "step": 46400 + }, + { + "epoch": 6.424426637192594, + "grad_norm": 0.08016248792409897, + "learning_rate": 0.00010884561501821236, + "loss": 1.0389, + "step": 46500 + }, + { + "epoch": 6.438242608455375, + "grad_norm": 0.08346617966890335, + "learning_rate": 0.00010842532922387222, + "loss": 1.0332, + "step": 46600 + }, + { + "epoch": 6.452058579718154, + "grad_norm": 0.048157453536987305, + "learning_rate": 0.00010800504342953207, + "loss": 1.0342, + "step": 46700 + }, + { + "epoch": 6.465874550980934, + "grad_norm": 0.06816009432077408, + "learning_rate": 0.00010758475763519192, + "loss": 1.0357, + "step": 46800 + }, + { + "epoch": 6.479690522243714, + "grad_norm": 0.05210613086819649, + "learning_rate": 0.00010716447184085176, + "loss": 1.0345, + "step": 46900 + }, + { + "epoch": 6.4935064935064934, + "grad_norm": 0.08138227462768555, + "learning_rate": 0.00010674418604651162, + "loss": 1.035, + "step": 47000 + }, + { + "epoch": 6.507322464769273, + "grad_norm": 0.07494477927684784, + "learning_rate": 0.00010632390025217147, + "loss": 1.0361, + "step": 47100 + }, + { + "epoch": 6.521138436032053, + "grad_norm": 0.07473413646221161, + "learning_rate": 0.00010590361445783132, + "loss": 1.0339, + "step": 47200 + }, + { + "epoch": 6.5349544072948325, + "grad_norm": 0.07200802862644196, + "learning_rate": 0.00010548332866349115, + "loss": 1.0333, + "step": 47300 + }, + { + "epoch": 6.548770378557613, + "grad_norm": 0.06346756964921951, + "learning_rate": 0.00010506304286915101, + "loss": 1.0345, + "step": 47400 + }, + { + "epoch": 6.562586349820393, + "grad_norm": 0.06382066756486893, + "learning_rate": 0.00010464275707481086, + "loss": 1.0352, + "step": 47500 + }, + { + "epoch": 6.576402321083172, + "grad_norm": 0.1000475063920021, + "learning_rate": 0.00010422247128047071, + "loss": 1.0344, + "step": 47600 + }, + { + "epoch": 6.590218292345952, + "grad_norm": 0.06456384807825089, + "learning_rate": 0.00010380218548613057, + "loss": 1.0356, + "step": 47700 + }, + { + "epoch": 6.604034263608732, + "grad_norm": 0.052929963916540146, + "learning_rate": 0.0001033818996917904, + "loss": 1.0343, + "step": 47800 + }, + { + "epoch": 6.617850234871511, + "grad_norm": 0.07275223731994629, + "learning_rate": 0.00010296161389745025, + "loss": 1.033, + "step": 47900 + }, + { + "epoch": 6.631666206134291, + "grad_norm": 0.060610584914684296, + "learning_rate": 0.0001025413281031101, + "loss": 1.0334, + "step": 48000 + }, + { + "epoch": 6.645482177397071, + "grad_norm": 0.0514766089618206, + "learning_rate": 0.00010212104230876997, + "loss": 1.0351, + "step": 48100 + }, + { + "epoch": 6.65929814865985, + "grad_norm": 0.08950326591730118, + "learning_rate": 0.0001017049593723732, + "loss": 1.0341, + "step": 48200 + }, + { + "epoch": 6.673114119922631, + "grad_norm": 0.052268847823143005, + "learning_rate": 0.00010128467357803306, + "loss": 1.0342, + "step": 48300 + }, + { + "epoch": 6.686930091185411, + "grad_norm": 0.059182267636060715, + "learning_rate": 0.00010086438778369291, + "loss": 1.0303, + "step": 48400 + }, + { + "epoch": 6.70074606244819, + "grad_norm": 0.06220945715904236, + "learning_rate": 0.00010044410198935274, + "loss": 1.032, + "step": 48500 + }, + { + "epoch": 6.71456203371097, + "grad_norm": 0.0486241914331913, + "learning_rate": 0.00010002381619501259, + "loss": 1.0338, + "step": 48600 + }, + { + "epoch": 6.72837800497375, + "grad_norm": 0.04813262075185776, + "learning_rate": 9.960353040067245e-05, + "loss": 1.0344, + "step": 48700 + }, + { + "epoch": 6.742193976236529, + "grad_norm": 0.04981222748756409, + "learning_rate": 9.91832446063323e-05, + "loss": 1.0347, + "step": 48800 + }, + { + "epoch": 6.756009947499309, + "grad_norm": 0.050560541450977325, + "learning_rate": 9.876295881199214e-05, + "loss": 1.0338, + "step": 48900 + }, + { + "epoch": 6.769825918762089, + "grad_norm": 0.05338674411177635, + "learning_rate": 9.834267301765199e-05, + "loss": 1.0369, + "step": 49000 + }, + { + "epoch": 6.783641890024869, + "grad_norm": 0.042156435549259186, + "learning_rate": 9.792238722331185e-05, + "loss": 1.0345, + "step": 49100 + }, + { + "epoch": 6.797457861287649, + "grad_norm": 0.0622396394610405, + "learning_rate": 9.75021014289717e-05, + "loss": 1.0321, + "step": 49200 + }, + { + "epoch": 6.8112738325504285, + "grad_norm": 0.08523661643266678, + "learning_rate": 9.708181563463155e-05, + "loss": 1.0317, + "step": 49300 + }, + { + "epoch": 6.825089803813208, + "grad_norm": 0.055176641792058945, + "learning_rate": 9.666152984029138e-05, + "loss": 1.0368, + "step": 49400 + }, + { + "epoch": 6.838905775075988, + "grad_norm": 0.07358380407094955, + "learning_rate": 9.624124404595124e-05, + "loss": 1.0318, + "step": 49500 + }, + { + "epoch": 6.8527217463387675, + "grad_norm": 0.055568769574165344, + "learning_rate": 9.582095825161109e-05, + "loss": 1.0343, + "step": 49600 + }, + { + "epoch": 6.866537717601547, + "grad_norm": 0.04249552637338638, + "learning_rate": 9.540067245727094e-05, + "loss": 1.0331, + "step": 49700 + }, + { + "epoch": 6.880353688864327, + "grad_norm": 0.05274058133363724, + "learning_rate": 9.498038666293077e-05, + "loss": 1.0351, + "step": 49800 + }, + { + "epoch": 6.8941696601271065, + "grad_norm": 0.04792112484574318, + "learning_rate": 9.456010086859064e-05, + "loss": 1.0333, + "step": 49900 + }, + { + "epoch": 6.907985631389887, + "grad_norm": 0.05513302981853485, + "learning_rate": 9.413981507425049e-05, + "loss": 1.0322, + "step": 50000 + }, + { + "epoch": 6.907985631389887, + "eval_accuracy": 0.5296076152096916, + "eval_loss": 1.0341060161590576, + "eval_runtime": 725.8939, + "eval_samples_per_second": 283.597, + "eval_steps_per_second": 8.864, + "step": 50000 + }, + { + "epoch": 6.921801602652667, + "grad_norm": 0.05296773836016655, + "learning_rate": 9.371952927991033e-05, + "loss": 1.031, + "step": 50100 + }, + { + "epoch": 6.935617573915446, + "grad_norm": 0.062248583883047104, + "learning_rate": 9.330344634351358e-05, + "loss": 1.0341, + "step": 50200 + }, + { + "epoch": 6.949433545178226, + "grad_norm": 0.07751675695180893, + "learning_rate": 9.288316054917343e-05, + "loss": 1.0352, + "step": 50300 + }, + { + "epoch": 6.963249516441006, + "grad_norm": 0.04984898492693901, + "learning_rate": 9.246287475483328e-05, + "loss": 1.0302, + "step": 50400 + }, + { + "epoch": 6.977065487703785, + "grad_norm": 0.04315504804253578, + "learning_rate": 9.204258896049314e-05, + "loss": 1.0327, + "step": 50500 + }, + { + "epoch": 6.990881458966565, + "grad_norm": 0.053620435297489166, + "learning_rate": 9.162230316615297e-05, + "loss": 1.0328, + "step": 50600 + }, + { + "epoch": 7.004697430229345, + "grad_norm": 0.04611975699663162, + "learning_rate": 9.120201737181282e-05, + "loss": 1.0336, + "step": 50700 + }, + { + "epoch": 7.018513401492125, + "grad_norm": 0.04269848018884659, + "learning_rate": 9.078173157747267e-05, + "loss": 1.0282, + "step": 50800 + }, + { + "epoch": 7.032329372754905, + "grad_norm": 0.055365532636642456, + "learning_rate": 9.036144578313253e-05, + "loss": 1.0339, + "step": 50900 + }, + { + "epoch": 7.046145344017685, + "grad_norm": 0.06129321828484535, + "learning_rate": 8.994115998879237e-05, + "loss": 1.0304, + "step": 51000 + }, + { + "epoch": 7.059961315280464, + "grad_norm": 0.06094348803162575, + "learning_rate": 8.952507705239563e-05, + "loss": 1.0288, + "step": 51100 + }, + { + "epoch": 7.073777286543244, + "grad_norm": 0.048849135637283325, + "learning_rate": 8.910479125805548e-05, + "loss": 1.0322, + "step": 51200 + }, + { + "epoch": 7.087593257806024, + "grad_norm": 0.05081125721335411, + "learning_rate": 8.868450546371531e-05, + "loss": 1.0303, + "step": 51300 + }, + { + "epoch": 7.101409229068803, + "grad_norm": 0.07727497071027756, + "learning_rate": 8.826421966937516e-05, + "loss": 1.03, + "step": 51400 + }, + { + "epoch": 7.115225200331583, + "grad_norm": 0.06357153505086899, + "learning_rate": 8.784393387503502e-05, + "loss": 1.0342, + "step": 51500 + }, + { + "epoch": 7.129041171594363, + "grad_norm": 0.05598052963614464, + "learning_rate": 8.742364808069487e-05, + "loss": 1.0312, + "step": 51600 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.06753697246313095, + "learning_rate": 8.70033622863547e-05, + "loss": 1.0306, + "step": 51700 + }, + { + "epoch": 7.156673114119923, + "grad_norm": 0.06586912274360657, + "learning_rate": 8.658307649201455e-05, + "loss": 1.0311, + "step": 51800 + }, + { + "epoch": 7.170489085382703, + "grad_norm": 0.10361455380916595, + "learning_rate": 8.616279069767442e-05, + "loss": 1.0326, + "step": 51900 + }, + { + "epoch": 7.184305056645482, + "grad_norm": 0.09442713856697083, + "learning_rate": 8.574250490333426e-05, + "loss": 1.0339, + "step": 52000 + }, + { + "epoch": 7.198121027908262, + "grad_norm": 0.08114325255155563, + "learning_rate": 8.532221910899411e-05, + "loss": 1.0335, + "step": 52100 + }, + { + "epoch": 7.211936999171042, + "grad_norm": 0.054252710193395615, + "learning_rate": 8.490193331465395e-05, + "loss": 1.0316, + "step": 52200 + }, + { + "epoch": 7.225752970433821, + "grad_norm": 0.059643086045980453, + "learning_rate": 8.448164752031381e-05, + "loss": 1.027, + "step": 52300 + }, + { + "epoch": 7.239568941696601, + "grad_norm": 0.045472096651792526, + "learning_rate": 8.406136172597366e-05, + "loss": 1.0311, + "step": 52400 + }, + { + "epoch": 7.2533849129593815, + "grad_norm": 0.0669686570763588, + "learning_rate": 8.36410759316335e-05, + "loss": 1.0309, + "step": 52500 + }, + { + "epoch": 7.267200884222161, + "grad_norm": 0.0454520583152771, + "learning_rate": 8.322079013729334e-05, + "loss": 1.0327, + "step": 52600 + }, + { + "epoch": 7.281016855484941, + "grad_norm": 0.05776028707623482, + "learning_rate": 8.28005043429532e-05, + "loss": 1.0318, + "step": 52700 + }, + { + "epoch": 7.2948328267477205, + "grad_norm": 0.051905229687690735, + "learning_rate": 8.238021854861305e-05, + "loss": 1.0313, + "step": 52800 + }, + { + "epoch": 7.3086487980105, + "grad_norm": 0.056912437081336975, + "learning_rate": 8.19599327542729e-05, + "loss": 1.0325, + "step": 52900 + }, + { + "epoch": 7.32246476927328, + "grad_norm": 0.04940250515937805, + "learning_rate": 8.153964695993274e-05, + "loss": 1.0323, + "step": 53000 + }, + { + "epoch": 7.3362807405360595, + "grad_norm": 0.04186444729566574, + "learning_rate": 8.11193611655926e-05, + "loss": 1.0285, + "step": 53100 + }, + { + "epoch": 7.350096711798839, + "grad_norm": 0.041809357702732086, + "learning_rate": 8.069907537125245e-05, + "loss": 1.0289, + "step": 53200 + }, + { + "epoch": 7.363912683061619, + "grad_norm": 0.05794375389814377, + "learning_rate": 8.02787895769123e-05, + "loss": 1.031, + "step": 53300 + }, + { + "epoch": 7.377728654324399, + "grad_norm": 0.08333911001682281, + "learning_rate": 7.985850378257213e-05, + "loss": 1.0316, + "step": 53400 + }, + { + "epoch": 7.391544625587179, + "grad_norm": 0.06473658233880997, + "learning_rate": 7.943821798823199e-05, + "loss": 1.0317, + "step": 53500 + }, + { + "epoch": 7.405360596849959, + "grad_norm": 0.05173886939883232, + "learning_rate": 7.901793219389184e-05, + "loss": 1.0308, + "step": 53600 + }, + { + "epoch": 7.419176568112738, + "grad_norm": 0.06362345069646835, + "learning_rate": 7.859764639955169e-05, + "loss": 1.0324, + "step": 53700 + }, + { + "epoch": 7.432992539375518, + "grad_norm": 0.054053716361522675, + "learning_rate": 7.817736060521152e-05, + "loss": 1.0303, + "step": 53800 + }, + { + "epoch": 7.446808510638298, + "grad_norm": 0.048420459032058716, + "learning_rate": 7.775707481087139e-05, + "loss": 1.0299, + "step": 53900 + }, + { + "epoch": 7.460624481901077, + "grad_norm": 0.0606950968503952, + "learning_rate": 7.733678901653123e-05, + "loss": 1.0317, + "step": 54000 + }, + { + "epoch": 7.474440453163857, + "grad_norm": 0.06072583049535751, + "learning_rate": 7.691650322219108e-05, + "loss": 1.033, + "step": 54100 + }, + { + "epoch": 7.488256424426638, + "grad_norm": 0.05064817890524864, + "learning_rate": 7.649621742785093e-05, + "loss": 1.0287, + "step": 54200 + }, + { + "epoch": 7.502072395689417, + "grad_norm": 0.09318757057189941, + "learning_rate": 7.607593163351078e-05, + "loss": 1.0296, + "step": 54300 + }, + { + "epoch": 7.515888366952197, + "grad_norm": 0.0935215950012207, + "learning_rate": 7.565564583917063e-05, + "loss": 1.0322, + "step": 54400 + }, + { + "epoch": 7.529704338214977, + "grad_norm": 0.07255256175994873, + "learning_rate": 7.523536004483048e-05, + "loss": 1.0333, + "step": 54500 + }, + { + "epoch": 7.543520309477756, + "grad_norm": 0.05486008897423744, + "learning_rate": 7.481507425049033e-05, + "loss": 1.032, + "step": 54600 + }, + { + "epoch": 7.557336280740536, + "grad_norm": 0.0525212287902832, + "learning_rate": 7.439478845615017e-05, + "loss": 1.0293, + "step": 54700 + }, + { + "epoch": 7.571152252003316, + "grad_norm": 0.047569695860147476, + "learning_rate": 7.397450266181002e-05, + "loss": 1.0282, + "step": 54800 + }, + { + "epoch": 7.584968223266095, + "grad_norm": 0.06165711581707001, + "learning_rate": 7.355421686746987e-05, + "loss": 1.0312, + "step": 54900 + }, + { + "epoch": 7.598784194528875, + "grad_norm": 0.0578945092856884, + "learning_rate": 7.313393107312972e-05, + "loss": 1.0307, + "step": 55000 + }, + { + "epoch": 7.598784194528875, + "eval_accuracy": 0.5305025000901846, + "eval_loss": 1.0327985286712646, + "eval_runtime": 731.5754, + "eval_samples_per_second": 281.394, + "eval_steps_per_second": 8.795, + "step": 55000 + }, + { + "epoch": 7.612600165791655, + "grad_norm": 0.0795338973402977, + "learning_rate": 7.271784813673297e-05, + "loss": 1.0294, + "step": 55100 + }, + { + "epoch": 7.626416137054435, + "grad_norm": 0.06103779003024101, + "learning_rate": 7.229756234239283e-05, + "loss": 1.033, + "step": 55200 + }, + { + "epoch": 7.640232108317215, + "grad_norm": 0.0635315552353859, + "learning_rate": 7.187727654805266e-05, + "loss": 1.0296, + "step": 55300 + }, + { + "epoch": 7.654048079579995, + "grad_norm": 0.05289231240749359, + "learning_rate": 7.145699075371253e-05, + "loss": 1.034, + "step": 55400 + }, + { + "epoch": 7.667864050842774, + "grad_norm": 0.07801427692174911, + "learning_rate": 7.103670495937236e-05, + "loss": 1.0332, + "step": 55500 + }, + { + "epoch": 7.681680022105554, + "grad_norm": 0.07564268261194229, + "learning_rate": 7.061641916503222e-05, + "loss": 1.0299, + "step": 55600 + }, + { + "epoch": 7.695495993368334, + "grad_norm": 0.04168133810162544, + "learning_rate": 7.019613337069206e-05, + "loss": 1.03, + "step": 55700 + }, + { + "epoch": 7.709311964631113, + "grad_norm": 0.11210035532712936, + "learning_rate": 6.977584757635192e-05, + "loss": 1.0301, + "step": 55800 + }, + { + "epoch": 7.723127935893894, + "grad_norm": 0.09023060649633408, + "learning_rate": 6.935556178201175e-05, + "loss": 1.0285, + "step": 55900 + }, + { + "epoch": 7.7369439071566735, + "grad_norm": 0.05271260067820549, + "learning_rate": 6.893527598767162e-05, + "loss": 1.0315, + "step": 56000 + }, + { + "epoch": 7.750759878419453, + "grad_norm": 0.06293012201786041, + "learning_rate": 6.851499019333145e-05, + "loss": 1.0286, + "step": 56100 + }, + { + "epoch": 7.764575849682233, + "grad_norm": 0.04555558040738106, + "learning_rate": 6.809470439899131e-05, + "loss": 1.0308, + "step": 56200 + }, + { + "epoch": 7.7783918209450125, + "grad_norm": 0.042364273220300674, + "learning_rate": 6.767441860465115e-05, + "loss": 1.0311, + "step": 56300 + }, + { + "epoch": 7.792207792207792, + "grad_norm": 0.05084213241934776, + "learning_rate": 6.725413281031101e-05, + "loss": 1.0298, + "step": 56400 + }, + { + "epoch": 7.806023763470572, + "grad_norm": 0.059168051928281784, + "learning_rate": 6.683384701597085e-05, + "loss": 1.0303, + "step": 56500 + }, + { + "epoch": 7.8198397347333515, + "grad_norm": 0.05535740405321121, + "learning_rate": 6.641356122163071e-05, + "loss": 1.0306, + "step": 56600 + }, + { + "epoch": 7.833655705996131, + "grad_norm": 0.06625715643167496, + "learning_rate": 6.599327542729054e-05, + "loss": 1.0283, + "step": 56700 + }, + { + "epoch": 7.847471677258911, + "grad_norm": 0.04644458368420601, + "learning_rate": 6.55729896329504e-05, + "loss": 1.0289, + "step": 56800 + }, + { + "epoch": 7.861287648521691, + "grad_norm": 0.05319574847817421, + "learning_rate": 6.515270383861024e-05, + "loss": 1.0303, + "step": 56900 + }, + { + "epoch": 7.875103619784471, + "grad_norm": 0.06394356489181519, + "learning_rate": 6.47324180442701e-05, + "loss": 1.0315, + "step": 57000 + }, + { + "epoch": 7.888919591047251, + "grad_norm": 0.0535539835691452, + "learning_rate": 6.431633510787335e-05, + "loss": 1.0323, + "step": 57100 + }, + { + "epoch": 7.90273556231003, + "grad_norm": 0.05220150947570801, + "learning_rate": 6.38960493135332e-05, + "loss": 1.032, + "step": 57200 + }, + { + "epoch": 7.91655153357281, + "grad_norm": 0.04795517399907112, + "learning_rate": 6.347576351919304e-05, + "loss": 1.03, + "step": 57300 + }, + { + "epoch": 7.93036750483559, + "grad_norm": 0.0748489499092102, + "learning_rate": 6.30554777248529e-05, + "loss": 1.0338, + "step": 57400 + }, + { + "epoch": 7.944183476098369, + "grad_norm": 0.08164035528898239, + "learning_rate": 6.263519193051274e-05, + "loss": 1.0318, + "step": 57500 + }, + { + "epoch": 7.95799944736115, + "grad_norm": 0.0764247477054596, + "learning_rate": 6.221490613617259e-05, + "loss": 1.0278, + "step": 57600 + }, + { + "epoch": 7.97181541862393, + "grad_norm": 0.05609816685318947, + "learning_rate": 6.179462034183244e-05, + "loss": 1.0307, + "step": 57700 + }, + { + "epoch": 7.985631389886709, + "grad_norm": 0.05001819133758545, + "learning_rate": 6.137433454749229e-05, + "loss": 1.0297, + "step": 57800 + }, + { + "epoch": 7.999447361149489, + "grad_norm": 0.10084258019924164, + "learning_rate": 6.0954048753152136e-05, + "loss": 1.0339, + "step": 57900 + }, + { + "epoch": 8.013263332412269, + "grad_norm": 0.07571733742952347, + "learning_rate": 6.0533762958811985e-05, + "loss": 1.0305, + "step": 58000 + }, + { + "epoch": 8.027079303675048, + "grad_norm": 0.059294216334819794, + "learning_rate": 6.011347716447183e-05, + "loss": 1.026, + "step": 58100 + }, + { + "epoch": 8.040895274937828, + "grad_norm": 0.04530787095427513, + "learning_rate": 5.969319137013168e-05, + "loss": 1.0282, + "step": 58200 + }, + { + "epoch": 8.054711246200608, + "grad_norm": 0.05052864924073219, + "learning_rate": 5.927290557579153e-05, + "loss": 1.0271, + "step": 58300 + }, + { + "epoch": 8.068527217463387, + "grad_norm": 0.04923342168331146, + "learning_rate": 5.885261978145138e-05, + "loss": 1.029, + "step": 58400 + }, + { + "epoch": 8.082343188726167, + "grad_norm": 0.04905908182263374, + "learning_rate": 5.843233398711123e-05, + "loss": 1.0277, + "step": 58500 + }, + { + "epoch": 8.096159159988947, + "grad_norm": 0.046151451766490936, + "learning_rate": 5.801204819277108e-05, + "loss": 1.0289, + "step": 58600 + }, + { + "epoch": 8.109975131251726, + "grad_norm": 0.06011873856186867, + "learning_rate": 5.7591762398430925e-05, + "loss": 1.0245, + "step": 58700 + }, + { + "epoch": 8.123791102514506, + "grad_norm": 0.06879663467407227, + "learning_rate": 5.717147660409078e-05, + "loss": 1.0271, + "step": 58800 + }, + { + "epoch": 8.137607073777286, + "grad_norm": 0.04675479233264923, + "learning_rate": 5.675119080975063e-05, + "loss": 1.0263, + "step": 58900 + }, + { + "epoch": 8.151423045040067, + "grad_norm": 0.08497285097837448, + "learning_rate": 5.633090501541048e-05, + "loss": 1.0287, + "step": 59000 + }, + { + "epoch": 8.165239016302847, + "grad_norm": 0.07600156217813492, + "learning_rate": 5.5910619221070326e-05, + "loss": 1.0262, + "step": 59100 + }, + { + "epoch": 8.179054987565626, + "grad_norm": 0.04951677843928337, + "learning_rate": 5.549453628467357e-05, + "loss": 1.0283, + "step": 59200 + }, + { + "epoch": 8.192870958828406, + "grad_norm": 0.05662324279546738, + "learning_rate": 5.507425049033342e-05, + "loss": 1.0295, + "step": 59300 + }, + { + "epoch": 8.206686930091186, + "grad_norm": 0.05791959911584854, + "learning_rate": 5.465396469599327e-05, + "loss": 1.0285, + "step": 59400 + }, + { + "epoch": 8.220502901353965, + "grad_norm": 0.058768805116415024, + "learning_rate": 5.423367890165312e-05, + "loss": 1.0272, + "step": 59500 + }, + { + "epoch": 8.234318872616745, + "grad_norm": 0.05399869754910469, + "learning_rate": 5.381339310731297e-05, + "loss": 1.0301, + "step": 59600 + }, + { + "epoch": 8.248134843879525, + "grad_norm": 0.06434085965156555, + "learning_rate": 5.3393107312972814e-05, + "loss": 1.0277, + "step": 59700 + }, + { + "epoch": 8.261950815142304, + "grad_norm": 0.054656483232975006, + "learning_rate": 5.297282151863267e-05, + "loss": 1.0295, + "step": 59800 + }, + { + "epoch": 8.275766786405084, + "grad_norm": 0.04396641626954079, + "learning_rate": 5.255253572429251e-05, + "loss": 1.0276, + "step": 59900 + }, + { + "epoch": 8.289582757667864, + "grad_norm": 0.058395449072122574, + "learning_rate": 5.2132249929952366e-05, + "loss": 1.0267, + "step": 60000 + }, + { + "epoch": 8.289582757667864, + "eval_accuracy": 0.5312832658873073, + "eval_loss": 1.0315501689910889, + "eval_runtime": 729.415, + "eval_samples_per_second": 282.228, + "eval_steps_per_second": 8.821, + "step": 60000 + }, + { + "epoch": 8.303398728930643, + "grad_norm": 0.06770013272762299, + "learning_rate": 5.171196413561221e-05, + "loss": 1.029, + "step": 60100 + }, + { + "epoch": 8.317214700193423, + "grad_norm": 0.06161688268184662, + "learning_rate": 5.1291678341272063e-05, + "loss": 1.0242, + "step": 60200 + }, + { + "epoch": 8.331030671456203, + "grad_norm": 0.04140911623835564, + "learning_rate": 5.087139254693191e-05, + "loss": 1.029, + "step": 60300 + }, + { + "epoch": 8.344846642718982, + "grad_norm": 0.07091998308897018, + "learning_rate": 5.045110675259176e-05, + "loss": 1.0268, + "step": 60400 + }, + { + "epoch": 8.358662613981762, + "grad_norm": 0.05135732889175415, + "learning_rate": 5.003082095825161e-05, + "loss": 1.0264, + "step": 60500 + }, + { + "epoch": 8.372478585244544, + "grad_norm": 0.05828474089503288, + "learning_rate": 4.961053516391146e-05, + "loss": 1.0271, + "step": 60600 + }, + { + "epoch": 8.386294556507323, + "grad_norm": 0.05920015275478363, + "learning_rate": 4.9190249369571306e-05, + "loss": 1.0263, + "step": 60700 + }, + { + "epoch": 8.400110527770103, + "grad_norm": 0.048502273857593536, + "learning_rate": 4.8769963575231155e-05, + "loss": 1.029, + "step": 60800 + }, + { + "epoch": 8.413926499032883, + "grad_norm": 0.049063604325056076, + "learning_rate": 4.8349677780891e-05, + "loss": 1.0294, + "step": 60900 + }, + { + "epoch": 8.427742470295662, + "grad_norm": 0.05672093480825424, + "learning_rate": 4.792939198655085e-05, + "loss": 1.0297, + "step": 61000 + }, + { + "epoch": 8.441558441558442, + "grad_norm": 0.06934633105993271, + "learning_rate": 4.75091061922107e-05, + "loss": 1.0261, + "step": 61100 + }, + { + "epoch": 8.455374412821222, + "grad_norm": 0.04098910838365555, + "learning_rate": 4.709302325581395e-05, + "loss": 1.0292, + "step": 61200 + }, + { + "epoch": 8.469190384084001, + "grad_norm": 0.06421385705471039, + "learning_rate": 4.6672737461473794e-05, + "loss": 1.0315, + "step": 61300 + }, + { + "epoch": 8.483006355346781, + "grad_norm": 0.05238828435540199, + "learning_rate": 4.625245166713365e-05, + "loss": 1.0309, + "step": 61400 + }, + { + "epoch": 8.49682232660956, + "grad_norm": 0.049910806119441986, + "learning_rate": 4.583216587279349e-05, + "loss": 1.0257, + "step": 61500 + }, + { + "epoch": 8.51063829787234, + "grad_norm": 0.06672196090221405, + "learning_rate": 4.541188007845335e-05, + "loss": 1.0328, + "step": 61600 + }, + { + "epoch": 8.52445426913512, + "grad_norm": 0.05466538295149803, + "learning_rate": 4.4991594284113195e-05, + "loss": 1.0284, + "step": 61700 + }, + { + "epoch": 8.5382702403979, + "grad_norm": 0.05218784883618355, + "learning_rate": 4.4571308489773044e-05, + "loss": 1.0285, + "step": 61800 + }, + { + "epoch": 8.55208621166068, + "grad_norm": 0.04263923689723015, + "learning_rate": 4.415102269543289e-05, + "loss": 1.0307, + "step": 61900 + }, + { + "epoch": 8.565902182923459, + "grad_norm": 0.054478637874126434, + "learning_rate": 4.373073690109274e-05, + "loss": 1.0291, + "step": 62000 + }, + { + "epoch": 8.579718154186239, + "grad_norm": 0.05667020007967949, + "learning_rate": 4.331045110675259e-05, + "loss": 1.0296, + "step": 62100 + }, + { + "epoch": 8.593534125449018, + "grad_norm": 0.0490160770714283, + "learning_rate": 4.289016531241244e-05, + "loss": 1.029, + "step": 62200 + }, + { + "epoch": 8.607350096711798, + "grad_norm": 0.049655403941869736, + "learning_rate": 4.246987951807229e-05, + "loss": 1.0298, + "step": 62300 + }, + { + "epoch": 8.62116606797458, + "grad_norm": 0.047429408878088, + "learning_rate": 4.2049593723732135e-05, + "loss": 1.0277, + "step": 62400 + }, + { + "epoch": 8.634982039237359, + "grad_norm": 0.05222218483686447, + "learning_rate": 4.1629307929391984e-05, + "loss": 1.0292, + "step": 62500 + }, + { + "epoch": 8.648798010500139, + "grad_norm": 0.05841238424181938, + "learning_rate": 4.120902213505183e-05, + "loss": 1.029, + "step": 62600 + }, + { + "epoch": 8.662613981762918, + "grad_norm": 0.0452195480465889, + "learning_rate": 4.078873634071168e-05, + "loss": 1.0265, + "step": 62700 + }, + { + "epoch": 8.676429953025698, + "grad_norm": 0.049306340515613556, + "learning_rate": 4.036845054637153e-05, + "loss": 1.0308, + "step": 62800 + }, + { + "epoch": 8.690245924288478, + "grad_norm": 0.050401389598846436, + "learning_rate": 3.994816475203138e-05, + "loss": 1.0294, + "step": 62900 + }, + { + "epoch": 8.704061895551257, + "grad_norm": 0.04503024369478226, + "learning_rate": 3.952787895769123e-05, + "loss": 1.0291, + "step": 63000 + }, + { + "epoch": 8.717877866814037, + "grad_norm": 0.0738733783364296, + "learning_rate": 3.9107593163351075e-05, + "loss": 1.0279, + "step": 63100 + }, + { + "epoch": 8.731693838076817, + "grad_norm": 0.04586975276470184, + "learning_rate": 3.869151022695433e-05, + "loss": 1.026, + "step": 63200 + }, + { + "epoch": 8.745509809339596, + "grad_norm": 0.04988343268632889, + "learning_rate": 3.8271224432614176e-05, + "loss": 1.0257, + "step": 63300 + }, + { + "epoch": 8.759325780602376, + "grad_norm": 0.07822008430957794, + "learning_rate": 3.7850938638274025e-05, + "loss": 1.0254, + "step": 63400 + }, + { + "epoch": 8.773141751865156, + "grad_norm": 0.058496229350566864, + "learning_rate": 3.743065284393387e-05, + "loss": 1.0263, + "step": 63500 + }, + { + "epoch": 8.786957723127935, + "grad_norm": 0.04458677023649216, + "learning_rate": 3.701036704959372e-05, + "loss": 1.0292, + "step": 63600 + }, + { + "epoch": 8.800773694390715, + "grad_norm": 0.06616061180830002, + "learning_rate": 3.659008125525357e-05, + "loss": 1.0309, + "step": 63700 + }, + { + "epoch": 8.814589665653495, + "grad_norm": 0.06473194807767868, + "learning_rate": 3.616979546091342e-05, + "loss": 1.0265, + "step": 63800 + }, + { + "epoch": 8.828405636916274, + "grad_norm": 0.047700874507427216, + "learning_rate": 3.574950966657327e-05, + "loss": 1.0303, + "step": 63900 + }, + { + "epoch": 8.842221608179056, + "grad_norm": 0.055733323097229004, + "learning_rate": 3.5329223872233116e-05, + "loss": 1.0279, + "step": 64000 + }, + { + "epoch": 8.856037579441836, + "grad_norm": 0.04398791491985321, + "learning_rate": 3.4908938077892965e-05, + "loss": 1.0284, + "step": 64100 + }, + { + "epoch": 8.869853550704615, + "grad_norm": 0.08901511132717133, + "learning_rate": 3.448865228355281e-05, + "loss": 1.0283, + "step": 64200 + }, + { + "epoch": 8.883669521967395, + "grad_norm": 0.05853118374943733, + "learning_rate": 3.406836648921266e-05, + "loss": 1.0291, + "step": 64300 + }, + { + "epoch": 8.897485493230175, + "grad_norm": 0.043922308832407, + "learning_rate": 3.364808069487251e-05, + "loss": 1.0294, + "step": 64400 + }, + { + "epoch": 8.911301464492954, + "grad_norm": 0.04332153871655464, + "learning_rate": 3.322779490053236e-05, + "loss": 1.0277, + "step": 64500 + }, + { + "epoch": 8.925117435755734, + "grad_norm": 0.09197825193405151, + "learning_rate": 3.280750910619221e-05, + "loss": 1.0295, + "step": 64600 + }, + { + "epoch": 8.938933407018514, + "grad_norm": 0.05589272826910019, + "learning_rate": 3.2387223311852056e-05, + "loss": 1.0274, + "step": 64700 + }, + { + "epoch": 8.952749378281293, + "grad_norm": 0.06028933823108673, + "learning_rate": 3.1966937517511904e-05, + "loss": 1.0285, + "step": 64800 + }, + { + "epoch": 8.966565349544073, + "grad_norm": 0.05357721447944641, + "learning_rate": 3.154665172317175e-05, + "loss": 1.027, + "step": 64900 + }, + { + "epoch": 8.980381320806853, + "grad_norm": 0.07362578809261322, + "learning_rate": 3.11263659288316e-05, + "loss": 1.0273, + "step": 65000 + }, + { + "epoch": 8.980381320806853, + "eval_accuracy": 0.5319501927585898, + "eval_loss": 1.0305662155151367, + "eval_runtime": 722.9505, + "eval_samples_per_second": 284.751, + "eval_steps_per_second": 8.9, + "step": 65000 + }, + { + "epoch": 8.994197292069632, + "grad_norm": 0.04831722378730774, + "learning_rate": 3.070608013449145e-05, + "loss": 1.0294, + "step": 65100 + }, + { + "epoch": 9.008013263332412, + "grad_norm": 0.06001870334148407, + "learning_rate": 3.0289997198094702e-05, + "loss": 1.0306, + "step": 65200 + }, + { + "epoch": 9.021829234595192, + "grad_norm": 0.04466562718153, + "learning_rate": 2.986971140375455e-05, + "loss": 1.0267, + "step": 65300 + }, + { + "epoch": 9.035645205857971, + "grad_norm": 0.059990085661411285, + "learning_rate": 2.94494256094144e-05, + "loss": 1.0248, + "step": 65400 + }, + { + "epoch": 9.049461177120751, + "grad_norm": 0.05244195833802223, + "learning_rate": 2.9029139815074248e-05, + "loss": 1.0282, + "step": 65500 + }, + { + "epoch": 9.06327714838353, + "grad_norm": 0.060148317366838455, + "learning_rate": 2.8608854020734097e-05, + "loss": 1.0266, + "step": 65600 + }, + { + "epoch": 9.07709311964631, + "grad_norm": 0.051530059427022934, + "learning_rate": 2.8188568226393945e-05, + "loss": 1.0257, + "step": 65700 + }, + { + "epoch": 9.090909090909092, + "grad_norm": 0.06650034338235855, + "learning_rate": 2.7768282432053794e-05, + "loss": 1.0276, + "step": 65800 + }, + { + "epoch": 9.104725062171871, + "grad_norm": 0.04850700497627258, + "learning_rate": 2.7347996637713642e-05, + "loss": 1.0249, + "step": 65900 + }, + { + "epoch": 9.118541033434651, + "grad_norm": 0.057128727436065674, + "learning_rate": 2.692771084337349e-05, + "loss": 1.0264, + "step": 66000 + }, + { + "epoch": 9.13235700469743, + "grad_norm": 0.056875213980674744, + "learning_rate": 2.650742504903334e-05, + "loss": 1.0285, + "step": 66100 + }, + { + "epoch": 9.14617297596021, + "grad_norm": 0.05632421374320984, + "learning_rate": 2.6087139254693188e-05, + "loss": 1.0286, + "step": 66200 + }, + { + "epoch": 9.15998894722299, + "grad_norm": 0.04903789609670639, + "learning_rate": 2.5666853460353037e-05, + "loss": 1.0233, + "step": 66300 + }, + { + "epoch": 9.17380491848577, + "grad_norm": 0.04932420328259468, + "learning_rate": 2.5246567666012885e-05, + "loss": 1.0273, + "step": 66400 + }, + { + "epoch": 9.18762088974855, + "grad_norm": 0.0668862909078598, + "learning_rate": 2.4826281871672734e-05, + "loss": 1.0264, + "step": 66500 + }, + { + "epoch": 9.201436861011329, + "grad_norm": 0.05283021926879883, + "learning_rate": 2.4405996077332586e-05, + "loss": 1.0278, + "step": 66600 + }, + { + "epoch": 9.215252832274109, + "grad_norm": 0.04914732649922371, + "learning_rate": 2.3985710282992434e-05, + "loss": 1.0276, + "step": 66700 + }, + { + "epoch": 9.229068803536888, + "grad_norm": 0.06511181592941284, + "learning_rate": 2.3565424488652283e-05, + "loss": 1.0268, + "step": 66800 + }, + { + "epoch": 9.242884774799668, + "grad_norm": 0.06101306900382042, + "learning_rate": 2.314513869431213e-05, + "loss": 1.0267, + "step": 66900 + }, + { + "epoch": 9.256700746062448, + "grad_norm": 0.05272289365530014, + "learning_rate": 2.272485289997198e-05, + "loss": 1.0242, + "step": 67000 + }, + { + "epoch": 9.270516717325227, + "grad_norm": 0.04828105494379997, + "learning_rate": 2.230456710563183e-05, + "loss": 1.0258, + "step": 67100 + }, + { + "epoch": 9.284332688588007, + "grad_norm": 0.054294098168611526, + "learning_rate": 2.1888484169235077e-05, + "loss": 1.0262, + "step": 67200 + }, + { + "epoch": 9.298148659850787, + "grad_norm": 0.04951765388250351, + "learning_rate": 2.1468198374894926e-05, + "loss": 1.0254, + "step": 67300 + }, + { + "epoch": 9.311964631113566, + "grad_norm": 0.047647446393966675, + "learning_rate": 2.1047912580554774e-05, + "loss": 1.0262, + "step": 67400 + }, + { + "epoch": 9.325780602376348, + "grad_norm": 0.062047079205513, + "learning_rate": 2.0627626786214623e-05, + "loss": 1.0287, + "step": 67500 + }, + { + "epoch": 9.339596573639128, + "grad_norm": 0.05751033127307892, + "learning_rate": 2.020734099187447e-05, + "loss": 1.027, + "step": 67600 + }, + { + "epoch": 9.353412544901907, + "grad_norm": 0.058642346411943436, + "learning_rate": 1.978705519753432e-05, + "loss": 1.0276, + "step": 67700 + }, + { + "epoch": 9.367228516164687, + "grad_norm": 0.050882838666439056, + "learning_rate": 1.936676940319417e-05, + "loss": 1.0223, + "step": 67800 + }, + { + "epoch": 9.381044487427467, + "grad_norm": 0.053814638406038284, + "learning_rate": 1.8946483608854017e-05, + "loss": 1.0271, + "step": 67900 + }, + { + "epoch": 9.394860458690246, + "grad_norm": 0.05407179519534111, + "learning_rate": 1.852619781451387e-05, + "loss": 1.0242, + "step": 68000 + }, + { + "epoch": 9.408676429953026, + "grad_norm": 0.05431421846151352, + "learning_rate": 1.8105912020173718e-05, + "loss": 1.0246, + "step": 68100 + }, + { + "epoch": 9.422492401215806, + "grad_norm": 0.05826635658740997, + "learning_rate": 1.7685626225833566e-05, + "loss": 1.024, + "step": 68200 + }, + { + "epoch": 9.436308372478585, + "grad_norm": 0.043603766709566116, + "learning_rate": 1.7265340431493415e-05, + "loss": 1.025, + "step": 68300 + }, + { + "epoch": 9.450124343741365, + "grad_norm": 0.0555894561111927, + "learning_rate": 1.6845054637153263e-05, + "loss": 1.0267, + "step": 68400 + }, + { + "epoch": 9.463940315004145, + "grad_norm": 0.046029891818761826, + "learning_rate": 1.6424768842813112e-05, + "loss": 1.0247, + "step": 68500 + }, + { + "epoch": 9.477756286266924, + "grad_norm": 0.04906938225030899, + "learning_rate": 1.600448304847296e-05, + "loss": 1.0233, + "step": 68600 + }, + { + "epoch": 9.491572257529704, + "grad_norm": 0.07827210426330566, + "learning_rate": 1.558419725413281e-05, + "loss": 1.0262, + "step": 68700 + }, + { + "epoch": 9.505388228792484, + "grad_norm": 0.04391390085220337, + "learning_rate": 1.5163911459792658e-05, + "loss": 1.0255, + "step": 68800 + }, + { + "epoch": 9.519204200055263, + "grad_norm": 0.05310402810573578, + "learning_rate": 1.4743625665452506e-05, + "loss": 1.0268, + "step": 68900 + }, + { + "epoch": 9.533020171318043, + "grad_norm": 0.060242168605327606, + "learning_rate": 1.4323339871112355e-05, + "loss": 1.0257, + "step": 69000 + }, + { + "epoch": 9.546836142580823, + "grad_norm": 0.04949665814638138, + "learning_rate": 1.3903054076772205e-05, + "loss": 1.0294, + "step": 69100 + }, + { + "epoch": 9.560652113843604, + "grad_norm": 0.05413687229156494, + "learning_rate": 1.3482768282432054e-05, + "loss": 1.0272, + "step": 69200 + }, + { + "epoch": 9.574468085106384, + "grad_norm": 0.05380227789282799, + "learning_rate": 1.3062482488091902e-05, + "loss": 1.025, + "step": 69300 + }, + { + "epoch": 9.588284056369163, + "grad_norm": 0.04961249604821205, + "learning_rate": 1.2646399551695151e-05, + "loss": 1.0289, + "step": 69400 + }, + { + "epoch": 9.602100027631943, + "grad_norm": 0.045629873871803284, + "learning_rate": 1.2226113757355e-05, + "loss": 1.0269, + "step": 69500 + }, + { + "epoch": 9.615915998894723, + "grad_norm": 0.04661751165986061, + "learning_rate": 1.1805827963014848e-05, + "loss": 1.0277, + "step": 69600 + }, + { + "epoch": 9.629731970157502, + "grad_norm": 0.06289409101009369, + "learning_rate": 1.1385542168674697e-05, + "loss": 1.0246, + "step": 69700 + }, + { + "epoch": 9.643547941420282, + "grad_norm": 0.061526406556367874, + "learning_rate": 1.0965256374334547e-05, + "loss": 1.0252, + "step": 69800 + }, + { + "epoch": 9.657363912683062, + "grad_norm": 0.05611636862158775, + "learning_rate": 1.0544970579994395e-05, + "loss": 1.0281, + "step": 69900 + }, + { + "epoch": 9.671179883945841, + "grad_norm": 0.05305150896310806, + "learning_rate": 1.0124684785654244e-05, + "loss": 1.027, + "step": 70000 + }, + { + "epoch": 9.671179883945841, + "eval_accuracy": 0.5323623139821072, + "eval_loss": 1.029943823814392, + "eval_runtime": 726.2479, + "eval_samples_per_second": 283.458, + "eval_steps_per_second": 8.859, + "step": 70000 + }, + { + "epoch": 9.684995855208621, + "grad_norm": 0.06483161449432373, + "learning_rate": 9.704398991314093e-06, + "loss": 1.0262, + "step": 70100 + }, + { + "epoch": 9.6988118264714, + "grad_norm": 0.05063271522521973, + "learning_rate": 9.284113196973941e-06, + "loss": 1.0246, + "step": 70200 + }, + { + "epoch": 9.71262779773418, + "grad_norm": 0.04985768347978592, + "learning_rate": 8.86382740263379e-06, + "loss": 1.03, + "step": 70300 + }, + { + "epoch": 9.72644376899696, + "grad_norm": 0.04751725122332573, + "learning_rate": 8.443541608293638e-06, + "loss": 1.0272, + "step": 70400 + }, + { + "epoch": 9.74025974025974, + "grad_norm": 0.042586106806993484, + "learning_rate": 8.023255813953487e-06, + "loss": 1.0254, + "step": 70500 + }, + { + "epoch": 9.75407571152252, + "grad_norm": 0.059688206762075424, + "learning_rate": 7.602970019613336e-06, + "loss": 1.0255, + "step": 70600 + }, + { + "epoch": 9.767891682785299, + "grad_norm": 0.04823042452335358, + "learning_rate": 7.182684225273185e-06, + "loss": 1.028, + "step": 70700 + }, + { + "epoch": 9.78170765404808, + "grad_norm": 0.0480177104473114, + "learning_rate": 6.762398430933033e-06, + "loss": 1.025, + "step": 70800 + }, + { + "epoch": 9.79552362531086, + "grad_norm": 0.045797545462846756, + "learning_rate": 6.342112636592882e-06, + "loss": 1.023, + "step": 70900 + }, + { + "epoch": 9.80933959657364, + "grad_norm": 0.04858710244297981, + "learning_rate": 5.921826842252732e-06, + "loss": 1.0216, + "step": 71000 + }, + { + "epoch": 9.82315556783642, + "grad_norm": 0.05248698219656944, + "learning_rate": 5.501541047912581e-06, + "loss": 1.0252, + "step": 71100 + }, + { + "epoch": 9.8369715390992, + "grad_norm": 0.045856546610593796, + "learning_rate": 5.081255253572429e-06, + "loss": 1.0249, + "step": 71200 + }, + { + "epoch": 9.850787510361979, + "grad_norm": 0.047852564603090286, + "learning_rate": 4.660969459232278e-06, + "loss": 1.0238, + "step": 71300 + }, + { + "epoch": 9.864603481624759, + "grad_norm": 0.044457610696554184, + "learning_rate": 4.240683664892126e-06, + "loss": 1.0245, + "step": 71400 + }, + { + "epoch": 9.878419452887538, + "grad_norm": 0.06768154352903366, + "learning_rate": 3.824600728495377e-06, + "loss": 1.0256, + "step": 71500 + }, + { + "epoch": 9.892235424150318, + "grad_norm": 0.050749246031045914, + "learning_rate": 3.4043149341552255e-06, + "loss": 1.0224, + "step": 71600 + }, + { + "epoch": 9.906051395413098, + "grad_norm": 0.04643206670880318, + "learning_rate": 2.984029139815074e-06, + "loss": 1.0279, + "step": 71700 + }, + { + "epoch": 9.919867366675877, + "grad_norm": 0.05305636674165726, + "learning_rate": 2.5637433454749226e-06, + "loss": 1.0257, + "step": 71800 + }, + { + "epoch": 9.933683337938657, + "grad_norm": 0.06888972967863083, + "learning_rate": 2.1434575511347716e-06, + "loss": 1.0301, + "step": 71900 + }, + { + "epoch": 9.947499309201437, + "grad_norm": 0.04633474349975586, + "learning_rate": 1.7231717567946201e-06, + "loss": 1.0228, + "step": 72000 + }, + { + "epoch": 9.961315280464216, + "grad_norm": 0.05391710251569748, + "learning_rate": 1.302885962454469e-06, + "loss": 1.0284, + "step": 72100 + }, + { + "epoch": 9.975131251726996, + "grad_norm": 0.048064954578876495, + "learning_rate": 8.826001681143176e-07, + "loss": 1.0257, + "step": 72200 + }, + { + "epoch": 9.988947222989776, + "grad_norm": 0.04865507408976555, + "learning_rate": 4.623143737741664e-07, + "loss": 1.0274, + "step": 72300 + } + ], + "logging_steps": 100, + "max_steps": 72380, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.683106445125485e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-72380/training_args.bin b/checkpoint-72380/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..45f2f12b913e85908e1565ce4b13c8763ea7a1ca --- /dev/null +++ b/checkpoint-72380/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e8fb7657adc13bdcaf635b1c6fb616dd082a6870cdd6aecd3b669d8cac0873 +size 5304 diff --git a/checkpoint-72380/vocab.json b/checkpoint-72380/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..d0809a2e3e28811023f05ed415122e24681bc9d1 --- /dev/null +++ b/checkpoint-72380/vocab.json @@ -0,0 +1 @@ +{"<|endoftext|>":0,"A":1,"C":2,"G":3,"T":4} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..192ba05a8714569e728cced45eaebf4106596353 --- /dev/null +++ b/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 50256, + "embd_pdrop": 0.1, + "eos_token_id": 50256, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 768, + "n_head": 12, + "n_inner": null, + "n_layer": 12, + "n_positions": 1024, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.1, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.52.0.dev0", + "use_cache": true, + "vocab_size": 5 +} diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..02fbfa8fff8ade1cc30d8104426012b5fc6b014a --- /dev/null +++ b/eval_results.json @@ -0,0 +1,10 @@ +{ + "epoch": 10.0, + "eval_accuracy": 0.5323623139821072, + "eval_loss": 1.029943823814392, + "eval_runtime": 721.9591, + "eval_samples": 205861, + "eval_samples_per_second": 285.142, + "eval_steps_per_second": 8.912, + "perplexity": 2.8009084859245172 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c829fa47bd90bfe00fdb37ed6d41324f6fb81f63 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 50256, + "eos_token_id": 50256, + "transformers_version": "4.52.0.dev0" +} diff --git a/merges.txt b/merges.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e7f1fd94996c8e2b65adea828af1b398eace61f --- /dev/null +++ b/merges.txt @@ -0,0 +1 @@ +#version: 0.2 diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c09e2b5644e78b2f38d42b31f00f85b094fe2fe0 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7694d2d4deca5552713678b7ba3a98015adc907157087d06374e8759397a9704 +size 343400064 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..773bd68cf0900427f8d69dd974724e3abb9a08a9 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..ac40039af791f0fd130b3d36c3677a156b2de089 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,53 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": true + }, + "post_processor": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": false, + "use_regex": true + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": "", + "end_of_word_suffix": "", + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "<|endoftext|>": 0, + "A": 1, + "C": 2, + "G": 3, + "T": 4 + }, + "merges": [] + } +} \ No newline at end of file diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7c4e19588fa8b4faceab450a1d7e8dae1ce87f7c --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,21 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|endoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "<|endoftext|>" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..25b12c700e47a31490fffa65dca4beb3189efa59 --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 10.0, + "total_flos": 9.683106445125485e+18, + "train_loss": 1.0569831334908848, + "train_runtime": 55085.3166, + "train_samples": 1852919, + "train_samples_per_second": 336.373, + "train_steps_per_second": 1.314 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..36bfc47059d88591a3957a29925baf1431f2a762 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,5237 @@ +{ + "best_global_step": 70000, + "best_metric": 1.029943823814392, + "best_model_checkpoint": "./dna_model/checkpoint-70000", + "epoch": 10.0, + "eval_steps": 5000, + "global_step": 72380, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00013815971262779773, + "grad_norm": 42.580928802490234, + "learning_rate": 0.0, + "loss": 1.6625, + "step": 1 + }, + { + "epoch": 0.013815971262779773, + "grad_norm": 2.4457767009735107, + "learning_rate": 2.97e-05, + "loss": 1.36, + "step": 100 + }, + { + "epoch": 0.027631942525559547, + "grad_norm": 0.5432274341583252, + "learning_rate": 5.97e-05, + "loss": 1.3309, + "step": 200 + }, + { + "epoch": 0.04144791378833932, + "grad_norm": 0.825528621673584, + "learning_rate": 8.969999999999998e-05, + "loss": 1.3234, + "step": 300 + }, + { + "epoch": 0.055263885051119094, + "grad_norm": 0.4912604093551636, + "learning_rate": 0.0001197, + "loss": 1.3249, + "step": 400 + }, + { + "epoch": 0.06907985631389886, + "grad_norm": 0.9077563881874084, + "learning_rate": 0.00014969999999999998, + "loss": 1.3153, + "step": 500 + }, + { + "epoch": 0.08289582757667864, + "grad_norm": 0.8954246640205383, + "learning_rate": 0.00017969999999999998, + "loss": 1.3123, + "step": 600 + }, + { + "epoch": 0.09671179883945841, + "grad_norm": 0.5876831412315369, + "learning_rate": 0.00020969999999999997, + "loss": 1.3098, + "step": 700 + }, + { + "epoch": 0.11052777010223819, + "grad_norm": 0.426789253950119, + "learning_rate": 0.0002397, + "loss": 1.3072, + "step": 800 + }, + { + "epoch": 0.12434374136501795, + "grad_norm": 0.3324718177318573, + "learning_rate": 0.0002697, + "loss": 1.3037, + "step": 900 + }, + { + "epoch": 0.13815971262779772, + "grad_norm": 0.23672613501548767, + "learning_rate": 0.00029969999999999997, + "loss": 1.2991, + "step": 1000 + }, + { + "epoch": 0.1519756838905775, + "grad_norm": 0.4699796438217163, + "learning_rate": 0.00029958391706360325, + "loss": 1.2923, + "step": 1100 + }, + { + "epoch": 0.16579165515335728, + "grad_norm": 0.684186577796936, + "learning_rate": 0.00029916363126926307, + "loss": 1.2825, + "step": 1200 + }, + { + "epoch": 0.17960762641613706, + "grad_norm": 0.3944641649723053, + "learning_rate": 0.00029874334547492294, + "loss": 1.2678, + "step": 1300 + }, + { + "epoch": 0.19342359767891681, + "grad_norm": 1.1556001901626587, + "learning_rate": 0.00029832305968058276, + "loss": 1.2541, + "step": 1400 + }, + { + "epoch": 0.2072395689416966, + "grad_norm": 0.39745599031448364, + "learning_rate": 0.0002979027738862426, + "loss": 1.2439, + "step": 1500 + }, + { + "epoch": 0.22105554020447638, + "grad_norm": 0.5201444029808044, + "learning_rate": 0.00029748248809190246, + "loss": 1.2329, + "step": 1600 + }, + { + "epoch": 0.23487151146725616, + "grad_norm": 0.2168777734041214, + "learning_rate": 0.00029706220229756234, + "loss": 1.2268, + "step": 1700 + }, + { + "epoch": 0.2486874827300359, + "grad_norm": 0.30599427223205566, + "learning_rate": 0.00029664191650322216, + "loss": 1.2199, + "step": 1800 + }, + { + "epoch": 0.2625034539928157, + "grad_norm": 0.32062044739723206, + "learning_rate": 0.00029622163070888203, + "loss": 1.2131, + "step": 1900 + }, + { + "epoch": 0.27631942525559544, + "grad_norm": 0.13411013782024384, + "learning_rate": 0.00029580134491454186, + "loss": 1.2074, + "step": 2000 + }, + { + "epoch": 0.2901353965183752, + "grad_norm": 0.3672633767127991, + "learning_rate": 0.00029538105912020173, + "loss": 1.2022, + "step": 2100 + }, + { + "epoch": 0.303951367781155, + "grad_norm": 0.41515815258026123, + "learning_rate": 0.00029496077332586155, + "loss": 1.1949, + "step": 2200 + }, + { + "epoch": 0.3177673390439348, + "grad_norm": 0.18381068110466003, + "learning_rate": 0.0002945404875315214, + "loss": 1.1887, + "step": 2300 + }, + { + "epoch": 0.33158331030671456, + "grad_norm": 0.3080751895904541, + "learning_rate": 0.00029412020173718125, + "loss": 1.1844, + "step": 2400 + }, + { + "epoch": 0.34539928156949434, + "grad_norm": 0.38037416338920593, + "learning_rate": 0.0002936999159428411, + "loss": 1.1804, + "step": 2500 + }, + { + "epoch": 0.3592152528322741, + "grad_norm": 0.23272989690303802, + "learning_rate": 0.00029327963014850095, + "loss": 1.1753, + "step": 2600 + }, + { + "epoch": 0.3730312240950539, + "grad_norm": 0.1149936243891716, + "learning_rate": 0.0002928593443541608, + "loss": 1.1739, + "step": 2700 + }, + { + "epoch": 0.38684719535783363, + "grad_norm": 0.28469276428222656, + "learning_rate": 0.00029243905855982064, + "loss": 1.1671, + "step": 2800 + }, + { + "epoch": 0.4006631666206134, + "grad_norm": 0.25204166769981384, + "learning_rate": 0.0002920187727654805, + "loss": 1.1633, + "step": 2900 + }, + { + "epoch": 0.4144791378833932, + "grad_norm": 0.3945861756801605, + "learning_rate": 0.00029159848697114034, + "loss": 1.1608, + "step": 3000 + }, + { + "epoch": 0.42829510914617297, + "grad_norm": 0.2578865587711334, + "learning_rate": 0.00029117820117680016, + "loss": 1.1622, + "step": 3100 + }, + { + "epoch": 0.44211108040895275, + "grad_norm": 0.16060177981853485, + "learning_rate": 0.00029075791538246004, + "loss": 1.1577, + "step": 3200 + }, + { + "epoch": 0.45592705167173253, + "grad_norm": 0.1980718970298767, + "learning_rate": 0.0002903376295881199, + "loss": 1.155, + "step": 3300 + }, + { + "epoch": 0.4697430229345123, + "grad_norm": 0.12515653669834137, + "learning_rate": 0.00028991734379377974, + "loss": 1.1519, + "step": 3400 + }, + { + "epoch": 0.4835589941972921, + "grad_norm": 0.26255738735198975, + "learning_rate": 0.0002894970579994396, + "loss": 1.1523, + "step": 3500 + }, + { + "epoch": 0.4973749654600718, + "grad_norm": 0.281464546918869, + "learning_rate": 0.00028907677220509943, + "loss": 1.1511, + "step": 3600 + }, + { + "epoch": 0.5111909367228517, + "grad_norm": 0.11816036701202393, + "learning_rate": 0.0002886564864107593, + "loss": 1.1469, + "step": 3700 + }, + { + "epoch": 0.5250069079856314, + "grad_norm": 0.25923675298690796, + "learning_rate": 0.00028823620061641913, + "loss": 1.1456, + "step": 3800 + }, + { + "epoch": 0.5388228792484112, + "grad_norm": 0.2766472399234772, + "learning_rate": 0.00028781591482207895, + "loss": 1.1442, + "step": 3900 + }, + { + "epoch": 0.5526388505111909, + "grad_norm": 0.1701624095439911, + "learning_rate": 0.00028739562902773883, + "loss": 1.1445, + "step": 4000 + }, + { + "epoch": 0.5664548217739707, + "grad_norm": 0.3141656219959259, + "learning_rate": 0.0002869753432333987, + "loss": 1.1392, + "step": 4100 + }, + { + "epoch": 0.5802707930367504, + "grad_norm": 0.11816743016242981, + "learning_rate": 0.0002865550574390585, + "loss": 1.1406, + "step": 4200 + }, + { + "epoch": 0.5940867642995302, + "grad_norm": 0.12762723863124847, + "learning_rate": 0.0002861347716447184, + "loss": 1.1361, + "step": 4300 + }, + { + "epoch": 0.60790273556231, + "grad_norm": 0.09322622418403625, + "learning_rate": 0.0002857144858503782, + "loss": 1.134, + "step": 4400 + }, + { + "epoch": 0.6217187068250898, + "grad_norm": 0.1586735099554062, + "learning_rate": 0.0002852942000560381, + "loss": 1.1336, + "step": 4500 + }, + { + "epoch": 0.6355346780878696, + "grad_norm": 0.13594642281532288, + "learning_rate": 0.0002848739142616979, + "loss": 1.1328, + "step": 4600 + }, + { + "epoch": 0.6493506493506493, + "grad_norm": 0.21865279972553253, + "learning_rate": 0.00028445362846735774, + "loss": 1.1311, + "step": 4700 + }, + { + "epoch": 0.6631666206134291, + "grad_norm": 0.22787001729011536, + "learning_rate": 0.0002840333426730176, + "loss": 1.1271, + "step": 4800 + }, + { + "epoch": 0.6769825918762089, + "grad_norm": 0.2334531843662262, + "learning_rate": 0.0002836130568786775, + "loss": 1.1291, + "step": 4900 + }, + { + "epoch": 0.6907985631389887, + "grad_norm": 0.11103236675262451, + "learning_rate": 0.0002831927710843373, + "loss": 1.1252, + "step": 5000 + }, + { + "epoch": 0.6907985631389887, + "eval_accuracy": 0.4745045939970608, + "eval_loss": 1.1205766201019287, + "eval_runtime": 1027.9902, + "eval_samples_per_second": 200.256, + "eval_steps_per_second": 6.259, + "step": 5000 + }, + { + "epoch": 0.7046145344017685, + "grad_norm": 0.21742330491542816, + "learning_rate": 0.0002827724852899972, + "loss": 1.1235, + "step": 5100 + }, + { + "epoch": 0.7184305056645482, + "grad_norm": 0.23728515207767487, + "learning_rate": 0.000282352199495657, + "loss": 1.1233, + "step": 5200 + }, + { + "epoch": 0.732246476927328, + "grad_norm": 0.21022765338420868, + "learning_rate": 0.0002819319137013169, + "loss": 1.1236, + "step": 5300 + }, + { + "epoch": 0.7460624481901078, + "grad_norm": 0.0924484059214592, + "learning_rate": 0.0002815116279069767, + "loss": 1.1215, + "step": 5400 + }, + { + "epoch": 0.7598784194528876, + "grad_norm": 0.1716778427362442, + "learning_rate": 0.00028109134211263653, + "loss": 1.1238, + "step": 5500 + }, + { + "epoch": 0.7736943907156673, + "grad_norm": 0.13049638271331787, + "learning_rate": 0.0002806710563182964, + "loss": 1.1185, + "step": 5600 + }, + { + "epoch": 0.787510361978447, + "grad_norm": 0.16255174577236176, + "learning_rate": 0.0002802507705239563, + "loss": 1.1169, + "step": 5700 + }, + { + "epoch": 0.8013263332412268, + "grad_norm": 0.10065080225467682, + "learning_rate": 0.0002798304847296161, + "loss": 1.1184, + "step": 5800 + }, + { + "epoch": 0.8151423045040066, + "grad_norm": 0.1182553768157959, + "learning_rate": 0.000279410198935276, + "loss": 1.1141, + "step": 5900 + }, + { + "epoch": 0.8289582757667864, + "grad_norm": 0.14556263387203217, + "learning_rate": 0.0002789899131409358, + "loss": 1.1154, + "step": 6000 + }, + { + "epoch": 0.8427742470295662, + "grad_norm": 0.1383764147758484, + "learning_rate": 0.00027857383020453907, + "loss": 1.1118, + "step": 6100 + }, + { + "epoch": 0.8565902182923459, + "grad_norm": 0.2821154296398163, + "learning_rate": 0.00027815354441019895, + "loss": 1.1104, + "step": 6200 + }, + { + "epoch": 0.8704061895551257, + "grad_norm": 0.22286450862884521, + "learning_rate": 0.00027773325861585877, + "loss": 1.1109, + "step": 6300 + }, + { + "epoch": 0.8842221608179055, + "grad_norm": 0.2058987319469452, + "learning_rate": 0.0002773129728215186, + "loss": 1.1093, + "step": 6400 + }, + { + "epoch": 0.8980381320806853, + "grad_norm": 0.21338045597076416, + "learning_rate": 0.00027689268702717847, + "loss": 1.1091, + "step": 6500 + }, + { + "epoch": 0.9118541033434651, + "grad_norm": 0.0900028795003891, + "learning_rate": 0.0002764724012328383, + "loss": 1.1067, + "step": 6600 + }, + { + "epoch": 0.9256700746062448, + "grad_norm": 0.10679551959037781, + "learning_rate": 0.00027605211543849816, + "loss": 1.108, + "step": 6700 + }, + { + "epoch": 0.9394860458690246, + "grad_norm": 0.07972779124975204, + "learning_rate": 0.000275631829644158, + "loss": 1.1057, + "step": 6800 + }, + { + "epoch": 0.9533020171318044, + "grad_norm": 0.24500218033790588, + "learning_rate": 0.00027521154384981786, + "loss": 1.105, + "step": 6900 + }, + { + "epoch": 0.9671179883945842, + "grad_norm": 0.11576998978853226, + "learning_rate": 0.00027479125805547774, + "loss": 1.1029, + "step": 7000 + }, + { + "epoch": 0.980933959657364, + "grad_norm": 0.10553757101297379, + "learning_rate": 0.00027437097226113756, + "loss": 1.1041, + "step": 7100 + }, + { + "epoch": 0.9947499309201436, + "grad_norm": 0.15332186222076416, + "learning_rate": 0.0002739506864667974, + "loss": 1.0982, + "step": 7200 + }, + { + "epoch": 1.0085659021829234, + "grad_norm": 0.11897014081478119, + "learning_rate": 0.00027353040067245725, + "loss": 1.0996, + "step": 7300 + }, + { + "epoch": 1.0223818734457033, + "grad_norm": 0.1156444102525711, + "learning_rate": 0.0002731101148781171, + "loss": 1.1032, + "step": 7400 + }, + { + "epoch": 1.036197844708483, + "grad_norm": 0.06223931908607483, + "learning_rate": 0.00027268982908377695, + "loss": 1.0982, + "step": 7500 + }, + { + "epoch": 1.0500138159712629, + "grad_norm": 0.14377152919769287, + "learning_rate": 0.00027226954328943677, + "loss": 1.1003, + "step": 7600 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 0.12667153775691986, + "learning_rate": 0.00027184925749509665, + "loss": 1.0989, + "step": 7700 + }, + { + "epoch": 1.0776457584968224, + "grad_norm": 0.16101804375648499, + "learning_rate": 0.0002714289717007565, + "loss": 1.0968, + "step": 7800 + }, + { + "epoch": 1.091461729759602, + "grad_norm": 0.06424383819103241, + "learning_rate": 0.00027100868590641635, + "loss": 1.0955, + "step": 7900 + }, + { + "epoch": 1.105277701022382, + "grad_norm": 0.09638939052820206, + "learning_rate": 0.00027058840011207617, + "loss": 1.095, + "step": 8000 + }, + { + "epoch": 1.1190936722851617, + "grad_norm": 0.08098015189170837, + "learning_rate": 0.00027016811431773604, + "loss": 1.0969, + "step": 8100 + }, + { + "epoch": 1.1329096435479413, + "grad_norm": 0.10837887227535248, + "learning_rate": 0.00026974782852339586, + "loss": 1.096, + "step": 8200 + }, + { + "epoch": 1.1467256148107212, + "grad_norm": 0.05644046515226364, + "learning_rate": 0.00026932754272905574, + "loss": 1.0944, + "step": 8300 + }, + { + "epoch": 1.1605415860735009, + "grad_norm": 0.12965446710586548, + "learning_rate": 0.00026890725693471556, + "loss": 1.0953, + "step": 8400 + }, + { + "epoch": 1.1743575573362808, + "grad_norm": 0.12333771586418152, + "learning_rate": 0.00026848697114037544, + "loss": 1.095, + "step": 8500 + }, + { + "epoch": 1.1881735285990604, + "grad_norm": 0.1270703673362732, + "learning_rate": 0.0002680666853460353, + "loss": 1.0929, + "step": 8600 + }, + { + "epoch": 1.2019894998618403, + "grad_norm": 0.16918766498565674, + "learning_rate": 0.00026764639955169513, + "loss": 1.0918, + "step": 8700 + }, + { + "epoch": 1.21580547112462, + "grad_norm": 0.08776108920574188, + "learning_rate": 0.00026722611375735496, + "loss": 1.0952, + "step": 8800 + }, + { + "epoch": 1.2296214423874, + "grad_norm": 0.08252176642417908, + "learning_rate": 0.00026680582796301483, + "loss": 1.09, + "step": 8900 + }, + { + "epoch": 1.2434374136501796, + "grad_norm": 0.16331979632377625, + "learning_rate": 0.00026638554216867465, + "loss": 1.0898, + "step": 9000 + }, + { + "epoch": 1.2572533849129595, + "grad_norm": 0.17065368592739105, + "learning_rate": 0.00026596525637433453, + "loss": 1.0907, + "step": 9100 + }, + { + "epoch": 1.2710693561757391, + "grad_norm": 0.12038784474134445, + "learning_rate": 0.00026554497057999435, + "loss": 1.0856, + "step": 9200 + }, + { + "epoch": 1.284885327438519, + "grad_norm": 0.11924347281455994, + "learning_rate": 0.0002651246847856542, + "loss": 1.0895, + "step": 9300 + }, + { + "epoch": 1.2987012987012987, + "grad_norm": 0.1443828046321869, + "learning_rate": 0.0002647043989913141, + "loss": 1.0874, + "step": 9400 + }, + { + "epoch": 1.3125172699640784, + "grad_norm": 0.14472317695617676, + "learning_rate": 0.0002642841131969739, + "loss": 1.0879, + "step": 9500 + }, + { + "epoch": 1.3263332412268583, + "grad_norm": 0.15847088396549225, + "learning_rate": 0.00026386382740263374, + "loss": 1.0873, + "step": 9600 + }, + { + "epoch": 1.3401492124896381, + "grad_norm": 0.17960332334041595, + "learning_rate": 0.0002634435416082936, + "loss": 1.0887, + "step": 9700 + }, + { + "epoch": 1.3539651837524178, + "grad_norm": 0.1566227227449417, + "learning_rate": 0.00026302325581395344, + "loss": 1.0884, + "step": 9800 + }, + { + "epoch": 1.3677811550151975, + "grad_norm": 0.1431213617324829, + "learning_rate": 0.0002626029700196133, + "loss": 1.0864, + "step": 9900 + }, + { + "epoch": 1.3815971262779774, + "grad_norm": 0.10321222990751266, + "learning_rate": 0.0002621826842252732, + "loss": 1.0835, + "step": 10000 + }, + { + "epoch": 1.3815971262779774, + "eval_accuracy": 0.49913821881815945, + "eval_loss": 1.081355094909668, + "eval_runtime": 748.8314, + "eval_samples_per_second": 274.91, + "eval_steps_per_second": 8.592, + "step": 10000 + }, + { + "epoch": 1.395413097540757, + "grad_norm": 0.10260605067014694, + "learning_rate": 0.0002617666012888764, + "loss": 1.0843, + "step": 10100 + }, + { + "epoch": 1.409229068803537, + "grad_norm": 0.1076885387301445, + "learning_rate": 0.0002613463154945363, + "loss": 1.0845, + "step": 10200 + }, + { + "epoch": 1.4230450400663166, + "grad_norm": 0.0723571702837944, + "learning_rate": 0.0002609260297001961, + "loss": 1.0814, + "step": 10300 + }, + { + "epoch": 1.4368610113290965, + "grad_norm": 0.10695687681436539, + "learning_rate": 0.00026050574390585593, + "loss": 1.0842, + "step": 10400 + }, + { + "epoch": 1.4506769825918762, + "grad_norm": 0.11008185893297195, + "learning_rate": 0.0002600854581115158, + "loss": 1.0832, + "step": 10500 + }, + { + "epoch": 1.464492953854656, + "grad_norm": 0.12239653617143631, + "learning_rate": 0.0002596651723171756, + "loss": 1.0813, + "step": 10600 + }, + { + "epoch": 1.4783089251174357, + "grad_norm": 0.11045056581497192, + "learning_rate": 0.0002592448865228355, + "loss": 1.0848, + "step": 10700 + }, + { + "epoch": 1.4921248963802154, + "grad_norm": 0.07234488427639008, + "learning_rate": 0.0002588246007284954, + "loss": 1.0826, + "step": 10800 + }, + { + "epoch": 1.5059408676429953, + "grad_norm": 0.11086778342723846, + "learning_rate": 0.0002584043149341552, + "loss": 1.0804, + "step": 10900 + }, + { + "epoch": 1.5197568389057752, + "grad_norm": 0.10693442821502686, + "learning_rate": 0.0002579840291398151, + "loss": 1.0784, + "step": 11000 + }, + { + "epoch": 1.5335728101685548, + "grad_norm": 0.11604110896587372, + "learning_rate": 0.0002575637433454749, + "loss": 1.0792, + "step": 11100 + }, + { + "epoch": 1.5473887814313345, + "grad_norm": 0.0809662714600563, + "learning_rate": 0.0002571434575511347, + "loss": 1.083, + "step": 11200 + }, + { + "epoch": 1.5612047526941144, + "grad_norm": 0.1850002408027649, + "learning_rate": 0.0002567231717567946, + "loss": 1.0802, + "step": 11300 + }, + { + "epoch": 1.5750207239568943, + "grad_norm": 0.0779227465391159, + "learning_rate": 0.0002563028859624544, + "loss": 1.0811, + "step": 11400 + }, + { + "epoch": 1.588836695219674, + "grad_norm": 0.16764625906944275, + "learning_rate": 0.0002558826001681143, + "loss": 1.0763, + "step": 11500 + }, + { + "epoch": 1.6026526664824536, + "grad_norm": 0.11104313284158707, + "learning_rate": 0.00025546231437377417, + "loss": 1.0782, + "step": 11600 + }, + { + "epoch": 1.6164686377452335, + "grad_norm": 0.16667212545871735, + "learning_rate": 0.000255042028579434, + "loss": 1.0781, + "step": 11700 + }, + { + "epoch": 1.6302846090080134, + "grad_norm": 0.2246047705411911, + "learning_rate": 0.00025462174278509386, + "loss": 1.08, + "step": 11800 + }, + { + "epoch": 1.644100580270793, + "grad_norm": 0.2305343896150589, + "learning_rate": 0.0002542014569907537, + "loss": 1.0756, + "step": 11900 + }, + { + "epoch": 1.6579165515335728, + "grad_norm": 0.13618823885917664, + "learning_rate": 0.0002537811711964135, + "loss": 1.076, + "step": 12000 + }, + { + "epoch": 1.6717325227963524, + "grad_norm": 0.15795475244522095, + "learning_rate": 0.0002533608854020734, + "loss": 1.0749, + "step": 12100 + }, + { + "epoch": 1.6855484940591323, + "grad_norm": 0.20267115533351898, + "learning_rate": 0.00025294480246567665, + "loss": 1.077, + "step": 12200 + }, + { + "epoch": 1.6993644653219122, + "grad_norm": 0.08052489906549454, + "learning_rate": 0.0002525245166713365, + "loss": 1.073, + "step": 12300 + }, + { + "epoch": 1.7131804365846919, + "grad_norm": 0.11914093047380447, + "learning_rate": 0.00025210423087699635, + "loss": 1.0755, + "step": 12400 + }, + { + "epoch": 1.7269964078474715, + "grad_norm": 0.12703542411327362, + "learning_rate": 0.00025168394508265617, + "loss": 1.0765, + "step": 12500 + }, + { + "epoch": 1.7408123791102514, + "grad_norm": 0.12948518991470337, + "learning_rate": 0.00025126365928831605, + "loss": 1.0748, + "step": 12600 + }, + { + "epoch": 1.7546283503730313, + "grad_norm": 0.1027710810303688, + "learning_rate": 0.00025084337349397587, + "loss": 1.0745, + "step": 12700 + }, + { + "epoch": 1.768444321635811, + "grad_norm": 0.20131652057170868, + "learning_rate": 0.0002504230876996357, + "loss": 1.0731, + "step": 12800 + }, + { + "epoch": 1.7822602928985907, + "grad_norm": 0.0673370212316513, + "learning_rate": 0.00025000280190529557, + "loss": 1.0721, + "step": 12900 + }, + { + "epoch": 1.7960762641613706, + "grad_norm": 0.10322799533605576, + "learning_rate": 0.00024958251611095544, + "loss": 1.0731, + "step": 13000 + }, + { + "epoch": 1.8098922354241505, + "grad_norm": 0.08498311042785645, + "learning_rate": 0.00024916223031661526, + "loss": 1.0722, + "step": 13100 + }, + { + "epoch": 1.8237082066869301, + "grad_norm": 0.07025079429149628, + "learning_rate": 0.00024874194452227514, + "loss": 1.0725, + "step": 13200 + }, + { + "epoch": 1.8375241779497098, + "grad_norm": 0.13933932781219482, + "learning_rate": 0.00024832165872793496, + "loss": 1.0714, + "step": 13300 + }, + { + "epoch": 1.8513401492124897, + "grad_norm": 0.10513993352651596, + "learning_rate": 0.00024790137293359484, + "loss": 1.0725, + "step": 13400 + }, + { + "epoch": 1.8651561204752696, + "grad_norm": 0.1704607903957367, + "learning_rate": 0.0002474810871392547, + "loss": 1.0712, + "step": 13500 + }, + { + "epoch": 1.8789720917380492, + "grad_norm": 0.08315689861774445, + "learning_rate": 0.0002470608013449145, + "loss": 1.0697, + "step": 13600 + }, + { + "epoch": 1.892788063000829, + "grad_norm": 0.09900273382663727, + "learning_rate": 0.00024664051555057436, + "loss": 1.0735, + "step": 13700 + }, + { + "epoch": 1.9066040342636086, + "grad_norm": 0.05560864508152008, + "learning_rate": 0.00024622022975623423, + "loss": 1.0711, + "step": 13800 + }, + { + "epoch": 1.9204200055263885, + "grad_norm": 0.13863462209701538, + "learning_rate": 0.00024579994396189405, + "loss": 1.0681, + "step": 13900 + }, + { + "epoch": 1.9342359767891684, + "grad_norm": 0.07841744273900986, + "learning_rate": 0.00024537965816755393, + "loss": 1.0711, + "step": 14000 + }, + { + "epoch": 1.948051948051948, + "grad_norm": 0.058312736451625824, + "learning_rate": 0.00024495937237321375, + "loss": 1.0709, + "step": 14100 + }, + { + "epoch": 1.9618679193147277, + "grad_norm": 0.11208023875951767, + "learning_rate": 0.000244543289436817, + "loss": 1.0686, + "step": 14200 + }, + { + "epoch": 1.9756838905775076, + "grad_norm": 0.10133163630962372, + "learning_rate": 0.00024412300364247687, + "loss": 1.0683, + "step": 14300 + }, + { + "epoch": 1.9894998618402875, + "grad_norm": 0.08370282500982285, + "learning_rate": 0.0002437027178481367, + "loss": 1.0709, + "step": 14400 + }, + { + "epoch": 2.003315833103067, + "grad_norm": 0.09476770460605621, + "learning_rate": 0.00024328243205379654, + "loss": 1.0697, + "step": 14500 + }, + { + "epoch": 2.017131804365847, + "grad_norm": 0.0733637660741806, + "learning_rate": 0.0002428621462594564, + "loss": 1.0681, + "step": 14600 + }, + { + "epoch": 2.0309477756286265, + "grad_norm": 0.09925834089517593, + "learning_rate": 0.00024244186046511627, + "loss": 1.0702, + "step": 14700 + }, + { + "epoch": 2.0447637468914066, + "grad_norm": 0.15911750495433807, + "learning_rate": 0.00024202157467077611, + "loss": 1.0665, + "step": 14800 + }, + { + "epoch": 2.0585797181541863, + "grad_norm": 0.13638247549533844, + "learning_rate": 0.00024160128887643596, + "loss": 1.0696, + "step": 14900 + }, + { + "epoch": 2.072395689416966, + "grad_norm": 0.16883982717990875, + "learning_rate": 0.0002411810030820958, + "loss": 1.0641, + "step": 15000 + }, + { + "epoch": 2.072395689416966, + "eval_accuracy": 0.5102966510685876, + "eval_loss": 1.0638896226882935, + "eval_runtime": 924.2494, + "eval_samples_per_second": 222.733, + "eval_steps_per_second": 6.961, + "step": 15000 + }, + { + "epoch": 2.0862116606797456, + "grad_norm": 0.09925784170627594, + "learning_rate": 0.00024076071728775566, + "loss": 1.0683, + "step": 15100 + }, + { + "epoch": 2.1000276319425257, + "grad_norm": 0.06180203706026077, + "learning_rate": 0.00024034043149341548, + "loss": 1.066, + "step": 15200 + }, + { + "epoch": 2.1138436032053054, + "grad_norm": 0.10063247382640839, + "learning_rate": 0.00023992014569907533, + "loss": 1.0668, + "step": 15300 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 0.11476041376590729, + "learning_rate": 0.0002394998599047352, + "loss": 1.0644, + "step": 15400 + }, + { + "epoch": 2.1414755457308647, + "grad_norm": 0.11798429489135742, + "learning_rate": 0.00023907957411039505, + "loss": 1.0626, + "step": 15500 + }, + { + "epoch": 2.155291516993645, + "grad_norm": 0.13165287673473358, + "learning_rate": 0.0002386592883160549, + "loss": 1.0648, + "step": 15600 + }, + { + "epoch": 2.1691074882564245, + "grad_norm": 0.1705123484134674, + "learning_rate": 0.00023823900252171475, + "loss": 1.0639, + "step": 15700 + }, + { + "epoch": 2.182923459519204, + "grad_norm": 0.13375049829483032, + "learning_rate": 0.0002378187167273746, + "loss": 1.062, + "step": 15800 + }, + { + "epoch": 2.196739430781984, + "grad_norm": 0.09405038505792618, + "learning_rate": 0.00023739843093303445, + "loss": 1.0634, + "step": 15900 + }, + { + "epoch": 2.210555402044764, + "grad_norm": 0.11285752803087234, + "learning_rate": 0.00023697814513869427, + "loss": 1.0667, + "step": 16000 + }, + { + "epoch": 2.2243713733075436, + "grad_norm": 0.12377699464559555, + "learning_rate": 0.00023655785934435412, + "loss": 1.064, + "step": 16100 + }, + { + "epoch": 2.2381873445703233, + "grad_norm": 0.0979316234588623, + "learning_rate": 0.000236137573550014, + "loss": 1.0621, + "step": 16200 + }, + { + "epoch": 2.252003315833103, + "grad_norm": 0.11494515091180801, + "learning_rate": 0.00023572149061361724, + "loss": 1.0645, + "step": 16300 + }, + { + "epoch": 2.2658192870958827, + "grad_norm": 0.07066236436367035, + "learning_rate": 0.0002353012048192771, + "loss": 1.063, + "step": 16400 + }, + { + "epoch": 2.2796352583586628, + "grad_norm": 0.08686563372612, + "learning_rate": 0.00023488091902493694, + "loss": 1.066, + "step": 16500 + }, + { + "epoch": 2.2934512296214424, + "grad_norm": 0.058148209005594254, + "learning_rate": 0.00023446063323059678, + "loss": 1.0643, + "step": 16600 + }, + { + "epoch": 2.307267200884222, + "grad_norm": 0.14033359289169312, + "learning_rate": 0.00023404034743625666, + "loss": 1.0634, + "step": 16700 + }, + { + "epoch": 2.3210831721470018, + "grad_norm": 0.09940097481012344, + "learning_rate": 0.00023362006164191645, + "loss": 1.0629, + "step": 16800 + }, + { + "epoch": 2.334899143409782, + "grad_norm": 0.08228994905948639, + "learning_rate": 0.00023319977584757633, + "loss": 1.0626, + "step": 16900 + }, + { + "epoch": 2.3487151146725616, + "grad_norm": 0.05418753623962402, + "learning_rate": 0.00023277949005323618, + "loss": 1.0611, + "step": 17000 + }, + { + "epoch": 2.3625310859353412, + "grad_norm": 0.09691222757101059, + "learning_rate": 0.00023235920425889603, + "loss": 1.0626, + "step": 17100 + }, + { + "epoch": 2.376347057198121, + "grad_norm": 0.1607312560081482, + "learning_rate": 0.00023193891846455588, + "loss": 1.0623, + "step": 17200 + }, + { + "epoch": 2.3901630284609006, + "grad_norm": 0.1193649098277092, + "learning_rate": 0.00023151863267021572, + "loss": 1.0627, + "step": 17300 + }, + { + "epoch": 2.4039789997236807, + "grad_norm": 0.05427398905158043, + "learning_rate": 0.00023109834687587557, + "loss": 1.0609, + "step": 17400 + }, + { + "epoch": 2.4177949709864603, + "grad_norm": 0.10591702163219452, + "learning_rate": 0.00023067806108153545, + "loss": 1.0637, + "step": 17500 + }, + { + "epoch": 2.43161094224924, + "grad_norm": 0.057032886892557144, + "learning_rate": 0.00023025777528719524, + "loss": 1.0612, + "step": 17600 + }, + { + "epoch": 2.44542691351202, + "grad_norm": 0.08455175161361694, + "learning_rate": 0.00022983748949285512, + "loss": 1.0606, + "step": 17700 + }, + { + "epoch": 2.4592428847748, + "grad_norm": 0.13975144922733307, + "learning_rate": 0.00022941720369851497, + "loss": 1.0624, + "step": 17800 + }, + { + "epoch": 2.4730588560375795, + "grad_norm": 0.11535393446683884, + "learning_rate": 0.00022899691790417482, + "loss": 1.0603, + "step": 17900 + }, + { + "epoch": 2.486874827300359, + "grad_norm": 0.10047648102045059, + "learning_rate": 0.00022857663210983466, + "loss": 1.0607, + "step": 18000 + }, + { + "epoch": 2.500690798563139, + "grad_norm": 0.08474704623222351, + "learning_rate": 0.0002281563463154945, + "loss": 1.062, + "step": 18100 + }, + { + "epoch": 2.514506769825919, + "grad_norm": 0.15308576822280884, + "learning_rate": 0.00022773606052115436, + "loss": 1.0603, + "step": 18200 + }, + { + "epoch": 2.5283227410886986, + "grad_norm": 0.05684039369225502, + "learning_rate": 0.00022731577472681424, + "loss": 1.0589, + "step": 18300 + }, + { + "epoch": 2.5421387123514783, + "grad_norm": 0.10712555050849915, + "learning_rate": 0.00022689548893247409, + "loss": 1.0592, + "step": 18400 + }, + { + "epoch": 2.555954683614258, + "grad_norm": 0.0800655260682106, + "learning_rate": 0.0002264794059960773, + "loss": 1.0603, + "step": 18500 + }, + { + "epoch": 2.569770654877038, + "grad_norm": 0.05980188027024269, + "learning_rate": 0.00022605912020173715, + "loss": 1.0608, + "step": 18600 + }, + { + "epoch": 2.5835866261398177, + "grad_norm": 0.052051473408937454, + "learning_rate": 0.000225638834407397, + "loss": 1.0603, + "step": 18700 + }, + { + "epoch": 2.5974025974025974, + "grad_norm": 0.11966883391141891, + "learning_rate": 0.00022521854861305685, + "loss": 1.057, + "step": 18800 + }, + { + "epoch": 2.611218568665377, + "grad_norm": 0.08861220628023148, + "learning_rate": 0.00022479826281871673, + "loss": 1.0603, + "step": 18900 + }, + { + "epoch": 2.6250345399281567, + "grad_norm": 0.12264814227819443, + "learning_rate": 0.00022437797702437657, + "loss": 1.0602, + "step": 19000 + }, + { + "epoch": 2.638850511190937, + "grad_norm": 0.08384163677692413, + "learning_rate": 0.00022395769123003642, + "loss": 1.057, + "step": 19100 + }, + { + "epoch": 2.6526664824537165, + "grad_norm": 0.11168386787176132, + "learning_rate": 0.00022353740543569624, + "loss": 1.0572, + "step": 19200 + }, + { + "epoch": 2.666482453716496, + "grad_norm": 0.12558519840240479, + "learning_rate": 0.0002231171196413561, + "loss": 1.0592, + "step": 19300 + }, + { + "epoch": 2.6802984249792763, + "grad_norm": 0.06810207664966583, + "learning_rate": 0.00022269683384701594, + "loss": 1.055, + "step": 19400 + }, + { + "epoch": 2.694114396242056, + "grad_norm": 0.16571113467216492, + "learning_rate": 0.0002222765480526758, + "loss": 1.0599, + "step": 19500 + }, + { + "epoch": 2.7079303675048356, + "grad_norm": 0.07613151520490646, + "learning_rate": 0.00022185626225833564, + "loss": 1.0564, + "step": 19600 + }, + { + "epoch": 2.7217463387676153, + "grad_norm": 0.08713393658399582, + "learning_rate": 0.00022143597646399551, + "loss": 1.0582, + "step": 19700 + }, + { + "epoch": 2.735562310030395, + "grad_norm": 0.11707925796508789, + "learning_rate": 0.00022101569066965536, + "loss": 1.056, + "step": 19800 + }, + { + "epoch": 2.749378281293175, + "grad_norm": 0.1053171455860138, + "learning_rate": 0.0002205954048753152, + "loss": 1.0608, + "step": 19900 + }, + { + "epoch": 2.7631942525559547, + "grad_norm": 0.056531500071287155, + "learning_rate": 0.00022017511908097506, + "loss": 1.0563, + "step": 20000 + }, + { + "epoch": 2.7631942525559547, + "eval_accuracy": 0.516310033016185, + "eval_loss": 1.054749608039856, + "eval_runtime": 731.5154, + "eval_samples_per_second": 281.417, + "eval_steps_per_second": 8.795, + "step": 20000 + }, + { + "epoch": 2.7770102238187344, + "grad_norm": 0.10811367630958557, + "learning_rate": 0.00021975483328663488, + "loss": 1.0556, + "step": 20100 + }, + { + "epoch": 2.790826195081514, + "grad_norm": 0.06601472198963165, + "learning_rate": 0.00021933454749229473, + "loss": 1.0578, + "step": 20200 + }, + { + "epoch": 2.804642166344294, + "grad_norm": 0.06906837224960327, + "learning_rate": 0.00021891426169795458, + "loss": 1.06, + "step": 20300 + }, + { + "epoch": 2.818458137607074, + "grad_norm": 0.08911406248807907, + "learning_rate": 0.00021849397590361443, + "loss": 1.0583, + "step": 20400 + }, + { + "epoch": 2.8322741088698535, + "grad_norm": 0.06497912108898163, + "learning_rate": 0.0002180778929672177, + "loss": 1.0575, + "step": 20500 + }, + { + "epoch": 2.846090080132633, + "grad_norm": 0.0886107012629509, + "learning_rate": 0.00021765760717287755, + "loss": 1.0552, + "step": 20600 + }, + { + "epoch": 2.859906051395413, + "grad_norm": 0.05942055955529213, + "learning_rate": 0.0002172373213785374, + "loss": 1.0533, + "step": 20700 + }, + { + "epoch": 2.873722022658193, + "grad_norm": 0.13015809655189514, + "learning_rate": 0.00021681703558419725, + "loss": 1.0549, + "step": 20800 + }, + { + "epoch": 2.8875379939209727, + "grad_norm": 0.06085093691945076, + "learning_rate": 0.00021639674978985707, + "loss": 1.057, + "step": 20900 + }, + { + "epoch": 2.9013539651837523, + "grad_norm": 0.17039401829242706, + "learning_rate": 0.00021597646399551692, + "loss": 1.0571, + "step": 21000 + }, + { + "epoch": 2.9151699364465324, + "grad_norm": 0.07950026541948318, + "learning_rate": 0.00021555617820117676, + "loss": 1.0535, + "step": 21100 + }, + { + "epoch": 2.928985907709312, + "grad_norm": 0.1195695698261261, + "learning_rate": 0.00021513589240683664, + "loss": 1.0535, + "step": 21200 + }, + { + "epoch": 2.942801878972092, + "grad_norm": 0.0896124541759491, + "learning_rate": 0.0002147156066124965, + "loss": 1.0534, + "step": 21300 + }, + { + "epoch": 2.9566178502348714, + "grad_norm": 0.07629978656768799, + "learning_rate": 0.00021429532081815634, + "loss": 1.0564, + "step": 21400 + }, + { + "epoch": 2.970433821497651, + "grad_norm": 0.07431907206773758, + "learning_rate": 0.00021387503502381618, + "loss": 1.0559, + "step": 21500 + }, + { + "epoch": 2.984249792760431, + "grad_norm": 0.0771278440952301, + "learning_rate": 0.00021345474922947603, + "loss": 1.0562, + "step": 21600 + }, + { + "epoch": 2.998065764023211, + "grad_norm": 0.11643990874290466, + "learning_rate": 0.00021303446343513585, + "loss": 1.0525, + "step": 21700 + }, + { + "epoch": 3.0118817352859906, + "grad_norm": 0.058162059634923935, + "learning_rate": 0.0002126141776407957, + "loss": 1.0509, + "step": 21800 + }, + { + "epoch": 3.0256977065487702, + "grad_norm": 0.12037301808595657, + "learning_rate": 0.00021219389184645558, + "loss": 1.0513, + "step": 21900 + }, + { + "epoch": 3.0395136778115504, + "grad_norm": 0.052515506744384766, + "learning_rate": 0.00021177360605211543, + "loss": 1.051, + "step": 22000 + }, + { + "epoch": 3.05332964907433, + "grad_norm": 0.10646827518939972, + "learning_rate": 0.00021135332025777528, + "loss": 1.0542, + "step": 22100 + }, + { + "epoch": 3.0671456203371097, + "grad_norm": 0.1113181784749031, + "learning_rate": 0.00021093303446343512, + "loss": 1.0531, + "step": 22200 + }, + { + "epoch": 3.0809615915998894, + "grad_norm": 0.07355222851037979, + "learning_rate": 0.00021051274866909497, + "loss": 1.0524, + "step": 22300 + }, + { + "epoch": 3.094777562862669, + "grad_norm": 0.06925370544195175, + "learning_rate": 0.00021009246287475482, + "loss": 1.0535, + "step": 22400 + }, + { + "epoch": 3.108593534125449, + "grad_norm": 0.048475924879312515, + "learning_rate": 0.00020967217708041464, + "loss": 1.0564, + "step": 22500 + }, + { + "epoch": 3.122409505388229, + "grad_norm": 0.08578319102525711, + "learning_rate": 0.0002092518912860745, + "loss": 1.0519, + "step": 22600 + }, + { + "epoch": 3.1362254766510085, + "grad_norm": 0.08585724979639053, + "learning_rate": 0.00020883160549173437, + "loss": 1.0525, + "step": 22700 + }, + { + "epoch": 3.150041447913788, + "grad_norm": 0.06518802791833878, + "learning_rate": 0.00020841131969739422, + "loss": 1.0543, + "step": 22800 + }, + { + "epoch": 3.1638574191765683, + "grad_norm": 0.046030618250370026, + "learning_rate": 0.00020799103390305406, + "loss": 1.0525, + "step": 22900 + }, + { + "epoch": 3.177673390439348, + "grad_norm": 0.04972764104604721, + "learning_rate": 0.0002075707481087139, + "loss": 1.0512, + "step": 23000 + }, + { + "epoch": 3.1914893617021276, + "grad_norm": 0.11977583914995193, + "learning_rate": 0.00020715046231437376, + "loss": 1.052, + "step": 23100 + }, + { + "epoch": 3.2053053329649073, + "grad_norm": 0.08040472120046616, + "learning_rate": 0.0002067301765200336, + "loss": 1.0491, + "step": 23200 + }, + { + "epoch": 3.2191213042276874, + "grad_norm": 0.10473213344812393, + "learning_rate": 0.00020630989072569343, + "loss": 1.0525, + "step": 23300 + }, + { + "epoch": 3.232937275490467, + "grad_norm": 0.0790744498372078, + "learning_rate": 0.00020588960493135328, + "loss": 1.0508, + "step": 23400 + }, + { + "epoch": 3.2467532467532467, + "grad_norm": 0.12807689607143402, + "learning_rate": 0.00020547352199495655, + "loss": 1.0485, + "step": 23500 + }, + { + "epoch": 3.2605692180160264, + "grad_norm": 0.10298227518796921, + "learning_rate": 0.0002050532362006164, + "loss": 1.049, + "step": 23600 + }, + { + "epoch": 3.2743851892788065, + "grad_norm": 0.11504103243350983, + "learning_rate": 0.00020463295040627625, + "loss": 1.0511, + "step": 23700 + }, + { + "epoch": 3.288201160541586, + "grad_norm": 0.05548229441046715, + "learning_rate": 0.0002042126646119361, + "loss": 1.0499, + "step": 23800 + }, + { + "epoch": 3.302017131804366, + "grad_norm": 0.06242981553077698, + "learning_rate": 0.00020379237881759595, + "loss": 1.0543, + "step": 23900 + }, + { + "epoch": 3.3158331030671455, + "grad_norm": 0.12101748585700989, + "learning_rate": 0.00020337209302325582, + "loss": 1.0482, + "step": 24000 + }, + { + "epoch": 3.329649074329925, + "grad_norm": 0.09176388382911682, + "learning_rate": 0.00020295180722891562, + "loss": 1.0514, + "step": 24100 + }, + { + "epoch": 3.3434650455927053, + "grad_norm": 0.08758760988712311, + "learning_rate": 0.0002025315214345755, + "loss": 1.0505, + "step": 24200 + }, + { + "epoch": 3.357281016855485, + "grad_norm": 0.06818066537380219, + "learning_rate": 0.00020211123564023534, + "loss": 1.0511, + "step": 24300 + }, + { + "epoch": 3.3710969881182646, + "grad_norm": 0.10384306311607361, + "learning_rate": 0.0002016909498458952, + "loss": 1.0513, + "step": 24400 + }, + { + "epoch": 3.3849129593810443, + "grad_norm": 0.12452493607997894, + "learning_rate": 0.00020127066405155504, + "loss": 1.0502, + "step": 24500 + }, + { + "epoch": 3.3987289306438244, + "grad_norm": 0.07460072636604309, + "learning_rate": 0.0002008503782572149, + "loss": 1.0526, + "step": 24600 + }, + { + "epoch": 3.412544901906604, + "grad_norm": 0.1017543151974678, + "learning_rate": 0.00020043009246287474, + "loss": 1.0501, + "step": 24700 + }, + { + "epoch": 3.4263608731693838, + "grad_norm": 0.0900358185172081, + "learning_rate": 0.0002000098066685346, + "loss": 1.0512, + "step": 24800 + }, + { + "epoch": 3.4401768444321634, + "grad_norm": 0.10934050381183624, + "learning_rate": 0.00019958952087419443, + "loss": 1.0495, + "step": 24900 + }, + { + "epoch": 3.4539928156949435, + "grad_norm": 0.0656353011727333, + "learning_rate": 0.00019916923507985428, + "loss": 1.0504, + "step": 25000 + }, + { + "epoch": 3.4539928156949435, + "eval_accuracy": 0.520419659075542, + "eval_loss": 1.0485948324203491, + "eval_runtime": 728.0613, + "eval_samples_per_second": 282.752, + "eval_steps_per_second": 8.837, + "step": 25000 + }, + { + "epoch": 3.467808786957723, + "grad_norm": 0.07246037572622299, + "learning_rate": 0.00019874894928551413, + "loss": 1.0493, + "step": 25100 + }, + { + "epoch": 3.481624758220503, + "grad_norm": 0.14033739268779755, + "learning_rate": 0.00019832866349117398, + "loss": 1.05, + "step": 25200 + }, + { + "epoch": 3.4954407294832825, + "grad_norm": 0.05688853561878204, + "learning_rate": 0.00019790837769683383, + "loss": 1.0509, + "step": 25300 + }, + { + "epoch": 3.5092567007460627, + "grad_norm": 0.053916674107313156, + "learning_rate": 0.00019748809190249368, + "loss": 1.0503, + "step": 25400 + }, + { + "epoch": 3.5230726720088423, + "grad_norm": 0.12233688682317734, + "learning_rate": 0.00019706780610815352, + "loss": 1.05, + "step": 25500 + }, + { + "epoch": 3.536888643271622, + "grad_norm": 0.10314755886793137, + "learning_rate": 0.0001966475203138134, + "loss": 1.0501, + "step": 25600 + }, + { + "epoch": 3.5507046145344017, + "grad_norm": 0.05037887394428253, + "learning_rate": 0.00019623143737741662, + "loss": 1.0468, + "step": 25700 + }, + { + "epoch": 3.5645205857971813, + "grad_norm": 0.13344399631023407, + "learning_rate": 0.00019581115158307647, + "loss": 1.0477, + "step": 25800 + }, + { + "epoch": 3.5783365570599615, + "grad_norm": 0.07191654294729233, + "learning_rate": 0.00019539086578873632, + "loss": 1.0498, + "step": 25900 + }, + { + "epoch": 3.592152528322741, + "grad_norm": 0.05592725798487663, + "learning_rate": 0.00019497057999439616, + "loss": 1.0506, + "step": 26000 + }, + { + "epoch": 3.605968499585521, + "grad_norm": 0.10346696525812149, + "learning_rate": 0.000194550294200056, + "loss": 1.0499, + "step": 26100 + }, + { + "epoch": 3.619784470848301, + "grad_norm": 0.09233855456113815, + "learning_rate": 0.0001941300084057159, + "loss": 1.0456, + "step": 26200 + }, + { + "epoch": 3.6336004421110806, + "grad_norm": 0.060603220015764236, + "learning_rate": 0.00019370972261137574, + "loss": 1.0475, + "step": 26300 + }, + { + "epoch": 3.6474164133738602, + "grad_norm": 0.11710167676210403, + "learning_rate": 0.00019328943681703559, + "loss": 1.0497, + "step": 26400 + }, + { + "epoch": 3.66123238463664, + "grad_norm": 0.16325397789478302, + "learning_rate": 0.0001928691510226954, + "loss": 1.0487, + "step": 26500 + }, + { + "epoch": 3.6750483558994196, + "grad_norm": 0.08937475085258484, + "learning_rate": 0.00019244886522835526, + "loss": 1.0468, + "step": 26600 + }, + { + "epoch": 3.6888643271621993, + "grad_norm": 0.07486152648925781, + "learning_rate": 0.0001920285794340151, + "loss": 1.0479, + "step": 26700 + }, + { + "epoch": 3.7026802984249794, + "grad_norm": 0.1263752579689026, + "learning_rate": 0.00019160829363967495, + "loss": 1.0449, + "step": 26800 + }, + { + "epoch": 3.716496269687759, + "grad_norm": 0.11803583055734634, + "learning_rate": 0.0001911880078453348, + "loss": 1.0512, + "step": 26900 + }, + { + "epoch": 3.7303122409505387, + "grad_norm": 0.07918773591518402, + "learning_rate": 0.00019076772205099468, + "loss": 1.0486, + "step": 27000 + }, + { + "epoch": 3.744128212213319, + "grad_norm": 0.11923271417617798, + "learning_rate": 0.00019034743625665453, + "loss": 1.0465, + "step": 27100 + }, + { + "epoch": 3.7579441834760985, + "grad_norm": 0.12752223014831543, + "learning_rate": 0.00018992715046231437, + "loss": 1.0472, + "step": 27200 + }, + { + "epoch": 3.771760154738878, + "grad_norm": 0.07391146570444107, + "learning_rate": 0.0001895068646679742, + "loss": 1.0493, + "step": 27300 + }, + { + "epoch": 3.785576126001658, + "grad_norm": 0.06606881320476532, + "learning_rate": 0.00018908657887363404, + "loss": 1.0485, + "step": 27400 + }, + { + "epoch": 3.7993920972644375, + "grad_norm": 0.04949864745140076, + "learning_rate": 0.0001886662930792939, + "loss": 1.0481, + "step": 27500 + }, + { + "epoch": 3.8132080685272176, + "grad_norm": 0.05234380066394806, + "learning_rate": 0.00018824600728495374, + "loss": 1.0476, + "step": 27600 + }, + { + "epoch": 3.8270240397899973, + "grad_norm": 0.04995539411902428, + "learning_rate": 0.0001878257214906136, + "loss": 1.0466, + "step": 27700 + }, + { + "epoch": 3.840840011052777, + "grad_norm": 0.09871330112218857, + "learning_rate": 0.00018740543569627347, + "loss": 1.0501, + "step": 27800 + }, + { + "epoch": 3.8546559823155566, + "grad_norm": 0.06254375725984573, + "learning_rate": 0.00018698514990193331, + "loss": 1.0467, + "step": 27900 + }, + { + "epoch": 3.8684719535783367, + "grad_norm": 0.07971449941396713, + "learning_rate": 0.00018656486410759316, + "loss": 1.0502, + "step": 28000 + }, + { + "epoch": 3.8822879248411164, + "grad_norm": 0.12627951800823212, + "learning_rate": 0.000186144578313253, + "loss": 1.0446, + "step": 28100 + }, + { + "epoch": 3.896103896103896, + "grad_norm": 0.08057064563035965, + "learning_rate": 0.00018572429251891283, + "loss": 1.0468, + "step": 28200 + }, + { + "epoch": 3.9099198673666757, + "grad_norm": 0.0501413568854332, + "learning_rate": 0.00018530400672457268, + "loss": 1.0453, + "step": 28300 + }, + { + "epoch": 3.9237358386294554, + "grad_norm": 0.09999352693557739, + "learning_rate": 0.00018488372093023253, + "loss": 1.0502, + "step": 28400 + }, + { + "epoch": 3.9375518098922355, + "grad_norm": 0.12323564291000366, + "learning_rate": 0.00018446343513589238, + "loss": 1.0478, + "step": 28500 + }, + { + "epoch": 3.951367781155015, + "grad_norm": 0.0877193808555603, + "learning_rate": 0.00018404314934155225, + "loss": 1.049, + "step": 28600 + }, + { + "epoch": 3.965183752417795, + "grad_norm": 0.09397170692682266, + "learning_rate": 0.0001836228635472121, + "loss": 1.0474, + "step": 28700 + }, + { + "epoch": 3.978999723680575, + "grad_norm": 0.09532420337200165, + "learning_rate": 0.00018320257775287195, + "loss": 1.0496, + "step": 28800 + }, + { + "epoch": 3.9928156949433546, + "grad_norm": 0.0442403182387352, + "learning_rate": 0.0001827822919585318, + "loss": 1.0466, + "step": 28900 + }, + { + "epoch": 4.006631666206134, + "grad_norm": 0.06309514492750168, + "learning_rate": 0.00018236200616419162, + "loss": 1.0479, + "step": 29000 + }, + { + "epoch": 4.020447637468914, + "grad_norm": 0.06191420555114746, + "learning_rate": 0.00018194172036985147, + "loss": 1.0442, + "step": 29100 + }, + { + "epoch": 4.034263608731694, + "grad_norm": 0.06752864271402359, + "learning_rate": 0.00018152143457551132, + "loss": 1.045, + "step": 29200 + }, + { + "epoch": 4.048079579994473, + "grad_norm": 0.07383009046316147, + "learning_rate": 0.00018110114878117117, + "loss": 1.0429, + "step": 29300 + }, + { + "epoch": 4.061895551257253, + "grad_norm": 0.11942852288484573, + "learning_rate": 0.00018068086298683104, + "loss": 1.0433, + "step": 29400 + }, + { + "epoch": 4.0757115225200335, + "grad_norm": 0.0840003713965416, + "learning_rate": 0.0001802605771924909, + "loss": 1.0434, + "step": 29500 + }, + { + "epoch": 4.089527493782813, + "grad_norm": 0.07768476754426956, + "learning_rate": 0.00017984029139815074, + "loss": 1.0421, + "step": 29600 + }, + { + "epoch": 4.103343465045593, + "grad_norm": 0.07166603952646255, + "learning_rate": 0.00017942420846175398, + "loss": 1.0443, + "step": 29700 + }, + { + "epoch": 4.1171594363083726, + "grad_norm": 0.07380765676498413, + "learning_rate": 0.0001790039226674138, + "loss": 1.0448, + "step": 29800 + }, + { + "epoch": 4.130975407571152, + "grad_norm": 0.1263025552034378, + "learning_rate": 0.00017858363687307365, + "loss": 1.0437, + "step": 29900 + }, + { + "epoch": 4.144791378833932, + "grad_norm": 0.09632286429405212, + "learning_rate": 0.00017816335107873353, + "loss": 1.0439, + "step": 30000 + }, + { + "epoch": 4.144791378833932, + "eval_accuracy": 0.5233148259844476, + "eval_loss": 1.0439139604568481, + "eval_runtime": 787.8404, + "eval_samples_per_second": 261.298, + "eval_steps_per_second": 8.167, + "step": 30000 + }, + { + "epoch": 4.158607350096712, + "grad_norm": 0.09395026415586472, + "learning_rate": 0.00017774306528439338, + "loss": 1.0447, + "step": 30100 + }, + { + "epoch": 4.172423321359491, + "grad_norm": 0.07320912927389145, + "learning_rate": 0.00017732277949005323, + "loss": 1.0477, + "step": 30200 + }, + { + "epoch": 4.186239292622272, + "grad_norm": 0.05703623965382576, + "learning_rate": 0.00017690249369571308, + "loss": 1.0443, + "step": 30300 + }, + { + "epoch": 4.2000552638850515, + "grad_norm": 0.04885410889983177, + "learning_rate": 0.00017648220790137292, + "loss": 1.0467, + "step": 30400 + }, + { + "epoch": 4.213871235147831, + "grad_norm": 0.10649748146533966, + "learning_rate": 0.00017606192210703277, + "loss": 1.0448, + "step": 30500 + }, + { + "epoch": 4.227687206410611, + "grad_norm": 0.05844441428780556, + "learning_rate": 0.0001756416363126926, + "loss": 1.044, + "step": 30600 + }, + { + "epoch": 4.2415031776733905, + "grad_norm": 0.07287675887346268, + "learning_rate": 0.00017522135051835244, + "loss": 1.0428, + "step": 30700 + }, + { + "epoch": 4.25531914893617, + "grad_norm": 0.05190150439739227, + "learning_rate": 0.00017480106472401232, + "loss": 1.0413, + "step": 30800 + }, + { + "epoch": 4.26913512019895, + "grad_norm": 0.06985218822956085, + "learning_rate": 0.00017438077892967217, + "loss": 1.0455, + "step": 30900 + }, + { + "epoch": 4.2829510914617295, + "grad_norm": 0.06930764764547348, + "learning_rate": 0.00017396049313533202, + "loss": 1.0444, + "step": 31000 + }, + { + "epoch": 4.296767062724509, + "grad_norm": 0.07905230671167374, + "learning_rate": 0.00017354020734099186, + "loss": 1.0445, + "step": 31100 + }, + { + "epoch": 4.31058303398729, + "grad_norm": 0.04994554817676544, + "learning_rate": 0.0001731199215466517, + "loss": 1.0432, + "step": 31200 + }, + { + "epoch": 4.324399005250069, + "grad_norm": 0.08036911487579346, + "learning_rate": 0.00017269963575231156, + "loss": 1.0424, + "step": 31300 + }, + { + "epoch": 4.338214976512849, + "grad_norm": 0.07251475006341934, + "learning_rate": 0.00017227934995797138, + "loss": 1.0465, + "step": 31400 + }, + { + "epoch": 4.352030947775629, + "grad_norm": 0.09622683376073837, + "learning_rate": 0.00017185906416363123, + "loss": 1.0441, + "step": 31500 + }, + { + "epoch": 4.365846919038408, + "grad_norm": 0.07545050978660583, + "learning_rate": 0.0001714387783692911, + "loss": 1.0423, + "step": 31600 + }, + { + "epoch": 4.379662890301188, + "grad_norm": 0.07171428948640823, + "learning_rate": 0.00017102269543289435, + "loss": 1.0434, + "step": 31700 + }, + { + "epoch": 4.393478861563968, + "grad_norm": 0.06658755987882614, + "learning_rate": 0.0001706024096385542, + "loss": 1.0415, + "step": 31800 + }, + { + "epoch": 4.407294832826747, + "grad_norm": 0.10734014213085175, + "learning_rate": 0.00017018212384421405, + "loss": 1.0406, + "step": 31900 + }, + { + "epoch": 4.421110804089528, + "grad_norm": 0.06358776986598969, + "learning_rate": 0.0001697618380498739, + "loss": 1.0405, + "step": 32000 + }, + { + "epoch": 4.434926775352308, + "grad_norm": 0.06078578904271126, + "learning_rate": 0.00016934155225553377, + "loss": 1.0458, + "step": 32100 + }, + { + "epoch": 4.448742746615087, + "grad_norm": 0.09674441814422607, + "learning_rate": 0.000168925469319137, + "loss": 1.0433, + "step": 32200 + }, + { + "epoch": 4.462558717877867, + "grad_norm": 0.11840452253818512, + "learning_rate": 0.00016850518352479684, + "loss": 1.0448, + "step": 32300 + }, + { + "epoch": 4.476374689140647, + "grad_norm": 0.08742488920688629, + "learning_rate": 0.0001680848977304567, + "loss": 1.0409, + "step": 32400 + }, + { + "epoch": 4.490190660403426, + "grad_norm": 0.09082327783107758, + "learning_rate": 0.00016766461193611654, + "loss": 1.0432, + "step": 32500 + }, + { + "epoch": 4.504006631666206, + "grad_norm": 0.06259270012378693, + "learning_rate": 0.0001672443261417764, + "loss": 1.0406, + "step": 32600 + }, + { + "epoch": 4.517822602928986, + "grad_norm": 0.06466669589281082, + "learning_rate": 0.00016682404034743626, + "loss": 1.0404, + "step": 32700 + }, + { + "epoch": 4.531638574191765, + "grad_norm": 0.07167832553386688, + "learning_rate": 0.0001664037545530961, + "loss": 1.0457, + "step": 32800 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.055970191955566406, + "learning_rate": 0.00016598346875875596, + "loss": 1.0433, + "step": 32900 + }, + { + "epoch": 4.5592705167173255, + "grad_norm": 0.05038364604115486, + "learning_rate": 0.00016556318296441578, + "loss": 1.0414, + "step": 33000 + }, + { + "epoch": 4.573086487980105, + "grad_norm": 0.11647244542837143, + "learning_rate": 0.00016514289717007563, + "loss": 1.0408, + "step": 33100 + }, + { + "epoch": 4.586902459242885, + "grad_norm": 0.08881094306707382, + "learning_rate": 0.00016472261137573548, + "loss": 1.0468, + "step": 33200 + }, + { + "epoch": 4.6007184305056645, + "grad_norm": 0.0706004872918129, + "learning_rate": 0.00016430232558139533, + "loss": 1.0433, + "step": 33300 + }, + { + "epoch": 4.614534401768444, + "grad_norm": 0.07594550400972366, + "learning_rate": 0.00016388203978705518, + "loss": 1.0401, + "step": 33400 + }, + { + "epoch": 4.628350373031224, + "grad_norm": 0.06709697842597961, + "learning_rate": 0.00016346175399271505, + "loss": 1.0406, + "step": 33500 + }, + { + "epoch": 4.6421663442940035, + "grad_norm": 0.055218733847141266, + "learning_rate": 0.0001630414681983749, + "loss": 1.0439, + "step": 33600 + }, + { + "epoch": 4.655982315556784, + "grad_norm": 0.09484557062387466, + "learning_rate": 0.00016262118240403475, + "loss": 1.0445, + "step": 33700 + }, + { + "epoch": 4.669798286819564, + "grad_norm": 0.08181110769510269, + "learning_rate": 0.00016220089660969457, + "loss": 1.0404, + "step": 33800 + }, + { + "epoch": 4.683614258082343, + "grad_norm": 0.07101566344499588, + "learning_rate": 0.00016178061081535442, + "loss": 1.0418, + "step": 33900 + }, + { + "epoch": 4.697430229345123, + "grad_norm": 0.07521411031484604, + "learning_rate": 0.00016136032502101427, + "loss": 1.0413, + "step": 34000 + }, + { + "epoch": 4.711246200607903, + "grad_norm": 0.06438640505075455, + "learning_rate": 0.00016094003922667412, + "loss": 1.0413, + "step": 34100 + }, + { + "epoch": 4.7250621718706824, + "grad_norm": 0.0852956548333168, + "learning_rate": 0.00016051975343233396, + "loss": 1.0411, + "step": 34200 + }, + { + "epoch": 4.738878143133462, + "grad_norm": 0.041669171303510666, + "learning_rate": 0.00016009946763799384, + "loss": 1.043, + "step": 34300 + }, + { + "epoch": 4.752694114396242, + "grad_norm": 0.07866424322128296, + "learning_rate": 0.0001596791818436537, + "loss": 1.0416, + "step": 34400 + }, + { + "epoch": 4.7665100856590215, + "grad_norm": 0.06820093840360641, + "learning_rate": 0.00015925889604931354, + "loss": 1.0419, + "step": 34500 + }, + { + "epoch": 4.780326056921801, + "grad_norm": 0.08769433945417404, + "learning_rate": 0.00015883861025497336, + "loss": 1.0436, + "step": 34600 + }, + { + "epoch": 4.794142028184582, + "grad_norm": 0.11472765356302261, + "learning_rate": 0.0001584183244606332, + "loss": 1.0448, + "step": 34700 + }, + { + "epoch": 4.807957999447361, + "grad_norm": 0.10286398231983185, + "learning_rate": 0.00015799803866629305, + "loss": 1.0396, + "step": 34800 + }, + { + "epoch": 4.821773970710141, + "grad_norm": 0.08412828296422958, + "learning_rate": 0.0001575777528719529, + "loss": 1.0432, + "step": 34900 + }, + { + "epoch": 4.835589941972921, + "grad_norm": 0.06536369025707245, + "learning_rate": 0.00015715746707761275, + "loss": 1.0425, + "step": 35000 + }, + { + "epoch": 4.835589941972921, + "eval_accuracy": 0.5253784900927014, + "eval_loss": 1.0407328605651855, + "eval_runtime": 804.3369, + "eval_samples_per_second": 255.939, + "eval_steps_per_second": 7.999, + "step": 35000 + }, + { + "epoch": 4.8494059132357, + "grad_norm": 0.05366332083940506, + "learning_rate": 0.00015673718128327263, + "loss": 1.0401, + "step": 35100 + }, + { + "epoch": 4.86322188449848, + "grad_norm": 0.05627182498574257, + "learning_rate": 0.00015631689548893248, + "loss": 1.0413, + "step": 35200 + }, + { + "epoch": 4.87703785576126, + "grad_norm": 0.06880544126033783, + "learning_rate": 0.00015589660969459232, + "loss": 1.0399, + "step": 35300 + }, + { + "epoch": 4.89085382702404, + "grad_norm": 0.06326279044151306, + "learning_rate": 0.00015547632390025215, + "loss": 1.0424, + "step": 35400 + }, + { + "epoch": 4.90466979828682, + "grad_norm": 0.050615083426237106, + "learning_rate": 0.000155056038105912, + "loss": 1.0419, + "step": 35500 + }, + { + "epoch": 4.9184857695496, + "grad_norm": 0.09092865139245987, + "learning_rate": 0.00015463575231157184, + "loss": 1.0417, + "step": 35600 + }, + { + "epoch": 4.932301740812379, + "grad_norm": 0.10828616470098495, + "learning_rate": 0.0001542154665172317, + "loss": 1.0461, + "step": 35700 + }, + { + "epoch": 4.946117712075159, + "grad_norm": 0.10398013889789581, + "learning_rate": 0.00015379518072289154, + "loss": 1.0402, + "step": 35800 + }, + { + "epoch": 4.959933683337939, + "grad_norm": 0.060978490859270096, + "learning_rate": 0.00015337489492855142, + "loss": 1.0428, + "step": 35900 + }, + { + "epoch": 4.973749654600718, + "grad_norm": 0.09474412351846695, + "learning_rate": 0.00015295460913421126, + "loss": 1.0426, + "step": 36000 + }, + { + "epoch": 4.987565625863498, + "grad_norm": 0.055337630212306976, + "learning_rate": 0.0001525343233398711, + "loss": 1.0424, + "step": 36100 + }, + { + "epoch": 5.001381597126278, + "grad_norm": 0.062282662838697433, + "learning_rate": 0.00015211824040347433, + "loss": 1.0408, + "step": 36200 + }, + { + "epoch": 5.015197568389058, + "grad_norm": 0.08418793976306915, + "learning_rate": 0.00015169795460913418, + "loss": 1.0423, + "step": 36300 + }, + { + "epoch": 5.029013539651838, + "grad_norm": 0.056806761771440506, + "learning_rate": 0.00015127766881479403, + "loss": 1.0397, + "step": 36400 + }, + { + "epoch": 5.0428295109146175, + "grad_norm": 0.050782449543476105, + "learning_rate": 0.0001508573830204539, + "loss": 1.0397, + "step": 36500 + }, + { + "epoch": 5.056645482177397, + "grad_norm": 0.04436805471777916, + "learning_rate": 0.00015043709722611375, + "loss": 1.0372, + "step": 36600 + }, + { + "epoch": 5.070461453440177, + "grad_norm": 0.056697145104408264, + "learning_rate": 0.0001500168114317736, + "loss": 1.0396, + "step": 36700 + }, + { + "epoch": 5.0842774247029565, + "grad_norm": 0.0936078131198883, + "learning_rate": 0.00014959652563743342, + "loss": 1.0366, + "step": 36800 + }, + { + "epoch": 5.098093395965736, + "grad_norm": 0.058340467512607574, + "learning_rate": 0.0001491762398430933, + "loss": 1.038, + "step": 36900 + }, + { + "epoch": 5.111909367228516, + "grad_norm": 0.07920562475919724, + "learning_rate": 0.00014875595404875315, + "loss": 1.0389, + "step": 37000 + }, + { + "epoch": 5.1257253384912955, + "grad_norm": 0.054546140134334564, + "learning_rate": 0.000148335668254413, + "loss": 1.0352, + "step": 37100 + }, + { + "epoch": 5.139541309754076, + "grad_norm": 0.0779619961977005, + "learning_rate": 0.00014791538246007282, + "loss": 1.0362, + "step": 37200 + }, + { + "epoch": 5.153357281016856, + "grad_norm": 0.06077539920806885, + "learning_rate": 0.0001474950966657327, + "loss": 1.0395, + "step": 37300 + }, + { + "epoch": 5.167173252279635, + "grad_norm": 0.07015964388847351, + "learning_rate": 0.00014707481087139254, + "loss": 1.0378, + "step": 37400 + }, + { + "epoch": 5.180989223542415, + "grad_norm": 0.07821048051118851, + "learning_rate": 0.0001466545250770524, + "loss": 1.0358, + "step": 37500 + }, + { + "epoch": 5.194805194805195, + "grad_norm": 0.06446918845176697, + "learning_rate": 0.0001462342392827122, + "loss": 1.0401, + "step": 37600 + }, + { + "epoch": 5.208621166067974, + "grad_norm": 0.0754179060459137, + "learning_rate": 0.0001458139534883721, + "loss": 1.0372, + "step": 37700 + }, + { + "epoch": 5.222437137330754, + "grad_norm": 0.06225774064660072, + "learning_rate": 0.00014539366769403194, + "loss": 1.0396, + "step": 37800 + }, + { + "epoch": 5.236253108593534, + "grad_norm": 0.09567879885435104, + "learning_rate": 0.00014497338189969178, + "loss": 1.0427, + "step": 37900 + }, + { + "epoch": 5.250069079856313, + "grad_norm": 0.0810612216591835, + "learning_rate": 0.00014455309610535163, + "loss": 1.0368, + "step": 38000 + }, + { + "epoch": 5.263885051119094, + "grad_norm": 0.058250732719898224, + "learning_rate": 0.00014413281031101148, + "loss": 1.039, + "step": 38100 + }, + { + "epoch": 5.277701022381874, + "grad_norm": 0.07354842871427536, + "learning_rate": 0.00014371252451667133, + "loss": 1.0393, + "step": 38200 + }, + { + "epoch": 5.291516993644653, + "grad_norm": 0.04756517335772514, + "learning_rate": 0.00014329223872233118, + "loss": 1.0369, + "step": 38300 + }, + { + "epoch": 5.305332964907433, + "grad_norm": 0.05551883205771446, + "learning_rate": 0.00014287195292799103, + "loss": 1.038, + "step": 38400 + }, + { + "epoch": 5.319148936170213, + "grad_norm": 0.05476289987564087, + "learning_rate": 0.00014245166713365088, + "loss": 1.0391, + "step": 38500 + }, + { + "epoch": 5.332964907432992, + "grad_norm": 0.041929882019758224, + "learning_rate": 0.00014203138133931072, + "loss": 1.0377, + "step": 38600 + }, + { + "epoch": 5.346780878695772, + "grad_norm": 0.05916072428226471, + "learning_rate": 0.00014161109554497057, + "loss": 1.0417, + "step": 38700 + }, + { + "epoch": 5.360596849958552, + "grad_norm": 0.0609772689640522, + "learning_rate": 0.00014119080975063042, + "loss": 1.0386, + "step": 38800 + }, + { + "epoch": 5.374412821221332, + "grad_norm": 0.06430498510599136, + "learning_rate": 0.00014077052395629027, + "loss": 1.0397, + "step": 38900 + }, + { + "epoch": 5.388228792484112, + "grad_norm": 0.07042800635099411, + "learning_rate": 0.00014035023816195012, + "loss": 1.038, + "step": 39000 + }, + { + "epoch": 5.402044763746892, + "grad_norm": 0.05623612925410271, + "learning_rate": 0.00013992995236760997, + "loss": 1.0405, + "step": 39100 + }, + { + "epoch": 5.415860735009671, + "grad_norm": 0.04936366528272629, + "learning_rate": 0.00013950966657326982, + "loss": 1.0404, + "step": 39200 + }, + { + "epoch": 5.429676706272451, + "grad_norm": 0.05738508701324463, + "learning_rate": 0.00013908938077892966, + "loss": 1.0364, + "step": 39300 + }, + { + "epoch": 5.443492677535231, + "grad_norm": 0.09567712992429733, + "learning_rate": 0.0001386690949845895, + "loss": 1.0381, + "step": 39400 + }, + { + "epoch": 5.45730864879801, + "grad_norm": 0.07306545972824097, + "learning_rate": 0.00013824880919024936, + "loss": 1.0394, + "step": 39500 + }, + { + "epoch": 5.47112462006079, + "grad_norm": 0.060108475387096405, + "learning_rate": 0.0001378285233959092, + "loss": 1.0379, + "step": 39600 + }, + { + "epoch": 5.48494059132357, + "grad_norm": 0.08150669932365417, + "learning_rate": 0.00013740823760156906, + "loss": 1.0391, + "step": 39700 + }, + { + "epoch": 5.49875656258635, + "grad_norm": 0.06265643239021301, + "learning_rate": 0.0001369879518072289, + "loss": 1.0419, + "step": 39800 + }, + { + "epoch": 5.51257253384913, + "grad_norm": 0.09023050218820572, + "learning_rate": 0.00013656766601288876, + "loss": 1.0374, + "step": 39900 + }, + { + "epoch": 5.5263885051119095, + "grad_norm": 0.06600885838270187, + "learning_rate": 0.0001361473802185486, + "loss": 1.0365, + "step": 40000 + }, + { + "epoch": 5.5263885051119095, + "eval_accuracy": 0.52706640122358, + "eval_loss": 1.0380040407180786, + "eval_runtime": 773.4583, + "eval_samples_per_second": 266.157, + "eval_steps_per_second": 8.318, + "step": 40000 + }, + { + "epoch": 5.540204476374689, + "grad_norm": 0.07041644304990768, + "learning_rate": 0.00013572709442420845, + "loss": 1.038, + "step": 40100 + }, + { + "epoch": 5.554020447637469, + "grad_norm": 0.0819341391324997, + "learning_rate": 0.0001353110114878117, + "loss": 1.0383, + "step": 40200 + }, + { + "epoch": 5.5678364189002485, + "grad_norm": 0.04390214383602142, + "learning_rate": 0.00013489072569347155, + "loss": 1.0381, + "step": 40300 + }, + { + "epoch": 5.581652390163028, + "grad_norm": 0.0681944414973259, + "learning_rate": 0.0001344704398991314, + "loss": 1.0368, + "step": 40400 + }, + { + "epoch": 5.595468361425809, + "grad_norm": 0.0888848677277565, + "learning_rate": 0.00013405015410479124, + "loss": 1.0369, + "step": 40500 + }, + { + "epoch": 5.609284332688588, + "grad_norm": 0.07275230437517166, + "learning_rate": 0.0001336298683104511, + "loss": 1.0353, + "step": 40600 + }, + { + "epoch": 5.623100303951368, + "grad_norm": 0.10200846940279007, + "learning_rate": 0.00013320958251611094, + "loss": 1.0381, + "step": 40700 + }, + { + "epoch": 5.636916275214148, + "grad_norm": 0.056480832397937775, + "learning_rate": 0.0001327892967217708, + "loss": 1.0383, + "step": 40800 + }, + { + "epoch": 5.650732246476927, + "grad_norm": 0.0845484584569931, + "learning_rate": 0.00013236901092743064, + "loss": 1.0385, + "step": 40900 + }, + { + "epoch": 5.664548217739707, + "grad_norm": 0.05990500748157501, + "learning_rate": 0.0001319487251330905, + "loss": 1.0381, + "step": 41000 + }, + { + "epoch": 5.678364189002487, + "grad_norm": 0.04566818103194237, + "learning_rate": 0.00013152843933875034, + "loss": 1.0409, + "step": 41100 + }, + { + "epoch": 5.692180160265266, + "grad_norm": 0.05529521405696869, + "learning_rate": 0.00013110815354441018, + "loss": 1.039, + "step": 41200 + }, + { + "epoch": 5.705996131528046, + "grad_norm": 0.08812158554792404, + "learning_rate": 0.00013068786775007003, + "loss": 1.0393, + "step": 41300 + }, + { + "epoch": 5.719812102790826, + "grad_norm": 0.0714721605181694, + "learning_rate": 0.00013026758195572988, + "loss": 1.0365, + "step": 41400 + }, + { + "epoch": 5.733628074053606, + "grad_norm": 0.050889432430267334, + "learning_rate": 0.00012984729616138973, + "loss": 1.0399, + "step": 41500 + }, + { + "epoch": 5.747444045316386, + "grad_norm": 0.05863107368350029, + "learning_rate": 0.00012942701036704958, + "loss": 1.0401, + "step": 41600 + }, + { + "epoch": 5.761260016579166, + "grad_norm": 0.05279000476002693, + "learning_rate": 0.00012900672457270943, + "loss": 1.0368, + "step": 41700 + }, + { + "epoch": 5.775075987841945, + "grad_norm": 0.06430874019861221, + "learning_rate": 0.00012858643877836928, + "loss": 1.0347, + "step": 41800 + }, + { + "epoch": 5.788891959104725, + "grad_norm": 0.1187288910150528, + "learning_rate": 0.00012816615298402912, + "loss": 1.0372, + "step": 41900 + }, + { + "epoch": 5.802707930367505, + "grad_norm": 0.05984746664762497, + "learning_rate": 0.00012774586718968897, + "loss": 1.036, + "step": 42000 + }, + { + "epoch": 5.816523901630284, + "grad_norm": 0.047202371060848236, + "learning_rate": 0.00012732558139534882, + "loss": 1.0341, + "step": 42100 + }, + { + "epoch": 5.830339872893065, + "grad_norm": 0.0888022631406784, + "learning_rate": 0.00012690949845895207, + "loss": 1.0358, + "step": 42200 + }, + { + "epoch": 5.8441558441558445, + "grad_norm": 0.071753591299057, + "learning_rate": 0.00012648921266461191, + "loss": 1.0356, + "step": 42300 + }, + { + "epoch": 5.857971815418624, + "grad_norm": 0.06311481446027756, + "learning_rate": 0.0001260689268702718, + "loss": 1.0381, + "step": 42400 + }, + { + "epoch": 5.871787786681404, + "grad_norm": 0.05733519420027733, + "learning_rate": 0.0001256486410759316, + "loss": 1.0366, + "step": 42500 + }, + { + "epoch": 5.885603757944184, + "grad_norm": 0.05296749621629715, + "learning_rate": 0.00012522835528159146, + "loss": 1.0391, + "step": 42600 + }, + { + "epoch": 5.899419729206963, + "grad_norm": 0.05728083476424217, + "learning_rate": 0.0001248080694872513, + "loss": 1.0393, + "step": 42700 + }, + { + "epoch": 5.913235700469743, + "grad_norm": 0.10918726772069931, + "learning_rate": 0.00012438778369291118, + "loss": 1.0375, + "step": 42800 + }, + { + "epoch": 5.927051671732523, + "grad_norm": 0.043641045689582825, + "learning_rate": 0.000123967497898571, + "loss": 1.0342, + "step": 42900 + }, + { + "epoch": 5.940867642995302, + "grad_norm": 0.07793564349412918, + "learning_rate": 0.00012354721210423085, + "loss": 1.037, + "step": 43000 + }, + { + "epoch": 5.954683614258082, + "grad_norm": 0.10596407949924469, + "learning_rate": 0.0001231269263098907, + "loss": 1.0361, + "step": 43100 + }, + { + "epoch": 5.9684995855208625, + "grad_norm": 0.05018968880176544, + "learning_rate": 0.00012270664051555058, + "loss": 1.0352, + "step": 43200 + }, + { + "epoch": 5.982315556783642, + "grad_norm": 0.06663347035646439, + "learning_rate": 0.0001222863547212104, + "loss": 1.0379, + "step": 43300 + }, + { + "epoch": 5.996131528046422, + "grad_norm": 0.05061174929141998, + "learning_rate": 0.00012186606892687026, + "loss": 1.0378, + "step": 43400 + }, + { + "epoch": 6.0099474993092015, + "grad_norm": 0.07496211677789688, + "learning_rate": 0.00012144578313253011, + "loss": 1.0357, + "step": 43500 + }, + { + "epoch": 6.023763470571981, + "grad_norm": 0.058973684906959534, + "learning_rate": 0.00012102549733818996, + "loss": 1.0336, + "step": 43600 + }, + { + "epoch": 6.037579441834761, + "grad_norm": 0.07304850965738297, + "learning_rate": 0.0001206052115438498, + "loss": 1.0366, + "step": 43700 + }, + { + "epoch": 6.0513954130975405, + "grad_norm": 0.05964922904968262, + "learning_rate": 0.00012018492574950966, + "loss": 1.0358, + "step": 43800 + }, + { + "epoch": 6.06521138436032, + "grad_norm": 0.10107408463954926, + "learning_rate": 0.0001197646399551695, + "loss": 1.0363, + "step": 43900 + }, + { + "epoch": 6.079027355623101, + "grad_norm": 0.05830320343375206, + "learning_rate": 0.00011934435416082935, + "loss": 1.0374, + "step": 44000 + }, + { + "epoch": 6.09284332688588, + "grad_norm": 0.06493101269006729, + "learning_rate": 0.00011892406836648919, + "loss": 1.0358, + "step": 44100 + }, + { + "epoch": 6.10665929814866, + "grad_norm": 0.06381756067276001, + "learning_rate": 0.00011850798543009245, + "loss": 1.0345, + "step": 44200 + }, + { + "epoch": 6.12047526941144, + "grad_norm": 0.057328786700963974, + "learning_rate": 0.0001180876996357523, + "loss": 1.0347, + "step": 44300 + }, + { + "epoch": 6.134291240674219, + "grad_norm": 0.09036822617053986, + "learning_rate": 0.00011766741384141216, + "loss": 1.0352, + "step": 44400 + }, + { + "epoch": 6.148107211936999, + "grad_norm": 0.05485937371850014, + "learning_rate": 0.000117247128047072, + "loss": 1.0371, + "step": 44500 + }, + { + "epoch": 6.161923183199779, + "grad_norm": 0.06304465979337692, + "learning_rate": 0.00011682684225273184, + "loss": 1.0302, + "step": 44600 + }, + { + "epoch": 6.175739154462558, + "grad_norm": 0.045126065611839294, + "learning_rate": 0.0001164065564583917, + "loss": 1.0338, + "step": 44700 + }, + { + "epoch": 6.189555125725338, + "grad_norm": 0.06636038422584534, + "learning_rate": 0.00011598627066405155, + "loss": 1.0353, + "step": 44800 + }, + { + "epoch": 6.203371096988119, + "grad_norm": 0.05977385491132736, + "learning_rate": 0.00011556598486971139, + "loss": 1.0346, + "step": 44900 + }, + { + "epoch": 6.217187068250898, + "grad_norm": 0.07459376752376556, + "learning_rate": 0.00011514569907537124, + "loss": 1.0325, + "step": 45000 + }, + { + "epoch": 6.217187068250898, + "eval_accuracy": 0.5284276106869993, + "eval_loss": 1.0360603332519531, + "eval_runtime": 770.702, + "eval_samples_per_second": 267.108, + "eval_steps_per_second": 8.348, + "step": 45000 + }, + { + "epoch": 6.231003039513678, + "grad_norm": 0.050757069140672684, + "learning_rate": 0.0001147254132810311, + "loss": 1.0337, + "step": 45100 + }, + { + "epoch": 6.244819010776458, + "grad_norm": 0.065644271671772, + "learning_rate": 0.00011430512748669095, + "loss": 1.035, + "step": 45200 + }, + { + "epoch": 6.258634982039237, + "grad_norm": 0.06008651480078697, + "learning_rate": 0.00011388484169235078, + "loss": 1.0323, + "step": 45300 + }, + { + "epoch": 6.272450953302017, + "grad_norm": 0.050868868827819824, + "learning_rate": 0.00011346455589801063, + "loss": 1.0341, + "step": 45400 + }, + { + "epoch": 6.286266924564797, + "grad_norm": 0.0535401850938797, + "learning_rate": 0.00011304427010367049, + "loss": 1.0349, + "step": 45500 + }, + { + "epoch": 6.300082895827576, + "grad_norm": 0.07083383947610855, + "learning_rate": 0.00011262398430933034, + "loss": 1.0327, + "step": 45600 + }, + { + "epoch": 6.313898867090357, + "grad_norm": 0.06998474150896072, + "learning_rate": 0.00011220369851499018, + "loss": 1.035, + "step": 45700 + }, + { + "epoch": 6.3277148383531365, + "grad_norm": 0.06696050614118576, + "learning_rate": 0.00011178341272065002, + "loss": 1.0342, + "step": 45800 + }, + { + "epoch": 6.341530809615916, + "grad_norm": 0.050143785774707794, + "learning_rate": 0.00011136312692630989, + "loss": 1.0342, + "step": 45900 + }, + { + "epoch": 6.355346780878696, + "grad_norm": 0.066258005797863, + "learning_rate": 0.00011094284113196974, + "loss": 1.0368, + "step": 46000 + }, + { + "epoch": 6.3691627521414755, + "grad_norm": 0.057613175362348557, + "learning_rate": 0.00011052255533762957, + "loss": 1.0357, + "step": 46100 + }, + { + "epoch": 6.382978723404255, + "grad_norm": 0.07405593246221542, + "learning_rate": 0.00011010647240123283, + "loss": 1.033, + "step": 46200 + }, + { + "epoch": 6.396794694667035, + "grad_norm": 0.07005150616168976, + "learning_rate": 0.00010968618660689268, + "loss": 1.0329, + "step": 46300 + }, + { + "epoch": 6.4106106659298145, + "grad_norm": 0.057546067982912064, + "learning_rate": 0.00010926590081255253, + "loss": 1.033, + "step": 46400 + }, + { + "epoch": 6.424426637192594, + "grad_norm": 0.08016248792409897, + "learning_rate": 0.00010884561501821236, + "loss": 1.0389, + "step": 46500 + }, + { + "epoch": 6.438242608455375, + "grad_norm": 0.08346617966890335, + "learning_rate": 0.00010842532922387222, + "loss": 1.0332, + "step": 46600 + }, + { + "epoch": 6.452058579718154, + "grad_norm": 0.048157453536987305, + "learning_rate": 0.00010800504342953207, + "loss": 1.0342, + "step": 46700 + }, + { + "epoch": 6.465874550980934, + "grad_norm": 0.06816009432077408, + "learning_rate": 0.00010758475763519192, + "loss": 1.0357, + "step": 46800 + }, + { + "epoch": 6.479690522243714, + "grad_norm": 0.05210613086819649, + "learning_rate": 0.00010716447184085176, + "loss": 1.0345, + "step": 46900 + }, + { + "epoch": 6.4935064935064934, + "grad_norm": 0.08138227462768555, + "learning_rate": 0.00010674418604651162, + "loss": 1.035, + "step": 47000 + }, + { + "epoch": 6.507322464769273, + "grad_norm": 0.07494477927684784, + "learning_rate": 0.00010632390025217147, + "loss": 1.0361, + "step": 47100 + }, + { + "epoch": 6.521138436032053, + "grad_norm": 0.07473413646221161, + "learning_rate": 0.00010590361445783132, + "loss": 1.0339, + "step": 47200 + }, + { + "epoch": 6.5349544072948325, + "grad_norm": 0.07200802862644196, + "learning_rate": 0.00010548332866349115, + "loss": 1.0333, + "step": 47300 + }, + { + "epoch": 6.548770378557613, + "grad_norm": 0.06346756964921951, + "learning_rate": 0.00010506304286915101, + "loss": 1.0345, + "step": 47400 + }, + { + "epoch": 6.562586349820393, + "grad_norm": 0.06382066756486893, + "learning_rate": 0.00010464275707481086, + "loss": 1.0352, + "step": 47500 + }, + { + "epoch": 6.576402321083172, + "grad_norm": 0.1000475063920021, + "learning_rate": 0.00010422247128047071, + "loss": 1.0344, + "step": 47600 + }, + { + "epoch": 6.590218292345952, + "grad_norm": 0.06456384807825089, + "learning_rate": 0.00010380218548613057, + "loss": 1.0356, + "step": 47700 + }, + { + "epoch": 6.604034263608732, + "grad_norm": 0.052929963916540146, + "learning_rate": 0.0001033818996917904, + "loss": 1.0343, + "step": 47800 + }, + { + "epoch": 6.617850234871511, + "grad_norm": 0.07275223731994629, + "learning_rate": 0.00010296161389745025, + "loss": 1.033, + "step": 47900 + }, + { + "epoch": 6.631666206134291, + "grad_norm": 0.060610584914684296, + "learning_rate": 0.0001025413281031101, + "loss": 1.0334, + "step": 48000 + }, + { + "epoch": 6.645482177397071, + "grad_norm": 0.0514766089618206, + "learning_rate": 0.00010212104230876997, + "loss": 1.0351, + "step": 48100 + }, + { + "epoch": 6.65929814865985, + "grad_norm": 0.08950326591730118, + "learning_rate": 0.0001017049593723732, + "loss": 1.0341, + "step": 48200 + }, + { + "epoch": 6.673114119922631, + "grad_norm": 0.052268847823143005, + "learning_rate": 0.00010128467357803306, + "loss": 1.0342, + "step": 48300 + }, + { + "epoch": 6.686930091185411, + "grad_norm": 0.059182267636060715, + "learning_rate": 0.00010086438778369291, + "loss": 1.0303, + "step": 48400 + }, + { + "epoch": 6.70074606244819, + "grad_norm": 0.06220945715904236, + "learning_rate": 0.00010044410198935274, + "loss": 1.032, + "step": 48500 + }, + { + "epoch": 6.71456203371097, + "grad_norm": 0.0486241914331913, + "learning_rate": 0.00010002381619501259, + "loss": 1.0338, + "step": 48600 + }, + { + "epoch": 6.72837800497375, + "grad_norm": 0.04813262075185776, + "learning_rate": 9.960353040067245e-05, + "loss": 1.0344, + "step": 48700 + }, + { + "epoch": 6.742193976236529, + "grad_norm": 0.04981222748756409, + "learning_rate": 9.91832446063323e-05, + "loss": 1.0347, + "step": 48800 + }, + { + "epoch": 6.756009947499309, + "grad_norm": 0.050560541450977325, + "learning_rate": 9.876295881199214e-05, + "loss": 1.0338, + "step": 48900 + }, + { + "epoch": 6.769825918762089, + "grad_norm": 0.05338674411177635, + "learning_rate": 9.834267301765199e-05, + "loss": 1.0369, + "step": 49000 + }, + { + "epoch": 6.783641890024869, + "grad_norm": 0.042156435549259186, + "learning_rate": 9.792238722331185e-05, + "loss": 1.0345, + "step": 49100 + }, + { + "epoch": 6.797457861287649, + "grad_norm": 0.0622396394610405, + "learning_rate": 9.75021014289717e-05, + "loss": 1.0321, + "step": 49200 + }, + { + "epoch": 6.8112738325504285, + "grad_norm": 0.08523661643266678, + "learning_rate": 9.708181563463155e-05, + "loss": 1.0317, + "step": 49300 + }, + { + "epoch": 6.825089803813208, + "grad_norm": 0.055176641792058945, + "learning_rate": 9.666152984029138e-05, + "loss": 1.0368, + "step": 49400 + }, + { + "epoch": 6.838905775075988, + "grad_norm": 0.07358380407094955, + "learning_rate": 9.624124404595124e-05, + "loss": 1.0318, + "step": 49500 + }, + { + "epoch": 6.8527217463387675, + "grad_norm": 0.055568769574165344, + "learning_rate": 9.582095825161109e-05, + "loss": 1.0343, + "step": 49600 + }, + { + "epoch": 6.866537717601547, + "grad_norm": 0.04249552637338638, + "learning_rate": 9.540067245727094e-05, + "loss": 1.0331, + "step": 49700 + }, + { + "epoch": 6.880353688864327, + "grad_norm": 0.05274058133363724, + "learning_rate": 9.498038666293077e-05, + "loss": 1.0351, + "step": 49800 + }, + { + "epoch": 6.8941696601271065, + "grad_norm": 0.04792112484574318, + "learning_rate": 9.456010086859064e-05, + "loss": 1.0333, + "step": 49900 + }, + { + "epoch": 6.907985631389887, + "grad_norm": 0.05513302981853485, + "learning_rate": 9.413981507425049e-05, + "loss": 1.0322, + "step": 50000 + }, + { + "epoch": 6.907985631389887, + "eval_accuracy": 0.5296076152096916, + "eval_loss": 1.0341060161590576, + "eval_runtime": 725.8939, + "eval_samples_per_second": 283.597, + "eval_steps_per_second": 8.864, + "step": 50000 + }, + { + "epoch": 6.921801602652667, + "grad_norm": 0.05296773836016655, + "learning_rate": 9.371952927991033e-05, + "loss": 1.031, + "step": 50100 + }, + { + "epoch": 6.935617573915446, + "grad_norm": 0.062248583883047104, + "learning_rate": 9.330344634351358e-05, + "loss": 1.0341, + "step": 50200 + }, + { + "epoch": 6.949433545178226, + "grad_norm": 0.07751675695180893, + "learning_rate": 9.288316054917343e-05, + "loss": 1.0352, + "step": 50300 + }, + { + "epoch": 6.963249516441006, + "grad_norm": 0.04984898492693901, + "learning_rate": 9.246287475483328e-05, + "loss": 1.0302, + "step": 50400 + }, + { + "epoch": 6.977065487703785, + "grad_norm": 0.04315504804253578, + "learning_rate": 9.204258896049314e-05, + "loss": 1.0327, + "step": 50500 + }, + { + "epoch": 6.990881458966565, + "grad_norm": 0.053620435297489166, + "learning_rate": 9.162230316615297e-05, + "loss": 1.0328, + "step": 50600 + }, + { + "epoch": 7.004697430229345, + "grad_norm": 0.04611975699663162, + "learning_rate": 9.120201737181282e-05, + "loss": 1.0336, + "step": 50700 + }, + { + "epoch": 7.018513401492125, + "grad_norm": 0.04269848018884659, + "learning_rate": 9.078173157747267e-05, + "loss": 1.0282, + "step": 50800 + }, + { + "epoch": 7.032329372754905, + "grad_norm": 0.055365532636642456, + "learning_rate": 9.036144578313253e-05, + "loss": 1.0339, + "step": 50900 + }, + { + "epoch": 7.046145344017685, + "grad_norm": 0.06129321828484535, + "learning_rate": 8.994115998879237e-05, + "loss": 1.0304, + "step": 51000 + }, + { + "epoch": 7.059961315280464, + "grad_norm": 0.06094348803162575, + "learning_rate": 8.952507705239563e-05, + "loss": 1.0288, + "step": 51100 + }, + { + "epoch": 7.073777286543244, + "grad_norm": 0.048849135637283325, + "learning_rate": 8.910479125805548e-05, + "loss": 1.0322, + "step": 51200 + }, + { + "epoch": 7.087593257806024, + "grad_norm": 0.05081125721335411, + "learning_rate": 8.868450546371531e-05, + "loss": 1.0303, + "step": 51300 + }, + { + "epoch": 7.101409229068803, + "grad_norm": 0.07727497071027756, + "learning_rate": 8.826421966937516e-05, + "loss": 1.03, + "step": 51400 + }, + { + "epoch": 7.115225200331583, + "grad_norm": 0.06357153505086899, + "learning_rate": 8.784393387503502e-05, + "loss": 1.0342, + "step": 51500 + }, + { + "epoch": 7.129041171594363, + "grad_norm": 0.05598052963614464, + "learning_rate": 8.742364808069487e-05, + "loss": 1.0312, + "step": 51600 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.06753697246313095, + "learning_rate": 8.70033622863547e-05, + "loss": 1.0306, + "step": 51700 + }, + { + "epoch": 7.156673114119923, + "grad_norm": 0.06586912274360657, + "learning_rate": 8.658307649201455e-05, + "loss": 1.0311, + "step": 51800 + }, + { + "epoch": 7.170489085382703, + "grad_norm": 0.10361455380916595, + "learning_rate": 8.616279069767442e-05, + "loss": 1.0326, + "step": 51900 + }, + { + "epoch": 7.184305056645482, + "grad_norm": 0.09442713856697083, + "learning_rate": 8.574250490333426e-05, + "loss": 1.0339, + "step": 52000 + }, + { + "epoch": 7.198121027908262, + "grad_norm": 0.08114325255155563, + "learning_rate": 8.532221910899411e-05, + "loss": 1.0335, + "step": 52100 + }, + { + "epoch": 7.211936999171042, + "grad_norm": 0.054252710193395615, + "learning_rate": 8.490193331465395e-05, + "loss": 1.0316, + "step": 52200 + }, + { + "epoch": 7.225752970433821, + "grad_norm": 0.059643086045980453, + "learning_rate": 8.448164752031381e-05, + "loss": 1.027, + "step": 52300 + }, + { + "epoch": 7.239568941696601, + "grad_norm": 0.045472096651792526, + "learning_rate": 8.406136172597366e-05, + "loss": 1.0311, + "step": 52400 + }, + { + "epoch": 7.2533849129593815, + "grad_norm": 0.0669686570763588, + "learning_rate": 8.36410759316335e-05, + "loss": 1.0309, + "step": 52500 + }, + { + "epoch": 7.267200884222161, + "grad_norm": 0.0454520583152771, + "learning_rate": 8.322079013729334e-05, + "loss": 1.0327, + "step": 52600 + }, + { + "epoch": 7.281016855484941, + "grad_norm": 0.05776028707623482, + "learning_rate": 8.28005043429532e-05, + "loss": 1.0318, + "step": 52700 + }, + { + "epoch": 7.2948328267477205, + "grad_norm": 0.051905229687690735, + "learning_rate": 8.238021854861305e-05, + "loss": 1.0313, + "step": 52800 + }, + { + "epoch": 7.3086487980105, + "grad_norm": 0.056912437081336975, + "learning_rate": 8.19599327542729e-05, + "loss": 1.0325, + "step": 52900 + }, + { + "epoch": 7.32246476927328, + "grad_norm": 0.04940250515937805, + "learning_rate": 8.153964695993274e-05, + "loss": 1.0323, + "step": 53000 + }, + { + "epoch": 7.3362807405360595, + "grad_norm": 0.04186444729566574, + "learning_rate": 8.11193611655926e-05, + "loss": 1.0285, + "step": 53100 + }, + { + "epoch": 7.350096711798839, + "grad_norm": 0.041809357702732086, + "learning_rate": 8.069907537125245e-05, + "loss": 1.0289, + "step": 53200 + }, + { + "epoch": 7.363912683061619, + "grad_norm": 0.05794375389814377, + "learning_rate": 8.02787895769123e-05, + "loss": 1.031, + "step": 53300 + }, + { + "epoch": 7.377728654324399, + "grad_norm": 0.08333911001682281, + "learning_rate": 7.985850378257213e-05, + "loss": 1.0316, + "step": 53400 + }, + { + "epoch": 7.391544625587179, + "grad_norm": 0.06473658233880997, + "learning_rate": 7.943821798823199e-05, + "loss": 1.0317, + "step": 53500 + }, + { + "epoch": 7.405360596849959, + "grad_norm": 0.05173886939883232, + "learning_rate": 7.901793219389184e-05, + "loss": 1.0308, + "step": 53600 + }, + { + "epoch": 7.419176568112738, + "grad_norm": 0.06362345069646835, + "learning_rate": 7.859764639955169e-05, + "loss": 1.0324, + "step": 53700 + }, + { + "epoch": 7.432992539375518, + "grad_norm": 0.054053716361522675, + "learning_rate": 7.817736060521152e-05, + "loss": 1.0303, + "step": 53800 + }, + { + "epoch": 7.446808510638298, + "grad_norm": 0.048420459032058716, + "learning_rate": 7.775707481087139e-05, + "loss": 1.0299, + "step": 53900 + }, + { + "epoch": 7.460624481901077, + "grad_norm": 0.0606950968503952, + "learning_rate": 7.733678901653123e-05, + "loss": 1.0317, + "step": 54000 + }, + { + "epoch": 7.474440453163857, + "grad_norm": 0.06072583049535751, + "learning_rate": 7.691650322219108e-05, + "loss": 1.033, + "step": 54100 + }, + { + "epoch": 7.488256424426638, + "grad_norm": 0.05064817890524864, + "learning_rate": 7.649621742785093e-05, + "loss": 1.0287, + "step": 54200 + }, + { + "epoch": 7.502072395689417, + "grad_norm": 0.09318757057189941, + "learning_rate": 7.607593163351078e-05, + "loss": 1.0296, + "step": 54300 + }, + { + "epoch": 7.515888366952197, + "grad_norm": 0.0935215950012207, + "learning_rate": 7.565564583917063e-05, + "loss": 1.0322, + "step": 54400 + }, + { + "epoch": 7.529704338214977, + "grad_norm": 0.07255256175994873, + "learning_rate": 7.523536004483048e-05, + "loss": 1.0333, + "step": 54500 + }, + { + "epoch": 7.543520309477756, + "grad_norm": 0.05486008897423744, + "learning_rate": 7.481507425049033e-05, + "loss": 1.032, + "step": 54600 + }, + { + "epoch": 7.557336280740536, + "grad_norm": 0.0525212287902832, + "learning_rate": 7.439478845615017e-05, + "loss": 1.0293, + "step": 54700 + }, + { + "epoch": 7.571152252003316, + "grad_norm": 0.047569695860147476, + "learning_rate": 7.397450266181002e-05, + "loss": 1.0282, + "step": 54800 + }, + { + "epoch": 7.584968223266095, + "grad_norm": 0.06165711581707001, + "learning_rate": 7.355421686746987e-05, + "loss": 1.0312, + "step": 54900 + }, + { + "epoch": 7.598784194528875, + "grad_norm": 0.0578945092856884, + "learning_rate": 7.313393107312972e-05, + "loss": 1.0307, + "step": 55000 + }, + { + "epoch": 7.598784194528875, + "eval_accuracy": 0.5305025000901846, + "eval_loss": 1.0327985286712646, + "eval_runtime": 731.5754, + "eval_samples_per_second": 281.394, + "eval_steps_per_second": 8.795, + "step": 55000 + }, + { + "epoch": 7.612600165791655, + "grad_norm": 0.0795338973402977, + "learning_rate": 7.271784813673297e-05, + "loss": 1.0294, + "step": 55100 + }, + { + "epoch": 7.626416137054435, + "grad_norm": 0.06103779003024101, + "learning_rate": 7.229756234239283e-05, + "loss": 1.033, + "step": 55200 + }, + { + "epoch": 7.640232108317215, + "grad_norm": 0.0635315552353859, + "learning_rate": 7.187727654805266e-05, + "loss": 1.0296, + "step": 55300 + }, + { + "epoch": 7.654048079579995, + "grad_norm": 0.05289231240749359, + "learning_rate": 7.145699075371253e-05, + "loss": 1.034, + "step": 55400 + }, + { + "epoch": 7.667864050842774, + "grad_norm": 0.07801427692174911, + "learning_rate": 7.103670495937236e-05, + "loss": 1.0332, + "step": 55500 + }, + { + "epoch": 7.681680022105554, + "grad_norm": 0.07564268261194229, + "learning_rate": 7.061641916503222e-05, + "loss": 1.0299, + "step": 55600 + }, + { + "epoch": 7.695495993368334, + "grad_norm": 0.04168133810162544, + "learning_rate": 7.019613337069206e-05, + "loss": 1.03, + "step": 55700 + }, + { + "epoch": 7.709311964631113, + "grad_norm": 0.11210035532712936, + "learning_rate": 6.977584757635192e-05, + "loss": 1.0301, + "step": 55800 + }, + { + "epoch": 7.723127935893894, + "grad_norm": 0.09023060649633408, + "learning_rate": 6.935556178201175e-05, + "loss": 1.0285, + "step": 55900 + }, + { + "epoch": 7.7369439071566735, + "grad_norm": 0.05271260067820549, + "learning_rate": 6.893527598767162e-05, + "loss": 1.0315, + "step": 56000 + }, + { + "epoch": 7.750759878419453, + "grad_norm": 0.06293012201786041, + "learning_rate": 6.851499019333145e-05, + "loss": 1.0286, + "step": 56100 + }, + { + "epoch": 7.764575849682233, + "grad_norm": 0.04555558040738106, + "learning_rate": 6.809470439899131e-05, + "loss": 1.0308, + "step": 56200 + }, + { + "epoch": 7.7783918209450125, + "grad_norm": 0.042364273220300674, + "learning_rate": 6.767441860465115e-05, + "loss": 1.0311, + "step": 56300 + }, + { + "epoch": 7.792207792207792, + "grad_norm": 0.05084213241934776, + "learning_rate": 6.725413281031101e-05, + "loss": 1.0298, + "step": 56400 + }, + { + "epoch": 7.806023763470572, + "grad_norm": 0.059168051928281784, + "learning_rate": 6.683384701597085e-05, + "loss": 1.0303, + "step": 56500 + }, + { + "epoch": 7.8198397347333515, + "grad_norm": 0.05535740405321121, + "learning_rate": 6.641356122163071e-05, + "loss": 1.0306, + "step": 56600 + }, + { + "epoch": 7.833655705996131, + "grad_norm": 0.06625715643167496, + "learning_rate": 6.599327542729054e-05, + "loss": 1.0283, + "step": 56700 + }, + { + "epoch": 7.847471677258911, + "grad_norm": 0.04644458368420601, + "learning_rate": 6.55729896329504e-05, + "loss": 1.0289, + "step": 56800 + }, + { + "epoch": 7.861287648521691, + "grad_norm": 0.05319574847817421, + "learning_rate": 6.515270383861024e-05, + "loss": 1.0303, + "step": 56900 + }, + { + "epoch": 7.875103619784471, + "grad_norm": 0.06394356489181519, + "learning_rate": 6.47324180442701e-05, + "loss": 1.0315, + "step": 57000 + }, + { + "epoch": 7.888919591047251, + "grad_norm": 0.0535539835691452, + "learning_rate": 6.431633510787335e-05, + "loss": 1.0323, + "step": 57100 + }, + { + "epoch": 7.90273556231003, + "grad_norm": 0.05220150947570801, + "learning_rate": 6.38960493135332e-05, + "loss": 1.032, + "step": 57200 + }, + { + "epoch": 7.91655153357281, + "grad_norm": 0.04795517399907112, + "learning_rate": 6.347576351919304e-05, + "loss": 1.03, + "step": 57300 + }, + { + "epoch": 7.93036750483559, + "grad_norm": 0.0748489499092102, + "learning_rate": 6.30554777248529e-05, + "loss": 1.0338, + "step": 57400 + }, + { + "epoch": 7.944183476098369, + "grad_norm": 0.08164035528898239, + "learning_rate": 6.263519193051274e-05, + "loss": 1.0318, + "step": 57500 + }, + { + "epoch": 7.95799944736115, + "grad_norm": 0.0764247477054596, + "learning_rate": 6.221490613617259e-05, + "loss": 1.0278, + "step": 57600 + }, + { + "epoch": 7.97181541862393, + "grad_norm": 0.05609816685318947, + "learning_rate": 6.179462034183244e-05, + "loss": 1.0307, + "step": 57700 + }, + { + "epoch": 7.985631389886709, + "grad_norm": 0.05001819133758545, + "learning_rate": 6.137433454749229e-05, + "loss": 1.0297, + "step": 57800 + }, + { + "epoch": 7.999447361149489, + "grad_norm": 0.10084258019924164, + "learning_rate": 6.0954048753152136e-05, + "loss": 1.0339, + "step": 57900 + }, + { + "epoch": 8.013263332412269, + "grad_norm": 0.07571733742952347, + "learning_rate": 6.0533762958811985e-05, + "loss": 1.0305, + "step": 58000 + }, + { + "epoch": 8.027079303675048, + "grad_norm": 0.059294216334819794, + "learning_rate": 6.011347716447183e-05, + "loss": 1.026, + "step": 58100 + }, + { + "epoch": 8.040895274937828, + "grad_norm": 0.04530787095427513, + "learning_rate": 5.969319137013168e-05, + "loss": 1.0282, + "step": 58200 + }, + { + "epoch": 8.054711246200608, + "grad_norm": 0.05052864924073219, + "learning_rate": 5.927290557579153e-05, + "loss": 1.0271, + "step": 58300 + }, + { + "epoch": 8.068527217463387, + "grad_norm": 0.04923342168331146, + "learning_rate": 5.885261978145138e-05, + "loss": 1.029, + "step": 58400 + }, + { + "epoch": 8.082343188726167, + "grad_norm": 0.04905908182263374, + "learning_rate": 5.843233398711123e-05, + "loss": 1.0277, + "step": 58500 + }, + { + "epoch": 8.096159159988947, + "grad_norm": 0.046151451766490936, + "learning_rate": 5.801204819277108e-05, + "loss": 1.0289, + "step": 58600 + }, + { + "epoch": 8.109975131251726, + "grad_norm": 0.06011873856186867, + "learning_rate": 5.7591762398430925e-05, + "loss": 1.0245, + "step": 58700 + }, + { + "epoch": 8.123791102514506, + "grad_norm": 0.06879663467407227, + "learning_rate": 5.717147660409078e-05, + "loss": 1.0271, + "step": 58800 + }, + { + "epoch": 8.137607073777286, + "grad_norm": 0.04675479233264923, + "learning_rate": 5.675119080975063e-05, + "loss": 1.0263, + "step": 58900 + }, + { + "epoch": 8.151423045040067, + "grad_norm": 0.08497285097837448, + "learning_rate": 5.633090501541048e-05, + "loss": 1.0287, + "step": 59000 + }, + { + "epoch": 8.165239016302847, + "grad_norm": 0.07600156217813492, + "learning_rate": 5.5910619221070326e-05, + "loss": 1.0262, + "step": 59100 + }, + { + "epoch": 8.179054987565626, + "grad_norm": 0.04951677843928337, + "learning_rate": 5.549453628467357e-05, + "loss": 1.0283, + "step": 59200 + }, + { + "epoch": 8.192870958828406, + "grad_norm": 0.05662324279546738, + "learning_rate": 5.507425049033342e-05, + "loss": 1.0295, + "step": 59300 + }, + { + "epoch": 8.206686930091186, + "grad_norm": 0.05791959911584854, + "learning_rate": 5.465396469599327e-05, + "loss": 1.0285, + "step": 59400 + }, + { + "epoch": 8.220502901353965, + "grad_norm": 0.058768805116415024, + "learning_rate": 5.423367890165312e-05, + "loss": 1.0272, + "step": 59500 + }, + { + "epoch": 8.234318872616745, + "grad_norm": 0.05399869754910469, + "learning_rate": 5.381339310731297e-05, + "loss": 1.0301, + "step": 59600 + }, + { + "epoch": 8.248134843879525, + "grad_norm": 0.06434085965156555, + "learning_rate": 5.3393107312972814e-05, + "loss": 1.0277, + "step": 59700 + }, + { + "epoch": 8.261950815142304, + "grad_norm": 0.054656483232975006, + "learning_rate": 5.297282151863267e-05, + "loss": 1.0295, + "step": 59800 + }, + { + "epoch": 8.275766786405084, + "grad_norm": 0.04396641626954079, + "learning_rate": 5.255253572429251e-05, + "loss": 1.0276, + "step": 59900 + }, + { + "epoch": 8.289582757667864, + "grad_norm": 0.058395449072122574, + "learning_rate": 5.2132249929952366e-05, + "loss": 1.0267, + "step": 60000 + }, + { + "epoch": 8.289582757667864, + "eval_accuracy": 0.5312832658873073, + "eval_loss": 1.0315501689910889, + "eval_runtime": 729.415, + "eval_samples_per_second": 282.228, + "eval_steps_per_second": 8.821, + "step": 60000 + }, + { + "epoch": 8.303398728930643, + "grad_norm": 0.06770013272762299, + "learning_rate": 5.171196413561221e-05, + "loss": 1.029, + "step": 60100 + }, + { + "epoch": 8.317214700193423, + "grad_norm": 0.06161688268184662, + "learning_rate": 5.1291678341272063e-05, + "loss": 1.0242, + "step": 60200 + }, + { + "epoch": 8.331030671456203, + "grad_norm": 0.04140911623835564, + "learning_rate": 5.087139254693191e-05, + "loss": 1.029, + "step": 60300 + }, + { + "epoch": 8.344846642718982, + "grad_norm": 0.07091998308897018, + "learning_rate": 5.045110675259176e-05, + "loss": 1.0268, + "step": 60400 + }, + { + "epoch": 8.358662613981762, + "grad_norm": 0.05135732889175415, + "learning_rate": 5.003082095825161e-05, + "loss": 1.0264, + "step": 60500 + }, + { + "epoch": 8.372478585244544, + "grad_norm": 0.05828474089503288, + "learning_rate": 4.961053516391146e-05, + "loss": 1.0271, + "step": 60600 + }, + { + "epoch": 8.386294556507323, + "grad_norm": 0.05920015275478363, + "learning_rate": 4.9190249369571306e-05, + "loss": 1.0263, + "step": 60700 + }, + { + "epoch": 8.400110527770103, + "grad_norm": 0.048502273857593536, + "learning_rate": 4.8769963575231155e-05, + "loss": 1.029, + "step": 60800 + }, + { + "epoch": 8.413926499032883, + "grad_norm": 0.049063604325056076, + "learning_rate": 4.8349677780891e-05, + "loss": 1.0294, + "step": 60900 + }, + { + "epoch": 8.427742470295662, + "grad_norm": 0.05672093480825424, + "learning_rate": 4.792939198655085e-05, + "loss": 1.0297, + "step": 61000 + }, + { + "epoch": 8.441558441558442, + "grad_norm": 0.06934633105993271, + "learning_rate": 4.75091061922107e-05, + "loss": 1.0261, + "step": 61100 + }, + { + "epoch": 8.455374412821222, + "grad_norm": 0.04098910838365555, + "learning_rate": 4.709302325581395e-05, + "loss": 1.0292, + "step": 61200 + }, + { + "epoch": 8.469190384084001, + "grad_norm": 0.06421385705471039, + "learning_rate": 4.6672737461473794e-05, + "loss": 1.0315, + "step": 61300 + }, + { + "epoch": 8.483006355346781, + "grad_norm": 0.05238828435540199, + "learning_rate": 4.625245166713365e-05, + "loss": 1.0309, + "step": 61400 + }, + { + "epoch": 8.49682232660956, + "grad_norm": 0.049910806119441986, + "learning_rate": 4.583216587279349e-05, + "loss": 1.0257, + "step": 61500 + }, + { + "epoch": 8.51063829787234, + "grad_norm": 0.06672196090221405, + "learning_rate": 4.541188007845335e-05, + "loss": 1.0328, + "step": 61600 + }, + { + "epoch": 8.52445426913512, + "grad_norm": 0.05466538295149803, + "learning_rate": 4.4991594284113195e-05, + "loss": 1.0284, + "step": 61700 + }, + { + "epoch": 8.5382702403979, + "grad_norm": 0.05218784883618355, + "learning_rate": 4.4571308489773044e-05, + "loss": 1.0285, + "step": 61800 + }, + { + "epoch": 8.55208621166068, + "grad_norm": 0.04263923689723015, + "learning_rate": 4.415102269543289e-05, + "loss": 1.0307, + "step": 61900 + }, + { + "epoch": 8.565902182923459, + "grad_norm": 0.054478637874126434, + "learning_rate": 4.373073690109274e-05, + "loss": 1.0291, + "step": 62000 + }, + { + "epoch": 8.579718154186239, + "grad_norm": 0.05667020007967949, + "learning_rate": 4.331045110675259e-05, + "loss": 1.0296, + "step": 62100 + }, + { + "epoch": 8.593534125449018, + "grad_norm": 0.0490160770714283, + "learning_rate": 4.289016531241244e-05, + "loss": 1.029, + "step": 62200 + }, + { + "epoch": 8.607350096711798, + "grad_norm": 0.049655403941869736, + "learning_rate": 4.246987951807229e-05, + "loss": 1.0298, + "step": 62300 + }, + { + "epoch": 8.62116606797458, + "grad_norm": 0.047429408878088, + "learning_rate": 4.2049593723732135e-05, + "loss": 1.0277, + "step": 62400 + }, + { + "epoch": 8.634982039237359, + "grad_norm": 0.05222218483686447, + "learning_rate": 4.1629307929391984e-05, + "loss": 1.0292, + "step": 62500 + }, + { + "epoch": 8.648798010500139, + "grad_norm": 0.05841238424181938, + "learning_rate": 4.120902213505183e-05, + "loss": 1.029, + "step": 62600 + }, + { + "epoch": 8.662613981762918, + "grad_norm": 0.0452195480465889, + "learning_rate": 4.078873634071168e-05, + "loss": 1.0265, + "step": 62700 + }, + { + "epoch": 8.676429953025698, + "grad_norm": 0.049306340515613556, + "learning_rate": 4.036845054637153e-05, + "loss": 1.0308, + "step": 62800 + }, + { + "epoch": 8.690245924288478, + "grad_norm": 0.050401389598846436, + "learning_rate": 3.994816475203138e-05, + "loss": 1.0294, + "step": 62900 + }, + { + "epoch": 8.704061895551257, + "grad_norm": 0.04503024369478226, + "learning_rate": 3.952787895769123e-05, + "loss": 1.0291, + "step": 63000 + }, + { + "epoch": 8.717877866814037, + "grad_norm": 0.0738733783364296, + "learning_rate": 3.9107593163351075e-05, + "loss": 1.0279, + "step": 63100 + }, + { + "epoch": 8.731693838076817, + "grad_norm": 0.04586975276470184, + "learning_rate": 3.869151022695433e-05, + "loss": 1.026, + "step": 63200 + }, + { + "epoch": 8.745509809339596, + "grad_norm": 0.04988343268632889, + "learning_rate": 3.8271224432614176e-05, + "loss": 1.0257, + "step": 63300 + }, + { + "epoch": 8.759325780602376, + "grad_norm": 0.07822008430957794, + "learning_rate": 3.7850938638274025e-05, + "loss": 1.0254, + "step": 63400 + }, + { + "epoch": 8.773141751865156, + "grad_norm": 0.058496229350566864, + "learning_rate": 3.743065284393387e-05, + "loss": 1.0263, + "step": 63500 + }, + { + "epoch": 8.786957723127935, + "grad_norm": 0.04458677023649216, + "learning_rate": 3.701036704959372e-05, + "loss": 1.0292, + "step": 63600 + }, + { + "epoch": 8.800773694390715, + "grad_norm": 0.06616061180830002, + "learning_rate": 3.659008125525357e-05, + "loss": 1.0309, + "step": 63700 + }, + { + "epoch": 8.814589665653495, + "grad_norm": 0.06473194807767868, + "learning_rate": 3.616979546091342e-05, + "loss": 1.0265, + "step": 63800 + }, + { + "epoch": 8.828405636916274, + "grad_norm": 0.047700874507427216, + "learning_rate": 3.574950966657327e-05, + "loss": 1.0303, + "step": 63900 + }, + { + "epoch": 8.842221608179056, + "grad_norm": 0.055733323097229004, + "learning_rate": 3.5329223872233116e-05, + "loss": 1.0279, + "step": 64000 + }, + { + "epoch": 8.856037579441836, + "grad_norm": 0.04398791491985321, + "learning_rate": 3.4908938077892965e-05, + "loss": 1.0284, + "step": 64100 + }, + { + "epoch": 8.869853550704615, + "grad_norm": 0.08901511132717133, + "learning_rate": 3.448865228355281e-05, + "loss": 1.0283, + "step": 64200 + }, + { + "epoch": 8.883669521967395, + "grad_norm": 0.05853118374943733, + "learning_rate": 3.406836648921266e-05, + "loss": 1.0291, + "step": 64300 + }, + { + "epoch": 8.897485493230175, + "grad_norm": 0.043922308832407, + "learning_rate": 3.364808069487251e-05, + "loss": 1.0294, + "step": 64400 + }, + { + "epoch": 8.911301464492954, + "grad_norm": 0.04332153871655464, + "learning_rate": 3.322779490053236e-05, + "loss": 1.0277, + "step": 64500 + }, + { + "epoch": 8.925117435755734, + "grad_norm": 0.09197825193405151, + "learning_rate": 3.280750910619221e-05, + "loss": 1.0295, + "step": 64600 + }, + { + "epoch": 8.938933407018514, + "grad_norm": 0.05589272826910019, + "learning_rate": 3.2387223311852056e-05, + "loss": 1.0274, + "step": 64700 + }, + { + "epoch": 8.952749378281293, + "grad_norm": 0.06028933823108673, + "learning_rate": 3.1966937517511904e-05, + "loss": 1.0285, + "step": 64800 + }, + { + "epoch": 8.966565349544073, + "grad_norm": 0.05357721447944641, + "learning_rate": 3.154665172317175e-05, + "loss": 1.027, + "step": 64900 + }, + { + "epoch": 8.980381320806853, + "grad_norm": 0.07362578809261322, + "learning_rate": 3.11263659288316e-05, + "loss": 1.0273, + "step": 65000 + }, + { + "epoch": 8.980381320806853, + "eval_accuracy": 0.5319501927585898, + "eval_loss": 1.0305662155151367, + "eval_runtime": 722.9505, + "eval_samples_per_second": 284.751, + "eval_steps_per_second": 8.9, + "step": 65000 + }, + { + "epoch": 8.994197292069632, + "grad_norm": 0.04831722378730774, + "learning_rate": 3.070608013449145e-05, + "loss": 1.0294, + "step": 65100 + }, + { + "epoch": 9.008013263332412, + "grad_norm": 0.06001870334148407, + "learning_rate": 3.0289997198094702e-05, + "loss": 1.0306, + "step": 65200 + }, + { + "epoch": 9.021829234595192, + "grad_norm": 0.04466562718153, + "learning_rate": 2.986971140375455e-05, + "loss": 1.0267, + "step": 65300 + }, + { + "epoch": 9.035645205857971, + "grad_norm": 0.059990085661411285, + "learning_rate": 2.94494256094144e-05, + "loss": 1.0248, + "step": 65400 + }, + { + "epoch": 9.049461177120751, + "grad_norm": 0.05244195833802223, + "learning_rate": 2.9029139815074248e-05, + "loss": 1.0282, + "step": 65500 + }, + { + "epoch": 9.06327714838353, + "grad_norm": 0.060148317366838455, + "learning_rate": 2.8608854020734097e-05, + "loss": 1.0266, + "step": 65600 + }, + { + "epoch": 9.07709311964631, + "grad_norm": 0.051530059427022934, + "learning_rate": 2.8188568226393945e-05, + "loss": 1.0257, + "step": 65700 + }, + { + "epoch": 9.090909090909092, + "grad_norm": 0.06650034338235855, + "learning_rate": 2.7768282432053794e-05, + "loss": 1.0276, + "step": 65800 + }, + { + "epoch": 9.104725062171871, + "grad_norm": 0.04850700497627258, + "learning_rate": 2.7347996637713642e-05, + "loss": 1.0249, + "step": 65900 + }, + { + "epoch": 9.118541033434651, + "grad_norm": 0.057128727436065674, + "learning_rate": 2.692771084337349e-05, + "loss": 1.0264, + "step": 66000 + }, + { + "epoch": 9.13235700469743, + "grad_norm": 0.056875213980674744, + "learning_rate": 2.650742504903334e-05, + "loss": 1.0285, + "step": 66100 + }, + { + "epoch": 9.14617297596021, + "grad_norm": 0.05632421374320984, + "learning_rate": 2.6087139254693188e-05, + "loss": 1.0286, + "step": 66200 + }, + { + "epoch": 9.15998894722299, + "grad_norm": 0.04903789609670639, + "learning_rate": 2.5666853460353037e-05, + "loss": 1.0233, + "step": 66300 + }, + { + "epoch": 9.17380491848577, + "grad_norm": 0.04932420328259468, + "learning_rate": 2.5246567666012885e-05, + "loss": 1.0273, + "step": 66400 + }, + { + "epoch": 9.18762088974855, + "grad_norm": 0.0668862909078598, + "learning_rate": 2.4826281871672734e-05, + "loss": 1.0264, + "step": 66500 + }, + { + "epoch": 9.201436861011329, + "grad_norm": 0.05283021926879883, + "learning_rate": 2.4405996077332586e-05, + "loss": 1.0278, + "step": 66600 + }, + { + "epoch": 9.215252832274109, + "grad_norm": 0.04914732649922371, + "learning_rate": 2.3985710282992434e-05, + "loss": 1.0276, + "step": 66700 + }, + { + "epoch": 9.229068803536888, + "grad_norm": 0.06511181592941284, + "learning_rate": 2.3565424488652283e-05, + "loss": 1.0268, + "step": 66800 + }, + { + "epoch": 9.242884774799668, + "grad_norm": 0.06101306900382042, + "learning_rate": 2.314513869431213e-05, + "loss": 1.0267, + "step": 66900 + }, + { + "epoch": 9.256700746062448, + "grad_norm": 0.05272289365530014, + "learning_rate": 2.272485289997198e-05, + "loss": 1.0242, + "step": 67000 + }, + { + "epoch": 9.270516717325227, + "grad_norm": 0.04828105494379997, + "learning_rate": 2.230456710563183e-05, + "loss": 1.0258, + "step": 67100 + }, + { + "epoch": 9.284332688588007, + "grad_norm": 0.054294098168611526, + "learning_rate": 2.1888484169235077e-05, + "loss": 1.0262, + "step": 67200 + }, + { + "epoch": 9.298148659850787, + "grad_norm": 0.04951765388250351, + "learning_rate": 2.1468198374894926e-05, + "loss": 1.0254, + "step": 67300 + }, + { + "epoch": 9.311964631113566, + "grad_norm": 0.047647446393966675, + "learning_rate": 2.1047912580554774e-05, + "loss": 1.0262, + "step": 67400 + }, + { + "epoch": 9.325780602376348, + "grad_norm": 0.062047079205513, + "learning_rate": 2.0627626786214623e-05, + "loss": 1.0287, + "step": 67500 + }, + { + "epoch": 9.339596573639128, + "grad_norm": 0.05751033127307892, + "learning_rate": 2.020734099187447e-05, + "loss": 1.027, + "step": 67600 + }, + { + "epoch": 9.353412544901907, + "grad_norm": 0.058642346411943436, + "learning_rate": 1.978705519753432e-05, + "loss": 1.0276, + "step": 67700 + }, + { + "epoch": 9.367228516164687, + "grad_norm": 0.050882838666439056, + "learning_rate": 1.936676940319417e-05, + "loss": 1.0223, + "step": 67800 + }, + { + "epoch": 9.381044487427467, + "grad_norm": 0.053814638406038284, + "learning_rate": 1.8946483608854017e-05, + "loss": 1.0271, + "step": 67900 + }, + { + "epoch": 9.394860458690246, + "grad_norm": 0.05407179519534111, + "learning_rate": 1.852619781451387e-05, + "loss": 1.0242, + "step": 68000 + }, + { + "epoch": 9.408676429953026, + "grad_norm": 0.05431421846151352, + "learning_rate": 1.8105912020173718e-05, + "loss": 1.0246, + "step": 68100 + }, + { + "epoch": 9.422492401215806, + "grad_norm": 0.05826635658740997, + "learning_rate": 1.7685626225833566e-05, + "loss": 1.024, + "step": 68200 + }, + { + "epoch": 9.436308372478585, + "grad_norm": 0.043603766709566116, + "learning_rate": 1.7265340431493415e-05, + "loss": 1.025, + "step": 68300 + }, + { + "epoch": 9.450124343741365, + "grad_norm": 0.0555894561111927, + "learning_rate": 1.6845054637153263e-05, + "loss": 1.0267, + "step": 68400 + }, + { + "epoch": 9.463940315004145, + "grad_norm": 0.046029891818761826, + "learning_rate": 1.6424768842813112e-05, + "loss": 1.0247, + "step": 68500 + }, + { + "epoch": 9.477756286266924, + "grad_norm": 0.04906938225030899, + "learning_rate": 1.600448304847296e-05, + "loss": 1.0233, + "step": 68600 + }, + { + "epoch": 9.491572257529704, + "grad_norm": 0.07827210426330566, + "learning_rate": 1.558419725413281e-05, + "loss": 1.0262, + "step": 68700 + }, + { + "epoch": 9.505388228792484, + "grad_norm": 0.04391390085220337, + "learning_rate": 1.5163911459792658e-05, + "loss": 1.0255, + "step": 68800 + }, + { + "epoch": 9.519204200055263, + "grad_norm": 0.05310402810573578, + "learning_rate": 1.4743625665452506e-05, + "loss": 1.0268, + "step": 68900 + }, + { + "epoch": 9.533020171318043, + "grad_norm": 0.060242168605327606, + "learning_rate": 1.4323339871112355e-05, + "loss": 1.0257, + "step": 69000 + }, + { + "epoch": 9.546836142580823, + "grad_norm": 0.04949665814638138, + "learning_rate": 1.3903054076772205e-05, + "loss": 1.0294, + "step": 69100 + }, + { + "epoch": 9.560652113843604, + "grad_norm": 0.05413687229156494, + "learning_rate": 1.3482768282432054e-05, + "loss": 1.0272, + "step": 69200 + }, + { + "epoch": 9.574468085106384, + "grad_norm": 0.05380227789282799, + "learning_rate": 1.3062482488091902e-05, + "loss": 1.025, + "step": 69300 + }, + { + "epoch": 9.588284056369163, + "grad_norm": 0.04961249604821205, + "learning_rate": 1.2646399551695151e-05, + "loss": 1.0289, + "step": 69400 + }, + { + "epoch": 9.602100027631943, + "grad_norm": 0.045629873871803284, + "learning_rate": 1.2226113757355e-05, + "loss": 1.0269, + "step": 69500 + }, + { + "epoch": 9.615915998894723, + "grad_norm": 0.04661751165986061, + "learning_rate": 1.1805827963014848e-05, + "loss": 1.0277, + "step": 69600 + }, + { + "epoch": 9.629731970157502, + "grad_norm": 0.06289409101009369, + "learning_rate": 1.1385542168674697e-05, + "loss": 1.0246, + "step": 69700 + }, + { + "epoch": 9.643547941420282, + "grad_norm": 0.061526406556367874, + "learning_rate": 1.0965256374334547e-05, + "loss": 1.0252, + "step": 69800 + }, + { + "epoch": 9.657363912683062, + "grad_norm": 0.05611636862158775, + "learning_rate": 1.0544970579994395e-05, + "loss": 1.0281, + "step": 69900 + }, + { + "epoch": 9.671179883945841, + "grad_norm": 0.05305150896310806, + "learning_rate": 1.0124684785654244e-05, + "loss": 1.027, + "step": 70000 + }, + { + "epoch": 9.671179883945841, + "eval_accuracy": 0.5323623139821072, + "eval_loss": 1.029943823814392, + "eval_runtime": 726.2479, + "eval_samples_per_second": 283.458, + "eval_steps_per_second": 8.859, + "step": 70000 + }, + { + "epoch": 9.684995855208621, + "grad_norm": 0.06483161449432373, + "learning_rate": 9.704398991314093e-06, + "loss": 1.0262, + "step": 70100 + }, + { + "epoch": 9.6988118264714, + "grad_norm": 0.05063271522521973, + "learning_rate": 9.284113196973941e-06, + "loss": 1.0246, + "step": 70200 + }, + { + "epoch": 9.71262779773418, + "grad_norm": 0.04985768347978592, + "learning_rate": 8.86382740263379e-06, + "loss": 1.03, + "step": 70300 + }, + { + "epoch": 9.72644376899696, + "grad_norm": 0.04751725122332573, + "learning_rate": 8.443541608293638e-06, + "loss": 1.0272, + "step": 70400 + }, + { + "epoch": 9.74025974025974, + "grad_norm": 0.042586106806993484, + "learning_rate": 8.023255813953487e-06, + "loss": 1.0254, + "step": 70500 + }, + { + "epoch": 9.75407571152252, + "grad_norm": 0.059688206762075424, + "learning_rate": 7.602970019613336e-06, + "loss": 1.0255, + "step": 70600 + }, + { + "epoch": 9.767891682785299, + "grad_norm": 0.04823042452335358, + "learning_rate": 7.182684225273185e-06, + "loss": 1.028, + "step": 70700 + }, + { + "epoch": 9.78170765404808, + "grad_norm": 0.0480177104473114, + "learning_rate": 6.762398430933033e-06, + "loss": 1.025, + "step": 70800 + }, + { + "epoch": 9.79552362531086, + "grad_norm": 0.045797545462846756, + "learning_rate": 6.342112636592882e-06, + "loss": 1.023, + "step": 70900 + }, + { + "epoch": 9.80933959657364, + "grad_norm": 0.04858710244297981, + "learning_rate": 5.921826842252732e-06, + "loss": 1.0216, + "step": 71000 + }, + { + "epoch": 9.82315556783642, + "grad_norm": 0.05248698219656944, + "learning_rate": 5.501541047912581e-06, + "loss": 1.0252, + "step": 71100 + }, + { + "epoch": 9.8369715390992, + "grad_norm": 0.045856546610593796, + "learning_rate": 5.081255253572429e-06, + "loss": 1.0249, + "step": 71200 + }, + { + "epoch": 9.850787510361979, + "grad_norm": 0.047852564603090286, + "learning_rate": 4.660969459232278e-06, + "loss": 1.0238, + "step": 71300 + }, + { + "epoch": 9.864603481624759, + "grad_norm": 0.044457610696554184, + "learning_rate": 4.240683664892126e-06, + "loss": 1.0245, + "step": 71400 + }, + { + "epoch": 9.878419452887538, + "grad_norm": 0.06768154352903366, + "learning_rate": 3.824600728495377e-06, + "loss": 1.0256, + "step": 71500 + }, + { + "epoch": 9.892235424150318, + "grad_norm": 0.050749246031045914, + "learning_rate": 3.4043149341552255e-06, + "loss": 1.0224, + "step": 71600 + }, + { + "epoch": 9.906051395413098, + "grad_norm": 0.04643206670880318, + "learning_rate": 2.984029139815074e-06, + "loss": 1.0279, + "step": 71700 + }, + { + "epoch": 9.919867366675877, + "grad_norm": 0.05305636674165726, + "learning_rate": 2.5637433454749226e-06, + "loss": 1.0257, + "step": 71800 + }, + { + "epoch": 9.933683337938657, + "grad_norm": 0.06888972967863083, + "learning_rate": 2.1434575511347716e-06, + "loss": 1.0301, + "step": 71900 + }, + { + "epoch": 9.947499309201437, + "grad_norm": 0.04633474349975586, + "learning_rate": 1.7231717567946201e-06, + "loss": 1.0228, + "step": 72000 + }, + { + "epoch": 9.961315280464216, + "grad_norm": 0.05391710251569748, + "learning_rate": 1.302885962454469e-06, + "loss": 1.0284, + "step": 72100 + }, + { + "epoch": 9.975131251726996, + "grad_norm": 0.048064954578876495, + "learning_rate": 8.826001681143176e-07, + "loss": 1.0257, + "step": 72200 + }, + { + "epoch": 9.988947222989776, + "grad_norm": 0.04865507408976555, + "learning_rate": 4.623143737741664e-07, + "loss": 1.0274, + "step": 72300 + }, + { + "epoch": 10.0, + "step": 72380, + "total_flos": 9.683106445125485e+18, + "train_loss": 1.0569831334908848, + "train_runtime": 55085.3166, + "train_samples_per_second": 336.373, + "train_steps_per_second": 1.314 + } + ], + "logging_steps": 100, + "max_steps": 72380, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.683106445125485e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..45f2f12b913e85908e1565ce4b13c8763ea7a1ca --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e8fb7657adc13bdcaf635b1c6fb616dd082a6870cdd6aecd3b669d8cac0873 +size 5304 diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..d0809a2e3e28811023f05ed415122e24681bc9d1 --- /dev/null +++ b/vocab.json @@ -0,0 +1 @@ +{"<|endoftext|>":0,"A":1,"C":2,"G":3,"T":4} \ No newline at end of file