Upload 13 files
Browse files- config.json +38 -0
- generation_config.json +7 -0
- merges.txt +0 -0
- model.safetensors +3 -0
- optimizer.pt +3 -0
- rng_state.pth +3 -0
- scheduler.pt +3 -0
- special_tokens_map.json +34 -0
- tokenizer.json +0 -0
- tokenizer_config.json +155 -0
- trainer_state.json +216 -0
- training_args.bin +3 -0
- vocab.json +0 -0
    	
        config.json
    ADDED
    
    | @@ -0,0 +1,38 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "architectures": [
         | 
| 3 | 
            +
                "LlamaForCausalLM"
         | 
| 4 | 
            +
              ],
         | 
| 5 | 
            +
              "attention_bias": false,
         | 
| 6 | 
            +
              "attention_dropout": 0.0,
         | 
| 7 | 
            +
              "bos_token_id": 1,
         | 
| 8 | 
            +
              "eos_token_id": 2,
         | 
| 9 | 
            +
              "head_dim": 64,
         | 
| 10 | 
            +
              "hidden_act": "silu",
         | 
| 11 | 
            +
              "hidden_size": 960,
         | 
| 12 | 
            +
              "initializer_range": 0.02,
         | 
| 13 | 
            +
              "intermediate_size": 2560,
         | 
| 14 | 
            +
              "is_llama_config": true,
         | 
| 15 | 
            +
              "max_position_embeddings": 8192,
         | 
| 16 | 
            +
              "mlp_bias": false,
         | 
| 17 | 
            +
              "model_type": "llama",
         | 
| 18 | 
            +
              "num_attention_heads": 15,
         | 
| 19 | 
            +
              "num_hidden_layers": 32,
         | 
| 20 | 
            +
              "num_key_value_heads": 5,
         | 
| 21 | 
            +
              "pad_token_id": 2,
         | 
| 22 | 
            +
              "pretraining_tp": 1,
         | 
| 23 | 
            +
              "rms_norm_eps": 1e-05,
         | 
| 24 | 
            +
              "rope_interleaved": false,
         | 
| 25 | 
            +
              "rope_scaling": null,
         | 
| 26 | 
            +
              "rope_theta": 100000,
         | 
| 27 | 
            +
              "tie_word_embeddings": true,
         | 
| 28 | 
            +
              "torch_dtype": "float32",
         | 
| 29 | 
            +
              "transformers.js_config": {
         | 
| 30 | 
            +
                "kv_cache_dtype": {
         | 
| 31 | 
            +
                  "fp16": "float16",
         | 
| 32 | 
            +
                  "q4f16": "float16"
         | 
| 33 | 
            +
                }
         | 
| 34 | 
            +
              },
         | 
| 35 | 
            +
              "transformers_version": "4.51.3",
         | 
| 36 | 
            +
              "use_cache": true,
         | 
| 37 | 
            +
              "vocab_size": 49152
         | 
| 38 | 
            +
            }
         | 
    	
        generation_config.json
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "_from_model_config": true,
         | 
| 3 | 
            +
              "bos_token_id": 1,
         | 
| 4 | 
            +
              "eos_token_id": 2,
         | 
| 5 | 
            +
              "pad_token_id": 2,
         | 
| 6 | 
            +
              "transformers_version": "4.51.3"
         | 
| 7 | 
            +
            }
         | 
    	
        merges.txt
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        model.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:06f2c4a5e2f95070ea789067ffcee980df7a8b467d28a9b201ed2b784f989f45
         | 
| 3 | 
            +
            size 1447317080
         | 
    	
        optimizer.pt
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:4a8ba8e01c178ef6fa98339c2359fb6fe6db898f6ab073664ef04e762d436767
         | 
| 3 | 
            +
            size 2894813242
         | 
    	
        rng_state.pth
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
         | 
| 3 | 
            +
            size 14244
         | 
    	
        scheduler.pt
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:ed5c253a48e6001901218a2e9de2ab738bbeb1dc1ea233eaecf6152cbd749c0c
         | 
| 3 | 
            +
            size 1064
         | 
    	
        special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1,34 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "additional_special_tokens": [
         | 
| 3 | 
            +
                "<|im_start|>",
         | 
| 4 | 
            +
                "<|im_end|>"
         | 
| 5 | 
            +
              ],
         | 
| 6 | 
            +
              "bos_token": {
         | 
| 7 | 
            +
                "content": "<|im_start|>",
         | 
| 8 | 
            +
                "lstrip": false,
         | 
| 9 | 
            +
                "normalized": false,
         | 
| 10 | 
            +
                "rstrip": false,
         | 
| 11 | 
            +
                "single_word": false
         | 
| 12 | 
            +
              },
         | 
| 13 | 
            +
              "eos_token": {
         | 
| 14 | 
            +
                "content": "<|im_end|>",
         | 
| 15 | 
            +
                "lstrip": false,
         | 
| 16 | 
            +
                "normalized": false,
         | 
| 17 | 
            +
                "rstrip": false,
         | 
| 18 | 
            +
                "single_word": false
         | 
| 19 | 
            +
              },
         | 
| 20 | 
            +
              "pad_token": {
         | 
| 21 | 
            +
                "content": "<|im_end|>",
         | 
| 22 | 
            +
                "lstrip": false,
         | 
| 23 | 
            +
                "normalized": false,
         | 
| 24 | 
            +
                "rstrip": false,
         | 
| 25 | 
            +
                "single_word": false
         | 
| 26 | 
            +
              },
         | 
| 27 | 
            +
              "unk_token": {
         | 
| 28 | 
            +
                "content": "<|endoftext|>",
         | 
| 29 | 
            +
                "lstrip": false,
         | 
| 30 | 
            +
                "normalized": false,
         | 
| 31 | 
            +
                "rstrip": false,
         | 
| 32 | 
            +
                "single_word": false
         | 
| 33 | 
            +
              }
         | 
| 34 | 
            +
            }
         | 
    	
        tokenizer.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1,155 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "add_prefix_space": false,
         | 
| 3 | 
            +
              "added_tokens_decoder": {
         | 
| 4 | 
            +
                "0": {
         | 
| 5 | 
            +
                  "content": "<|endoftext|>",
         | 
| 6 | 
            +
                  "lstrip": false,
         | 
| 7 | 
            +
                  "normalized": false,
         | 
| 8 | 
            +
                  "rstrip": false,
         | 
| 9 | 
            +
                  "single_word": false,
         | 
| 10 | 
            +
                  "special": true
         | 
| 11 | 
            +
                },
         | 
| 12 | 
            +
                "1": {
         | 
| 13 | 
            +
                  "content": "<|im_start|>",
         | 
| 14 | 
            +
                  "lstrip": false,
         | 
| 15 | 
            +
                  "normalized": false,
         | 
| 16 | 
            +
                  "rstrip": false,
         | 
| 17 | 
            +
                  "single_word": false,
         | 
| 18 | 
            +
                  "special": true
         | 
| 19 | 
            +
                },
         | 
| 20 | 
            +
                "2": {
         | 
| 21 | 
            +
                  "content": "<|im_end|>",
         | 
| 22 | 
            +
                  "lstrip": false,
         | 
| 23 | 
            +
                  "normalized": false,
         | 
| 24 | 
            +
                  "rstrip": false,
         | 
| 25 | 
            +
                  "single_word": false,
         | 
| 26 | 
            +
                  "special": true
         | 
| 27 | 
            +
                },
         | 
| 28 | 
            +
                "3": {
         | 
| 29 | 
            +
                  "content": "<repo_name>",
         | 
| 30 | 
            +
                  "lstrip": false,
         | 
| 31 | 
            +
                  "normalized": false,
         | 
| 32 | 
            +
                  "rstrip": false,
         | 
| 33 | 
            +
                  "single_word": false,
         | 
| 34 | 
            +
                  "special": true
         | 
| 35 | 
            +
                },
         | 
| 36 | 
            +
                "4": {
         | 
| 37 | 
            +
                  "content": "<reponame>",
         | 
| 38 | 
            +
                  "lstrip": false,
         | 
| 39 | 
            +
                  "normalized": false,
         | 
| 40 | 
            +
                  "rstrip": false,
         | 
| 41 | 
            +
                  "single_word": false,
         | 
| 42 | 
            +
                  "special": true
         | 
| 43 | 
            +
                },
         | 
| 44 | 
            +
                "5": {
         | 
| 45 | 
            +
                  "content": "<file_sep>",
         | 
| 46 | 
            +
                  "lstrip": false,
         | 
| 47 | 
            +
                  "normalized": false,
         | 
| 48 | 
            +
                  "rstrip": false,
         | 
| 49 | 
            +
                  "single_word": false,
         | 
| 50 | 
            +
                  "special": true
         | 
| 51 | 
            +
                },
         | 
| 52 | 
            +
                "6": {
         | 
| 53 | 
            +
                  "content": "<filename>",
         | 
| 54 | 
            +
                  "lstrip": false,
         | 
| 55 | 
            +
                  "normalized": false,
         | 
| 56 | 
            +
                  "rstrip": false,
         | 
| 57 | 
            +
                  "single_word": false,
         | 
| 58 | 
            +
                  "special": true
         | 
| 59 | 
            +
                },
         | 
| 60 | 
            +
                "7": {
         | 
| 61 | 
            +
                  "content": "<gh_stars>",
         | 
| 62 | 
            +
                  "lstrip": false,
         | 
| 63 | 
            +
                  "normalized": false,
         | 
| 64 | 
            +
                  "rstrip": false,
         | 
| 65 | 
            +
                  "single_word": false,
         | 
| 66 | 
            +
                  "special": true
         | 
| 67 | 
            +
                },
         | 
| 68 | 
            +
                "8": {
         | 
| 69 | 
            +
                  "content": "<issue_start>",
         | 
| 70 | 
            +
                  "lstrip": false,
         | 
| 71 | 
            +
                  "normalized": false,
         | 
| 72 | 
            +
                  "rstrip": false,
         | 
| 73 | 
            +
                  "single_word": false,
         | 
| 74 | 
            +
                  "special": true
         | 
| 75 | 
            +
                },
         | 
| 76 | 
            +
                "9": {
         | 
| 77 | 
            +
                  "content": "<issue_comment>",
         | 
| 78 | 
            +
                  "lstrip": false,
         | 
| 79 | 
            +
                  "normalized": false,
         | 
| 80 | 
            +
                  "rstrip": false,
         | 
| 81 | 
            +
                  "single_word": false,
         | 
| 82 | 
            +
                  "special": true
         | 
| 83 | 
            +
                },
         | 
| 84 | 
            +
                "10": {
         | 
| 85 | 
            +
                  "content": "<issue_closed>",
         | 
| 86 | 
            +
                  "lstrip": false,
         | 
| 87 | 
            +
                  "normalized": false,
         | 
| 88 | 
            +
                  "rstrip": false,
         | 
| 89 | 
            +
                  "single_word": false,
         | 
| 90 | 
            +
                  "special": true
         | 
| 91 | 
            +
                },
         | 
| 92 | 
            +
                "11": {
         | 
| 93 | 
            +
                  "content": "<jupyter_start>",
         | 
| 94 | 
            +
                  "lstrip": false,
         | 
| 95 | 
            +
                  "normalized": false,
         | 
| 96 | 
            +
                  "rstrip": false,
         | 
| 97 | 
            +
                  "single_word": false,
         | 
| 98 | 
            +
                  "special": true
         | 
| 99 | 
            +
                },
         | 
| 100 | 
            +
                "12": {
         | 
| 101 | 
            +
                  "content": "<jupyter_text>",
         | 
| 102 | 
            +
                  "lstrip": false,
         | 
| 103 | 
            +
                  "normalized": false,
         | 
| 104 | 
            +
                  "rstrip": false,
         | 
| 105 | 
            +
                  "single_word": false,
         | 
| 106 | 
            +
                  "special": true
         | 
| 107 | 
            +
                },
         | 
| 108 | 
            +
                "13": {
         | 
| 109 | 
            +
                  "content": "<jupyter_code>",
         | 
| 110 | 
            +
                  "lstrip": false,
         | 
| 111 | 
            +
                  "normalized": false,
         | 
| 112 | 
            +
                  "rstrip": false,
         | 
| 113 | 
            +
                  "single_word": false,
         | 
| 114 | 
            +
                  "special": true
         | 
| 115 | 
            +
                },
         | 
| 116 | 
            +
                "14": {
         | 
| 117 | 
            +
                  "content": "<jupyter_output>",
         | 
| 118 | 
            +
                  "lstrip": false,
         | 
| 119 | 
            +
                  "normalized": false,
         | 
| 120 | 
            +
                  "rstrip": false,
         | 
| 121 | 
            +
                  "single_word": false,
         | 
| 122 | 
            +
                  "special": true
         | 
| 123 | 
            +
                },
         | 
| 124 | 
            +
                "15": {
         | 
| 125 | 
            +
                  "content": "<jupyter_script>",
         | 
| 126 | 
            +
                  "lstrip": false,
         | 
| 127 | 
            +
                  "normalized": false,
         | 
| 128 | 
            +
                  "rstrip": false,
         | 
| 129 | 
            +
                  "single_word": false,
         | 
| 130 | 
            +
                  "special": true
         | 
| 131 | 
            +
                },
         | 
| 132 | 
            +
                "16": {
         | 
| 133 | 
            +
                  "content": "<empty_output>",
         | 
| 134 | 
            +
                  "lstrip": false,
         | 
| 135 | 
            +
                  "normalized": false,
         | 
| 136 | 
            +
                  "rstrip": false,
         | 
| 137 | 
            +
                  "single_word": false,
         | 
| 138 | 
            +
                  "special": true
         | 
| 139 | 
            +
                }
         | 
| 140 | 
            +
              },
         | 
| 141 | 
            +
              "additional_special_tokens": [
         | 
| 142 | 
            +
                "<|im_start|>",
         | 
| 143 | 
            +
                "<|im_end|>"
         | 
| 144 | 
            +
              ],
         | 
| 145 | 
            +
              "bos_token": "<|im_start|>",
         | 
| 146 | 
            +
              "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
         | 
| 147 | 
            +
              "clean_up_tokenization_spaces": false,
         | 
| 148 | 
            +
              "eos_token": "<|im_end|>",
         | 
| 149 | 
            +
              "extra_special_tokens": {},
         | 
| 150 | 
            +
              "model_max_length": 8192,
         | 
| 151 | 
            +
              "pad_token": "<|im_end|>",
         | 
| 152 | 
            +
              "tokenizer_class": "GPT2Tokenizer",
         | 
| 153 | 
            +
              "unk_token": "<|endoftext|>",
         | 
| 154 | 
            +
              "vocab_size": 49152
         | 
| 155 | 
            +
            }
         | 
    	
        trainer_state.json
    ADDED
    
    | @@ -0,0 +1,216 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "best_global_step": null,
         | 
| 3 | 
            +
              "best_metric": null,
         | 
| 4 | 
            +
              "best_model_checkpoint": null,
         | 
| 5 | 
            +
              "epoch": 0.2,
         | 
| 6 | 
            +
              "eval_steps": 500,
         | 
| 7 | 
            +
              "global_step": 267,
         | 
| 8 | 
            +
              "is_hyper_param_search": false,
         | 
| 9 | 
            +
              "is_local_process_zero": true,
         | 
| 10 | 
            +
              "is_world_process_zero": true,
         | 
| 11 | 
            +
              "log_history": [
         | 
| 12 | 
            +
                {
         | 
| 13 | 
            +
                  "epoch": 0.00749063670411985,
         | 
| 14 | 
            +
                  "grad_norm": 4.558547019958496,
         | 
| 15 | 
            +
                  "learning_rate": 4.831460674157304e-05,
         | 
| 16 | 
            +
                  "loss": 3.2586,
         | 
| 17 | 
            +
                  "step": 10
         | 
| 18 | 
            +
                },
         | 
| 19 | 
            +
                {
         | 
| 20 | 
            +
                  "epoch": 0.0149812734082397,
         | 
| 21 | 
            +
                  "grad_norm": 4.060137748718262,
         | 
| 22 | 
            +
                  "learning_rate": 4.644194756554308e-05,
         | 
| 23 | 
            +
                  "loss": 2.9414,
         | 
| 24 | 
            +
                  "step": 20
         | 
| 25 | 
            +
                },
         | 
| 26 | 
            +
                {
         | 
| 27 | 
            +
                  "epoch": 0.02247191011235955,
         | 
| 28 | 
            +
                  "grad_norm": 4.330236434936523,
         | 
| 29 | 
            +
                  "learning_rate": 4.456928838951311e-05,
         | 
| 30 | 
            +
                  "loss": 2.7364,
         | 
| 31 | 
            +
                  "step": 30
         | 
| 32 | 
            +
                },
         | 
| 33 | 
            +
                {
         | 
| 34 | 
            +
                  "epoch": 0.0299625468164794,
         | 
| 35 | 
            +
                  "grad_norm": 3.8646533489227295,
         | 
| 36 | 
            +
                  "learning_rate": 4.269662921348315e-05,
         | 
| 37 | 
            +
                  "loss": 2.6617,
         | 
| 38 | 
            +
                  "step": 40
         | 
| 39 | 
            +
                },
         | 
| 40 | 
            +
                {
         | 
| 41 | 
            +
                  "epoch": 0.03745318352059925,
         | 
| 42 | 
            +
                  "grad_norm": 4.332949161529541,
         | 
| 43 | 
            +
                  "learning_rate": 4.082397003745319e-05,
         | 
| 44 | 
            +
                  "loss": 2.5693,
         | 
| 45 | 
            +
                  "step": 50
         | 
| 46 | 
            +
                },
         | 
| 47 | 
            +
                {
         | 
| 48 | 
            +
                  "epoch": 0.0449438202247191,
         | 
| 49 | 
            +
                  "grad_norm": 3.7690067291259766,
         | 
| 50 | 
            +
                  "learning_rate": 3.8951310861423226e-05,
         | 
| 51 | 
            +
                  "loss": 2.6648,
         | 
| 52 | 
            +
                  "step": 60
         | 
| 53 | 
            +
                },
         | 
| 54 | 
            +
                {
         | 
| 55 | 
            +
                  "epoch": 0.052434456928838954,
         | 
| 56 | 
            +
                  "grad_norm": 3.539844274520874,
         | 
| 57 | 
            +
                  "learning_rate": 3.7078651685393264e-05,
         | 
| 58 | 
            +
                  "loss": 2.6171,
         | 
| 59 | 
            +
                  "step": 70
         | 
| 60 | 
            +
                },
         | 
| 61 | 
            +
                {
         | 
| 62 | 
            +
                  "epoch": 0.0599250936329588,
         | 
| 63 | 
            +
                  "grad_norm": 4.24369478225708,
         | 
| 64 | 
            +
                  "learning_rate": 3.52059925093633e-05,
         | 
| 65 | 
            +
                  "loss": 2.4795,
         | 
| 66 | 
            +
                  "step": 80
         | 
| 67 | 
            +
                },
         | 
| 68 | 
            +
                {
         | 
| 69 | 
            +
                  "epoch": 0.06741573033707865,
         | 
| 70 | 
            +
                  "grad_norm": 4.186371326446533,
         | 
| 71 | 
            +
                  "learning_rate": 3.3333333333333335e-05,
         | 
| 72 | 
            +
                  "loss": 2.6167,
         | 
| 73 | 
            +
                  "step": 90
         | 
| 74 | 
            +
                },
         | 
| 75 | 
            +
                {
         | 
| 76 | 
            +
                  "epoch": 0.0749063670411985,
         | 
| 77 | 
            +
                  "grad_norm": 3.6899819374084473,
         | 
| 78 | 
            +
                  "learning_rate": 3.1460674157303374e-05,
         | 
| 79 | 
            +
                  "loss": 2.4597,
         | 
| 80 | 
            +
                  "step": 100
         | 
| 81 | 
            +
                },
         | 
| 82 | 
            +
                {
         | 
| 83 | 
            +
                  "epoch": 0.08239700374531835,
         | 
| 84 | 
            +
                  "grad_norm": 4.035410404205322,
         | 
| 85 | 
            +
                  "learning_rate": 2.958801498127341e-05,
         | 
| 86 | 
            +
                  "loss": 2.3755,
         | 
| 87 | 
            +
                  "step": 110
         | 
| 88 | 
            +
                },
         | 
| 89 | 
            +
                {
         | 
| 90 | 
            +
                  "epoch": 0.0898876404494382,
         | 
| 91 | 
            +
                  "grad_norm": 3.86106538772583,
         | 
| 92 | 
            +
                  "learning_rate": 2.7715355805243448e-05,
         | 
| 93 | 
            +
                  "loss": 2.4617,
         | 
| 94 | 
            +
                  "step": 120
         | 
| 95 | 
            +
                },
         | 
| 96 | 
            +
                {
         | 
| 97 | 
            +
                  "epoch": 0.09737827715355805,
         | 
| 98 | 
            +
                  "grad_norm": 5.214311599731445,
         | 
| 99 | 
            +
                  "learning_rate": 2.5842696629213486e-05,
         | 
| 100 | 
            +
                  "loss": 2.3476,
         | 
| 101 | 
            +
                  "step": 130
         | 
| 102 | 
            +
                },
         | 
| 103 | 
            +
                {
         | 
| 104 | 
            +
                  "epoch": 0.10486891385767791,
         | 
| 105 | 
            +
                  "grad_norm": 4.270963191986084,
         | 
| 106 | 
            +
                  "learning_rate": 2.3970037453183522e-05,
         | 
| 107 | 
            +
                  "loss": 2.4596,
         | 
| 108 | 
            +
                  "step": 140
         | 
| 109 | 
            +
                },
         | 
| 110 | 
            +
                {
         | 
| 111 | 
            +
                  "epoch": 0.11235955056179775,
         | 
| 112 | 
            +
                  "grad_norm": 3.6309258937835693,
         | 
| 113 | 
            +
                  "learning_rate": 2.209737827715356e-05,
         | 
| 114 | 
            +
                  "loss": 2.3118,
         | 
| 115 | 
            +
                  "step": 150
         | 
| 116 | 
            +
                },
         | 
| 117 | 
            +
                {
         | 
| 118 | 
            +
                  "epoch": 0.1198501872659176,
         | 
| 119 | 
            +
                  "grad_norm": 3.888986349105835,
         | 
| 120 | 
            +
                  "learning_rate": 2.0224719101123596e-05,
         | 
| 121 | 
            +
                  "loss": 2.3443,
         | 
| 122 | 
            +
                  "step": 160
         | 
| 123 | 
            +
                },
         | 
| 124 | 
            +
                {
         | 
| 125 | 
            +
                  "epoch": 0.12734082397003746,
         | 
| 126 | 
            +
                  "grad_norm": 2.937702178955078,
         | 
| 127 | 
            +
                  "learning_rate": 1.8352059925093635e-05,
         | 
| 128 | 
            +
                  "loss": 2.4183,
         | 
| 129 | 
            +
                  "step": 170
         | 
| 130 | 
            +
                },
         | 
| 131 | 
            +
                {
         | 
| 132 | 
            +
                  "epoch": 0.1348314606741573,
         | 
| 133 | 
            +
                  "grad_norm": 3.4217689037323,
         | 
| 134 | 
            +
                  "learning_rate": 1.647940074906367e-05,
         | 
| 135 | 
            +
                  "loss": 2.4306,
         | 
| 136 | 
            +
                  "step": 180
         | 
| 137 | 
            +
                },
         | 
| 138 | 
            +
                {
         | 
| 139 | 
            +
                  "epoch": 0.14232209737827714,
         | 
| 140 | 
            +
                  "grad_norm": 3.5683271884918213,
         | 
| 141 | 
            +
                  "learning_rate": 1.4606741573033709e-05,
         | 
| 142 | 
            +
                  "loss": 2.3407,
         | 
| 143 | 
            +
                  "step": 190
         | 
| 144 | 
            +
                },
         | 
| 145 | 
            +
                {
         | 
| 146 | 
            +
                  "epoch": 0.149812734082397,
         | 
| 147 | 
            +
                  "grad_norm": 3.605642318725586,
         | 
| 148 | 
            +
                  "learning_rate": 1.2734082397003746e-05,
         | 
| 149 | 
            +
                  "loss": 2.3402,
         | 
| 150 | 
            +
                  "step": 200
         | 
| 151 | 
            +
                },
         | 
| 152 | 
            +
                {
         | 
| 153 | 
            +
                  "epoch": 0.15730337078651685,
         | 
| 154 | 
            +
                  "grad_norm": 2.3034493923187256,
         | 
| 155 | 
            +
                  "learning_rate": 1.0861423220973783e-05,
         | 
| 156 | 
            +
                  "loss": 2.1756,
         | 
| 157 | 
            +
                  "step": 210
         | 
| 158 | 
            +
                },
         | 
| 159 | 
            +
                {
         | 
| 160 | 
            +
                  "epoch": 0.1647940074906367,
         | 
| 161 | 
            +
                  "grad_norm": 3.481696605682373,
         | 
| 162 | 
            +
                  "learning_rate": 8.98876404494382e-06,
         | 
| 163 | 
            +
                  "loss": 2.1487,
         | 
| 164 | 
            +
                  "step": 220
         | 
| 165 | 
            +
                },
         | 
| 166 | 
            +
                {
         | 
| 167 | 
            +
                  "epoch": 0.17228464419475656,
         | 
| 168 | 
            +
                  "grad_norm": 3.515986919403076,
         | 
| 169 | 
            +
                  "learning_rate": 7.116104868913858e-06,
         | 
| 170 | 
            +
                  "loss": 2.2253,
         | 
| 171 | 
            +
                  "step": 230
         | 
| 172 | 
            +
                },
         | 
| 173 | 
            +
                {
         | 
| 174 | 
            +
                  "epoch": 0.1797752808988764,
         | 
| 175 | 
            +
                  "grad_norm": 3.26123046875,
         | 
| 176 | 
            +
                  "learning_rate": 5.243445692883896e-06,
         | 
| 177 | 
            +
                  "loss": 2.3215,
         | 
| 178 | 
            +
                  "step": 240
         | 
| 179 | 
            +
                },
         | 
| 180 | 
            +
                {
         | 
| 181 | 
            +
                  "epoch": 0.18726591760299627,
         | 
| 182 | 
            +
                  "grad_norm": 4.033005714416504,
         | 
| 183 | 
            +
                  "learning_rate": 3.3707865168539327e-06,
         | 
| 184 | 
            +
                  "loss": 2.2975,
         | 
| 185 | 
            +
                  "step": 250
         | 
| 186 | 
            +
                },
         | 
| 187 | 
            +
                {
         | 
| 188 | 
            +
                  "epoch": 0.1947565543071161,
         | 
| 189 | 
            +
                  "grad_norm": 3.7090864181518555,
         | 
| 190 | 
            +
                  "learning_rate": 1.4981273408239701e-06,
         | 
| 191 | 
            +
                  "loss": 2.2616,
         | 
| 192 | 
            +
                  "step": 260
         | 
| 193 | 
            +
                }
         | 
| 194 | 
            +
              ],
         | 
| 195 | 
            +
              "logging_steps": 10,
         | 
| 196 | 
            +
              "max_steps": 267,
         | 
| 197 | 
            +
              "num_input_tokens_seen": 0,
         | 
| 198 | 
            +
              "num_train_epochs": 1,
         | 
| 199 | 
            +
              "save_steps": 500,
         | 
| 200 | 
            +
              "stateful_callbacks": {
         | 
| 201 | 
            +
                "TrainerControl": {
         | 
| 202 | 
            +
                  "args": {
         | 
| 203 | 
            +
                    "should_epoch_stop": false,
         | 
| 204 | 
            +
                    "should_evaluate": false,
         | 
| 205 | 
            +
                    "should_log": false,
         | 
| 206 | 
            +
                    "should_save": true,
         | 
| 207 | 
            +
                    "should_training_stop": true
         | 
| 208 | 
            +
                  },
         | 
| 209 | 
            +
                  "attributes": {}
         | 
| 210 | 
            +
                }
         | 
| 211 | 
            +
              },
         | 
| 212 | 
            +
              "total_flos": 1032285369139200.0,
         | 
| 213 | 
            +
              "train_batch_size": 4,
         | 
| 214 | 
            +
              "trial_name": null,
         | 
| 215 | 
            +
              "trial_params": null
         | 
| 216 | 
            +
            }
         | 
    	
        training_args.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:f1ac74783d34e42ccc2da737bfff93ba37dddbb94719fd2396015daa942159ee
         | 
| 3 | 
            +
            size 5304
         | 
    	
        vocab.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 

