C10X commited on
Commit
a91e884
·
verified ·
1 Parent(s): 6e2d39c

Upload trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. trainer_state.json +139 -0
trainer_state.json ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 700,
3
+ "best_metric": 5.510611534118652,
4
+ "best_model_checkpoint": "./qwen3moe_tinystories_sft/checkpoint-700",
5
+ "epoch": 0.9996631862579993,
6
+ "eval_steps": 100,
7
+ "global_step": 742,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.13472549680026946,
14
+ "grad_norm": 58221.64453125,
15
+ "learning_rate": 3.3221476510067115e-05,
16
+ "loss": 5.6966,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.13472549680026946,
21
+ "eval_loss": 10.694306373596191,
22
+ "eval_runtime": 119.9061,
23
+ "eval_samples_per_second": 41.699,
24
+ "eval_steps_per_second": 2.61,
25
+ "step": 100
26
+ },
27
+ {
28
+ "epoch": 0.2694509936005389,
29
+ "grad_norm": 53346.890625,
30
+ "learning_rate": 4.5784148397976396e-05,
31
+ "loss": 4.7677,
32
+ "step": 200
33
+ },
34
+ {
35
+ "epoch": 0.2694509936005389,
36
+ "eval_loss": 8.538475036621094,
37
+ "eval_runtime": 118.4911,
38
+ "eval_samples_per_second": 42.197,
39
+ "eval_steps_per_second": 2.642,
40
+ "step": 200
41
+ },
42
+ {
43
+ "epoch": 0.40417649040080833,
44
+ "grad_norm": 60593.984375,
45
+ "learning_rate": 3.735244519392918e-05,
46
+ "loss": 3.8182,
47
+ "step": 300
48
+ },
49
+ {
50
+ "epoch": 0.40417649040080833,
51
+ "eval_loss": 6.992630481719971,
52
+ "eval_runtime": 122.3318,
53
+ "eval_samples_per_second": 40.872,
54
+ "eval_steps_per_second": 2.559,
55
+ "step": 300
56
+ },
57
+ {
58
+ "epoch": 0.5389019872010778,
59
+ "grad_norm": 45884.55078125,
60
+ "learning_rate": 2.8920741989881955e-05,
61
+ "loss": 3.268,
62
+ "step": 400
63
+ },
64
+ {
65
+ "epoch": 0.5389019872010778,
66
+ "eval_loss": 6.24953031539917,
67
+ "eval_runtime": 122.2793,
68
+ "eval_samples_per_second": 40.89,
69
+ "eval_steps_per_second": 2.56,
70
+ "step": 400
71
+ },
72
+ {
73
+ "epoch": 0.6736274840013473,
74
+ "grad_norm": 37802.12890625,
75
+ "learning_rate": 2.048903878583474e-05,
76
+ "loss": 2.9965,
77
+ "step": 500
78
+ },
79
+ {
80
+ "epoch": 0.6736274840013473,
81
+ "eval_loss": 5.843188762664795,
82
+ "eval_runtime": 121.956,
83
+ "eval_samples_per_second": 40.998,
84
+ "eval_steps_per_second": 2.567,
85
+ "step": 500
86
+ },
87
+ {
88
+ "epoch": 0.8083529808016167,
89
+ "grad_norm": 34398.84375,
90
+ "learning_rate": 1.205733558178752e-05,
91
+ "loss": 2.8499,
92
+ "step": 600
93
+ },
94
+ {
95
+ "epoch": 0.8083529808016167,
96
+ "eval_loss": 5.611515045166016,
97
+ "eval_runtime": 121.7403,
98
+ "eval_samples_per_second": 41.071,
99
+ "eval_steps_per_second": 2.571,
100
+ "step": 600
101
+ },
102
+ {
103
+ "epoch": 0.9430784776018861,
104
+ "grad_norm": 29294.10546875,
105
+ "learning_rate": 3.625632377740304e-06,
106
+ "loss": 2.7606,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.9430784776018861,
111
+ "eval_loss": 5.510611534118652,
112
+ "eval_runtime": 121.6665,
113
+ "eval_samples_per_second": 41.096,
114
+ "eval_steps_per_second": 2.573,
115
+ "step": 700
116
+ }
117
+ ],
118
+ "logging_steps": 100,
119
+ "max_steps": 742,
120
+ "num_input_tokens_seen": 0,
121
+ "num_train_epochs": 1,
122
+ "save_steps": 100,
123
+ "stateful_callbacks": {
124
+ "TrainerControl": {
125
+ "args": {
126
+ "should_epoch_stop": false,
127
+ "should_evaluate": false,
128
+ "should_log": false,
129
+ "should_save": true,
130
+ "should_training_stop": true
131
+ },
132
+ "attributes": {}
133
+ }
134
+ },
135
+ "total_flos": 1.0700859494762496e+16,
136
+ "train_batch_size": 16,
137
+ "trial_name": null,
138
+ "trial_params": null
139
+ }