nhxnnz commited on
Commit
4217876
·
verified ·
1 Parent(s): 7832a46

End of training

Browse files
README.md CHANGED
@@ -45,7 +45,7 @@ The following hyperparameters were used during training:
45
  - total_train_batch_size: 8
46
  - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
47
  - lr_scheduler_type: linear
48
- - num_epochs: 5
49
  - mixed_precision_training: Native AMP
50
 
51
  ### Training results
 
45
  - total_train_batch_size: 8
46
  - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
47
  - lr_scheduler_type: linear
48
+ - num_epochs: 3
49
  - mixed_precision_training: Native AMP
50
 
51
  ### Training results
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5aa93c1b7c6b9c41d9c025d794d95d11a02e903be58aa7c8109a86cabf31a19
3
  size 7098064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f13b5bc3bfd7e71cc212611755dcdb8e8ff2fa2781efeef6f372a56a4ce707d3
3
  size 7098064
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 5.0,
3
- "total_flos": 4.0365651456e+18,
4
- "train_loss": 0.6317709513287723,
5
- "train_runtime": 3451.1568,
6
- "train_samples_per_second": 4.017,
7
- "train_steps_per_second": 0.503
8
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "total_flos": 2.42193908736e+18,
4
+ "train_loss": 0.6824458238599852,
5
+ "train_runtime": 2058.2511,
6
+ "train_samples_per_second": 4.042,
7
+ "train_steps_per_second": 0.506
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 5.0,
3
- "total_flos": 4.0365651456e+18,
4
- "train_loss": 0.6317709513287723,
5
- "train_runtime": 3451.1568,
6
- "train_samples_per_second": 4.017,
7
- "train_steps_per_second": 0.503
8
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "total_flos": 2.42193908736e+18,
4
+ "train_loss": 0.6824458238599852,
5
+ "train_runtime": 2058.2511,
6
+ "train_samples_per_second": 4.042,
7
+ "train_steps_per_second": 0.506
8
  }
trainer_state.json CHANGED
@@ -1,167 +1,111 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 1735,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2881844380403458,
13
- "grad_norm": 1.1160194873809814,
14
- "learning_rate": 0.000189164265129683,
15
- "loss": 4.0205,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.5763688760806917,
20
- "grad_norm": 2.0156755447387695,
21
- "learning_rate": 0.0001776368876080692,
22
- "loss": 1.605,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.8645533141210374,
27
- "grad_norm": 1.7225794792175293,
28
- "learning_rate": 0.00016610951008645534,
29
- "loss": 0.504,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 1.1527377521613833,
34
- "grad_norm": 1.113335132598877,
35
- "learning_rate": 0.00015458213256484151,
36
- "loss": 0.4403,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 1.440922190201729,
41
- "grad_norm": 1.4553213119506836,
42
- "learning_rate": 0.00014305475504322766,
43
- "loss": 0.3956,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 1.440922190201729,
48
- "eval_runtime": 222.0213,
49
- "eval_samples_per_second": 5.572,
50
- "eval_steps_per_second": 2.788,
51
  "step": 500
52
  },
53
  {
54
  "epoch": 1.729106628242075,
55
- "grad_norm": 1.1621837615966797,
56
- "learning_rate": 0.00013152737752161384,
57
- "loss": 0.4173,
58
  "step": 600
59
  },
60
  {
61
  "epoch": 2.0172910662824206,
62
- "grad_norm": 1.2616935968399048,
63
- "learning_rate": 0.00012,
64
- "loss": 0.4011,
65
  "step": 700
66
  },
67
  {
68
  "epoch": 2.3054755043227666,
69
- "grad_norm": 1.4558714628219604,
70
- "learning_rate": 0.00010847262247838617,
71
- "loss": 0.3526,
72
  "step": 800
73
  },
74
  {
75
  "epoch": 2.5936599423631126,
76
- "grad_norm": 0.9764755964279175,
77
- "learning_rate": 9.694524495677234e-05,
78
- "loss": 0.3396,
79
  "step": 900
80
  },
81
  {
82
  "epoch": 2.881844380403458,
83
- "grad_norm": 1.3010659217834473,
84
- "learning_rate": 8.54178674351585e-05,
85
- "loss": 0.331,
86
  "step": 1000
87
  },
88
  {
89
  "epoch": 2.881844380403458,
90
- "eval_runtime": 221.1814,
91
- "eval_samples_per_second": 5.593,
92
- "eval_steps_per_second": 2.799,
93
  "step": 1000
94
  },
95
  {
96
- "epoch": 3.170028818443804,
97
- "grad_norm": 1.0955506563186646,
98
- "learning_rate": 7.389048991354467e-05,
99
- "loss": 0.3304,
100
- "step": 1100
101
- },
102
- {
103
- "epoch": 3.4582132564841497,
104
- "grad_norm": 1.3507652282714844,
105
- "learning_rate": 6.236311239193083e-05,
106
- "loss": 0.2925,
107
- "step": 1200
108
- },
109
- {
110
- "epoch": 3.7463976945244957,
111
- "grad_norm": 1.1415163278579712,
112
- "learning_rate": 5.083573487031701e-05,
113
- "loss": 0.3203,
114
- "step": 1300
115
- },
116
- {
117
- "epoch": 4.034582132564841,
118
- "grad_norm": 1.4559412002563477,
119
- "learning_rate": 3.930835734870317e-05,
120
- "loss": 0.2941,
121
- "step": 1400
122
- },
123
- {
124
- "epoch": 4.322766570605188,
125
- "grad_norm": 1.3477973937988281,
126
- "learning_rate": 2.7780979827089336e-05,
127
- "loss": 0.269,
128
- "step": 1500
129
- },
130
- {
131
- "epoch": 4.322766570605188,
132
- "eval_runtime": 221.2997,
133
- "eval_samples_per_second": 5.59,
134
- "eval_steps_per_second": 2.797,
135
- "step": 1500
136
- },
137
- {
138
- "epoch": 4.610951008645533,
139
- "grad_norm": 0.9907336831092834,
140
- "learning_rate": 1.6253602305475506e-05,
141
- "loss": 0.2738,
142
- "step": 1600
143
- },
144
- {
145
- "epoch": 4.899135446685879,
146
- "grad_norm": 1.5455944538116455,
147
- "learning_rate": 4.726224783861672e-06,
148
- "loss": 0.2767,
149
- "step": 1700
150
- },
151
- {
152
- "epoch": 5.0,
153
- "step": 1735,
154
- "total_flos": 4.0365651456e+18,
155
- "train_loss": 0.6317709513287723,
156
- "train_runtime": 3451.1568,
157
- "train_samples_per_second": 4.017,
158
- "train_steps_per_second": 0.503
159
  }
160
  ],
161
  "logging_steps": 100,
162
- "max_steps": 1735,
163
  "num_input_tokens_seen": 0,
164
- "num_train_epochs": 5,
165
  "save_steps": 500,
166
  "stateful_callbacks": {
167
  "TrainerControl": {
@@ -175,7 +119,7 @@
175
  "attributes": {}
176
  }
177
  },
178
- "total_flos": 4.0365651456e+18,
179
  "train_batch_size": 4,
180
  "trial_name": null,
181
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
  "eval_steps": 500,
6
+ "global_step": 1041,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2881844380403458,
13
+ "grad_norm": 1.4422333240509033,
14
+ "learning_rate": 0.0001815561959654179,
15
+ "loss": 3.1655,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.5763688760806917,
20
+ "grad_norm": 1.9075777530670166,
21
+ "learning_rate": 0.00016234390009606147,
22
+ "loss": 0.8772,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.8645533141210374,
27
+ "grad_norm": 1.9097354412078857,
28
+ "learning_rate": 0.0001431316042267051,
29
+ "loss": 0.451,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 1.1527377521613833,
34
+ "grad_norm": 1.6237609386444092,
35
+ "learning_rate": 0.00012391930835734872,
36
+ "loss": 0.3981,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 1.440922190201729,
41
+ "grad_norm": 1.9261122941970825,
42
+ "learning_rate": 0.00010470701248799233,
43
+ "loss": 0.367,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 1.440922190201729,
48
+ "eval_runtime": 216.0803,
49
+ "eval_samples_per_second": 5.725,
50
+ "eval_steps_per_second": 2.865,
51
  "step": 500
52
  },
53
  {
54
  "epoch": 1.729106628242075,
55
+ "grad_norm": 1.2721047401428223,
56
+ "learning_rate": 8.549471661863592e-05,
57
+ "loss": 0.3705,
58
  "step": 600
59
  },
60
  {
61
  "epoch": 2.0172910662824206,
62
+ "grad_norm": 1.371832251548767,
63
+ "learning_rate": 6.628242074927953e-05,
64
+ "loss": 0.3746,
65
  "step": 700
66
  },
67
  {
68
  "epoch": 2.3054755043227666,
69
+ "grad_norm": 1.7075700759887695,
70
+ "learning_rate": 4.7070124879923156e-05,
71
+ "loss": 0.3314,
72
  "step": 800
73
  },
74
  {
75
  "epoch": 2.5936599423631126,
76
+ "grad_norm": 1.1070783138275146,
77
+ "learning_rate": 2.7857829010566765e-05,
78
+ "loss": 0.3145,
79
  "step": 900
80
  },
81
  {
82
  "epoch": 2.881844380403458,
83
+ "grad_norm": 1.6173722743988037,
84
+ "learning_rate": 8.645533141210376e-06,
85
+ "loss": 0.3149,
86
  "step": 1000
87
  },
88
  {
89
  "epoch": 2.881844380403458,
90
+ "eval_runtime": 213.2376,
91
+ "eval_samples_per_second": 5.801,
92
+ "eval_steps_per_second": 2.903,
93
  "step": 1000
94
  },
95
  {
96
+ "epoch": 3.0,
97
+ "step": 1041,
98
+ "total_flos": 2.42193908736e+18,
99
+ "train_loss": 0.6824458238599852,
100
+ "train_runtime": 2058.2511,
101
+ "train_samples_per_second": 4.042,
102
+ "train_steps_per_second": 0.506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  }
104
  ],
105
  "logging_steps": 100,
106
+ "max_steps": 1041,
107
  "num_input_tokens_seen": 0,
108
+ "num_train_epochs": 3,
109
  "save_steps": 500,
110
  "stateful_callbacks": {
111
  "TrainerControl": {
 
119
  "attributes": {}
120
  }
121
  },
122
+ "total_flos": 2.42193908736e+18,
123
  "train_batch_size": 4,
124
  "trial_name": null,
125
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b30f32d48e6fc7612992dedde0fe7264e5f8905c5bd332ddc71486c1d962cef3
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c9472bc82f396ae3da78adc9241f75284ff06546b9dfc82b30aa8a4384c5c76
3
  size 5304