RefalMachine commited on
Commit
e26204b
·
1 Parent(s): 3157642
README.md ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ tags:
4
+ - generated_from_trainer
5
+ metrics:
6
+ - accuracy
7
+ base_model: outputs/solar_10.7_darulm_unigram_proj_init_8node_darulm_part1_v3_1.0_512_12_02_24
8
+ model-index:
9
+ - name: solar_10.7_darulm_unigram_proj_init_darulm_part2_r128_a512_v3_1.0_512_28_02_24
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ # solar_10.7_darulm_unigram_proj_init_darulm_part2_r128_a512_v3_1.0_512_28_02_24
17
+
18
+ This model is a fine-tuned version of [outputs/solar_10.7_darulm_unigram_proj_init_8node_darulm_part1_v3_1.0_512_12_02_24](https://huggingface.co/outputs/solar_10.7_darulm_unigram_proj_init_8node_darulm_part1_v3_1.0_512_12_02_24) on the None dataset.
19
+ It achieves the following results on the evaluation set:
20
+ - Loss: 2.2309
21
+ - Accuracy: 0.5309
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 5e-05
41
+ - train_batch_size: 1
42
+ - eval_batch_size: 1
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 24
46
+ - gradient_accumulation_steps: 5
47
+ - total_train_batch_size: 120
48
+ - total_eval_batch_size: 24
49
+ - optimizer: Adam with betas=(0.9,0.95) and epsilon=1e-05
50
+ - lr_scheduler_type: linear
51
+ - num_epochs: 1.0
52
+ - mixed_precision_training: Native AMP
53
+
54
+ ### Training results
55
+
56
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
57
+ |:-------------:|:-----:|:-----:|:---------------:|:--------:|
58
+ | No log | 0.0 | 1 | 2.3534 | 0.5148 |
59
+ | 2.4427 | 0.01 | 500 | 2.3338 | 0.5155 |
60
+ | 2.4399 | 0.02 | 1000 | 2.3276 | 0.5164 |
61
+ | 2.4244 | 0.03 | 1500 | 2.3231 | 0.5169 |
62
+ | 2.4336 | 0.04 | 2000 | 2.3194 | 0.5177 |
63
+ | 2.4201 | 0.06 | 2500 | 2.3156 | 0.5180 |
64
+ | 2.4245 | 0.07 | 3000 | 2.3128 | 0.5185 |
65
+ | 2.4157 | 0.08 | 3500 | 2.3097 | 0.5187 |
66
+ | 2.4054 | 0.09 | 4000 | 2.3070 | 0.5194 |
67
+ | 2.4161 | 0.1 | 4500 | 2.3033 | 0.5197 |
68
+ | 2.395 | 0.11 | 5000 | 2.3020 | 0.5201 |
69
+ | 2.4037 | 0.12 | 5500 | 2.3001 | 0.5204 |
70
+ | 2.4188 | 0.13 | 6000 | 2.2977 | 0.5206 |
71
+ | 2.406 | 0.15 | 6500 | 2.2961 | 0.5208 |
72
+ | 2.4022 | 0.16 | 7000 | 2.2943 | 0.5210 |
73
+ | 2.3952 | 0.17 | 7500 | 2.2926 | 0.5217 |
74
+ | 2.394 | 0.18 | 8000 | 2.2909 | 0.5217 |
75
+ | 2.3828 | 0.19 | 8500 | 2.2891 | 0.5218 |
76
+ | 2.3903 | 0.2 | 9000 | 2.2882 | 0.5223 |
77
+ | 2.3943 | 0.21 | 9500 | 2.2861 | 0.5224 |
78
+ | 2.3944 | 0.22 | 10000 | 2.2851 | 0.5224 |
79
+ | 2.3872 | 0.23 | 10500 | 2.2841 | 0.5227 |
80
+ | 2.381 | 0.25 | 11000 | 2.2820 | 0.5228 |
81
+ | 2.3832 | 0.26 | 11500 | 2.2798 | 0.5232 |
82
+ | 2.3813 | 0.27 | 12000 | 2.2793 | 0.5237 |
83
+ | 2.3715 | 0.28 | 12500 | 2.2779 | 0.5241 |
84
+ | 2.3898 | 0.29 | 13000 | 2.2764 | 0.5240 |
85
+ | 2.3717 | 0.3 | 13500 | 2.2757 | 0.5240 |
86
+ | 2.3745 | 0.31 | 14000 | 2.2742 | 0.5244 |
87
+ | 2.3657 | 0.32 | 14500 | 2.2732 | 0.5244 |
88
+ | 2.3782 | 0.34 | 15000 | 2.2715 | 0.5247 |
89
+ | 2.3761 | 0.35 | 15500 | 2.2706 | 0.5247 |
90
+ | 2.3827 | 0.36 | 16000 | 2.2692 | 0.5249 |
91
+ | 2.3659 | 0.37 | 16500 | 2.2678 | 0.5251 |
92
+ | 2.3551 | 0.38 | 17000 | 2.2674 | 0.5252 |
93
+ | 2.3605 | 0.39 | 17500 | 2.2662 | 0.5255 |
94
+ | 2.3579 | 0.4 | 18000 | 2.2654 | 0.5256 |
95
+ | 2.361 | 0.41 | 18500 | 2.2642 | 0.5257 |
96
+ | 2.3632 | 0.42 | 19000 | 2.2652 | 0.5254 |
97
+ | 2.3409 | 0.44 | 19500 | 2.2625 | 0.5261 |
98
+ | 2.3546 | 0.45 | 20000 | 2.2631 | 0.5259 |
99
+ | 2.361 | 0.46 | 20500 | 2.2611 | 0.5264 |
100
+ | 2.355 | 0.47 | 21000 | 2.2598 | 0.5264 |
101
+ | 2.3599 | 0.48 | 21500 | 2.2588 | 0.5265 |
102
+ | 2.3554 | 0.49 | 22000 | 2.2583 | 0.5265 |
103
+ | 2.3552 | 0.5 | 22500 | 2.2571 | 0.5268 |
104
+ | 2.3574 | 0.51 | 23000 | 2.2565 | 0.5268 |
105
+ | 2.3527 | 0.53 | 23500 | 2.2557 | 0.5272 |
106
+ | 2.3574 | 0.54 | 24000 | 2.2548 | 0.5272 |
107
+ | 2.3395 | 0.55 | 24500 | 2.2534 | 0.5274 |
108
+ | 2.3517 | 0.56 | 25000 | 2.2531 | 0.5272 |
109
+ | 2.346 | 0.57 | 25500 | 2.2521 | 0.5275 |
110
+ | 2.3469 | 0.58 | 26000 | 2.2515 | 0.5275 |
111
+ | 2.3451 | 0.59 | 26500 | 2.2509 | 0.5278 |
112
+ | 2.3373 | 0.6 | 27000 | 2.2501 | 0.5277 |
113
+ | 2.3512 | 0.61 | 27500 | 2.2493 | 0.5281 |
114
+ | 2.3351 | 0.63 | 28000 | 2.2485 | 0.5282 |
115
+ | 2.3431 | 0.64 | 28500 | 2.2476 | 0.5282 |
116
+ | 2.3399 | 0.65 | 29000 | 2.2463 | 0.5283 |
117
+ | 2.3376 | 0.66 | 29500 | 2.2463 | 0.5284 |
118
+ | 2.3574 | 0.67 | 30000 | 2.2456 | 0.5285 |
119
+ | 2.3312 | 0.68 | 30500 | 2.2447 | 0.5289 |
120
+ | 2.3442 | 0.69 | 31000 | 2.2442 | 0.5288 |
121
+ | 2.338 | 0.7 | 31500 | 2.2434 | 0.5289 |
122
+ | 2.3345 | 0.72 | 32000 | 2.2433 | 0.5291 |
123
+ | 2.3314 | 0.73 | 32500 | 2.2420 | 0.5292 |
124
+ | 2.326 | 0.74 | 33000 | 2.2414 | 0.5293 |
125
+ | 2.3247 | 0.75 | 33500 | 2.2409 | 0.5295 |
126
+ | 2.3363 | 0.76 | 34000 | 2.2403 | 0.5296 |
127
+ | 2.3409 | 0.77 | 34500 | 2.2395 | 0.5297 |
128
+ | 2.335 | 0.78 | 35000 | 2.2391 | 0.5295 |
129
+ | 2.3194 | 0.79 | 35500 | 2.2383 | 0.5298 |
130
+ | 2.3367 | 0.8 | 36000 | 2.2379 | 0.5301 |
131
+ | 2.3286 | 0.82 | 36500 | 2.2372 | 0.5301 |
132
+ | 2.3225 | 0.83 | 37000 | 2.2366 | 0.5302 |
133
+ | 2.3198 | 0.84 | 37500 | 2.2363 | 0.5301 |
134
+ | 2.3274 | 0.85 | 38000 | 2.2355 | 0.5301 |
135
+ | 2.3195 | 0.86 | 38500 | 2.2349 | 0.5303 |
136
+ | 2.3418 | 0.87 | 39000 | 2.2344 | 0.5303 |
137
+ | 2.323 | 0.88 | 39500 | 2.2340 | 0.5304 |
138
+ | 2.3211 | 0.89 | 40000 | 2.2336 | 0.5304 |
139
+ | 2.3332 | 0.91 | 40500 | 2.2334 | 0.5306 |
140
+ | 2.3226 | 0.92 | 41000 | 2.2329 | 0.5307 |
141
+ | 2.3329 | 0.93 | 41500 | 2.2325 | 0.5308 |
142
+ | 2.3172 | 0.94 | 42000 | 2.2321 | 0.5307 |
143
+ | 2.3231 | 0.95 | 42500 | 2.2319 | 0.5308 |
144
+ | 2.314 | 0.96 | 43000 | 2.2316 | 0.5309 |
145
+ | 2.3205 | 0.97 | 43500 | 2.2315 | 0.5308 |
146
+ | 2.3208 | 0.98 | 44000 | 2.2312 | 0.5309 |
147
+ | 2.3228 | 0.99 | 44500 | 2.2310 | 0.5309 |
148
+
149
+
150
+ ### Framework versions
151
+
152
+ - Transformers 4.37.2
153
+ - Pytorch 2.1.2
154
+ - Datasets 2.16.1
155
+ - Tokenizers 0.15.2
156
+ ## Training procedure
157
+
158
+
159
+ ### Framework versions
160
+
161
+
162
+ - PEFT 0.6.0
adapter_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/data/models/gpt/solar/ruadapt_solar_10.7_darulm_unigram_proj_init_part1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "lora_alpha": 512.0,
12
+ "lora_dropout": 0.05,
13
+ "modules_to_save": [
14
+ "embed_tokens"
15
+ ],
16
+ "peft_type": "LORA",
17
+ "r": 128,
18
+ "rank_pattern": {},
19
+ "revision": null,
20
+ "target_modules": [
21
+ "lm_head",
22
+ "v_proj",
23
+ "q_proj",
24
+ "o_proj",
25
+ "k_proj"
26
+ ],
27
+ "task_type": "CAUSAL_LM"
28
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c25dbeae543388f47ecd3a1453c586252aa56845eed10041dd3411534263e578
3
+ size 598593576
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_accuracy": 0.5309375409708164,
4
+ "eval_loss": 2.230886697769165,
5
+ "eval_runtime": 23.8504,
6
+ "eval_samples": 2433,
7
+ "eval_samples_per_second": 102.011,
8
+ "eval_steps_per_second": 4.277,
9
+ "perplexity": 9.3081159070011,
10
+ "train_loss": 2.361179032145941,
11
+ "train_runtime": 131652.555,
12
+ "train_samples": 5368743,
13
+ "train_samples_per_second": 40.78,
14
+ "train_steps_per_second": 0.34
15
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_accuracy": 0.5309375409708164,
4
+ "eval_loss": 2.230886697769165,
5
+ "eval_runtime": 23.8504,
6
+ "eval_samples": 2433,
7
+ "eval_samples_per_second": 102.011,
8
+ "eval_steps_per_second": 4.277,
9
+ "perplexity": 9.3081159070011
10
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "legacy": true,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": null,
37
+ "sp_model_kwargs": {},
38
+ "spaces_between_special_tokens": false,
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": true
42
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 2.361179032145941,
4
+ "train_runtime": 131652.555,
5
+ "train_samples": 5368743,
6
+ "train_samples_per_second": 40.78,
7
+ "train_steps_per_second": 0.34
8
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab89d60b8b75fab8a51a1930151b21e966b67837dd3751bec79dcd3e1e42db87
3
+ size 6328