Update README.md
Browse files
README.md
CHANGED
@@ -5,10 +5,10 @@ datasets:
|
|
5 |
language:
|
6 |
- en
|
7 |
|
8 |
-
flan_t5_z3_config.json
|
9 |
|
10 |
{
|
11 |
-
"fp16"
|
12 |
"enabled": "auto",
|
13 |
"loss_scale": 0,
|
14 |
"loss_scale_window": 1000,
|
@@ -17,7 +17,7 @@ flan_t5_z3_config.json:
|
|
17 |
"min_loss_scale": 1
|
18 |
},
|
19 |
|
20 |
-
"optimizer"
|
21 |
"type": "AdamW",
|
22 |
"params": {
|
23 |
"lr": "auto",
|
@@ -27,7 +27,7 @@ flan_t5_z3_config.json:
|
|
27 |
}
|
28 |
},
|
29 |
|
30 |
-
"scheduler"
|
31 |
"type": "WarmupLR",
|
32 |
"params": {
|
33 |
"warmup_min_lr": "auto",
|
@@ -37,7 +37,7 @@ flan_t5_z3_config.json:
|
|
37 |
},
|
38 |
|
39 |
|
40 |
-
"zero_optimization"
|
41 |
"stage": 3,
|
42 |
"overlap_comm": true,
|
43 |
"contiguous_gradients": true,
|
@@ -50,7 +50,7 @@ flan_t5_z3_config.json:
|
|
50 |
"stage3_gather_16bit_weights_on_model_save": true
|
51 |
},
|
52 |
|
53 |
-
"gradient_accumulation_steps"
|
54 |
"gradient_clipping": "auto",
|
55 |
"steps_per_print": 2000,
|
56 |
"train_batch_size": "auto",
|
|
|
5 |
language:
|
6 |
- en
|
7 |
|
8 |
+
**flan_t5_z3_config.json:**
|
9 |
|
10 |
{
|
11 |
+
**"fp16"**: {
|
12 |
"enabled": "auto",
|
13 |
"loss_scale": 0,
|
14 |
"loss_scale_window": 1000,
|
|
|
17 |
"min_loss_scale": 1
|
18 |
},
|
19 |
|
20 |
+
**"optimizer":** {
|
21 |
"type": "AdamW",
|
22 |
"params": {
|
23 |
"lr": "auto",
|
|
|
27 |
}
|
28 |
},
|
29 |
|
30 |
+
**"scheduler":** {
|
31 |
"type": "WarmupLR",
|
32 |
"params": {
|
33 |
"warmup_min_lr": "auto",
|
|
|
37 |
},
|
38 |
|
39 |
|
40 |
+
**"zero_optimization":** {
|
41 |
"stage": 3,
|
42 |
"overlap_comm": true,
|
43 |
"contiguous_gradients": true,
|
|
|
50 |
"stage3_gather_16bit_weights_on_model_save": true
|
51 |
},
|
52 |
|
53 |
+
**"gradient_accumulation_steps":** "auto",
|
54 |
"gradient_clipping": "auto",
|
55 |
"steps_per_print": 2000,
|
56 |
"train_batch_size": "auto",
|